diff --git a/SoundScribe/SpeakerID/CITATION.cff b/SoundScribe/SpeakerID/CITATION.cff deleted file mode 100644 index 436750dd0af057b8c4d701cfb2a01a4ca0d4365a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/CITATION.cff +++ /dev/null @@ -1,41 +0,0 @@ -cff-version: 1.2.0 -message: "If you use this software, please cite it as below." -title: "NeMo: a toolkit for Conversational AI and Large Language Models" -url: https://nvidia.github.io/NeMo/ -repository-code: https://github.com/NVIDIA/NeMo -authors: - - family-names: Harper - given-names: Eric - - family-names: Majumdar - given-names: Somshubra - - family-names: Kuchaiev - given-names: Oleksii - - family-names: Jason - given-names: Li - - family-names: Zhang - given-names: Yang - - family-names: Bakhturina - given-names: Evelina - - family-names: Noroozi - given-names: Vahid - - family-names: Subramanian - given-names: Sandeep - - family-names: Nithin - given-names: Koluguri - - family-names: Jocelyn - given-names: Huang - - family-names: Jia - given-names: Fei - - family-names: Balam - given-names: Jagadeesh - - family-names: Yang - given-names: Xuesong - - family-names: Livne - given-names: Micha - - family-names: Dong - given-names: Yi - - family-names: Naren - given-names: Sean - - family-names: Ginsburg - given-names: Boris - diff --git a/SoundScribe/SpeakerID/CONTRIBUTING.md b/SoundScribe/SpeakerID/CONTRIBUTING.md deleted file mode 100644 index 621a37a171b7bb06a10491350697cdd671aa7a11..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/CONTRIBUTING.md +++ /dev/null @@ -1,79 +0,0 @@ -# Contributions are welcome! - -We do all of NeMo's development in the open. Contributions from NeMo community are welcome. - - -# Pull Requests (PR) Guidelines - -**Send your PRs to the `main` branch** - -1) Make sure your PR does one thing. Have a clear answer to "What does this PR do?". -2) Read General Principles and style guide below -3) Make sure you sign your commits. E.g. use ``git commit -s`` when before your commit -4) Make sure all unittests finish successfully before sending PR ``pytest`` or (if yor dev box does not have GPU) ``pytest --cpu`` from NeMo's root folder -5) Send your PR and request a review - -## Unit tests -Quick tests (locally, while developing) -``` -pytest -# If you don't have NVIDIA GPU do: -# pytest --cpu -``` -Full tests, including pre-trained model downloads -``` -pytest --with_downloads -``` - -## Whom should you ask for review: -1. For changes to NeMo's core: @ericharper, @titu1994, @blisc, or @okuchaiev -1. For changes to NeMo's ASR collection: @titu1994, @redoctopus, @jbalam-nv, or @okuchaiev -1. For changes to NeMo's NLP collection: @MaximumEntropy, @ericharper, @ekmb, @yzhang123, @VahidooX, @vladgets, or @okuchaiev -1. For changes to NeMo's TTS collection: @blisc, or @okuchaiev - -Note that some people may self-assign to review your PR - in which case, please wait for them to add a review. - -Your pull requests must pass all checks and peer-review before they can be merged. - -# General principles -1. **User-oriented**: make it easy for end users, even at the cost of writing more code in the background -1. **Robust**: make it hard for users to make mistakes. -1. **Well-tested**: please add simple, fast unittests. Consider adding CI tests for end-to-end functionality. -1. **Reusable**: for every piece of code, think about how it can be reused in the future and make it easy to be reused. -1. **Readable**: code should be easier to read. -1. **Legal**: if you copy even one line of code from the Internet, make sure that the code allows the license that NeMo supports. Give credit and link back to the code. -1. **Sensible**: code should make sense. If you think a piece of code might be confusing, write comments. - -## Class naming conventions -* No “I”, “Interface”, “NM” nor “NeMo” pre/postfixes anywhere -* Core interfaces have simple names: Typing, Cloud, Serialization, FileIO* -* Core classes have the simplest names ever: NeuralModule, Model, Graph, Dataset, Loss, Module* -* Abstract classes in the Model hierarchy have Model postfix -* A config class for MyModel should be called MyModelConfig -* Leaf Neural Module classes have simple names without any postfixes (e.g. AudioPreprocess) -* Leaf Datasets have Dataset postfix (e.g. AudioToSpeechLabelDataset) -* Leaf Losses have Loss postfix (e.g. CTCLoss) -* Leaf Models do not have any postfix, just name (e.g. QuartzNet) - -## Python style -We use ``black`` as our style guide. To check whether your code will pass style check (from the NeMo's repo folder) run: -``python setup.py style`` and if it does not pass run ``python setup.py style --fix``. - -1. Include docstrings for every class and method exposed to the user. -1. Use Python 3 type hints for every class and method exposed to the user. -1. Avoid wild import: ``from X import *`` unless in ``X.py``, ``__all__`` is defined. -1. Minimize the use of ``**kwargs``. -1. ``RaiseError`` is preferred to ``assert``. Write: ```if X: raise Error``` instead of ```assert X```. -1. Classes are preferred to standalone methods. -1. Methods should be atomic. A method shouldn't be longer than 75 lines, e.g. can be fit into the computer screen without scrolling. -1. If a method has arguments that don't fit into one line, each argument should be in its own line for readability. -1. Add ``__init__.py`` for every folder. -1. F-strings are prefered to formatted strings. -1. Loggers are preferred to print. In NeMo, you can use logger from ``from nemo.utils import logging`` -1. Private functions (functions start with ``_``) shouldn't be called outside its host file. -1. If a comment lasts multiple lines, use ``'''`` instead of ``#``. - -# Collections -Collection is a logical grouping of related Neural Modules. It is a grouping of modules that share a domain area or semantics. -When contributing module to a collection, please make sure it belongs to that category. -If you would like to start a new one and contribute back to the platform, you are very welcome to do so. diff --git a/SoundScribe/SpeakerID/Dockerfile b/SoundScribe/SpeakerID/Dockerfile deleted file mode 100644 index 06f96a091a2232174854390c74c17af2dd02c62a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/Dockerfile +++ /dev/null @@ -1,140 +0,0 @@ -# syntax=docker/dockerfile:experimental - -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.08-py3 - -# build an image that includes only the nemo dependencies, ensures that dependencies -# are included first for optimal caching, and useful for building a development -# image (by specifying build target as `nemo-deps`) -FROM ${BASE_IMAGE} as nemo-deps - -# dependency flags; should be declared after FROM -# torchaudio: not required by default -ARG REQUIRE_TORCHAUDIO=false -# k2: not required by default -ARG REQUIRE_K2=false -# ais cli: not required by default, install only if required -ARG REQUIRE_AIS_CLI=false - -# Ensure apt-get won't prompt for selecting options -ENV DEBIAN_FRONTEND=noninteractive -# libavdevice-dev rerquired for latest torchaudio -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get install -y \ - libsndfile1 sox \ - libfreetype6 \ - swig \ - ffmpeg \ - libavdevice-dev && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR /workspace/ -# install megatron core, this can be removed once 0.3 pip package is released -RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \ - cd Megatron-LM && \ - git checkout ab0336a5c8eab77aa74ae604ba1e73decbf6d560 && \ - pip install -e . - -WORKDIR /tmp/ - -# Distributed Adam support for multiple dtypes -RUN git clone https://github.com/NVIDIA/apex.git && \ - cd apex && \ - git checkout 52e18c894223800cb611682dce27d88050edf1de && \ - pip3 install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./ - -# uninstall stuff from base container -RUN pip3 uninstall -y sacrebleu torchtext - -# build torchaudio -WORKDIR /tmp/torchaudio_build -COPY scripts/installers /tmp/torchaudio_build/scripts/installers/ -RUN INSTALL_MSG=$(/bin/bash /tmp/torchaudio_build/scripts/installers/install_torchaudio_latest.sh); INSTALL_CODE=$?; \ - echo ${INSTALL_MSG}; \ - if [ ${INSTALL_CODE} -ne 0 ]; then \ - echo "torchaudio installation failed"; \ - if [ "${REQUIRE_TORCHAUDIO}" = true ]; then \ - exit ${INSTALL_CODE}; \ - else echo "Skipping failed torchaudio installation"; fi \ - else echo "torchaudio installed successfully"; fi - -# install nemo dependencies -WORKDIR /tmp/nemo -COPY requirements . -RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done - -# install flash attention dependencies -RUN pip install flash-attn -# pinned triton version for flash-attention https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3 -RUN pip install triton==2.0.0.dev20221202 -# install numba for latest containers -RUN pip install numba>=0.57.1 - -# install k2, skip if installation fails -COPY scripts /tmp/nemo/scripts/ -RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/speech_recognition/k2/setup.sh); INSTALL_CODE=$?; \ - echo ${INSTALL_MSG}; \ - if [ ${INSTALL_CODE} -ne 0 ]; then \ - echo "k2 installation failed"; \ - if [ "${REQUIRE_K2}" = true ]; then \ - exit ${INSTALL_CODE}; \ - else echo "Skipping failed k2 installation"; fi \ - else echo "k2 installed successfully"; fi - -# copy nemo source into a scratch image -FROM scratch as nemo-src -COPY . . - -# start building the final container -FROM nemo-deps as nemo -ARG NEMO_VERSION=1.21.0 - -# Check that NEMO_VERSION is set. Build will fail without this. Expose NEMO and base container -# version information as runtime environment variable for introspection purposes -RUN /usr/bin/test -n "$NEMO_VERSION" && \ - /bin/echo "export NEMO_VERSION=${NEMO_VERSION}" >> /root/.bashrc && \ - /bin/echo "export BASE_IMAGE=${BASE_IMAGE}" >> /root/.bashrc - -# Install NeMo -RUN --mount=from=nemo-src,target=/tmp/nemo,rw cd /tmp/nemo && pip install ".[all]" - -# Check install -RUN python -c "import nemo.collections.nlp as nemo_nlp" && \ - python -c "import nemo.collections.tts as nemo_tts" && \ - python -c "import nemo_text_processing.text_normalization as text_normalization" - - -# copy scripts/examples/tests into container for end user -WORKDIR /workspace/nemo -COPY scripts /workspace/nemo/scripts -COPY examples /workspace/nemo/examples -COPY tests /workspace/nemo/tests -COPY tutorials /workspace/nemo/tutorials -# COPY README.rst LICENSE /workspace/nemo/ - -RUN printf "#!/bin/bash\njupyter lab --no-browser --allow-root --ip=0.0.0.0" >> start-jupyter.sh && \ - chmod +x start-jupyter.sh - -# If required, install AIS CLI -RUN if [ "${REQUIRE_AIS_CLI}" = true ]; then \ - INSTALL_MSG=$(/bin/bash scripts/installers/install_ais_cli_latest.sh); INSTALL_CODE=$?; \ - echo ${INSTALL_MSG}; \ - if [ ${INSTALL_CODE} -ne 0 ]; then \ - echo "AIS CLI installation failed"; \ - exit ${INSTALL_CODE}; \ - else echo "AIS CLI installed successfully"; fi \ - else echo "Skipping AIS CLI installation"; fi diff --git a/SoundScribe/SpeakerID/Jenkinsfile b/SoundScribe/SpeakerID/Jenkinsfile deleted file mode 100644 index cff8c0f03924a384e97325893a401e7d61b3e31a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/Jenkinsfile +++ /dev/null @@ -1,4862 +0,0 @@ -pipeline { - agent { - docker { - image 'nvcr.io/nvidia/pytorch:23.09-py3' - args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=1 --env HYDRA_FULL_ERROR=1' - } - } - options { - timeout(time: 8, unit: 'HOURS') - disableConcurrentBuilds(abortPrevious: true) - } - - stages { - - stage('Add git safe directory'){ - steps{ - sh 'git config --global --add safe.directory /var/lib/jenkins/workspace/NeMo_$GIT_BRANCH' - sh 'git config --global --add safe.directory /raid/JenkinsWorkDir/workspace/NeMo_$GIT_BRANCH' - sh 'git config --global --add safe.directory /mnt/D3/JenkinsWorkDir/workspace/NeMo_$GIT_BRANCH' - } - } - - stage('nvidia-smi'){ - steps{ - sh 'nvidia-smi' - } - } - - stage('PyTorch version') { - steps { - sh 'python -c "import torch; print(torch.__version__)"' - sh 'python -c "import torchvision; print(torchvision.__version__)"' - } - } - - stage('Install test requirements') { - steps { - sh 'apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt' - } - } - - stage('Code formatting checks') { - steps { - sh 'python setup.py style' - } - } - - stage('Copyright Headers check') { - steps { - sh 'python tests/check_copyright_header.py --dir .' - } - } - - stage('NeMo Installation') { - steps { - sh './reinstall.sh release' - } - } - - // megatron-core 0.3 has been pinned in the requirements, this should not be needed on r1.21.0 - // stage('Megatron Core installation') { - // steps { - // // pinned MCore https://github.com/NVIDIA/Megatron-LM/commit/ab0336a5c8eab77aa74ae604ba1e73decbf6d560 - // // ToT for 23.08 branch - // sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ - // cd Megatron-LM && \ - // git checkout ab0336a5c8eab77aa74ae604ba1e73decbf6d560 && \ - // pip install -e .' - // } - // } - - - stage('PyTorch Lightning version') { - steps { - sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"' - } - } - - stage('PyTorch Lightning DDP Checks') { - steps { - sh 'CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"' - } - } - - stage('Basic Import Checks') { - steps { - sh 'python -c "import nemo.collections.asr as nemo_asr"' - sh 'python -c "import nemo.collections.nlp as nemo_nlp"' - sh 'python -c "import nemo.collections.tts as nemo_tts"' - } - } - stage('L0: Unit Tests GPU') { - steps { - sh 'NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads' - } - } - - stage('L0: Unit Tests CPU') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - steps { - sh 'CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat' - } - } - - // TODO: this requires TE >= v0.11 which is not available in 23.06. - // please uncomment this test once mcore CI is ready. - stage('L2: Community LLM Checkpoints tests') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Llama') { - steps { - sh 'CUDA_VISIBLE_DEVICES=0 python scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py \ - --in-file=/home/TestData/nlp/megatron_llama/llama-ci-hf \ - --out-file=/home/TestData/nlp/megatron_llama/ci.nemo \ - --precision=16' - sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo' - } - } - stage('StarCoder') { - steps { - sh 'python scripts/nlp_language_modeling/convert_starcoder_hf_to_nemo.py \ - --config examples/nlp/language_modeling/conf/megatron_gpt_config.yaml \ - --input /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \ - --output /home/TestData/nlp/megatron_gpt/starcoder-ci-hf' - sh 'rm -f /home/TestData/nlp/megatron_gpt/starcoder-ci-hf/megatron_starcoder_tp1_pp1.nemo' - } - } - } - } - - stage('L2: ASR dev run') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Speech to Text') { - steps { - sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_results' - sh 'rm -rf examples/asr/speech_to_text_results' - } - } - - stage('Speech to Text WPE - CitriNet') { - steps { - sh 'python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path="../conf/citrinet/" --config-name="config_bpe" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - model.tokenizer.type="wpe" \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results' - sh 'rm -rf examples/asr/speech_to_text_wpe_results' - } - } - - stage('Speech Pre-training - CitriNet') { - steps { - sh 'python examples/asr/speech_pretraining/speech_pre_training.py \ - --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_pre_training_results' - sh 'rm -rf examples/asr/speech_pre_training_results' - } - } - - stage('Speech To Text Finetuning') { - steps { - sh 'python examples/asr/speech_to_text_finetune.py \ - --config-path="conf" --config-name="speech_to_text_finetune" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ - model.tokenizer.update_tokenizer=False \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_finetuning_results' - sh 'rm -rf examples/asr/speech_finetuning_results' - } - } - - // TODO: Please Fix Me - // Error locating target 'nemo.collections.asr.modules.wav2vec_modules.ConvFeatureEncoder', see chained exception above. - // stage('L2: Speech Pre-training - Wav2Vec') { - // steps { - // sh 'python examples/asr/speech_pretraining/speech_pre_training.py \ - // --config-path="../conf/ssl/wav2vec/" --config-name="wav2vec_ci" \ - // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // trainer.devices=[1] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=True \ - // exp_manager.exp_dir=examples/asr/speech_pre_training_results' - // sh 'rm -rf examples/asr/speech_pre_training_results' - // } - // } - - stage('L2: Speech to Text WPE - Conformer') { - steps { - sh 'python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - model.tokenizer.type="wpe" \ - model.train_ds.batch_size=4 \ - model.validation_ds.batch_size=4 \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results' - sh 'rm -rf examples/asr/speech_to_text_wpe_conformer_results' - } - } - } - } - - stage('L2: ASR dev run - part two') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L2: Speech to Text WPE - Squeezeformer') { - steps { - sh 'python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - model.tokenizer.type="wpe" \ - model.encoder.d_model=144 \ - model.train_ds.batch_size=4 \ - model.validation_ds.batch_size=4 \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results' - sh 'rm -rf examples/asr/speech_to_text_wpe_squeezeformer_results' - } - } - } - } - - stage('L2: Speech to Text EMA') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - steps { - sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=2 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - +exp_manager.ema.enable=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_results' - sh 'rm -rf examples/asr/speech_to_text_results' - } - - } - - stage('L2: Speaker dev run') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Speaker Recognition') { - steps { - sh 'python examples/speaker_tasks/recognition/speaker_reco.py \ - model.train_ds.batch_size=10 \ - model.validation_ds.batch_size=2 \ - model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \ - model.decoder.num_classes=2 \ - trainer.max_epochs=10 \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results' - sh 'rm -rf examples/speaker_tasks/recognition/speaker_recognition_results' - } - } - - stage('Speaker Diarization') { - steps { - sh 'python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \ - model.diarizer.speaker_embeddings.model_path=titanet_large \ - model.train_ds.batch_size=5 \ - model.validation_ds.batch_size=5 \ - model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ - model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ - model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results' - sh 'rm -rf examples/speaker_tasks/diarization/speaker_diarization_results' - } - } - - stage('Speech to Label') { - steps { - sh 'python examples/asr/speech_classification/speech_to_label.py \ - model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ - model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ - model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ - ~model.preprocessor.window_size \ - ~model.preprocessor.window_stride \ - ~model.preprocessor.window \ - ~model.preprocessor.n_mels \ - ~model.preprocessor.n_mfcc \ - ~model.preprocessor.n_fft \ - exp_manager.exp_dir=examples/asr/speech_to_label_results' - sh 'rm -rf examples/asr/speech_to_label_results' - } - } - - stage('Speaker Diarization with ASR Inference') { - steps { - sh 'python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \ - diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ - diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ - diarizer.speaker_embeddings.parameters.save_embeddings=True \ - diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \ - diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \ - diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \ - diarizer.asr.model_path=QuartzNet15x5Base-En \ - diarizer.asr.parameters.asr_based_vad=True \ - diarizer.out_dir=examples/speaker_tasks/diarization/speaker_diarization_asr_results' - sh 'rm -rf examples/speaker_tasks/diarization/speaker_diarization_asr_results' - } - } - - stage('Clustering Diarizer Inference') { - steps { - sh 'python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \ - diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ - diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ - diarizer.speaker_embeddings.parameters.save_embeddings=True \ - diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \ - diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \ - diarizer.speaker_embeddings.parameters.multiscale_weights=null \ - diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ - diarizer.out_dir=examples/speaker_tasks/diarization/clustering_diarizer_results' - sh 'rm -rf examples/speaker_tasks/diarization/clustering_diarizer_results' - } - } - - stage('Neural Diarizer Inference') { - steps { - sh 'python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \ - diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ - diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \ - diarizer.speaker_embeddings.parameters.save_embeddings=True \ - diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ - diarizer.out_dir=examples/speaker_tasks/diarization/neural_diarizer_results' - sh 'rm -rf examples/speaker_tasks/diarization/neural_diarizer_results' - } - } - - stage('Multispeaker ASR Data Simulation') { - steps { - sh 'python tools/speech_data_simulator/multispeaker_simulator.py \ - --config-path=conf --config-name=data_simulator.yaml \ - data_simulator.random_seed=42 \ - data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \ - data_simulator.outputs.output_dir=./test_simulator \ - data_simulator.session_config.num_sessions=2 \ - data_simulator.session_config.session_length=60' - sh 'rm -rf ./test_simulator' - } - } - } - } - // TODO: Enable test after 21.08 container is used. - // stage('L2: ASR DALI dev run') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // parallel { - // stage('Speech to Text - DALI AudioToMelSpectrogramPreprocessor') { - // steps { - // sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \ - // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // +model.train_ds.use_dali=True \ - // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // +model.validation_ds.use_dali=True \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=True \ - // exp_manager.exp_dir=examples/asr/speech_to_text_results' - // sh 'rm -rf examples/asr/speech_to_text_results' - // } - // } - // stage('Speech to Text BPE - DALI AudioToMelSpectrogramPreprocessor') { - // steps { - // sh 'python examples/asr/asr_ctc/speech_to_text_bpe.py \ - // --config-path="../conf/citrinet/" --config-name="config_bpe" \ - // model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - // model.tokenizer.type="wpe" \ - // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // +model.train_ds.use_dali=True \ - // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // +model.validation_ds.use_dali=True \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=True \ - // exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results' - // sh 'rm -rf examples/asr/speech_to_text_wpe_results' - // } - // } - // // TODO: This would fail due to an unnecessary torchaudio import. - // // To be enabled once torchaudio is available in the container used for CI - // // stage('Speech to Text - DALI AudioToMFCCPreprocessor') { - // // steps { - // // sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \ - // // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // // +model.train_ds.use_dali=True \ - // // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // // +model.validation_ds.use_dali=True \ - // // model.preprocessor._target_=nemo.collections.asr.modules.AudioToMFCCPreprocessor \ - // // ~model.preprocessor.normalize \ - // // ~model.preprocessor.features \ - // // ~model.preprocessor.frame_splicing \ - // // ~model.preprocessor.dither \ - // // ~model.preprocessor.stft_conv \ - // // +model.n_mels=64 \ - // // +model.n_mfcc=64 \ - // // trainer.devices=[1] \ - // // trainer.accelerator="gpu" \ - // // +trainer.fast_dev_run=True \ - // // exp_manager.exp_dir=examples/asr/speech_to_text_results' - // // sh 'rm -rf examples/asr/speech_to_text_results' - // // } - // // } - // } - // } - - // TODO: Add back once CI is updated - // stage('L2: ASR RNNT dev run') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // parallel { - // stage('Speech to Text - RNNT') { - // steps { - // sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_transducer/speech_to_text_rnnt.py \ - // --config-path="../conf/contextnet_rnnt/" --config-name="config_rnnt.yaml" \ - // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // model.train_ds.batch_size=2 \ - // model.validation_ds.batch_size=2 \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=True \ - // exp_manager.exp_dir=examples/asr/speech_to_text_rnnt_results' - // sh 'rm -rf examples/asr/speech_to_text_rnnt_results' - // } - // } - // stage('L2: Speech to Text RNNT WPE') { - // steps { - // sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_transducer/speech_to_text_rnnt_bpe.py \ - // --config-path="../conf/contextnet_rnnt/" --config-name="config_rnnt_bpe.yaml" \ - // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // model.train_ds.batch_size=2 \ - // model.validation_ds.batch_size=2 \ - // model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - // model.tokenizer.type="wpe" \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=True \ - // exp_manager.exp_dir=examples/asr/speech_to_text_rnnt_wpe_results' - // sh 'rm -rf examples/asr/speech_to_text_rnnt_wpe_results' - // } - // } - // stage('L3: Speech to Text Hybrid Transducer-CTC WPE') { - // steps { - // sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py \ - // --config-path="../conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc/" --config-name="conformer_hybrid_transducer_ctc_bpe.yaml" \ - // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // model.encoder.n_layers= 2 \ - // model.train_ds.batch_size=2 \ - // model.validation_ds.batch_size=2 \ - // model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - // model.tokenizer.type="wpe" \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=True \ - // exp_manager.exp_dir=examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results' - // sh 'rm -rf examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results' - // } - // } - // } - // } - - // stage('L2: Hybrid ASR RNNT-CTC dev run') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // parallel { - // stage('Speech to Text Hybrid Transducer-CTC WPE') { - // steps { - // sh 'STRICT_NUMBA_COMPAT_CHECK=false python examples/asr/asr_hybrid_transducer_ctc/speech_to_text_hybrid_rnnt_ctc_bpe.py \ - // --config-path="../conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc/" --config-name="conformer_hybrid_transducer_ctc_bpe.yaml" \ - // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - // model.encoder.n_layers= 2 \ - // model.train_ds.batch_size=2 \ - // model.validation_ds.batch_size=2 \ - // model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - // model.tokenizer.type="wpe" \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=True \ - // exp_manager.exp_dir=examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results' - // sh 'rm -rf examples/asr/speech_to_text_hybrid_transducer_ctc_wpe_results' - // } - // } - // } - // } - - stage('L2: ASR Multi-dataloader dev run') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Speech to Text multi-dataloader') { - steps { - sh 'python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - +trainer.num_sanity_val_steps=1 \ - exp_manager.exp_dir=examples/asr/speech_to_text_results' - sh 'rm -rf examples/asr/speech_to_text_results' - } - } - - stage('Speech to Label multi-dataloader') { - steps { - sh 'python examples/asr/speech_classification/speech_to_label.py \ - model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ - model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - +trainer.num_sanity_val_steps=1 \ - model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ - ~model.preprocessor.window_size \ - ~model.preprocessor.window_stride \ - ~model.preprocessor.window \ - ~model.preprocessor.n_mels \ - ~model.preprocessor.n_mfcc \ - ~model.preprocessor.n_fft \ - exp_manager.exp_dir=examples/asr/speech_to_label_results' - sh 'rm -rf examples/asr/speech_to_label_results' - } - } - } - } - - stage('L2: ASR Adapters') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Linear Adapters') { - steps { - sh 'python examples/asr/asr_adapters/train_asr_adapter.py \ - model.pretrained_model="stt_en_conformer_ctc_small" \ - model.adapter.adapter_name="an4" \ - model.adapter.linear.in_features=176 \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.max_steps=5 \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results' - sh 'rm -rf examples/asr/speech_to_text_adapters_results' - } - } - stage('RelPos MHA Adapters') { - steps { - sh 'python examples/asr/asr_adapters/train_asr_adapter.py \ - model.pretrained_model="stt_en_conformer_ctc_small" \ - model.adapter.adapter_name="encoder:an4" \ - model.adapter.adapter_type="tiny_attn" \ - model.adapter.tiny_attn.n_feat=176 \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.max_steps=5 \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results' - sh 'rm -rf examples/asr/speech_to_text_adapters_mha_results' - } - } - - } - } - - stage('L2: Speech Transcription') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Speech to Text Transcribe') { - steps { - sh 'python examples/asr/transcribe_speech.py \ - pretrained_name="QuartzNet15x5Base-En" \ - audio_dir="/home/TestData/an4_transcribe/test_subset/" \ - output_filename="stt_test_res.json" \ - amp=true' - sh 'rm -rf stt_test_res.json' - } - } - } - } - stage('L2: Transducer alignment') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Running pytest') { - steps { - sh 'pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1' - } - } - } - } - - stage('L2: Segmentation Tool') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - stages { - stage('Install ctc_segmentation requirements') { - steps { - sh 'cd tools/ctc_segmentation && \ - pip install -r requirements.txt && \ - apt-get update && apt-get install libsox-fmt-all -y' - } - } - - stage('Parallel ctc_segmentation test') { - failFast true - parallel { - stage('L2: Eng CitriNet with .wav') { - steps { - sh 'cd tools/ctc_segmentation && \ - TIME=`date +"%Y-%m-%d-%T"` && \ - /bin/bash run_segmentation.sh \ - --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \ - --DATA_DIR=/home/TestData/ctc_segmentation/eng \ - --OUTPUT_DIR=/home/TestData/ctc_segmentation/eng/output${TIME} \ - --LANGUAGE=en \ - --USE_NEMO_NORMALIZATION="TRUE" && \ - python /home/TestData/ctc_segmentation/verify_alignment.py \ - -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \ - -g /home/TestData/ctc_segmentation/eng/output${TIME}/verified_segments/nv_test_segments.txt && \ - rm -rf /home/TestData/ctc_segmentation/eng/output${TIME}' - } - } - stage('L2: Ru QN with mp3') { - steps { - sh 'cd tools/ctc_segmentation && \ - TIME=`date +"%Y-%m-%d-%T"` && \ - /bin/bash run_segmentation.sh \ - --MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \ - --DATA_DIR=/home/TestData/ctc_segmentation/ru \ - --OUTPUT_DIR=/home/TestData/ctc_segmentation/ru/output${TIME} \ - --LANGUAGE=ru \ - --ADDITIONAL_SPLIT_SYMBOLS=";" && \ - python /home/TestData/ctc_segmentation/verify_alignment.py \ - -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \ - -g /home/TestData/ctc_segmentation/ru/output${TIME}/verified_segments/ru_segments.txt && \ - rm -rf /home/TestData/ctc_segmentation/ru/output${TIME}' - } - } - } - } - } - } - - stage('L2: G2P Models') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('G2P Conformer training, evaluation and inference') { - steps { - sh 'cd examples/tts/g2p && \ - TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \ - python g2p_train_and_evaluate.py \ - train_manifest=/home/TestData/g2p/g2p.json \ - validation_manifest=/home/TestData/g2p/g2p.json \ - model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \ - model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \ - trainer.max_epochs=1 \ - model.max_source_len=64 \ - trainer.devices=[0] \ - do_training=True \ - do_testing=True \ - exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \ - +exp_manager.use_datetime_version=False\ - +exp_manager.version=test \ - --config-name=g2p_conformer_ctc && \ - python g2p_inference.py \ - pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \ - manifest_filepath=/home/TestData/g2p/g2p.json \ - phoneme_field=text' - } - } - // TODO: pleasefixme @redoctopus - // stage('ByT5G2P training, evaluation and inference') { - // steps { - // sh 'TRANSFORMERS_OFFLINE=1 && cd examples/tts/g2p && \ - // TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_T5=output_byt5_${TIME} && \ - // python g2p_train_and_evaluate.py \ - // train_manifest=/home/TestData/g2p/g2p.json \ - // validation_manifest=/home/TestData/g2p/g2p.json \ - // model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \ - // trainer.max_epochs=1 \ - // model.max_source_len=64 \ - // trainer.devices=[1] \ - // do_training=True \ - // do_testing=True \ - // exp_manager.exp_dir=${OUTPUT_DIR_T5} \ - // +exp_manager.use_datetime_version=False\ - // +exp_manager.version=test && \ - // python g2p_inference.py \ - // pretrained_model=${OUTPUT_DIR_T5}/T5G2P/test/checkpoints/T5G2P.nemo \ - // manifest_filepath=/home/TestData/g2p/g2p.json \ - // phoneme_field=text && TRANSFORMERS_OFFLINE=1' - // } - // } - stage('HeteronymClassificationModel training, evaluation and inference') { - steps { - sh 'cd examples/tts/g2p && \ - TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \ - python g2p_heteronym_classification_train_and_evaluate.py \ - train_manifest=/home/TestData/g2p/manifest.json \ - validation_manifest=/home/TestData/g2p/manifest.json \ - test_manifest=/home/TestData/g2p/manifest.json \ - model.wordids=/home/TestData/g2p/wordids.tsv \ - trainer.max_epochs=1 \ - model.max_seq_length=64 \ - do_training=True \ - do_testing=True \ - exp_manager.exp_dir=${OUTPUT_DIR} \ - +exp_manager.use_datetime_version=False\ - +exp_manager.version=test && \ - python g2p_heteronym_classification_inference.py \ - manifest=/home/TestData/g2p/manifest.json \ - pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \ - output_manifest=preds.json' - } - } - } - } - - // TODO: add test once megatron-bert is supported again - // stage('L2: Multi-GPU Megatron finetuning') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // parallel { - // stage('L2: Cased Megatron finetuning on MRPC') { - // steps { - // sh 'cd examples/nlp/glue_benchmark && \ - // python glue_benchmark.py \ - // model.dataset.data_dir=/home/TestData/nlp/glue_fake/MRPC \ - // trainer.devices=[0,1] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // model.dataset.use_cache=false \ - // model.language_model.pretrained_model_name=megatron-bert-345m-cased \ - // trainer.accelerator=gpu \ - // trainer.strategy=ddp \ - // exp_manager=null' - // } - // } - // } - // } - - stage('L2: STS-b') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('GLUE STS-b with AlBERT') { - steps { - sh 'python examples/nlp/glue_benchmark/glue_benchmark.py \ - model.dataset.use_cache=false \ - model.task_name=sts-b \ - model.dataset.data_dir=/home/TestData/nlp/glue_fake/STS-B \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - model.language_model.pretrained_model_name=albert-base-v1 \ - exp_manager=null' - } - } - stage('Test Restore Punctuation & Capitalization with AlBERT') { - steps { - sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ - python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_and_Capitalization_albert.nemo \ - +model.test_ds.use_cache=false \ - ~model.train_ds \ - ~model.validation_ds \ - model.test_ds.ds_item="${data_dir}" \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - exp_manager=null && \ - rm -rf "${data_dir}"' - } - } -// stage('Test Restore Punctuation & Capitalization with RoBERTa') { -// steps { -// sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \ -// cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ -// python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ -// +do_training=false \ -// +do_testing=true \ -// pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_and_Capitalization_roberta.nemo \ -// +model.test_ds.use_cache=false \ -// ~model.train_ds \ -// ~model.validation_ds \ -// model.test_ds.ds_item="${data_dir}" \ -// trainer.devices=[1] \ -// trainer.accelerator="gpu" \ -// exp_manager=null && \ -// rm -rf "${data_dir}"' -// } -// } - } - } - stage('L2: Dialogue Classification') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Dialogue: Intent and slot classification using GPT') { - steps { - sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.language_model.lm_checkpoint=/home/TestData/nlp/gpt2/pytorch_model.bin\ - model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\ - model.dataset.dialogues_example_dir=sgd_gen_outputs \ - model.dataset.task_name=debug_sample \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=[0] \ - model.dataset.use_cache=false \ - model.tokenizer.special_tokens={pad_token:"endoftext"} \ - model.tokenizer.tokenizer_name=gpt2 \ - model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\ - model.language_model.pretrained_model_name=/home/TestData/nlp/gpt2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_outputs' - } - } - stage('Intent and slot classification using SGDQA') { - steps { - sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \ - model.dataset.task_name=debug_sample \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.dataset.num_tasks=6 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=[0] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-cased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_bert_outputs' - } - } - stage('Intent and slot classification using IntentSlotClassificationModel') { - steps { - sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/processed_assistant \ - model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \ - model.dataset.task=assistant \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=[0] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_bert_intent_classification_outputs && TRANSFORMERS_OFFLINE=1' - } - } - stage('Intent classification using ZeroShotIntentModel') { - steps { - sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \ - model.dataset.task=zero_shot \ - model.dataset.prompt_template="This example is" \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=[1] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_zero_shot_intent_classification_outputs && TRANSFORMERS_OFFLINE=1' - } - } - stage('Design Intent classification using ZeroShotIntentModel') { - steps { - sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="This example is related to" \ - model.library=megatron \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=[1] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_zero_shot_intent_classification_outputs && TRANSFORMERS_OFFLINE=1' - } - } - stage('Design Intent classification using ZeroShotIntentModel BART Classifier') { - steps { - sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="This example is related to" \ - model.library=huggingface \ - trainer.devices=[1] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_zero_shot_intent_classification_bart_outputs && TRANSFORMERS_OFFLINE=1' - } - } - stage('Design Intent classification using DialogueNearestNeighbourModel') { - steps { - sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="" \ - model.library=huggingface \ - trainer.devices=[0] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_dialogue_nearest_neighbour_classification_outputs && TRANSFORMERS_OFFLINE=1' - } - } - } - } - stage('L2: Dialogue Generation') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Dialogue: Answer Extender using DialogueS2SGenerationModel') { - steps { - sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ - model.dataset.dialogues_example_dir=answer_extender_s2s \ - model.dataset.task=ms_marco \ - model.library=huggingface \ - model.dataset.debug_mode=True \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=[1] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=facebook/bart-large \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf answer_extender_s2s' - } - } - stage('Dialogue: SGD Based Answer Extender using DialogueS2SGenerationModel') { - steps { - sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \ - model.dataset.task_name=debug_sample \ - model.dataset.task=sgd_generation \ - model.dataset.input_field=utterance+system_actions \ - model.dataset.output_field=system_utterance \ - model.dataset.use_cache=false \ - model.dataset.system_utterance=next_turn \ - model.dataset.debug_mode=True \ - model.dataset.prompt_template=slots_values \ - model.library=huggingface \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=[0] \ - model.language_model.pretrained_model_name=facebook/bart-large \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_answer_extender_s2s' - } - } - } - } -// stage('L2: Dialogue Generation Part 2') { -// when { -// anyOf { -// branch 'main' -// changeRequest target: 'main' -// } -// } -// failFast true -// parallel { -// stage('Dialogue: Answer Extender using DialogueGPTGenerationModel') { -// steps { -// sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ -// python dialogue.py \ -// do_training=False \ -// model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ -// model.dataset.dialogues_example_dir=answer_extender \ -// model.library=huggingface \ -// model.dataset.task=ms_marco \ -// model.dataset.debug_mode=True \ -// trainer.val_check_interval=0.0 \ -// trainer.devices=[0] \ -// model.dataset.use_cache=false \ -// model.language_model.pretrained_model_name=gpt2 \ -// trainer.accelerator=gpu \ -// exp_manager=null && \ -// rm -rf answer_extender' -// } -// } -// } -// } - stage('L2: COPY') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Dialogue: Answer Extender using DialogueGPTGenerationModel') { - steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ - model.dataset.dialogues_example_dir=answer_extender \ - model.library=huggingface \ - model.dataset.task=ms_marco \ - model.dataset.debug_mode=True \ - trainer.val_check_interval=0.0 \ - trainer.devices=[0] \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=gpt2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf answer_extender' - } - } - } - } - stage('L2: Duplex Text Normalization') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('Duplex Text Normalization with Tarred dataset') { - steps { - sh 'cd examples/nlp/duplex_text_normalization && \ - python duplex_text_normalization_train.py \ - data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \ - mode=tn \ - lang=en \ - tagger_model.do_training=false \ - decoder_model.transformer=t5-small \ - data.validation_ds.batch_size=2 \ - data.train_ds.use_cache=false \ - data.validation_ds.use_cache=false \ - data.test_ds.batch_size=2 \ - data.train_ds.decoder_data_augmentation=false \ - data.train_ds.num_workers=2 \ - decoder_trainer.devices=[0,1] \ - decoder_trainer.accelerator="gpu" \ - data.train_ds.use_tarred_dataset=true \ - +decoder_trainer.fast_dev_run=true \ - decoder_exp_manager.create_checkpoint_callback=false \ - data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \ - data.test_ds.use_cache=false \ - data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv' - } - } - } - } - // Runs out of memory on the 12G TITAN V (GPU 0 on main CI) - // TODO: add when megatron bert is supported again in NeMo - // stage('L2: MegaBERT Token Classification') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps { - // sh 'cd examples/nlp/token_classification && \ - // python token_classification_train.py \ - // model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \ - // model.language_model.pretrained_model_name=megatron-bert-345m-uncased \ - // model.train_ds.batch_size=10 \ - // model.dataset.max_seq_length=50 \ - // model.dataset.use_cache=false \ - // trainer.accelerator=gpu \ - // trainer.strategy=ddp \ - // trainer.precision=16 \ - // trainer.devices=[1] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // exp_manager=null' - // } - // } - - stage('L2: BERT Text Classification') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage ('Text Classification with BERT Test') { - steps { - sh 'cd examples/nlp/text_classification && \ - python text_classification_with_bert.py \ - model.dataset.num_classes=6 \ - model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ - model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - model.train_ds.batch_size=10 \ - model.dataset.max_seq_length=50 \ - model.dataset.use_cache=false \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager=null' - } - } - } - } - - stage('L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('BERT SQUAD 1.1') { - // Cannot do fast_dev_run because squad needs whole dev dataset - steps { - sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=bert-base-uncased \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - exp_manager=null && TRANSFORMERS_OFFLINE=1' - } - } - stage('BERT SQUAD 2.0') { - // Cannot do fast_dev_run because squad needs whole dev dataset - steps { - sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=bert-base-uncased \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - exp_manager=null && TRANSFORMERS_OFFLINE=1' - } - } - } - } - - stage('L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('BART SQUAD 1.1') { - // Cannot do fast_dev_run because squad needs whole dev dataset - steps { - sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=facebook/bart-base \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - exp_manager=null && TRANSFORMERS_OFFLINE=1' - } - } - stage('BART SQUAD 2.0') { - // Cannot do fast_dev_run because squad needs whole dev dataset - steps { - sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=facebook/bart-base \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - exp_manager=null && TRANSFORMERS_OFFLINE=1' - } - } - } - } - - stage('L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('GPT2 SQUAD 1.1') { - // Cannot do fast_dev_run because squad needs whole dev dataset - steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=gpt2 \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - exp_manager=null && TRANSFORMERS_OFFLINE=1' - } - } - stage('GPT2 SQUAD 2.0') { - // Cannot do fast_dev_run because squad needs whole dev dataset - steps { - sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=gpt2 \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - exp_manager=null && TRANSFORMERS_OFFLINE=1' - } - } - } - } - - stage('L2: Intent and Slot Classification Tasks') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L2: Intent and Slot Classification') { - steps { - sh 'cd examples/nlp/intent_slot_classification && \ - python intent_slot_classification.py \ - model.data_dir=/home/TestData/nlp/retail \ - model.validation_ds.prefix=dev \ - model.test_ds.prefix=dev \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=checkpoints' - sh 'rm -rf checkpoints' - } - } - stage('L2: Multi-Label Intent and Slot Classification') { - steps { - sh 'cd examples/nlp/intent_slot_classification && \ - python multi_label_intent_slot_classification.py \ - model.data_dir=/home/TestData/nlp/new_multiatis \ - model.validation_ds.prefix=dev \ - model.test_ds.prefix=dev \ - trainer.devices=[0] \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=checkpoints2' - sh 'rm -rf checkpoints2' - } - } - } - } - - // TODO: add when megatron-bert is supported again - // stage('L2: Model Parallel Size 2 Megatron Text Classification') { - // when { - // anyOf{ - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps{ - // sh 'cd examples/nlp/text_classification && \ - // python text_classification_with_bert.py \ - // trainer.devices=[0,1] \ - // trainer.accelerator="gpu" \ - // trainer.num_nodes=1 \ - // trainer.precision=16 \ - // trainer.gradient_clip_val=1.0 \ - // +trainer.fast_dev_run=true \ - // model.dataset.num_classes=6 \ - // model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ - // model.train_ds.batch_size=4 \ - // model.language_model.pretrained_model_name=megatron-bert-uncased \ - // model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \ - // model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \ - // model.nemo_path=null \ - // ~model.infer_samples \ - // exp_manager=null' - // } - // } - - // stage('L2: Model Parallel Size 2 Megatron Autoresume') { - // when { - // anyOf{ - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps{ - // sh 'cd examples/nlp/text_classification && \ - // python text_classification_with_bert.py \ - // trainer.devices=[0,1] \ - // trainer.accelerator="gpu" \ - // trainer.num_nodes=1 \ - // trainer.precision=16 \ - // trainer.gradient_clip_val=1.0 \ - // trainer.max_epochs=1 \ - // +trainer.fast_dev_run=true \ - // model.dataset.num_classes=6 \ - // model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ - // model.train_ds.batch_size=4 \ - // model.language_model.pretrained_model_name=megatron-bert-uncased \ - // model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \ - // model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \ - // model.nemo_path=null \ - // ~model.infer_samples \ - // +exp_manager.explicit_log_dir=/home/TestData/nlp/mp_autoresume \ - // +exp_manager.resume_if_exists=true' - // } - // } - - // stage('L2: Model Parallel Size 2 Megatron Evaluation from .nemo') { - // when { - // anyOf{ - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps{ - // sh 'cd examples/nlp/text_classification && \ - // python model_parallel_text_classification_evaluation.py \ - // trainer.devices=[0,1] \ - // trainer.accelerator="gpu" \ - // trainer.num_nodes=1 \ - // model.dataset.num_classes=6 \ - // model.test_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \ - // model.nemo_path=/home/TestData/nlp/mp_2_nemo/retail_text_class_350M.nemo \ - // exp_manager=null' - // } - // } - - // stage('L2: Model Parallel Size 2 Megatron Train from .nemo') { - // when { - // anyOf{ - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps{ - // sh 'cd examples/nlp/token_classification && \ - // python token_classification_train.py \ - // pretrained_model=/home/TestData/nlp/mp_2_nemo/ner_350M.nemo \ - // model.dataset.data_dir=/home/TestData/nlp/ner/ \ - // model.train_ds.batch_size=2 \ - // model.dataset.use_cache=false \ - // trainer.devices=[0,1] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // model.dataset.class_balancing="weighted_loss" \ - // exp_manager=null' - // } - // } - - stage('L2: Parallel NLP Examples 2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage ('NER finetuning from pretrained Test') { - steps { - sh 'cd examples/nlp/token_classification && \ - python token_classification_train.py \ - pretrained_model=ner_en_bert \ - model.dataset.data_dir=/home/TestData/nlp/ner/ \ - model.train_ds.batch_size=2 \ - model.dataset.use_cache=false \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - model.dataset.class_balancing="weighted_loss" \ - exp_manager.exp_dir=null' - } - } - stage ('Punctuation and capitalization finetuning from pretrained test') { - steps { - sh 'cd examples/nlp/token_classification && \ - data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ - python punctuation_capitalization_train_evaluate.py \ - pretrained_model=punctuation_en_bert \ - model.train_ds.ds_item="${data_dir}" \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=null && \ - rm -rf "${data_dir}"' - } - } - stage ('NER with TurkuNLP/bert-base-finnish-cased-v1') { - steps { - sh 'cd examples/nlp/token_classification && \ - python token_classification_train.py \ - model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \ - exp_manager.exp_dir=null' - } - } - stage('Evaluation script for Token Classification') { - steps { - sh 'python examples/nlp/token_classification/token_classification_evaluate.py \ - model.dataset.data_dir=/home/TestData/nlp/ner/ \ - model.dataset.use_cache=false \ - pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo' - } - } - stage('Evaluation script for Punctuation') { - steps { - sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ - python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - model.test_ds.ds_item="${data_dir}" \ - ~model.train_ds \ - ~model.validation_ds \ - +model.test_ds.use_cache=false \ - pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo && \ - rm -rf "${data_dir}"' - } - } - stage('L2: Punctuation & Capitalization, 2GPUs with DistilBERT, Fine-tuning on different data') { - steps { - sh 'cd examples/nlp/token_classification && \ - output_dir="$(mktemp -d -p "$(pwd)")" && \ - tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${tmp_data_dir}" \ - model.validation_ds.ds_item="${tmp_data_dir}" \ - model.test_ds.ds_item="${tmp_data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=true && \ - tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \ - mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \ - rm -rf "${tmp_data_dir}" && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${tmp_data_dir_2}" \ - model.validation_ds.ds_item="${tmp_data_dir_2}" \ - model.test_ds.ds_item="${tmp_data_dir_2}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null && \ - rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \ - "${tmp_data_dir_2}" \ - "${output_dir}"' - } - } - } - } - stage('Punctuation & Capitalization tarred dataset') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - stages { - stage('create and use tarred dataset') { - steps { - sh 'data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \ - /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \ - "${data_dir}"/ && \ - usual_data=${data_dir}/wmt_wiki_10000 && \ - output_dir="$(mktemp -d -p "$(pwd)")" && \ - tarred_data=${output_dir}/train_tarred && \ - tokens_in_batch=2000 && \ - max_seq_length=512 && \ - lm_model=distilbert-base-uncased && \ - python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \ - --text ${usual_data}/input.txt \ - --labels ${usual_data}/labels.txt \ - --output_dir ${tarred_data} \ - --tokens_in_batch ${tokens_in_batch} \ - --max_seq_length 512 \ - --lines_per_dataset_fragment 2000 \ - --num_batches_per_tarfile 5 \ - --tar_file_prefix punctuation_capitalization \ - --tokenizer_name ${lm_model} \ - --use_fast_tokenizer \ - --pad_label O \ - --n_jobs 3 && \ - echo "Number of tarred files in dataset:" && \ - ls ${tarred_data}/*.tar | wc -l && \ - echo "Label id files in dataset:" && \ - ls ${tarred_data}/*.csv && \ - metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \ - python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.train_ds.ds_item=${tarred_data} \ - model.language_model.pretrained_model_name=${lm_model} \ - model.train_ds.use_tarred_dataset=true \ - model.train_ds.tar_metadata_file=${metadata_file} \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir=${output_dir}/output && \ - rm -rf "${output_dir}" "${data_dir}"' - } - } - } - } - stage('Punctuation & Capitalization, Different ways of passing labels to model') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - stages { - stage('Punctuation & Capitalization, Using model.common_datasest_parameters.label_vocab_dir') { - steps { - sh 'cd examples/nlp/token_classification && \ - work_dir="$(mktemp -d -p "$(pwd)")" && \ - label_vocab_dir="${work_dir}/labels" && \ - mkdir -p ${label_vocab_dir} && \ - data_dir="${work_dir}/data" && \ - mkdir -p "${data_dir}" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ - output_dir="${work_dir}/output" && \ - mkdir -p "${output_dir}" && \ - punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \ - capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \ - printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \ - printf "O\nU\n" > "${capit_label_vocab}" && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${data_dir}" \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \ - model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \ - model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=false && \ - python punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - ~model.train_ds \ - ~model.validation_ds \ - model.test_ds.ds_item="${data_dir}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null && \ - rm -rf "${work_dir}"' - } - } - stage('Punctuation & Capitalization, Using model.common_datasest_parameters.{punct,capit}_label_ids') { - steps { - sh 'cd examples/nlp/token_classification && \ - work_dir="$(mktemp -d -p "$(pwd)")" && \ - output_dir="${work_dir}/output" && \ - mkdir -p "${output_dir}" && \ - data_dir="${work_dir}/data" && \ - mkdir -p "${data_dir}" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ - conf_name=punctuation_capitalization_config_with_ids && \ - cp conf/punctuation_capitalization_config.yaml "${work_dir}/${conf_name}.yaml" && \ - sed -i $\'s/punct_label_ids: null/punct_label_ids: {O: 0, \\\',\\\': 1, .: 2, \\\'?\\\': 3}/\' \ - "${work_dir}/${conf_name}.yaml" && \ - sed -i $\'s/capit_label_ids: null/capit_label_ids: {O: 0, U: 1}/\' \ - "${work_dir}/${conf_name}.yaml" && \ - python punctuation_capitalization_train_evaluate.py \ - --config-path "${work_dir}" \ - --config-name "${conf_name}" \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${data_dir}" \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=false && \ - python punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - ~model.train_ds \ - ~model.validation_ds \ - model.test_ds.ds_item="${data_dir}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null && \ - rm -rf "${work_dir}"' - } - } - } - } - stage('Punctuation & Capitalization inference') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - stages { - stage('Restore punctuation and capitalization in long text') { - steps { - sh 'output_dir="$(mktemp -d -p "$(pwd)")" && \ - python examples/nlp/token_classification/punctuate_capitalize_infer.py \ - --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \ - --output_text "${output_dir}/iwslt_inference_result.txt" \ - --max_seq_length 92 \ - --step 8 \ - --margin 16 \ - --pretrained_name punctuation_en_bert \ - --batch_size 32 && \ - rm -rf "${output_dir}"' - } - } - } - } - - stage('L2: Parallel Pretraining BERT pretraining from Text/Preprocessed') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L2: Pretraining BERT pretraining from Text') { - steps { - sh 'cd examples/nlp/language_modeling && \ - python bert_pretraining.py \ - --config-name=bert_pretraining_from_text_config.yaml \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - trainer.precision=16 \ - +trainer.fast_dev_run=true \ - model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt \ - model.train_ds.batch_size=32 \ - model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt \ - model.validation_ds.batch_size=32 \ - model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \ - model.optim.lr=0.01 \ - model.optim.sched.warmup_ratio=0.1 \ - model.tokenizer.tokenizer_name=sentencepiece \ - model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \ - model.mask_prob=0.15 \ - model.short_seq_prob=0.1 \ - exp_manager.exp_dir=PretrainingBERTFromText \ - ' - sh 'rm -f /home/TestData/nlp/wikitext-2/*.pkl' - sh 'rm -rf examples/nlp/language_modeling/PretrainingBERTFromText' - sh 'ls -lha examples/nlp/language_modeling' - } - } - stage('L2: Pretraining BERT from Preprocessed') { - steps { - sh 'cd examples/nlp/language_modeling && \ - python bert_pretraining.py \ - --config-name=bert_pretraining_from_preprocessed_config.yaml \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - trainer.precision=16 \ - +trainer.fast_dev_run=false \ - +trainer.max_epochs=1 \ - +trainer.limit_val_batches=0 \ - +trainer.limit_train_batches=1 \ - model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \ - model.train_ds.batch_size=8 \ - model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \ - model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \ - model.optim.lr=0.875e-4 \ - model.optim.weight_decay=0.01 \ - model.optim.sched.warmup_ratio=0.01 \ - exp_manager.exp_dir=PretrainingBERTFromPreprocessed \ - exp_manager.create_checkpoint_callback=False \ - ' - sh 'rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed' - sh 'ls -lha examples/nlp/language_modeling' - } - } - } - } - - stage('L2: Entity Linking') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage ('Self Alignment Pretraining BERT') { - steps { - sh 'cd examples/nlp/entity_linking && \ - python self_alignment_pretraining.py \ - project_dir=. \ - trainer.val_check_interval=3 \ - model.raw_data=None \ - model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \ - model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \ - model.train_ds.batch_size=8 \ - model.validation_ds.batch_size=8 \ - exp_manager.exp_dir=null' - } - } - } - } - - // TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858 - // is in the release container - stage('L2: NMT Attention is All You Need Training') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L2: NMT Training Post-LN') { - steps { - sh 'python examples/nlp/machine_translation/enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=false \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - model.encoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.encoder.inner_size=256 \ - model.decoder.num_layers=1 \ - model.decoder.hidden_size=64 \ - model.decoder.inner_size=256 \ - +model.optim.capturable=True \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.val_check_interval=2 \ - +trainer.limit_val_batches=1 \ - +trainer.max_steps=2 \ - trainer.precision=16 \ - +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ - +exp_manager.create_checkpoint_callback=true \ - ' - sh 'python examples/nlp/machine_translation/enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - model.encoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.encoder.inner_size=256 \ - model.decoder.num_layers=1 \ - model.decoder.hidden_size=64 \ - model.decoder.inner_size=256 \ - +model.optim.capturable=True \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.val_check_interval=10 \ - +trainer.limit_val_batches=1 \ - +trainer.limit_test_batches=1 \ - +trainer.max_steps=10 \ - +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ - +exp_manager.create_checkpoint_callback=true \ - +exp_manager.resume_if_exists=True \ - ' - sh 'rm -rf examples/nlp/machine_translation/nmt_results' - } - } - - stage('L2: NMT Training Pre-LN') { - steps { - sh 'cd examples/nlp/machine_translation && \ - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - model.encoder.pre_ln=true \ - model.decoder.pre_ln=true \ - trainer.devices=[1] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - +trainer.limit_test_batches=2 \ - exp_manager=null \ - ' - } - } - stage('L2: NMT Multi-Validation') { - steps { - sh 'cd examples/nlp/machine_translation && \ - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \ - model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ - model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ - model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ - model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - +trainer.limit_test_batches=2 \ - exp_manager=null \ - ' - } - } - } - } - - stage('L2: NMT Attention is All You Need Inference') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L2: NMT Inference - PostLN') { - steps { - sh 'cd examples/nlp/machine_translation && \ - python nmt_transformer_infer.py \ - --model=/home/TestData/nlp/nmt/toy_data/TransformerLargeDe-En.nemo \ - --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \ - --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \ - --target_lang en \ - --source_lang de \ - ' - } - } - stage('L2: NMT Inference - Pre-LN') { - steps { - sh 'cd examples/nlp/machine_translation && \ - python nmt_transformer_infer.py \ - --model=/home/TestData/nlp/nmt/toy_data/en_de_24x6_preln.nemo \ - --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.test.src \ - --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \ - --target_lang de \ - --source_lang en \ - ' - } - } - } - } - - stage('L2: NMT Attention is All You Need Finetuning') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "cd examples/nlp/machine_translation && \ - python enc_dec_nmt_finetune.py \ - model_path=/home/TestData/nlp/nmt/toy_data/en_de_24x6_preln.nemo \ - trainer.devices=[0] \ - ~trainer.max_epochs \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - +trainer.val_check_interval=10 \ - +trainer.limit_val_batches=1 \ - +trainer.limit_test_batches=1 \ - +trainer.max_steps=10 \ - +exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \ - +exp_manager.create_checkpoint_callback=True \ - +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \ - +exp_manager.checkpoint_callback_params.mode=max \ - +exp_manager.checkpoint_callback_params.save_best_model=true \ - " - sh "rm -rf examples/nlp/machine_translation/nmt_finetune" - } - } - - - stage('L2: NMT Tarred Dataset Creation') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('L2: NMT Auto Tarred Dataset Creation') { - steps { - sh 'cd examples/nlp/machine_translation && \ - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_training=false \ - model.preproc_out_dir=$PWD/preproc_out_dir \ - model.train_ds.use_tarred_dataset=true \ - model.train_ds.n_preproc_jobs=2 \ - model.train_ds.lines_per_dataset_fragment=500 \ - model.train_ds.num_batches_per_tarfile=10 \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.vocab_size=2000 \ - model.decoder_tokenizer.vocab_size=2000 \ - ~model.test_ds \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager=null \ - ' - } - } - - stage('L2: NMT Script Tarred Dataset Creation') { - steps { - sh 'cd examples/nlp/machine_translation && \ - python create_tarred_parallel_dataset.py \ - --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - --out_dir $PWD/out_dir \ - --encoder_tokenizer_vocab_size=2000 \ - --decoder_tokenizer_vocab_size=2000 \ - --tokens_in_batch=1000 \ - --lines_per_dataset_fragment=500 \ - --num_batches_per_tarfile=10 \ - --n_preproc_jobs=2 \ - ' - } - } - } - } - stage('L2: Megatron NMT Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/machine_translation/megatron_nmt_training.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - +trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.train_ds.num_workers=1 \ - model.validation_ds.num_workers=1 \ - ~model.test_ds \ - model.train_ds.dataset_type=text_memmap \ - model.encoder_tokenizer.library=sentencepiece \ - model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - model.decoder_tokenizer.library=sentencepiece \ - model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model" - // Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error - // if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() - sh "python examples/nlp/machine_translation/megatron_nmt_training.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.train_ds.num_workers=1 \ - model.validation_ds.num_workers=1 \ - ~model.test_ds \ - model.train_ds.dataset_type=text_memmap \ - model.encoder_tokenizer.library=sentencepiece \ - model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - model.decoder_tokenizer.library=sentencepiece \ - model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model" - sh "rm -rf examples/nlp/machine_translation/megatron_nmt_results" - } - } - stage('L2: Megatron BART Perceiver MIM Training TP=2') { - // Testing Megatron hidden transformations - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.arch=perceiver \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.data.data_impl=text_mmap \ - model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ - model.data.splits_string=\'\"800,100,100\"\' \ - model.data.whole_word_masking=False \ - model.tokenizer.library=sentencepiece \ - model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - ++model.hiddens.enc_output_name=z \ - ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ - ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ - ++model.hiddens.loss.mim.cls_name=a_mim \ - ++model.hiddens.loss.mim.loss_weight=0.5" - // Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error - // if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() - sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.arch=perceiver \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.data.data_impl=text_mmap \ - model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ - model.data.splits_string=\'\"800,100,100\"\' \ - model.data.whole_word_masking=False \ - model.tokenizer.library=sentencepiece \ - model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - ++model.hiddens.enc_output_name=z \ - ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ - ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ - ++model.hiddens.loss.mim.cls_name=a_mim \ - ++model.hiddens.loss.mim.loss_weight=0.5" - sh "rm -rf examples/nlp/language_modeling/megatron_mim_results" - } - } - // stage('L2: NMT Bottleneck Fallback') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // parallel { - // stage('L2: seq2seq (no bottleneck)') { - // steps { - // sh 'cd examples/nlp/machine_translation && \ - // enc_dec_nmt-bottleneck.py \ - // --config-path=conf \ - // --config-name=aayn_bottleneck \ - // do_testing=true \ - // model.model_type=nll \ - // model.encoder.arch=seq2seq \ - // model.encoder.hidden_steps=1 \ - // model.encoder.hidden_blocks=1 \ - // model.encoder.hidden_init_method=params \ - // model.encoder.hidden_size=64 \ - // model.encoder.inner_size=128 \ - // model.encoder.num_attention_heads=2 \ - // model.encoder.num_layers=2 \ - // model.decoder.hidden_size=64 \ - // model.decoder.inner_size=128 \ - // model.decoder.num_attention_heads=2 \ - // model.decoder.num_layers=2 \ - // model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \ - // model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \ - // model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ - // model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ - // model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src \ - // model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref \ - // model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - // model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - // trainer.devices=[1] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // +trainer.limit_test_batches=2 \ - // exp_manager=null \ - // ' - // } - // } - // } - // } - // stage('L2: NMT Bottleneck Architecture') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // parallel { - // stage('Bridge Encoder (identity)') { - // steps { - // sh 'cd examples/nlp/machine_translation && \ - // enc_dec_nmt-bottleneck.py \ - // --config-path=conf \ - // --config-name=aayn_bottleneck \ - // do_testing=true \ - // model.model_type=nll \ - // model.encoder.arch=bridge \ - // model.encoder.hidden_steps=1 \ - // model.encoder.hidden_blocks=1 \ - // model.encoder.hidden_init_method=identity \ - // model.encoder.hidden_size=64 \ - // model.encoder.inner_size=128 \ - // model.encoder.num_attention_heads=2 \ - // model.encoder.num_layers=2 \ - // model.decoder.hidden_size=64 \ - // model.decoder.inner_size=128 \ - // model.decoder.num_attention_heads=2 \ - // model.decoder.num_layers=2 \ - // model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - // model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - // model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // +trainer.limit_test_batches=2 \ - // exp_manager=null \ - // ' - // } - // } - // stage('Perceiver Encoder (params)') { - // steps { - // sh 'cd examples/nlp/machine_translation && \ - // enc_dec_nmt-bottleneck.py \ - // --config-path=conf \ - // --config-name=aayn_bottleneck \ - // do_testing=true \ - // model.model_type=nll \ - // model.encoder.arch=perceiver \ - // model.encoder.hidden_steps=1 \ - // model.encoder.hidden_blocks=1 \ - // model.encoder.hidden_init_method=params \ - // model.encoder.hidden_size=64 \ - // model.encoder.inner_size=128 \ - // model.encoder.num_attention_heads=2 \ - // model.encoder.num_layers=2 \ - // model.decoder.hidden_size=64 \ - // model.decoder.inner_size=128 \ - // model.decoder.num_attention_heads=2 \ - // model.decoder.num_layers=2 \ - // model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - // model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - // model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - // trainer.devices=[1] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // +trainer.limit_test_batches=2 \ - // exp_manager=null \ - // ' - // } - // } - // } - // } - // stage('L2: NMT Bottleneck LVM') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // parallel { - // stage('VAE') { - // steps { - // sh 'cd examples/nlp/machine_translation && \ - // enc_dec_nmt-bottleneck.py \ - // --config-path=conf \ - // --config-name=aayn_bottleneck \ - // do_testing=true \ - // model.model_type=vae \ - // model.encoder.arch=perceiver \ - // model.encoder.hidden_steps=1 \ - // model.encoder.hidden_blocks=1 \ - // model.encoder.hidden_init_method=params \ - // model.encoder.hidden_size=64 \ - // model.encoder.inner_size=128 \ - // model.encoder.num_attention_heads=2 \ - // model.encoder.num_layers=2 \ - // model.decoder.hidden_size=64 \ - // model.decoder.inner_size=128 \ - // model.decoder.num_attention_heads=2 \ - // model.decoder.num_layers=2 \ - // model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - // model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - // model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - // trainer.devices=[0] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // +trainer.limit_test_batches=2 \ - // exp_manager=null \ - // ' - // } - // } - // stage('MIM') { - // steps { - // sh 'cd examples/nlp/machine_translation && \ - // enc_dec_nmt-bottleneck.py \ - // --config-path=conf \ - // --config-name=aayn_bottleneck \ - // do_testing=true \ - // model.model_type=mim \ - // model.encoder.arch=perceiver \ - // model.encoder.hidden_steps=1 \ - // model.encoder.hidden_blocks=1 \ - // model.encoder.hidden_init_method=params \ - // model.encoder.hidden_size=64 \ - // model.encoder.inner_size=128 \ - // model.encoder.num_attention_heads=2 \ - // model.encoder.num_layers=2 \ - // model.decoder.hidden_size=64 \ - // model.decoder.inner_size=128 \ - // model.decoder.num_attention_heads=2 \ - // model.decoder.num_layers=2 \ - // model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - // model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - // model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - // model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - // trainer.devices=[1] \ - // trainer.accelerator="gpu" \ - // +trainer.fast_dev_run=true \ - // +trainer.limit_test_batches=2 \ - // exp_manager=null \ - // ' - // } - // } - // } - // } - stage('L2: Megatron Bert Pretraining and Resume Training with Pipeline Paralleism') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings" - sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings" - sh "rm -rf examples/nlp/language_modeling/bert_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/bert_index_mappings" - } - } - stage('L2: Megatron Bert Pretraining and Resume Training') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings" - sh "python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings" - sh "rm -rf examples/nlp/language_modeling/bert_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/bert_index_mappings" - } - } - stage('L2: Megatron RETRO Pretraining and Resume Training') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.accumulate_grad_batches=1 \ - trainer.limit_val_batches=2 \ - exp_manager.resume_if_exists=True \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_results \ - model.data.data_prefix='' \ - model.data.knn_index='' \ - model.data.retrieval_prefix='' \ - model.tensor_model_parallel_size=2 \ - model.micro_batch_size=4 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.chunk_size=32 \ - model.enc_num_layers=2 \ - model.dec_num_layers=2 \ - model.enc_cross_attention=[1] \ - model.dec_cross_attention=[1] \ - +model.data.mock=True" - sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.accumulate_grad_batches=1 \ - trainer.limit_val_batches=2 \ - exp_manager.resume_if_exists=True \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_results \ - model.data.data_prefix='' \ - model.data.knn_index='' \ - model.data.retrieval_prefix='' \ - model.tensor_model_parallel_size=2 \ - model.micro_batch_size=4 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.chunk_size=32 \ - model.enc_num_layers=2 \ - model.dec_num_layers=2 \ - model.enc_cross_attention=[1] \ - model.dec_cross_attention=[1] \ - +model.data.mock=True" - sh "rm -rf examples/nlp/language_modeling/retro_results" - } - } - stage('L2: Megatron RETRO muTransfer Pretraining Performance') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=100 \ - trainer.log_every_n_steps=1 \ - trainer.precision=16 \ - trainer.val_check_interval=100 \ - trainer.limit_val_batches=0 \ - trainer.gradient_clip_val=1.0 \ - +trainer.num_sanity_val_steps=0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \ - +exp_manager.version=smalltest \ - model.data.neighbors=2 \ - model.megatron_amp_O2=False \ - model.apply_query_key_layer_scaling=False \ - model.tensor_model_parallel_size=1 \ - model.optim.name=muadamw \ - model.optim.weight_decay=0.1 \ - model.optim.betas=[0.9,0.95] \ - model.optim.lr=6e-4 \ - model.optim.sched.warmup_steps=1000 \ - model.optim.sched.constant_steps=0 \ - model.optim.sched.min_lr=6e-5 \ - model.add_position_embedding=False \ - model.enc_num_layers=2 \ - model.dec_num_layers=6 \ - model.enc_cross_attention=[0] \ - model.dec_cross_attention=[3,5] \ - model.hidden_size=96 \ - model.ffn_hidden_size=384 \ - model.init_method_std=0.023 \ - model.num_attention_heads=12 \ - model.max_position_embeddings=1024 \ - model.encoder_seq_length=1024 \ - model.tokenizer.library=megatron \ - model.tokenizer.type=GPT2BPETokenizer \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \ - model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \ - model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \ - model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \ - model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \ - model.data.num_workers=8 \ - model.micro_batch_size=8 \ - model.normalization=rmsnorm \ - model.transformer_block_type=pre_ln \ - model.bias_activation_fusion=True \ - model.bias_dropout_add_fusion=False \ - model.masked_softmax_fusion=True \ - model.hidden_dropout=0 \ - model.attention_dropout=0 \ - model.fp32_residual_connection=True \ - model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml" - sh '''python -c "import pandas as pd -import pathlib -from pandas.testing import assert_frame_equal -from tensorboard.backend.event_processing.event_accumulator import EventAccumulator -import torch -if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()): - import sys - sys.exit(0) -event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0] -ea = EventAccumulator(str(event_file)).Reload() -vals = [] -for i in ea.Scalars('reduced_train_loss'): - vals.append(i.value) -training_curve = pd.DataFrame({'loss': vals}) -gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv') -assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' - sh "rm -rf examples/nlp/language_modeling/retro_results" - } - } - stage('L2: BioMegatron Bert NER Task') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/token_classification/token_classification_train.py \ - exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \ - trainer.max_epochs=1 \ - model.dataset.data_dir=/home/TestData/nlp/ner \ - model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \ - model.tokenizer.tokenizer_name=null" - sh "rm -rf examples/nlp/language_modeling/token_classification_results" - } - } - stage('L2: Megatron GPT Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_granularity='full' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_granularity='full' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" - } - } - stage('L2: Megatron GPT with Rope Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=rope \ - model.rotary_percentage=0.5 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_granularity='full' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - // commented out to save time on github ci @adithyare - //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - //trainer.devices=2 \ - //trainer.accelerator=gpu \ - //trainer.log_every_n_steps=1 \ - //trainer.val_check_interval=2 \ - //trainer.limit_val_batches=1 \ - //trainer.accumulate_grad_batches=1 \ - //trainer.max_steps=6 \ - //trainer.precision=16 \ - //trainer.gradient_clip_val=1.0 \ - //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - //exp_manager.resume_if_exists=True \ - //model.tensor_model_parallel_size=2 \ - //model.optim.name=fused_adam \ - //model.optim.lr=2e-4 \ - //model.optim.sched.warmup_steps=2 \ - //model.optim.sched.constant_steps=2 \ - //model.optim.sched.min_lr=8e-5 \ - //model.max_position_embeddings=128 \ - //model.encoder_seq_length=128 \ - //model.data.seq_length=128 \ - //model.position_embedding_type=rope \ - //model.rotary_percentage=0.5 \ - //model.normalization=rmsnorm \ - //model.bias=False \ - //model.bias_activation_fusion=False \ - //model.bias_dropout_add_fusion=False \ - //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - //model.num_layers=8 \ - //model.hidden_size=256 \ - //model.num_attention_heads=8 \ - //model.activations_checkpoint_method='block' \ - //model.activations_checkpoint_granularity='full' \ - //model.activations_checkpoint_num_layers=1 \ - //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" - } - } - - // This test requires Ampere but some of the test GPUs are Volta - // Need to add a check for compute capability before uncommenting this test - // stage('L2: Megatron GPT with Rope Pretraining using Flash Attention and Resume Training TP=2') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps { - // sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - // trainer.devices=2 \ - // trainer.accelerator=gpu \ - // trainer.log_every_n_steps=1 \ - // trainer.val_check_interval=2 \ - // trainer.limit_val_batches=2 \ - // trainer.accumulate_grad_batches=1 \ - // trainer.max_steps=3 \ - // trainer.precision=16 \ - // trainer.gradient_clip_val=1.0 \ - // exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - // model.tensor_model_parallel_size=2 \ - // model.optim.name=fused_adam \ - // model.optim.lr=2e-4 \ - // model.optim.sched.warmup_steps=1 \ - // model.optim.sched.constant_steps=1 \ - // model.optim.sched.min_lr=8e-5 \ - // model.max_position_embeddings=128 \ - // model.encoder_seq_length=128 \ - // model.data.seq_length=128 \ - // model.position_embedding_type=rope \ - // model.rotary_percentage=0.5 \ - // model.normalization=rmsnorm \ - // model.bias=False \ - // model.bias_activation_fusion=False \ - // model.bias_dropout_add_fusion=False \ - // model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - // model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - // model.num_layers=8 \ - // model.hidden_size=256 \ - // model.num_attention_heads=8 \ - // model.activations_checkpoint_method='block' \ - // model.activations_checkpoint_granularity='full' \ - // model.activations_checkpoint_num_layers=1 \ - // model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - // model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ - // model.use_flash_attention=True " - // // commented out to save time on github ci @adithyare - // //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - // //trainer.devices=2 \ - // //trainer.accelerator=gpu \ - // //trainer.log_every_n_steps=1 \ - // //trainer.val_check_interval=2 \ - // //trainer.limit_val_batches=1 \ - // //trainer.accumulate_grad_batches=1 \ - // //trainer.max_steps=6 \ - // //trainer.precision=16 \ - // //trainer.gradient_clip_val=1.0 \ - // //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - // //exp_manager.resume_if_exists=True \ - // //model.tensor_model_parallel_size=2 \ - // //model.optim.name=fused_adam \ - // //model.optim.lr=2e-4 \ - // //model.optim.sched.warmup_steps=2 \ - // //model.optim.sched.constant_steps=2 \ - // //model.optim.sched.min_lr=8e-5 \ - // //model.max_position_embeddings=128 \ - // //model.encoder_seq_length=128 \ - // //model.data.seq_length=128 \ - // //model.position_embedding_type=rope \ - // //model.rotary_percentage=0.5 \ - // //model.normalization=rmsnorm \ - // //model.bias=False \ - // //model.bias_activation_fusion=False \ - // //model.bias_dropout_add_fusion=False \ - // //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - // //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - // //model.num_layers=8 \ - // //model.hidden_size=256 \ - // //model.num_attention_heads=8 \ - // //model.activations_checkpoint_method='block' \ - // //model.activations_checkpoint_granularity='full' \ - // //model.activations_checkpoint_num_layers=1 \ - // //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - // //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ - // //model.use_flash_attention=True" - // sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - // sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" - // } - // } - stage('L2: Megatron GPT with ALiBi Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=alibi \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_granularity='full' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - // not testing resume functionality to save time on ci @adithyare - //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - //trainer.devices=2 \ - //trainer.accelerator=gpu \ - //trainer.log_every_n_steps=1 \ - //trainer.val_check_interval=2 \ - //trainer.limit_val_batches=1 \ - //trainer.accumulate_grad_batches=1 \ - //trainer.max_steps=6 \ - //trainer.precision=16 \ - //trainer.gradient_clip_val=1.0 \ - //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - //exp_manager.resume_if_exists=True \ - //model.tensor_model_parallel_size=2 \ - //model.optim.name=fused_adam \ - //model.optim.lr=2e-4 \ - //model.optim.sched.warmup_steps=2 \ - //model.optim.sched.constant_steps=2 \ - //model.optim.sched.min_lr=8e-5 \ - //model.max_position_embeddings=128 \ - //model.encoder_seq_length=128 \ - //model.data.seq_length=128 \ - //model.position_embedding_type=alibi \ - //model.normalization=rmsnorm \ - //model.bias=False \ - //model.bias_activation_fusion=False \ - //model.bias_dropout_add_fusion=False \ - //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - //model.num_layers=8 \ - //model.hidden_size=256 \ - //model.num_attention_heads=8 \ - //model.activations_checkpoint_method='block' \ - //model.activations_checkpoint_granularity='full' \ - //model.activations_checkpoint_num_layers=1 \ - //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" - } - } - stage('L2: Megatron GPT with KERPLE Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=kerple \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_granularity='full' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - // commented out to save time on github ci @adithyare - //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - //trainer.devices=2 \ - //trainer.accelerator=gpu \ - //trainer.log_every_n_steps=1 \ - //trainer.val_check_interval=2 \ - //trainer.limit_val_batches=1 \ - //trainer.accumulate_grad_batches=1 \ - //trainer.max_steps=6 \ - //trainer.precision=16 \ - //trainer.gradient_clip_val=1.0 \ - //exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - //exp_manager.resume_if_exists=True \ - //model.tensor_model_parallel_size=2 \ - //model.optim.name=fused_adam \ - //model.optim.lr=2e-4 \ - //model.optim.sched.warmup_steps=2 \ - //model.optim.sched.constant_steps=2 \ - //model.optim.sched.min_lr=8e-5 \ - //model.max_position_embeddings=128 \ - //model.encoder_seq_length=128 \ - //model.data.seq_length=128 \ - //model.position_embedding_type=kerple \ - //model.normalization=rmsnorm \ - //model.bias=False \ - //model.bias_activation_fusion=False \ - //model.bias_dropout_add_fusion=False \ - //model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - //model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - //model.num_layers=8 \ - //model.hidden_size=256 \ - //model.num_attention_heads=8 \ - //model.activations_checkpoint_method='block' \ - //model.activations_checkpoint_granularity='full' \ - //model.activations_checkpoint_num_layers=1 \ - //model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - //model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" - } - } - stage('L2: Megatron GPT Pretraining and Resume Training PP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.activation=fast-swiglu \ - model.bias_activation_fusion=False \ - model.hidden_dropout=0.0 \ - model.attention_dropout=0.0 \ - model.transformer_block_type=normformer \ - model.headscale=True \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.activation=fast-swiglu \ - model.bias_activation_fusion=False \ - model.hidden_dropout=0.0 \ - model.attention_dropout=0.0 \ - model.transformer_block_type=normformer \ - model.headscale=True \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings" - } - } - // @athitten Remove /home/TestData/nlp/megatron_sft/trec.jsonl for validation and test file until we have support for multiple dataloaders in lightning 2.0 - stage('L2: Megatron GPT Finetuning PP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - +trainer.limit_val_batches=2 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.peft.peft_scheme=null \ - model.data.train_ds.micro_batch_size=1 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.micro_batch_size=1 \ - model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.names=[quarel] \ - model.data.validation_ds.micro_batch_size=1 \ - model.data.validation_ds.global_batch_size=1 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel]" - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.peft.peft_scheme=null \ - model.data.train_ds.micro_batch_size=1 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.micro_batch_size=1 \ - model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.names=[quarel] \ - model.data.validation_ds.micro_batch_size=1 \ - model.data.validation_ds.global_batch_size=1 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel]" - sh "rm -rf examples/nlp/language_modeling/gpt_sft_results" - } - } - stage('L2: Megatron GPT Finetuning StarCoder PP=1') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_sft.py \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.precision=32 \ - trainer.max_steps=4 \ - trainer.val_check_interval=4 \ - trainer.enable_checkpointing=False \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - exp_manager.checkpoint_callback_params.save_best_model=False \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ - model.optim.name=distributed_fused_adam \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.num_workers=0 \ - model.data.train_ds.concat_sampling_probabilities=[1.0]" - sh "rm -rf examples/nlp/language_modeling/gpt_sft_results" - } - } - stage('L2: Megatron GPT PEFT Lora PP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2" - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - model.peft.peft_scheme='lora' \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel]" - sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2" - } - } - stage('L2: Megatron GPT PEFT Lora TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "rm -rf /home/TestData/nlp/lora_tuning_tp2" - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \ - model.pipeline_model_parallel_size=1 \ - model.tensor_model_parallel_size=2 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - model.peft.peft_scheme='lora' \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel]" - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \ - model.peft.restore_from_ckpt_name=null \ - model.peft.restore_from_hparams_path=null \ - model.tensor_model_parallel_size=2 \ - trainer.devices=2 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ - model.data.test_ds.names=['quarel4'] \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=10 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_tp2/out' \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path='/home/TestData/nlp/lora_tuning_tp2/out.jsonl'" - sh "rm -rf /home/TestData/nlp/lora_tuning_tp2" - } - } - stage('L2: Megatron GPT Eval') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps{ - sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \ - gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \ - prompts=['How to fix GPU memory? A:'] \ - tensor_model_parallel_size=1 \ - inference.tokens_to_generate=32 \ - trainer.precision=32" - } - } - stage('L2: Megatron GPT Eval PP2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \ - gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - server=False \ - tensor_model_parallel_size=1 \ - pipeline_model_parallel_size=2 \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.precision=32" - } - } - stage('L2: Megatron GPT SFT Eval (inference seq len > training seq len)') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps{ - sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \ - model.peft.restore_from_path=null \ - model.data.test_ds.file_names=['/home/TestData/nlp/megatron_gpt_sft/sample.jsonl'] \ - model.data.test_ds.names=['test'] \ - model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=30 \ - model.data.test_ds.max_seq_length=6000 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix='examples/nlp/language_modeling/out' \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path='examples/nlp/language_modeling/out.jsonl' && \ - rm -rf examples/nlp/language_modeling/out.jsonl" - } - } - - // TODO: Add this test back. Test was failing on CI machines due to HW error - // stage('L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps { - // sh "python -m torch.distributed.launch --nproc_per_node=2 \ - // examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \ - // --checkpoint_folder=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700 \ - // --checkpoint_name=model_optim_rng.pt \ - // --hparams_file=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700/hparams.yaml \ - // --nemo_file_path=examples/nlp/language_modeling/small_gpt.nemo \ - // --model_type=gpt \ - // --pipeline_model_parallel_size=1 \ - // --gpus_per_node=2 \ - // --tensor_model_parallel_size=2" - // sh "python examples/nlp/language_modeling/megatron_gpt_eval.py \ - // --gpt_model_file=examples/nlp/language_modeling/small_gpt.nemo \ - // --tokens_to_generate=32 \ - // --tensor_model_parallel_size=2 \ - // --prompt='This is a test.'" - // sh "rm examples/nlp/language_modeling/small_gpt.nemo" - // } - // } - stage('L2: Megatron Change Partitions') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel{ - stage('Reduce TP Num Partitions (2 to 1) and PP Num Partitions (1 to 2)'){ - steps{ - sh "python examples/nlp/language_modeling/megatron_change_num_partitions.py \ - --model_file \ - /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - --target_file \ - /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \ - --tensor_model_parallel_size \ - 2 \ - --target_tensor_model_parallel_size \ - 1 \ - --pipeline_model_parallel_size \ - 1 \ - --target_pipeline_model_parallel_size \ - 2" - sh "rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo" - } - } - stage('Increase TP Num Partitions (2 to 4) and PP Num Partitions (1 to 2)'){ - steps{ - sh "python examples/nlp/language_modeling/megatron_change_num_partitions.py \ - --model_file \ - /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - --target_file \ - /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \ - --tensor_model_parallel_size \ - 2 \ - --target_tensor_model_parallel_size \ - 4 \ - --pipeline_model_parallel_size \ - 1 \ - --target_pipeline_model_parallel_size \ - 1" - sh "rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo" - } - } - } - } - stage('L2: Megatron T5 Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=relative \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='fast-swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='pre_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False" - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=relative \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='fast-swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='pre_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False" - sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/t5_index_mappings" - } - } - stage('L2: Megatron T5 with ALiBi Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=alibi \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='pre_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False" - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=alibi \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='pre_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False" - sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/t5_index_mappings" - } - } - stage('L2: Megatron T5 with KERPLE Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=kerple \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='pre_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False" - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=kerple \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='swiglu' \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='pre_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False" - sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/t5_index_mappings" - } - } - stage('L2: Megatron T5 Pretraining and Resume Training PP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.decoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.decoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_hidden_size=2048 \ - model.encoder.activation='gelu' \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='post_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings" - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.decoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.decoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_hidden_size=2048 \ - model.encoder.activation='gelu' \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='post_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings" - sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/t5_index_mappings" - } - } - stage('L2: Megatron T5 w/ Mixture of Expert Pretraining') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.decoder.num_layers=1 \ - model.encoder.num_moe_experts=4 \ - model.decoder.num_moe_experts=4 \ - model.encoder.moe_frequency=3 \ - model.decoder.moe_frequency=1 \ - model.encoder.hidden_size=64 \ - model.decoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_hidden_size=2048 \ - model.encoder.activation='gelu' \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='pre_ln' \ - model.decoder.transformer_block_type='post_ln' \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings" - sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/t5_index_mappings" - } - } - - stage('L2: Megatron UL2 Pretraining and Resume Training TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='normformer' \ - model.encoder.headscale=True \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='geglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.decoder.transformer_block_type='normformer' \ - model.decoder.headscale=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings" - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='swiglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type='normformer' \ - model.encoder.headscale=True \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='geglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.decoder.transformer_block_type='normformer' \ - model.decoder.headscale=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings" - sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results" - sh "rm -rf examples/nlp/language_modeling/t5_index_mappings" - } - } - stage('L2: Megatron T5 Eval') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps{ - sh "python examples/nlp/language_modeling/megatron_t5_eval.py \ - --model_file \ - /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - --prompt \ - 'How do I fix my GPU memory issue? I am seeing out of memory.' \ - --tensor_model_parallel_size 1" - } - } - stage('L2: Megatron BART Pretraining and Resume Training, TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='reglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='reglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'" - sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=5 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='reglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='reglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}'" - sh "rm -rf examples/nlp/language_modeling/bart_pretrain_results" - } - } - stage('L2: Megatron BART Pretraining and Resume Training, PP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='geglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='geglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.respect_document_boundaries=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]" - sh "python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation='geglu' \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method='block' \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation='geglu' \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method='block' \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.respect_document_boundaries=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]" - sh "rm -rf examples/nlp/language_modeling/bart_pretrain_results" - } - } - stage('L2: Megatron T5 GLUE/XNLI Finetuning') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - // TODO(Oktai15): update it in 1.8.0 version - stage('T5 GLUE RTE') { - steps { - sh "python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ - trainer.devices=1 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - model.pipeline_model_parallel_size=1 \ - model.pipeline_model_parallel_split_rank=0 \ - model.data.train_ds.task_name=rte \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=2 \ - model.data.validation_ds.micro_batch_size=2 \ - model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ - model.data.validation_ds.task_name=rte \ - model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv \ - " - sh "rm -rf examples/nlp/language_modeling/t5_glue_results" - } - } - stage('T5 GLUE XNLI') { - steps { - sh "python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ - -cn megatron_t5_config_finetune_glue_xnli \ - trainer.devices=1 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - model.pipeline_model_parallel_size=1 \ - model.pipeline_model_parallel_split_rank=0 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=2 \ - model.data.validation_ds.micro_batch_size=2 \ - model.data.test_ds.global_batch_size=2 \ - model.data.test_ds.micro_batch_size=2 \ - model.data.train_ds.task_name=rte \ - model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ - model.data.validation_ds.task_name=xnli \ - model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \ - model.data.test_ds.task_name=xnli \ - model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \ - " - sh "rm -rf examples/nlp/language_modeling/t5_xnli_results" - } - } - } - } - - stage('L2: Megatron T5 PEFT Lora TP=2') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "rm -rf /home/TestData/nlp/t5_lora_tuning_tp2" - sh "python examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=/home/TestData/nlp/t5_lora_tuning_tp2 \ - model.pipeline_model_parallel_size=1 \ - model.tensor_model_parallel_size=2 \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ - model.peft.peft_scheme='lora' \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel]" - sh "python examples/nlp/language_modeling/tuning/megatron_t5_peft_eval.py \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ - model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \ - model.peft.restore_from_ckpt_name=null \ - model.peft.restore_from_hparams_path=null \ - model.tensor_model_parallel_size=2 \ - trainer.devices=2 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ - model.data.test_ds.names=['quarel4'] \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=10 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/t5_lora_tuning_tp2/out' \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path='/home/TestData/nlp/t5_lora_tuning_tp2/out.jsonl'" - sh "rm -rf /home/TestData/nlp/t5_lora_tuning_tp2" - } - } - - - stage('L2: Megatron Mock Data Generation') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - parallel { - stage('MockGPTDataset') { - steps { - sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.max_steps=10 \ - trainer.limit_val_batches=7 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.data.data_impl=mock \ - model.data.data_prefix=[] \ - " - sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - } - } - stage('MockT5Dataset') { - steps { - sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.max_steps=10 \ - trainer.limit_val_batches=3 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.data.data_impl=mock \ - model.data.data_prefix=[] \ - " - sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results" - } - } - } - } - stage('L2: TTS Fast dev runs 1') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - parallel { - stage('Tacotron 2') { - steps { - sh 'python examples/tts/tacotron2.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.decoder.decoder_rnn_dim=256 \ - model.decoder.attention_rnn_dim=1024 \ - model.decoder.prenet_dim=128 \ - model.postnet.postnet_n_convolutions=3 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs \ - ~trainer.check_val_every_n_epoch \ - ' - } - } - stage('WaveGlow') { - steps { - sh 'python examples/tts/waveglow.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - model.waveglow.n_flows=4 \ - model.waveglow.n_wn_layers=2 \ - model.waveglow.n_wn_channels=32 \ - ~trainer.check_val_every_n_epoch' - } - } - stage('FastPitch') { - steps { - sh 'python examples/tts/fastpitch.py \ - --config-name fastpitch_align_v1.05 \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - sup_data_path=/home/TestData/an4_dataset/beta_priors \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.pitch_mean=212.35873413085938 \ - model.pitch_std=68.52806091308594 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - model.symbols_embedding_dim=64 \ - model.input_fft.d_inner=384 \ - model.input_fft.n_layer=2 \ - model.output_fft.d_inner=384 \ - model.output_fft.n_layer=2 \ - ~trainer.check_val_every_n_epoch \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs' - } - } - stage('RADTTS') { - steps { - sh 'python examples/tts/radtts.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.pitch_mean=212.35873413085938 \ - model.pitch_std=68.52806091308594 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - export_dir=/home/TestData/radtts_test \ - model.optim.lr=0.0001 \ - model.modelConfig.decoder_use_partial_padding=True \ - ~trainer.check_val_every_n_epoch \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs' - } - } - stage('Mixer-TTS') { - steps { - sh 'python examples/tts/mixer_tts.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - sup_data_path=/home/TestData/an4_dataset/sup_data \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.pitch_mean=212.35873413085938 \ - model.pitch_std=68.52806091308594 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - ~trainer.check_val_every_n_epoch \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs' - } - } - stage('Hifigan') { - steps { - sh 'python examples/tts/hifigan.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - +trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - model.generator.upsample_initial_channel=64 \ - +model.debug=true \ - ~trainer.check_val_every_n_epoch' - } - } - } - } - - stage('L??: Speech Checkpoints tests') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh 'CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \ - pretrained_name=QuartzNet15x5Base-En \ - dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \ - batch_size=64 \ - tolerance=0.1012' - sh 'rm -f examples/asr/evaluation_transcripts.json' - } - } - } - - post { - always { - sh 'chmod -R 777 .' - cleanWs() - } - } -} diff --git a/SoundScribe/SpeakerID/LICENSE b/SoundScribe/SpeakerID/LICENSE deleted file mode 100644 index f49a4e16e68b128803cc2dcea614603632b04eac..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file diff --git a/SoundScribe/SpeakerID/README.rst b/SoundScribe/SpeakerID/README.rst deleted file mode 100644 index d05a9b63c6422d95d16f1b4afda8ae6adb28b3ed..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/README.rst +++ /dev/null @@ -1,387 +0,0 @@ - -|status| |documentation| |codeql| |license| |pypi| |pyversion| |downloads| |black| - -.. |status| image:: http://www.repostatus.org/badges/latest/active.svg - :target: http://www.repostatus.org/#active - :alt: Project Status: Active – The project has reached a stable, usable state and is being actively developed. - -.. |documentation| image:: https://readthedocs.com/projects/nvidia-nemo/badge/?version=main - :alt: Documentation - :target: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/ - -.. |license| image:: https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg - :target: https://github.com/NVIDIA/NeMo/blob/master/LICENSE - :alt: NeMo core license and license for collections in this repo - -.. |pypi| image:: https://badge.fury.io/py/nemo-toolkit.svg - :target: https://badge.fury.io/py/nemo-toolkit - :alt: Release version - -.. |pyversion| image:: https://img.shields.io/pypi/pyversions/nemo-toolkit.svg - :target: https://badge.fury.io/py/nemo-toolkit - :alt: Python version - -.. |downloads| image:: https://static.pepy.tech/personalized-badge/nemo-toolkit?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=downloads - :target: https://pepy.tech/project/nemo-toolkit - :alt: PyPi total downloads - -.. |codeql| image:: https://github.com/nvidia/nemo/actions/workflows/codeql.yml/badge.svg?branch=main&event=push - :target: https://github.com/nvidia/nemo/actions/workflows/codeql.yml - :alt: CodeQL - -.. |black| image:: https://img.shields.io/badge/code%20style-black-000000.svg - :target: https://github.com/psf/black - :alt: Code style: black - -.. _main-readme: - -**NVIDIA NeMo** -=============== - -Introduction ------------- - -NVIDIA NeMo is a conversational AI toolkit built for researchers working on automatic speech recognition (ASR), -text-to-speech synthesis (TTS), large language models (LLMs), and -natural language processing (NLP). -The primary objective of NeMo is to help researchers from industry and academia to reuse prior work (code and pretrained models) -and make it easier to create new `conversational AI models `_. - -All NeMo models are trained with `Lightning `_ and -training is automatically scalable to 1000s of GPUs. -Additionally, NeMo Megatron LLM models can be trained up to 1 trillion parameters using tensor and pipeline model parallelism. -NeMo models can be optimized for inference and deployed for production use-cases with `NVIDIA Riva `_. - -Getting started with NeMo is simple. -State of the Art pretrained NeMo models are freely available on `HuggingFace Hub `_ and -`NVIDIA NGC `_. -These models can be used to transcribe audio, synthesize speech, or translate text in just a few lines of code. - -We have extensive `tutorials `_ that -can be run on `Google Colab `_. - -For advanced users that want to train NeMo models from scratch or finetune existing NeMo models -we have a full suite of `example scripts `_ that support multi-GPU/multi-node training. - -For scaling NeMo LLM training on Slurm clusters or public clouds, please see the `NVIDIA NeMo Megatron Launcher `_. -The NM launcher has extensive recipes, scripts, utilities, and documentation for training NeMo LLMs and also has an `Autoconfigurator `_ -which can be used to find the optimal model parallel configuration for training on a specific cluster. - -Key Features ------------- - -* Speech processing - * `HuggingFace Space for Audio Transcription (File, Microphone and YouTube) `_ - * `Pretrained models `_ available in 14+ languages - * `Automatic Speech Recognition (ASR) `_ - * Supported ASR `models `_: - * Jasper, QuartzNet, CitriNet, ContextNet - * Conformer-CTC, Conformer-Transducer, FastConformer-CTC, FastConformer-Transducer - * Squeezeformer-CTC and Squeezeformer-Transducer - * LSTM-Transducer (RNNT) and LSTM-CTC - * Supports the following decoders/losses: - * CTC - * Transducer/RNNT - * Hybrid Transducer/CTC - * NeMo Original `Multi-blank Transducers `_ and `Token-and-Duration Transducers (TDT) `_ - * Streaming/Buffered ASR (CTC/Transducer) - `Chunked Inference Examples `_ - * `Cache-aware Streaming Conformer `_ with multiple lookaheads. - * Beam Search decoding - * `Language Modelling for ASR (CTC and RNNT) `_: N-gram LM in fusion with Beam Search decoding, Neural Rescoring with Transformer - * `Support of long audios for Conformer with memory efficient local attention `_ - * `Speech Classification, Speech Command Recognition and Language Identification `_: MatchboxNet (Command Recognition), AmberNet (LangID) - * `Voice activity Detection (VAD) `_: MarbleNet - * ASR with VAD Inference - `Example `_ - * `Speaker Recognition `_: TitaNet, ECAPA_TDNN, SpeakerNet - * `Speaker Diarization `_ - * Clustering Diarizer: TitaNet, ECAPA_TDNN, SpeakerNet - * Neural Diarizer: MSDD (Multi-scale Diarization Decoder) - * `Speech Intent Detection and Slot Filling `_: Conformer-Transformer -* Natural Language Processing - * `NeMo Megatron pre-training of Large Language Models `_ - * `Neural Machine Translation (NMT) `_ - * `Punctuation and Capitalization `_ - * `Token classification (named entity recognition) `_ - * `Text classification `_ - * `Joint Intent and Slot Classification `_ - * `Question answering `_ - * `GLUE benchmark `_ - * `Information retrieval `_ - * `Entity Linking `_ - * `Dialogue State Tracking `_ - * `Prompt Learning `_ - * `NGC collection of pre-trained NLP models. `_ - * `Synthetic Tabular Data Generation `_ -* Text-to-Speech Synthesis (TTS): - * `Documentation `_ - * Mel-Spectrogram generators: FastPitch, SSL FastPitch, Mixer-TTS/Mixer-TTS-X, RAD-TTS, Tacotron2 - * Vocoders: HiFiGAN, UnivNet, WaveGlow - * End-to-End Models: VITS - * `Pre-trained Model Checkpoints in NVIDIA GPU Cloud (NGC) `_ -* `Tools `_ - * `Text Processing (text normalization and inverse text normalization) `_ - * `NeMo Forced Aligner `_ - * `CTC-Segmentation tool `_ - * `Speech Data Explorer `_: a dash-based tool for interactive exploration of ASR/TTS datasets - * `Speech Data Processor `_ - - -Built for speed, NeMo can utilize NVIDIA's Tensor Cores and scale out training to multiple GPUs and multiple nodes. - -Requirements ------------- - -1) Python 3.10 or above -2) Pytorch 1.13.1 or above -3) NVIDIA GPU, if you intend to do model training - -Documentation -------------- - -.. |main| image:: https://readthedocs.com/projects/nvidia-nemo/badge/?version=main - :alt: Documentation Status - :scale: 100% - :target: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/ - -.. |stable| image:: https://readthedocs.com/projects/nvidia-nemo/badge/?version=stable - :alt: Documentation Status - :scale: 100% - :target: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/ - -+---------+-------------+------------------------------------------------------------------------------------------------------------------------------------------+ -| Version | Status | Description | -+=========+=============+==========================================================================================================================================+ -| Latest | |main| | `Documentation of the latest (i.e. main) branch. `_ | -+---------+-------------+------------------------------------------------------------------------------------------------------------------------------------------+ -| Stable | |stable| | `Documentation of the stable (i.e. most recent release) branch. `_ | -+---------+-------------+------------------------------------------------------------------------------------------------------------------------------------------+ - -Tutorials ---------- -A great way to start with NeMo is by checking `one of our tutorials `_. - -You can also get a high-level overview of NeMo by watching the talk *NVIDIA NeMo: Toolkit for Conversational AI*, presented at PyData Yerevan 2022: - -|pydata| - -.. |pydata| image:: https://img.youtube.com/vi/J-P6Sczmas8/maxres3.jpg - :target: https://www.youtube.com/embed/J-P6Sczmas8?mute=0&start=14&autoplay=0 - :width: 600 - :alt: NeMo presentation at PyData@Yerevan 2022 - -Getting help with NeMo ----------------------- -FAQ can be found on NeMo's `Discussions board `_. You are welcome to ask questions or start discussions there. - - -Installation ------------- -Conda -~~~~~ - -We recommend installing NeMo in a fresh Conda environment. - -.. code-block:: bash - - conda create --name nemo python==3.10.12 - conda activate nemo - -Install PyTorch using their `configurator `_. - -.. code-block:: bash - - conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia - -The command used to install PyTorch may depend on your system. Please use the configurator linked above to find the right command for your system. - -Pip -~~~ -Use this installation mode if you want the latest released version. - -.. code-block:: bash - - apt-get update && apt-get install -y libsndfile1 ffmpeg - pip install Cython - pip install nemo_toolkit['all'] - -Depending on the shell used, you may need to use ``"nemo_toolkit[all]"`` instead in the above command. - -Pip from source -~~~~~~~~~~~~~~~ -Use this installation mode if you want the version from a particular GitHub branch (e.g main). - -.. code-block:: bash - - apt-get update && apt-get install -y libsndfile1 ffmpeg - pip install Cython - python -m pip install git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[all] - - -From source -~~~~~~~~~~~ -Use this installation mode if you are contributing to NeMo. - -.. code-block:: bash - - apt-get update && apt-get install -y libsndfile1 ffmpeg - git clone https://github.com/NVIDIA/NeMo - cd NeMo - ./reinstall.sh - -If you only want the toolkit without additional conda-based dependencies, you may replace ``reinstall.sh`` -with ``pip install -e .`` when your PWD is the root of the NeMo repository. - -Mac computers with Apple silicon -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To install NeMo on Mac with Apple M-Series GPU: - -- create a new Conda environment - -- install PyTorch 2.0 or higher - -- run the following code: - -.. code-block:: shell - - # [optional] install mecab using Homebrew, to use sacrebleu for NLP collection - # you can install Homebrew here: https://brew.sh - brew install mecab - - # [optional] install pynini using Conda, to use text normalization - conda install -c conda-forge pynini - - # install Cython manually - pip install cython - - # clone the repo and install in development mode - git clone https://github.com/NVIDIA/NeMo - cd NeMo - ./reinstall.sh - -RNNT -~~~~ -Note that RNNT requires numba to be installed from conda. - -.. code-block:: bash - - conda remove numba - pip uninstall numba - conda install -c conda-forge numba - -NeMo Megatron -~~~~~~~~~~~~~ -NeMo Megatron training requires NVIDIA Apex to be installed. -Install it manually if not using the NVIDIA PyTorch container. - -To install Apex, run - -.. code-block:: bash - - git clone https://github.com/NVIDIA/apex.git - cd apex - git checkout 52e18c894223800cb611682dce27d88050edf1de - pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./ - -It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Apex or any other dependencies. - -While installing Apex, it may raise an error if the CUDA version on your system does not match the CUDA version torch was compiled with. -This raise can be avoided by commenting it here: https://github.com/NVIDIA/apex/blob/master/setup.py#L32 - -cuda-nvprof is needed to install Apex. The version should match the CUDA version that you are using: - -.. code-block:: bash - - conda install -c nvidia cuda-nvprof=11.8 - -packaging is also needed: - -.. code-block:: bash - - pip install packaging - -With the latest versions of Apex, the `pyproject.toml` file in Apex may need to be deleted in order to install locally. - - -Transformer Engine -~~~~~~~~~~~~~~~~~~ -NeMo Megatron GPT has been integrated with `NVIDIA Transformer Engine `_ -Transformer Engine enables FP8 training on NVIDIA Hopper GPUs. -`Install `_ it manually if not using the NVIDIA PyTorch container. - -.. code-block:: bash - - pip install --upgrade git+https://github.com/NVIDIA/TransformerEngine.git@stable - -It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Transformer Engine or any other dependencies. - -Transformer Engine requires PyTorch to be built with CUDA 11.8. - - -Flash Attention -~~~~~~~~~~~~~~~~~~~~ -Transformer Engine already supports Flash Attention for GPT models. If you want to use Flash Attention for non-causal models or use with attention bias (introduced from position encoding, e.g. Alibi), please install `flash-attn `_. - -.. code-block:: bash - - pip install flash-attn - pip install triton==2.0.0.dev20221202 - -NLP inference UI -~~~~~~~~~~~~~~~~~~~~ -To launch the inference web UI server, please install the gradio `gradio `_. - -.. code-block:: bash - - pip install gradio==3.34.0 - -NeMo Text Processing -~~~~~~~~~~~~~~~~~~~~ -NeMo Text Processing, specifically (Inverse) Text Normalization, is now a separate repository `https://github.com/NVIDIA/NeMo-text-processing `_. - -Docker containers: -~~~~~~~~~~~~~~~~~~ -We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.20.0`` comes with container ``nemo:23.06``, you may find more details about released containers in `releases page `_. - -To use built container, please run - -.. code-block:: bash - - docker pull nvcr.io/nvidia/nemo:23.06 - -To build a nemo container with Dockerfile from a branch, please run - -.. code-block:: bash - - DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest . - - -If you choose to work with the main branch, we recommend using NVIDIA's PyTorch container version 23.06-py3 and then installing from GitHub. - -.. code-block:: bash - - docker run --gpus all -it --rm -v :/NeMo --shm-size=8g \ - -p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \ - stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.06-py3 - -Examples --------- - -Many examples can be found under the `"Examples" `_ folder. - - -Contributing ------------- - -We welcome community contributions! Please refer to `CONTRIBUTING.md `_ for the process. - -Publications ------------- - -We provide an ever-growing list of `publications `_ that utilize the NeMo framework. - -If you would like to add your own article to the list, you are welcome to do so via a pull request to this repository's ``gh-pages-src`` branch. -Please refer to the instructions in the `README of that branch `_. - -License -------- -NeMo is released under an `Apache 2.0 license `_. diff --git a/SoundScribe/SpeakerID/ci.groovy b/SoundScribe/SpeakerID/ci.groovy deleted file mode 100644 index 27ad659b99a1746685045a6fcdcd2d5874ead849..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/ci.groovy +++ /dev/null @@ -1,119 +0,0 @@ -@Library('blossom-github-lib@master') -import ipp.blossom.* - -podTemplate(cloud:'sc-ipp-blossom-prod', yaml : """ -apiVersion: v1 -kind: Pod -metadata: - labels: - some-label: some-label-value -spec: - volumes: - - name: scratch - nfs: - server: ipp1-cdot01-col01 - path: /vol/scratch1/scratch.okuchaiev_blossom - containers: - - name: latestdlfw - image: nvcr.io/nvidia/pytorch:23.02-py3 - command: - - cat - volumeMounts: - - name: scratch - mountPath: /testdata - resources: - limits: - nvidia.com/gpu: 2 - restartPolicy: Never - backoffLimit: 4 - tty: true - shm-size: 32g - nodeSelector: - kubernetes.io/os: linux - nvidia.com/gpu_type: "Tesla_T4x4" - nvidia.com/node_type: gpu_tester - nvidia.com/driver_version: "510.20" -""" -) { - node(POD_LABEL) { - def githubHelper - stage('Get Token') { - withCredentials([usernamePassword(credentialsId: 'GHAtoken', passwordVariable: 'GIT_PASSWORD', usernameVariable: 'GIT_USERNAME')]) { - // create new instance of helper object - githubHelper = GithubHelper.getInstance("${GIT_PASSWORD}", githubData) - } - - } - def stageName = '' - try { - currentBuild.description = githubHelper.getBuildDescription() - container('latestdlfw') { - stage('Code checkout') { - // update status on github - githubHelper.updateCommitStatus("$BUILD_URL", "$stageName Running", GitHubCommitState.PENDING) - checkout changelog: true, poll: true, scm: [$class: 'GitSCM', branches: [[name: "pr/"+githubHelper.getPRNumber()]], - doGenerateSubmoduleConfigurations: false, - submoduleCfg: [], - userRemoteConfigs: [[credentialsId: 'github-token', url: githubHelper.getCloneUrl(), refspec: '+refs/pull/*/head:refs/remotes/origin/pr/*']]] - } - - stage('Code Style') { - sh "apt-get update && \ - apt-get install -y bc && \ - nvidia-smi && \ - pip install -r requirements/requirements_test.txt && \ - python setup.py style && ls -l /testdata/TestData && ln -s /testdata/TestData /home/TestData && \ - ls -l /home && ls -l /home/TestData" - } - - stage('Installation') { - sh "git config --global --add safe.directory '*' && nvidia-smi && ./reinstall.sh release" - } - - stage('L0: GPU unit tests') { - sh "NEMO_NUMBA_MINVER=0.53 pytest -m 'not pleasefixme'" - } - - parallel( //USE CUDA_VISIBLE_DEVICES to execute 2 single GPU tests in parallel here - [ - "L1: NMT Training Pre-LN": { sh 'CUDA_VISIBLE_DEVICES=0 python examples/nlp/machine_translation/enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/testdata/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - model.decoder_tokenizer.tokenizer_model=/testdata/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - model.encoder.pre_ln=true \ - model.decoder.pre_ln=true \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - +trainer.limit_test_batches=2 \ - exp_manager=null \ - '}, - "L1: Speech to text": { sh 'CUDA_VISIBLE_DEVICES=1 python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/testdata/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/testdata/TestData/an4_dataset/an4_val.json \ - trainer.devices=[0] \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager=null \ - '} - ] - )//end of parallel - } - githubHelper.updateCommitStatus("$BUILD_URL", "Complete", GitHubCommitState.SUCCESS) - } - catch (Exception ex){ - currentBuild.result = 'FAILURE' - println ex - githubHelper.updateCommitStatus("$BUILD_URL", "$stageName Failed", GitHubCommitState.FAILURE) - } - - } - } \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/Makefile b/SoundScribe/SpeakerID/docs/Makefile deleted file mode 100644 index 417fe2a0b149f603832abdd4dc070f30a331a502..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/Makefile +++ /dev/null @@ -1,216 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = build - -# User-friendly check for sphinx-build -ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) -$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) -endif - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source - -.PHONY: help -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " applehelp to make an Apple Help Book" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - @echo " coverage to run coverage check of the documentation (if enabled)" - -.PHONY: clean -clean: - rm -rf $(BUILDDIR)/* - -.PHONY: html -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -.PHONY: dirhtml -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -.PHONY: singlehtml -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -.PHONY: pickle -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -.PHONY: json -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -.PHONY: htmlhelp -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -.PHONY: qthelp -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/OpenSeq2Seq.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/OpenSeq2Seq.qhc" - -.PHONY: applehelp -applehelp: - $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp - @echo - @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." - @echo "N.B. You won't be able to view it unless you put it in" \ - "~/Library/Documentation/Help or install it in your application" \ - "bundle." - -.PHONY: devhelp -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/OpenSeq2Seq" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/OpenSeq2Seq" - @echo "# devhelp" - -.PHONY: epub -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -.PHONY: latex -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -.PHONY: latexpdf -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -.PHONY: latexpdfja -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -.PHONY: text -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -.PHONY: man -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -.PHONY: texinfo -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -.PHONY: info -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -.PHONY: gettext -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -.PHONY: changes -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -.PHONY: linkcheck -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -.PHONY: doctest -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -.PHONY: coverage -coverage: - $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage - @echo "Testing of coverage in the sources finished, look at the " \ - "results in $(BUILDDIR)/coverage/python.txt." - -.PHONY: xml -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." - -.PHONY: pseudoxml -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/SoundScribe/SpeakerID/docs/source/_static/css/custom.css b/SoundScribe/SpeakerID/docs/source/_static/css/custom.css deleted file mode 100644 index cf0ad0ff2d7f5b0561ddcad8c3220276d0168d56..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/_static/css/custom.css +++ /dev/null @@ -1,366 +0,0 @@ -body { - font-size: 100%; - font-family: 'NVIDIA Sans', sans-serif; -} - - -/* Width of template */ - -.wy-nav-content { - max-width: 1200px !important; -} - - - -/* Standard Text Formatting */ - -h1 { - color: #76b900; - text-align: center; - /* background-color: #ffffff; */ -} - -h2 { - color: #ffffff; - /* background-color: #ffffff; */ - /* #76b900 */ - Padding: 5px; -} - -h3 { - padding-top: 0px; - border-top: solid 3px #000000; - /* #76b900 */ - border-bottom: solid 3px #000000; - /* #76b900 */ -} - -p { - margin-bottom: 24px; -} - -/* Link Colors */ -a { - color: #76b900; -} - -a:visited { - color: #218219; -} - -.container-xl { - margin-right: unset; - margin-left: unset; -} - -section { - overflow-x: auto; -} - -/* ----------------------------------------------TABLES--------------------------------------- */ -section table { - overflow-x: auto; - display: block; -} - -table { - font-size: small; -} - -/* Table head Color */ -thead td { - background-color: #333333 !important; -} - -.row-odd p { - /*padding-bottom: 0px;*/ - /*margin-bottom: 0px;*/ -} - -/* even rows*/ - -.row-even tr { - background-color: #e5f1e6 !important; -} - -/* odd rows*/ - - -.wy-table-responsive table tr { - background-color: #ffffff !important; -} - - - -.wy-table-responsive table td { - white-space: normal; -} - - -/* Removes bottom margin in tables*/ - -.rst-content .line-block { - margin-bottom: 0px; -} - -.wy-table-responsive { - overflow: visible !important; -} - -/* reduces the size of text in multiline table columns. */ - -.rst-content table.docutils td { - font-size: 80%; -} - -.rst-content dl:not(.docutils) dt { - - background-color: inherit; - color: #000000; - border-top: solid 0px #000000; - -} - -.rst-content dl:not(.docutils) dt:before { - color: #333333; -} - -.rst-content .line-block { - margin-bottom: 0px; -} - -.wy-side-nav-search, -.wy-nav-top { - background-color: #000000; - padding: 0; -} - -.wy-side-nav-search img { - padding: 0px; - padding: 0px 0px; - margin-bottom: 0; -} - -.wy-side-nav-search input[type=text] { - border-radius: 0px; -} - - -.wy-menu-vertical p.caption { - color: #76b900; -} - - -.wy-side-nav-search>a img.logo, -.wy-side-nav-search .wy-dropdown>a img.logo { - margin: 0px 0px 0px 0px; -} - -.wy-nav-content { - margin: 0; - min-height: 100%; - height: 100%; - background: #ffffff; -} - -/* List (numbered, bulleted) padding Fix */ - - -.wy-plain-list-decimal li { - margin-top: -6px; - margin-bottom: -6px; -} - -.rst-content .section ol.loweralpha { - margin-top: -6px; - margin-bottom: 12px; -} - -.wy-plain-list-disc, -.rst-content .toctree-wrapper ul, -article ul { - margin-top: 0px !important; - margin-bottom: 12px; -} - -/* Alert Boxes */ -/* Background color of Alert Box Title */ - -.rst-content .section ul { - margin-top: -12px; - margin-bottom: 16px; -} - -.wy-alert.wy-alert-info .wy-alert-title, -.rst-content .note .wy-alert-title, -.rst-content .wy-alert-info.attention .wy-alert-title, -.rst-content .wy-alert-info.caution .wy-alert-title, -.rst-content .wy-alert-info.danger .wy-alert-title, -.rst-content .wy-alert-info.error .wy-alert-title, -.rst-content .wy-alert-info.hint .wy-alert-title, -.rst-content .wy-alert-info.important .wy-alert-title, -.rst-content .wy-alert-info.tip .wy-alert-title, -.rst-content .wy-alert-info.warning .wy-alert-title, -.rst-content .seealso .wy-alert-title, -.rst-content .wy-alert-info.admonition-todo .wy-alert-title, -.rst-content .wy-alert-info.admonition .wy-alert-title, -.wy-alert.wy-alert-info .rst-content .admonition-title, -.rst-content .wy-alert.wy-alert-info .admonition-title, -.rst-content .note .admonition-title, -.rst-content .wy-alert-info.attention .admonition-title, -.rst-content .wy-alert-info.caution .admonition-title, -.rst-content .wy-alert-info.danger .admonition-title, -.rst-content .wy-alert-info.error .admonition-title, -.rst-content .wy-alert-info.hint .admonition-title, -.rst-content .wy-alert-info.important .admonition-title, -.rst-content .wy-alert-info.tip .admonition-title, -.rst-content .wy-alert-info.warning .admonition-title, -.rst-content .seealso .admonition-title, -.rst-content .wy-alert-info.admonition-todo .admonition-title, -.rst-content .wy-alert-info.admonition .admonition-title { - background: #76b900; -} - -/* Background and Font Color of Alert Box Main Body*/ -.wy-alert.wy-alert-info, -.rst-content .note, -.rst-content .wy-alert-info.attention, -.rst-content .wy-alert-info.caution, -.rst-content .wy-alert-info.danger, -.rst-content .wy-alert-info.error, -.rst-content .wy-alert-info.hint, -.rst-content .wy-alert-info.important, -.rst-content .wy-alert-info.tip, -.rst-content .wy-alert-info.warning, -.rst-content .seealso, -.rst-content .wy-alert-info.admonition-todo, -.rst-content .wy-alert-info.admonition { - background: #333333; - color: #999999; -} - -.section { - margin-top: 50px; -} - -/* Logo */ -.navbar-brand-box { - background-color: #ffffff; -} - -/* ---------------------------------------------- Media Queries --------------------------------------- */ -@media (min-width: 1200px) { - .container-xl { - max-width: 100%; - } -} - -@media (min-width: none) { - body { - font-size: 18px; - } - - #site-navigation nav ul.nav { - font-size: 18px; - } - - #site-navigation nav.bd-links p { - font-size: 18px; - } - - #site-navigation { - width: 350px; - } - - .toc-h2 { - font-size: 18px; - } - - .toc-h3 { - font-size: 1rem; - } - - .toc-h4 { - font-size: 0.85rem; - } - - .header-article .bd-toc { - font-size: 18px; - } - - #main-content>div { - margin-left: 10%; - margin-right: 10%; - } -} - -/* ---------------------------------------------- NVIDIA Sans --------------------------------------- */ - -:root { - --md-text-font: "NVIDIA Sans"; - /* --md-code-font: "NVIDIA Sans"; */ -} - -@font-face { - font-family: "NVIDIA Sans"; - src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/5/2/52891dda673228d54e5d57bf1e4a3880d4b22405.woff2) format("woff2"), - url(https://aws1.discourse-cdn.com/nvidia/original/3X/e/0/e090b7dda7a582522c7f9045c6ce949cce60134f.woff) format("woff"); - font-weight: 300; - font-style: normal; -} - -@font-face { - font-family: "NVIDIA Sans"; - src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/a/1/a107baabcbf6b241099122336bce7429bcfd377a.woff2) format("woff2"), - url(https://aws1.discourse-cdn.com/nvidia/original/3X/3/a/3a6060a4e3bce70e5552ba0de8af4b22c6cf9144.woff) format("woff"); - font-weight: 300; - font-style: italic; -} - -@font-face { - font-family: "NVIDIA Sans"; - src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/9/9/9920d2b172b01d92fc9c1c0e521dcf45b59c47c3.woff2) format("woff2"), - url(https://aws1.discourse-cdn.com/nvidia/original/3X/6/c/6c7d947928a7e4ef3e80ed409bef6c243f2148cb.woff) format("woff"); - font-weight: 400; - font-style: normal; -} - -@font-face { - font-family: "NVIDIA Sans"; - src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/e/8/e8e63fe1244372cd942d957f44a5616a1eba0644.woff2) format("woff2"), - url(https://aws1.discourse-cdn.com/nvidia/original/3X/0/f/0f1fb2af0283ab09d36e7097bb07d895c3228f12.woff) format("woff"); - font-weight: 400; - font-style: italic; -} - -@font-face { - font-family: "NVIDIA Sans"; - src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/7/9/79d3c513a9cd72c59f65354f39f89ca52dc17dd2.woff2) format("woff2"), - url(https://aws1.discourse-cdn.com/nvidia/original/3X/2/5/2581ac533f5d01f4985d8a7245b0766b4630ced8.woff) format("woff"); - font-weight: 500; - font-style: normal; -} - -@font-face { - font-family: "NVIDIA Sans"; - src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/3/9/39d9ef1ee9770dd503f19bb2ace2fdb4eff3bb50.woff2) format("woff2"), - url(https://aws1.discourse-cdn.com/nvidia/original/3X/7/b/7bb5d5e2e71b2e13c8098b2e67c0a0ed9258e6c7.woff) format("woff"); - font-weight: 500; - font-style: italic; -} - -@font-face { - font-family: "NVIDIA Sans"; - src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/0/5/05276a55a43eb3f74981ec1e93252727afcd9d16.woff2) format("woff2"), - url(https://aws1.discourse-cdn.com/nvidia/original/3X/9/c/9cfec7ed941b06564aa4d5ca14610e81542d070f.woff) format("woff"); - font-weight: 700; - font-style: normal; -} - -@font-face { - font-family: "NVIDIA Sans"; - src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/a/e/aebd14d09ba56f541e1b8735fb051e33710f9ae7.woff2) format("woff2"), - url(https://aws1.discourse-cdn.com/nvidia/original/3X/e/d/edbdabef43acc5c12e84a94baaa5542c9404cfeb.woff) format("woff"); - font-weight: 700; - font-style: italic; -} \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/_static/js/pk_scripts.js b/SoundScribe/SpeakerID/docs/source/_static/js/pk_scripts.js deleted file mode 100644 index 23c74982d3aad11ab94c31e623b60e0ebe66972a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/_static/js/pk_scripts.js +++ /dev/null @@ -1,19 +0,0 @@ -document.addEventListener("DOMContentLoaded", function () { - var params = window.location.search.substring(1).split("&").reduce(function (params, param) { - if (!param) { - return params; - } - - var values = param.split("="); - var name = values[0]; - var value = values[1]; - params[name] = value; - return params; - }, {}); - - var form = document.getElementById("feedback-form"); - for (var name in params) { - var input = form.querySelector("[name=" + name + "]"); - input.value = params[name]; - } -}); \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/_templates/layout.html b/SoundScribe/SpeakerID/docs/source/_templates/layout.html deleted file mode 100644 index c8651c293491cbc0eadfc36138eaadcb9552d359..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/_templates/layout.html +++ /dev/null @@ -1,14 +0,0 @@ -{% extends "!layout.html" %} - -{% block extrahead %} - - - -{% endblock %} - -{% block footer %} - - - -{% endblock %} \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/api.rst b/SoundScribe/SpeakerID/docs/source/asr/api.rst deleted file mode 100644 index 1d880018fd150161a2ca87a3be99461af92b8ca8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/api.rst +++ /dev/null @@ -1,322 +0,0 @@ -NeMo ASR collection API -======================= - - -Model Classes -------------- - -.. autoclass:: nemo.collections.asr.models.EncDecCTCModel - :show-inheritance: - :members: transcribe, change_vocabulary, setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact - - -.. autoclass:: nemo.collections.asr.models.EncDecCTCModelBPE - :show-inheritance: - :members: transcribe, change_vocabulary, setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact - - -.. autoclass:: nemo.collections.asr.models.EncDecRNNTModel - :show-inheritance: - :members: transcribe, change_vocabulary, setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact - - -.. autoclass:: nemo.collections.asr.models.EncDecRNNTBPEModel - :show-inheritance: - :members: transcribe, change_vocabulary, setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact - - -.. autoclass:: nemo.collections.asr.models.EncDecClassificationModel - :show-inheritance: - :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact - - -.. autoclass:: nemo.collections.asr.models.EncDecSpeakerLabelModel - :show-inheritance: - :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact - - -.. autoclass:: nemo.collections.asr.models.hybrid_asr_tts_models.ASRWithTTSModel - :show-inheritance: - :members: from_asr_config, from_pretrained_models, save_asr_model_to, setup_training_data - -.. _confidence-ensembles-api: - -.. autoclass:: nemo.collections.asr.models.confidence_ensembles.ConfidenceEnsembleModel - :show-inheritance: - :members: transcribe - -Modules -------- - -.. autoclass:: nemo.collections.asr.modules.ConvASREncoder - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.modules.ConvASRDecoder - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.modules.ConvASRDecoderClassification - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.modules.SpeakerDecoder - :show-inheritance: - :members: - -.. _conformer-encoder-api: - -.. autoclass:: nemo.collections.asr.modules.ConformerEncoder - :show-inheritance: - :members: - -.. _squeezeformer-encoder-api: - -.. autoclass:: nemo.collections.asr.modules.SqueezeformerEncoder - :show-inheritance: - :members: - -.. _rnn-encoder-api: - -.. autoclass:: nemo.collections.asr.modules.RNNEncoder - :show-inheritance: - :members: - -.. _rnnt-decoder-api: - -.. autoclass:: nemo.collections.asr.modules.RNNTDecoder - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.modules.StatelessTransducerDecoder - :show-inheritance: - :members: - -.. _rnnt-joint-api: - -.. autoclass:: nemo.collections.asr.modules.RNNTJoint - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.modules.SampledRNNTJoint - :show-inheritance: - :members: - - - -Parts ------ - -.. autoclass:: nemo.collections.asr.parts.submodules.jasper.JasperBlock - :show-inheritance: - :members: - - -Mixins ------- - -.. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRBPEMixin - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.mixins.interctc_mixin.InterCTCMixin - :show-inheritance: - :members: - -Datasets --------- - -Character Encoding Datasets -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: nemo.collections.asr.data.audio_to_text.AudioToCharDataset - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset - :show-inheritance: - :members: - - -Text-to-Text Datasets for Hybrid ASR-TTS models -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: nemo.collections.asr.data.text_to_text.TextToTextDataset - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.data.text_to_text.TextToTextIterableDataset - :show-inheritance: - :members: - - -Subword Encoding Datasets -~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: nemo.collections.asr.data.audio_to_text.AudioToBPEDataset - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset - :show-inheritance: - :members: - -Audio Preprocessors -------------------- - -.. autoclass:: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.modules.AudioToMFCCPreprocessor - :show-inheritance: - :members: - -Audio Augmentors ----------------- - -.. autoclass:: nemo.collections.asr.modules.SpectrogramAugmentation - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.modules.CropOrPadSpectrogramAugmentation - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.SpeedPerturbation - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.TimeStretchPerturbation - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.GainPerturbation - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.ImpulsePerturbation - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.ShiftPerturbation - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.NoisePerturbation - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.WhiteNoisePerturbation - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.RirAndNoisePerturbation - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.TranscodePerturbation - :show-inheritance: - :members: - -Miscellaneous Classes ---------------------- - -CTC Decoding -~~~~~~~~~~~~ - -.. autoclass:: nemo.collections.asr.metrics.wer.CTCDecoding - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.metrics.wer_bpe.CTCBPEDecoding - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.submodules.ctc_greedy_decoding.GreedyCTCInfer - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.submodules.ctc_beam_decoding.BeamCTCInfer - :show-inheritance: - :members: - -RNNT Decoding -~~~~~~~~~~~~~ - -.. autoclass:: nemo.collections.asr.metrics.rnnt_wer.RNNTDecoding - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.metrics.rnnt_wer_bpe.RNNTBPEDecoding - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.submodules.rnnt_greedy_decoding.GreedyRNNTInfer - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.submodules.rnnt_greedy_decoding.GreedyBatchedRNNTInfer - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.submodules.rnnt_beam_decoding.BeamRNNTInfer - :show-inheritance: - :members: - -Hypotheses -~~~~~~~~~~ - -.. autoclass:: nemo.collections.asr.parts.utils.rnnt_utils.Hypothesis - :show-inheritance: - :no-members: - -.. autoclass:: nemo.collections.asr.parts.utils.rnnt_utils.NBestHypotheses - :show-inheritance: - :no-members: - -Adapter Networks -~~~~~~~~~~~~~~~~ - -.. autoclass:: nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.MultiHeadAttentionAdapter - :show-inheritance: - :members: - :member-order: bysource - ------ - -.. autoclass:: nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.RelPositionMultiHeadAttentionAdapter - :show-inheritance: - :members: - :member-order: bysource - ------ - -.. autoclass:: nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.PositionalEncodingAdapter - :show-inheritance: - :members: - :member-order: bysource - ------ - -.. autoclass:: nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.RelPositionalEncodingAdapter - :show-inheritance: - :members: - :member-order: bysource - - -Adapter Strategies -~~~~~~~~~~~~~~~~~~ - -.. autoclass:: nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.MHAResidualAddAdapterStrategy - :show-inheritance: - :members: - :member-order: bysource - :undoc-members: adapter_module_names - ------ - diff --git a/SoundScribe/SpeakerID/docs/source/asr/asr_all.bib b/SoundScribe/SpeakerID/docs/source/asr/asr_all.bib deleted file mode 100644 index 01c765f68f371557b5b37d38598c899711c7abf7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/asr_all.bib +++ /dev/null @@ -1,1043 +0,0 @@ -@article{matchboxnet, - title={{MatchboxNet}: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition}, - author={Majumdar, Somshubra and Ginsburg, Boris}, - journal={Proc. Interspeech 2020}, - year={2020} -} - -@article{marblenet, - title={MarbleNet: Deep 1D Time-Channel Separable Convolutional Neural Network for Voice Activity Detection}, - author={Jia, Fei and Majumdar, Somshubra and Ginsburg, Boris}, - journal={arXiv preprint arXiv:2010.13886}, - year={2020} -} - -@inproceedings{panayotov2015librispeech, - title={Librispeech: an ASR corpus based on public domain audio books}, - author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev}, - booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on}, - pages={5206--5210}, - year={2015}, - organization={IEEE} -} - -@article{luong17, - author = {Minh{-}Thang Luong and Eugene Brevdo and Rui Zhao}, - title = {Neural Machine Translation (seq2seq) Tutorial}, - journal = {https://github.com/tensorflow/nmt}, - year = {2017}, -} - -@INPROCEEDINGS{LaurentSeqWiseBN, -author={C. {Laurent} and G. {Pereyra} and P. {Brakel} and Y. {Zhang} and Y. {Bengio}}, -booktitle={2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, -title={Batch normalized recurrent neural networks}, -year={2016}, -volume={}, -number={}, -pages={2657-2661}, -keywords={feedforward neural nets;learning (artificial intelligence);recurrent neural nets;speech recognition;batch normalized recurrent neural networks;RNN;sequential data;long-term dependency learning;convergence rate improvement;intermediate representation normalization;feedforward neural networks;speech recognition task;language modeling;training criterion;Training;Recurrent neural networks;Convergence;Speech recognition;Computer architecture;Speech;batch normalization;RNN;LSTM;optimization}, -doi={10.1109/ICASSP.2016.7472159}, -ISSN={2379-190X}, -month={March},} - -@article{graves2005, - author = {Alex Graves and Jurgen Schmidhuber}, - title = {Framewise phoneme classification with bidirectional LSTM and other neural network architectures}, - journal = {Neural Networks, vol. 18}, - pages={602–-610}, - year = {2005}, -} - -@inproceedings{graves2006, - title={Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks}, - author={Graves, Alex and Fern{\'a}ndez, Santiago and Gomez, Faustino and Schmidhuber, J{\"u}rgen}, - booktitle={Proceedings of the 23rd international conference on Machine learning}, - pages={369--376}, - year={2006}, - organization={ACM} -} - -@article{li2019jasper, - title={Jasper: An End-to-End Convolutional Neural Acoustic Model}, - author={Li, Jason and Lavrukhin, Vitaly and Ginsburg, Boris and Leary, Ryan and Kuchaiev, Oleksii and Cohen, Jonathan M and Nguyen, Huyen and Gadde, Ravi Teja}, - journal={arXiv preprint arXiv:1904.03288}, - year={2019} -} - -@misc{ardila2019common, - title={Common Voice: A Massively-Multilingual Speech Corpus}, - author={Rosana Ardila and Megan Branson and Kelly Davis and Michael Henretty and Michael Kohler and Josh Meyer and Reuben Morais and Lindsay Saunders and Francis M. Tyers and Gregor Weber}, - year={2019}, - eprint={1912.06670}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} - -@article{graves2012, - title={Sequence Transduction with Recurrent Neural Networks}, - author={Graves, Alex}, - journal={arXiv preprint arXiv:1211.3711}, - year={2012} -} - - -@article{graves2013, - title={Generating sequences with recurrent neural networks}, - author={Graves, Alex}, - journal={arXiv preprint arXiv:1308.0850}, - year={2013} -} - -@article{sergeev2018horovod, - title={Horovod: fast and easy distributed deep learning in TensorFlow}, - author={Sergeev, Alexander and Del Balso, Mike}, - journal={arXiv preprint arXiv:1802.05799}, - year={2018} -} - -@misc{NVVolta, - title = {NVIDIA TESLA V100 GPU ARCHITECTURE}, - howpublished = {\url{http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf}}, - note = {Accessed: 2018-10-09} -} - -@article{NVTuring, - title = {NVIDIA TURING GPU ARCHITECTURE}, - howpublished = {\url{https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/technologies/turing-architecture/NVIDIA-Turing-Architecture-Whitepaper.pdf}}, - author = {NVIDIA}, - year = {2018}, - note = {Accessed: 2018-10-09} -} - -@misc{Rygaard2015, - title = {Using Synthesized Speech to Improve Speech Recognition for Low-Resource Languages}, - author = {Luise Valentin Rygaard}, - howpublished = {\url{https://parasol.tamu.edu/dreu2015/Rygaard/report.pdf}}, - year = {2015}, -} - -@misc{OpenSeq2Seq, - title = {OpenSeq2Seq: extensible toolkit for distributed and mixed precision training of sequence-to-sequence models}, - author = {Kuchaiev, Oleksii and Ginsburg, Boris and Gitman, Igor and Lavrukhin,Vitaly and Case, Carl and Micikevicius, Paulius}, - howpublished = {\url{https://arxiv.org/abs/1805.10387}}, - year = {2018}, -} - -@misc{MPGuide, - title = {Training with Mixed Precision}, - howpublished = {\url{http://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/}}, - note = {Accessed: 2018-04-06}, -} - -@misc{Mozilla, - title = {Mozilla: A Journey to less than 10\% Word Error Rate}, - howpublished = {\url{https://hacks.mozilla.org/2017/11/a-journey-to-10-word-error-rate/}}, - note = {Accessed: 2018-04-06}, -} - -@article{Waibel1989, - title={A time-delay neural network architecture for isolated word recognition}, - author={Waibel, Alexander, and Hanazawa, Toshiyki and Hinton,Geoffrey and Shirano, Kiyohiro and Lang, Kevin }, - journal={IEEE Trans. on Acoustics, Speech and Signal Processing}, - year={1989} -} - -@article{Lang1990, - title={A time-delay neural network architecture for isolated word recognition}, - author={Lang, Kevin and Waibel, Alexander, and Hinton,Geoffrey }, - journal={Neural Networks}, - year={1990} -} - -@book{Bengio1996, - Author = {Bengio, Y.}, - Publisher = {International Thomson Computer Press}, - Title = {Neural Networks for Speech and Sequence Recognition}, - Year = {1996} -} - -@article{Bengio1992, - title={Global optimization of a neural network-hidden Markov model hybrid}, - author={Bengio, Y., and De Mori, R., and Flammia, G., and Kompe, R. }, - journal={IEEE Transactions on Neural Networks, 3(2), 252–259}, - year={1992} -} - -@article{Bourlard1994, - title={Connectionist speech recognition: a hybrid approach}, - author={Bourlard, H. A. and Morgan, N.}, - journal={volume 247 Springer }, - year={1994} -} - -@article{srivastava14a, - author = {Nitish Srivastava, and Geoffrey Hinton, and Alex Krizhevsky, and Ilya Sutskever, and Ruslan Salakhutdinov}, - title = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting}, - journal = {Journal of Machine Learning Research}, - year = {2014}, - volume = {15}, - pages = {1929-1958}, - url = {http://jmlr.org/papers/v15/srivastava14a.html} -} - - -@article{Hinton2012, - title={Deep Neural Networks for Acoustic Modeling in Speech Recognition}, - author={ Hinton,Geoffrey and Deng, Li and Yu, Dong and Dahl,George and Mohamed,Abdel-rahman and Jaitly, Navdeep and Senior, Andrew and Vanhoucke, Vincent and Nguyen, Patrick and Kingsbury, Brian and Sainath, Tara}, - journal={IEEE Signal Processing Magazine}, - year={2012} -} - -@article{Graves2014, - title={Towards End-to-End Speech Recognition with Recurrent Neural Networks}, - author={Graves, Alex and Jaitly, Navdeep}, - journal={International Conference on Machine Learning}, - year={2014} -} - -@article{Chorowski2014, - title={End-to-end Continuous Speech Recognition using Attention-based Recurrent NN: First Results}, - author={ Chorowski, Jan, and Bahdanau, Dzmitry , and Cho, Kyunghyun , and Bengio, Yoshua }, - journal={Neural Information Processing Systems: Workshop Deep Learning and Representation Learning Workshop }, - year={2014} -} - -@article{Sak2014, - title={Long short-term memory recurrent neural network architectures for large scale acoustic modeling}, - author={Sak, Hasim and Senior, Andrew and Beaufays, Francoise }, - journal={Interspeech 2014}, - year={2014} -} - -@article{Ko2015, - title={Audio Augmentation for Speech Recognition}, - author={Tom, Ko and Vijayaditya, Peddinti and Daniel, Povey - and Sanjeev, Khudanpur }, - journal={Interspeech 2015}, - year={2015} -} - -@article{Tjandra2017, - title={Listening while Speaking: Speech Chain by Deep Learning}, - author={Andros, Tjandra and Sakriani, Sakti and Satoshi, Nakamura }, - journal={ASRU 2017}, - year={2017} -} - -@article{Tjandra2018, - title={Machine Speech Chain with One-shot Speaker Adaptation}, - author={Andros, Tjandra and Sakriani, Sakti and Satoshi, Nakamura }, - journal={Interspeech 2018}, - year={2018} -} - -@article{bahdanau2014neural, - title={Neural machine translation by jointly learning to align and translate}, - author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua}, - journal={arXiv preprint arXiv:1409.0473}, - year={2014} -} - -@article{cho2014learning, - title={Learning phrase representations using RNN encoder-decoder for statistical machine translation}, - author={Cho, Kyunghyun and Van Merri{\"e}nboer, Bart and Gulcehre, Caglar and Bahdanau, Dzmitry and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua}, - journal={arXiv preprint arXiv:1406.1078}, - year={2014} -} - -@article{rush2015neural, - title={A neural attention model for abstractive sentence summarization}, - author={Rush, Alexander M and Chopra, Sumit and Weston, Jason}, - journal={arXiv preprint arXiv:1509.00685}, - year={2015} -} - -@article{micikevicius2017mixed, - title={Mixed precision training}, - author={Micikevicius, Paulius and Narang, Sharan and Alben, Jonah and Diamos, Gregory and Elsen, Erich and Garcia, David and Ginsburg, Boris and Houston, Michael and Kuchaev, Oleksii and Venkatesh, Ganesh and others}, - journal={arXiv preprint arXiv:1710.03740}, - year={2017} -} - -@ARTICLE{Britz:2017, - author = {{Britz}, Denny and {Goldie}, Anna and {Luong}, Thang and {Le}, Quoc}, - title = {Massive Exploration of Neural Machine Translation Architectures}, - journal = {ArXiv e-prints arXiv:1703.03906}, - archivePrefix = "arXiv", - eprinttype = {arxiv}, - eprint = {1703.03906}, - primaryClass = "cs.CL", - keywords = {Computer Science - Computation and Language}, - year = 2017, - month = mar -} - -@inproceedings{abadi2016tensorflow, - title={TensorFlow: A System for Large-Scale Machine Learning.}, - author={Abadi, Mart{\'\i}n and Barham, Paul and Chen, Jianmin and Chen, Zhifeng and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Irving, Geoffrey and Isard, Michael and others}, - booktitle={OSDI}, - volume={16}, - pages={265--283}, - year={2016} -} - -@article{tensor2tensor, - author = {Ashish Vaswani and Samy Bengio and Eugene Brevdo and Francois Chollet and Aidan N. Gomez and Stephan Gouws and Llion Jones and \L{}ukasz Kaiser and Nal Kalchbrenner and Niki Parmar and Ryan Sepassi and - Noam Shazeer and Jakob Uszkoreit}, - title = {Tensor2Tensor for Neural Machine Translation}, - journal = {CoRR}, - volume = {abs/1803.07416}, - year = {2018}, - url = {http://arxiv.org/abs/1803.07416}, -} - -@article{gehring2017convs2s, - author = {Gehring, Jonas, and Auli, Michael and Grangier, David and Yarats, Denis and Dauphin, Yann N}, - title = "{Convolutional Sequence to Sequence Learning}", - journal = {ArXiv e-prints arXiv:1705.03122}, - archivePrefix = "arXiv", - eprinttype = {arxiv}, - eprint = {1705.03122}, - primaryClass = "cs.CL", - keywords = {Computer Science - Computation and Language}, - year = 2017, - month = May, -} - -@inproceedings{chan2015, - title={Listen, attend and spell}, - author={Chan, William and Jaitly, Navdeep and Le, Quoc V and Vinyals, Oriol}, - booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2016 IEEE International Conference on}, - pages={5206--5210}, - year={2016}, - organization={IEEE} -} - -@inproceedings{xu2015show, - title={Show, attend and tell: Neural image caption generation with visual attention}, - author={Xu, Kelvin and Ba, Jimmy and Kiros, Ryan and Cho, Kyunghyun and Courville, Aaron and Salakhudinov, Ruslan and Zemel, Rich and Bengio, Yoshua}, - booktitle={International Conference on Machine Learning}, - pages={2048--2057}, - year={2015} -} - -@incollection{Sutskever2014, - title = {Sequence to Sequence Learning with Neural Networks}, - author = {Sutskever, Ilya and Vinyals, Oriol and Le, Quoc V}, - booktitle = {Advances in Neural Information Processing Systems 27}, - editor = {Z. Ghahramani and M. Welling and C. Cortes and N. D. Lawrence and K. Q. Weinberger}, - pages = {3104--3112}, - year = {2014}, - publisher = {Curran Associates, Inc.}, - url = {http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf} -} - -@article{DeepSpeech2014, - title = {Deep Speech: Scaling up end-to-end speech recognition}, - author = {Awni Y. Hannun and Carl Case and Jared Casper and Bryan Catanzaro and Greg Diamos and Erich Elsen and Ryan Prenger and Sanjeev Satheesh and Shubho Sengupta and Adam Coates and Andrew Y. Ng}, - journal = {CoRR}, - volume = {abs/1412.5567}, - year = {2014}, - url = {http://arxiv.org/abs/1412.5567}, - archivePrefix = {arXiv}, - eprint = {1412.5567}, - timestamp = {Mon, 13 Aug 2018 16:48:07 +0200}, - biburl = {https://dblp.org/rec/bib/journals/corr/HannunCCCDEPSSCN14}, - bibsource = {dblp computer science bibliography, https://dblp.org} -} - -@inproceedings{DeepSpeech2, - author = {Amodei, Dario and Ananthanarayanan, Sundaram and Anubhai, Rishita and Bai, Jingliang and Battenberg, Eric and Case, Carl and Casper, Jared and Catanzaro, Bryan and Cheng, Qiang and Chen, Guoliang and Chen, Jie and Chen, Jingdong and Chen, Zhijie and Chrzanowski, Mike and Coates, Adam and Diamos, Greg and Ding, Ke and Du, Niandong and Elsen, Erich and Engel, Jesse and Fang, Weiwei and Fan, Linxi and Fougner, Christopher and Gao, Liang and Gong, Caixia and Hannun, Awni and Han, Tony and Johannes, Lappi Vaino and Jiang, Bing and Ju, Cai and Jun, Billy and LeGresley, Patrick and Lin, Libby and Liu, Junjie and Liu, Yang and Li, Weigao and Li, Xiangang and Ma, Dongpeng and Narang, Sharan and Ng, Andrew and Ozair, Sherjil and Peng, Yiping and Prenger, Ryan and Qian, Sheng and Quan, Zongfeng and Raiman, Jonathan and Rao, Vinay and Satheesh, Sanjeev and Seetapun, David and Sengupta, Shubho and Srinet, Kavya and Sriram, Anuroop and Tang, Haiyuan and Tang, Liliang and Wang, Chong and Wang, Jidong and Wang, Kaifu and Wang, Yi and Wang, Zhijian and Wang, Zhiqian and Wu, Shuang and Wei, Likai and Xiao, Bo and Xie, Wen and Xie, Yan and Yogatama, Dani and Yuan, Bin and Zhan, Jun and Zhu, Zhenyao}, - title = {Deep Speech 2: End-to-end Speech Recognition in English and Mandarin}, - booktitle = {Proceedings of the 33rd International Conference on International Conference on Machine Learning - Volume 48}, - series = {ICML'16}, - year = {2016}, - location = {New York, NY, USA}, - pages = {173--182}, - numpages = {10}, - url = {http://dl.acm.org/citation.cfm?id=3045390.3045410}, - acmid = {3045410}, - publisher = {JMLR.org}, -} - -@inproceedings{prabhavalkar2017comparison, - title={A comparison of sequence-to-sequence models for speech recognition}, - author={Prabhavalkar, Rohit and Rao, Kanishka and Sainath, Tara N and Li, Bo and Johnson, Leif and Jaitly, Navdeep}, - booktitle={Proc. Interspeech}, - pages={939--943}, - year={2017} -} - -@article{chiu2017state, - title={State-of-the-art speech recognition with sequence-to-sequence models}, - author={Chiu, Chung-Cheng and Sainath, Tara N and Wu, Yonghui and Prabhavalkar, Rohit and Nguyen, Patrick and Chen, Zhifeng and Kannan, Anjuli and Weiss, Ron J and Rao, Kanishka and Gonina, Katya and others}, - journal={arXiv preprint arXiv:1712.01769}, - year={2017} -} - -@misc{NVMixed, - title = {{NVIDA's Mixed-Precision Training - TensorFlow example}}, - howpublished = {\url{https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/#example_tensorflow}}, - author={NVIDIA}, - note = {Accessed: 2018-10-09}, - year={2018} -} - -@article{gehring2017, - title={Convolutional sequence to sequence learning}, - author={Gehring, Jonas and Auli, Michael and Grangier, David and Yarats, Denis and Dauphin, Yann N}, - journal={arXiv preprint arXiv:1705.03122}, - year={2017} -} - -@article{collobert2016, - title={Wav2letter: an end-to-end convnet-based speech recognition system}, - author={Collobert, Ronan and Puhrsch, Christian and Synnaeve, Gabriel}, - journal={arXiv preprint arXiv:1609.03193}, - year={2016} -} - -@inproceedings{Zhang2016, -author={Ying Zhang and Mohammad Pezeshki and Philémon Brakel and Saizheng Zhang and César Laurent and Yoshua Bengio and Aaron Courville}, -title={Towards End-to-End Speech Recognition with Deep Convolutional Neural Networks}, -year=2016, -booktitle={Interspeech 2016}, -doi={10.21437/Interspeech.2016-1446}, -url={http://dx.doi.org/10.21437/Interspeech.2016-1446}, -pages={410--414} -} - -@inproceedings{Zhang2017, - title={Very deep convolutional networks for end-to-end speech recognition}, - author={Zhang, Yu, and Chan, William, and Jaitly, Navdeep}, - booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2017 IEEE International Conference on}, - year={2017}, - organization={IEEE} -} - - -@article{Wang2017, - title={Tacotron: Towards End-to-End Speech Synthesis}, - author={ Wang, Yuxuan, and Skerry-Ryan, RJ, and Stanton, Daisy and Wu, Yonghui and Weiss, Ron, and Jaitly, Navdeep and Yang, Zongheng and Xiao, Ying and Chen,Zhifeng and Bengio, Samy and Le, Quoc and Agiomyrgiannakis, Yannis and Clark,Rob and Saurous, Rif A.}, - journal={arXiv preprint arXiv:1703.10135}, - year={2017} -} - -@article{griffin1984signal, - title={Signal estimation from modified short-time Fourier transform}, - author={Griffin, Daniel and Lim, Jae}, - journal={IEEE Transactions on Acoustics, Speech, and Signal Processing}, - volume={32}, - number={2}, - pages={236--243}, - year={1984}, - publisher={IEEE} -} - -@misc{ito2017lj, - title={The LJ speech dataset}, - author={Ito, Keith and others}, - year={2017} -} - -@misc{mailabs, - title = {{The M-AILABS Speech Dataset}}, - howpublished = {\url{http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/}}, - author={M-AILABS}, - note = {Accessed: 2018-10-09}, - year={2018} -} - -@article{merity2016pointer, - title={Pointer sentinel mixture models}, - author={Merity, Stephen and Xiong, Caiming and Bradbury, James and Socher, Richard}, - journal={arXiv preprint arXiv:1609.07843}, - year={2016} -} - -@inproceedings{socher2013recursive, - title={Recursive deep models for semantic compositionality over a sentiment treebank}, - author={Socher, Richard and Perelygin, Alex and Wu, Jean and Chuang, Jason and Manning, Christopher D and Ng, Andrew and Potts, Christopher}, - booktitle={Proceedings of the 2013 conference on empirical methods in natural language processing}, - pages={1631--1642}, - year={2013} -} - -@InProceedings{maas-EtAl:2011:ACL-HLT2011, - author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher}, - title = {Learning Word Vectors for Sentiment Analysis}, - booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}, - month = {June}, - year = {2011}, - address = {Portland, Oregon, USA}, - publisher = {Association for Computational Linguistics}, - pages = {142--150}, - url = {http://www.aclweb.org/anthology/P11-1015} -} - -@inproceedings{Povey2018SemiOrthogonalLM, - title={Semi-Orthogonal Low-Rank Matrix Factorization for Deep Neural Networks}, - author={Daniel Povey and Gaofeng Cheng and Yiming Wang and Ke Li and Hainan Xu and Mahsa Yarmohammadi and Sanjeev Khudanpur}, - booktitle={Interspeech}, - year={2018} -} - -@article{CAPIO2017, - author = {Kyu J. Han and Akshay Chandrashekaran and Jungsuk Kim and Ian R. Lane}, - title = {The {CAPIO} 2017 Conversational Speech Recognition System}, - journal = {CoRR}, - volume = {abs/1801.00059}, - year = {2018}, - url = {http://arxiv.org/abs/1801.00059}, - archivePrefix = {arXiv}, - eprint = {1801.00059}, - timestamp = {Mon, 13 Aug 2018 16:49:10 +0200}, - biburl = {https://dblp.org/rec/bib/journals/corr/abs-1801-00059}, - bibsource = {dblp computer science bibliography, https://dblp.org} -} - -@article{WaveNet, - author = {A{\"{a}}ron van den Oord and Sander Dieleman and Heiga Zen and Karen Simonyan and Oriol Vinyals and Alex Graves and Nal Kalchbrenner and Andrew W. Senior and Koray Kavukcuoglu}, - title = {WaveNet: {A} Generative Model for Raw Audio}, - journal = {CoRR}, - volume = {abs/1609.03499}, - year = {2016}, - url = {http://arxiv.org/abs/1609.03499}, - archivePrefix = {arXiv}, - eprint = {1609.03499}, - timestamp = {Mon, 13 Aug 2018 16:49:15 +0200}, - biburl = {https://dblp.org/rec/bib/journals/corr/OordDZSVGKSK16}, - bibsource = {dblp computer science bibliography, https://dblp.org} -} - -@article{FacebookGERENGBackTranslation, - author = {Rico Sennrich and Barry Haddow and Alexandra Birch}, - title = {Improving Neural Machine Translation Models with Monolingual Data}, - journal = {CoRR}, - volume = {abs/1511.06709}, - year = {2015}, - url = {http://arxiv.org/abs/1511.06709}, - archivePrefix = {arXiv}, - eprint = {1511.06709}, - timestamp = {Mon, 13 Aug 2018 16:47:05 +0200}, - biburl = {https://dblp.org/rec/bib/journals/corr/SennrichHB15a}, - bibsource = {dblp computer science bibliography, https://dblp.org} -} - -@article{GlobalStyleTokens, - author = {Yuxuan Wang and Daisy Stanton and Yu Zhang and R. J. Skerry{-}Ryan and Eric Battenberg and Joel Shor and Ying Xiao and Fei Ren and Ye Jia and Rif A. Saurous}, - title = {Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis}, - journal = {CoRR}, - volume = {abs/1803.09017}, - year = {2018}, - url = {http://arxiv.org/abs/1803.09017}, - archivePrefix = {arXiv}, - eprint = {1803.09017}, - timestamp = {Mon, 13 Aug 2018 16:46:53 +0200}, - biburl = {https://dblp.org/rec/bib/journals/corr/abs-1803-09017}, - bibsource = {dblp computer science bibliography, https://dblp.org} -} - -@article{IoffeS15BatchNorm, - author = {Sergey Ioffe and Christian Szegedy}, - title = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift}, - journal = {CoRR}, - volume = {abs/1502.03167}, - year = {2015}, - url = {http://arxiv.org/abs/1502.03167}, - archivePrefix = {arXiv}, - eprint = {1502.03167}, - timestamp = {Mon, 13 Aug 2018 16:47:06 +0200}, - biburl = {https://dblp.org/rec/bib/journals/corr/IoffeS15}, - bibsource = {dblp computer science bibliography, https://dblp.org} -} - -@article{kingma, - author = {Diederik P. Kingma and - Jimmy Ba}, - title = {Adam: {A} Method for Stochastic Optimization}, - journal = {CoRR}, - volume = {abs/1412.6980}, - year = {2014}, - url = {http://arxiv.org/abs/1412.6980}, - archivePrefix = {arXiv}, - eprint = {1412.6980}, - timestamp = {Mon, 13 Aug 2018 01:00:00 +0200}, - biburl = {https://dblp.org/rec/bib/journals/corr/KingmaB14}, - bibsource = {dblp computer science bibliography, https://dblp.org} -} - -@incollection{Salimans2016WeightNorm, - title = {Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks}, - author = {Salimans, Tim and Kingma, Durk P}, - booktitle = {Advances in Neural Information Processing Systems 29}, - editor = {D. D. Lee and M. Sugiyama and U. V. Luxburg and I. Guyon and R. Garnett}, - pages = {901--909}, - year = {2016}, - publisher = {Curran Associates, Inc.}, - url = {http://papers.nips.cc/paper/6114-weight-normalization-a-simple-reparameterization-to-accelerate-training-of-deep-neural-networks.pdf} -} - -@article{wu2016google, - title={Google's neural machine translation system: Bridging the gap between human and machine translation}, - author={Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le, Quoc V and Norouzi, Mohammad and Macherey, Zolfgang and Krikun, Maxim and Cao, Yuan and Gao, Qin and Macherey, Klaus and others}, - journal={arXiv preprint arXiv:1609.08144}, - year={2016} -} - -@inproceedings{opennmt, - author = {Guillaume Klein and Yoon Kim and Yuntian Deng and Jean Senellart and Alexander M. Rush}, - title = {OpenNMT: Open-Source Toolkit for Neural Machine Translation}, - booktitle = {Proc. ACL}, - year = {2017}, - url = {https://doi.org/10.18653/v1/P17-4012}, - doi = {10.18653/v1/P17-4012} -} - -@article{paszke2017automatic, - title={Automatic differentiation in PyTorch}, - author={Paszke, Adam and Gross, Sam and Chintala, Soumith and Chanan, Gregory and Yang, Edward and DeVito, Zachary and Lin, Zeming and Desmaison, Alban and Antiga, Luca and Lerer, Adam}, - year={2017} -} - -@article{yu2014introduction, - title={An introduction to computational networks and the computational network toolkit}, - author={Yu, Dong and Eversole, Adam and Seltzer, Mike and Yao, Kaisheng and Huang, Zhiheng and Guenter, Brian and Kuchaiev, Oleksii and Zhang, Yu and Seide, Frank and Wang, Huaming and others}, - journal={Microsoft Technical Report MSR-TR-2014--112}, - year={2014} -} - -@article{nvidia2017v100, - title={V100 GPU architecture. The world’s most advanced data center GPU. Version WP-08608-001\_v1. 1}, - author={NVIDIA, Tesla}, - journal={NVIDIA. Aug}, - pages={108}, - year={2017} -} - -@article{Ba2016LayerNorm, - author = {Jimmy Lei Ba and Jamie Ryan Kiros and Geoffrey E Hinton}, - title = {Layer normalization}, - journal = {CoRR}, - volume = {abs/1607.06450}, - year = {2016}, - url = {http://arxiv.org/abs/1607.06450}, - archivePrefix = {arXiv}, -} - -@inproceedings{Dauphin2017GLU, - author = {Dauphin, Yann N. and Fan, Angela and Auli, Michael and Grangier, David}, - title = {Language Modeling with Gated Convolutional Networks}, - booktitle = {Proceedings of the 34th International Conference on Machine Learning - Volume 70}, - series = {ICML'17}, - year = {2017}, - location = {Sydney, NSW, Australia}, - pages = {933--941}, - numpages = {9}, - url = {http://dl.acm.org/citation.cfm?id=3305381.3305478}, - acmid = {3305478}, - publisher = {JMLR.org}, -} - -@incollection{Oord2016PixelCNN, -title = {Conditional Image Generation with PixelCNN Decoders}, -author = {van den Oord, Aaron and Kalchbrenner, Nal and Espeholt, Lasse and kavukcuoglu, koray and Vinyals, Oriol and Graves, Alex}, -booktitle = {Advances in Neural Information Processing Systems 29}, -editor = {D. D. Lee and M. Sugiyama and U. V. Luxburg and I. Guyon and R. Garnett}, -pages = {4790--4798}, -year = {2016}, -publisher = {Curran Associates, Inc.}, -url = {http://papers.nips.cc/paper/6527-conditional-image-generation-with-pixelcnn-decoders.pdf} -} - -@article{he2015, - title={Deep residual learning for image recognition}, - author={K. He, and X. Zhang, and S. Ren, and J. Sun}, - journal={arXiv preprint arXiv:1512.03385}, - year={2015} -} - -@article{huang2016, - title={Densely Connected Convolutional Networks}, - author={Gao Huang, and Zhuang Liu, and Laurens van der Maaten, and Kilian Q. Weinberger}, - journal={arXiv preprint arXiv:1608.06993}, - year={2016} -} - -@inproceedings{heafield2011kenlm, - title={KenLM: Faster and smaller language model queries}, - author={Heafield, Kenneth}, - booktitle={Proceedings of the sixth workshop on statistical machine translation}, - pages={187--197}, - year={2011}, - organization={Association for Computational Linguistics} -} - -@article{dai2018transformer, - title={Transformer-XL: Language Modeling with Longer-Term Dependency}, - author={Dai, Zihang and Yang, Zhilin and Yang, Yiming and Cohen, William W and Carbonell, Jaime and Le, Quoc V and Salakhutdinov, Ruslan}, - year={2018}, - journal = {CoRR}, - volume = {abs/1901.02860}, - url = {http://arxiv.org/abs/1901.02860}, - archivePrefix = {arXiv}, - eprint = {1901.02860}, - timestamp = {Fri, 01 Feb 2019 13:39:59 +0100}, - biburl = {https://dblp.org/rec/bib/journals/corr/abs-1901-02860}, - bibsource = {dblp computer science bibliography, https://dblp.org} -} - -@inproceedings{Saon+2016, -author={George Saon and Tom Sercu and Steven Rennie and Hong-Kwang J. Kuo}, -title={The IBM 2016 English Conversational Telephone Speech Recognition System}, -year=2016, -booktitle={Interspeech 2016}, -doi={10.21437/Interspeech.2016-1460}, -url={http://dx.doi.org/10.21437/Interspeech.2016-1460}, -pages={7--11} -} - -@INPROCEEDINGS{Sercu-2016, -author={T. {Sercu} and C. {Puhrsch} and B. {Kingsbury} and Y. {LeCun}}, -booktitle={2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, -title={Very deep multilingual convolutional neural networks for LVCSR}, -year={2016}, -volume={}, -number={}, -pages={4955-4959}, -keywords={natural language processing;neural nets;speech recognition;very deep multilingual convolutional neural networks;LVCSR;CNN;large vocabulary continuous speech recognition systems;word error rate;Training;Context;Hidden Markov models;Neural networks;Computer architecture;Kernel;Training data;Convolutional Networks;Multilingual;Acoustic Modeling;Speech Recognition;Neural Networks}, -doi={10.1109/ICASSP.2016.7472620}, -ISSN={2379-190X}, -month={March},} - - -@inproceedings{Sercu+2016, -author={Tom Sercu and Vaibhava Goel}, -title={Advances in Very Deep Convolutional Neural Networks for LVCSR}, -year=2016, -booktitle={Interspeech 2016}, -doi={10.21437/Interspeech.2016-1033}, -url={http://dx.doi.org/10.21437/Interspeech.2016-1033}, -pages={3429--3433} -} - -@INPROCEEDINGS{Xiong-2018, -author={W. {Xiong} and L. {Wu} and F. {Alleva} and J. {Droppo} and X. {Huang} and A. {Stolcke}}, -booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, -title={The Microsoft 2017 Conversational Speech Recognition System}, -year={2018}, -volume={}, -number={}, -pages={5934-5938}, -keywords={convolution;feedforward neural nets;natural language processing;speaker recognition;speech processing;language model rescoring step;senone level;switchboard domains;character-based LSTM language models;NIST 2000 switchboard test set;frame level;word-level voting;acoustic model posteriors;dialog session aware LSTM language models;CNN-BLSTM acoustic model;Microsoft 2017 conversational speech recognition system;Acoustics;Error analysis;Training;Speech recognition;Switches;Computational modeling;Context modeling;Conversational speech recognition;CNN;LACE;BLSTM;LSTM-LM;system combination;human parity}, -doi={10.1109/ICASSP.2018.8461870}, -ISSN={2379-190X}, -month={April},} - -@inproceedings{zeyer2018improved, - author={Albert Zeyer and Kazuki Irie and Ralf Schlüter and Hermann Ney}, - title={Improved Training of End-to-end Attention Models for Speech Recognition}, - year=2018, - booktitle={Proc. Interspeech 2018}, - pages={7--11}, - doi={10.21437/Interspeech.2018-1616}, - url={http://dx.doi.org/10.21437/Interspeech.2018-1616} -} - -@article{Wav2LetterV2, - author = {Vitaliy Liptchinsky and - Gabriel Synnaeve and - Ronan Collobert}, - title = {Letter-Based Speech Recognition with Gated ConvNets}, - journal = {CoRR}, - volume = {abs/1712.09444}, - year = {2017}, - url = {http://arxiv.org/abs/1712.09444}, - archivePrefix = {arXiv}, - eprint = {1712.09444}, - timestamp = {Mon, 13 Aug 2018 16:46:33 +0200}, - biburl = {https://dblp.org/rec/bib/journals/corr/abs-1712-09444}, - bibsource = {dblp computer science bibliography, https://dblp.org} -} - -@article{zeghidour2018, - author = {Neil Zeghidour and - Qiantong Xu and - Vitaliy Liptchinsky and - Nicolas Usunier and - Gabriel Synnaeve and - Ronan Collobert}, - title = {Fully Convolutional Speech Recognition}, - journal = {CoRR}, - volume = {abs/1812.06864}, - year = {2018}, - url = {http://arxiv.org/abs/1812.06864}, - archivePrefix = {arXiv}, - eprint = {1812.06864}, - timestamp = {Tue, 01 Jan 2019 15:01:25 +0100}, - biburl = {https://dblp.org/rec/bib/journals/corr/abs-1812-06864}, - bibsource = {dblp computer science bibliography, https://dblp.org} -} - -@inproceedings{Hadian2018, - author={Hossein Hadian and Hossein Sameti and Daniel Povey and Sanjeev Khudanpur}, - title={End-to-end Speech Recognition Using Lattice-free MMI}, - year=2018, - booktitle={Proc. Interspeech 2018}, - pages={12--16}, - doi={10.21437/Interspeech.2018-1423}, - url={http://dx.doi.org/10.21437/Interspeech.2018-1423} -} - -@inproceedings{Tang2018, - author={Jian Tang and Yan Song and Lirong Dai and Ian McLoughlin}, - title={Acoustic Modeling with Densely Connected Residual Network for Multichannel Speech Recognition}, - year=2018, - booktitle={Proc. Interspeech 2018}, - pages={1783--1787}, - doi={10.21437/Interspeech.2018-1089}, - url={http://dx.doi.org/10.21437/Interspeech.2018-1089} -} - -@article{Kurata2017LanguageMW, - title={Language modeling with highway LSTM}, - author={Gakuto Kurata and Bhuvana Ramabhadran and George Saon and Abhinav Sethy}, - journal={2017 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)}, - year={2017}, - pages={244-251} -} - -@inproceedings{Saon2017, - author={George Saon and Gakuto Kurata and Tom Sercu and Kartik Audhkhasi and Samuel Thomas and Dimitrios Dimitriadis and Xiaodong Cui and Bhuvana Ramabhadran and Michael Picheny and Lynn-Li Lim and Bergul Roomi and Phil Hall}, - title={English Conversational Telephone Speech Recognition by Humans and Machines}, - year=2017, - booktitle={Proc. Interspeech 2017}, - pages={132--136}, - doi={10.21437/Interspeech.2017-405}, - url={http://dx.doi.org/10.21437/Interspeech.2017-405} -} - -@inproceedings{Povey+2016, -author={Daniel Povey and Vijayaditya Peddinti and Daniel Galvez and Pegah Ghahremani and Vimal Manohar and Xingyu Na and Yiming Wang and Sanjeev Khudanpur}, -title={Purely Sequence-Trained Neural Networks for ASR Based on Lattice-Free MMI}, -year=2016, -booktitle={Interspeech 2016}, -doi={10.21437/Interspeech.2016-595}, -url={http://dx.doi.org/10.21437/Interspeech.2016-595}, -pages={2751--2755} -} - -@article{Yang2018, - author = {Xuerui Yang and - Jiwei Li and - Xi Zhou}, - title = {A novel pyramidal-FSMN architecture with lattice-free {MMI} for speech - recognition}, - journal = {CoRR}, - volume = {abs/1810.11352}, - year = {2018}, - url = {http://arxiv.org/abs/1810.11352}, - archivePrefix = {arXiv}, - eprint = {1810.11352}, - timestamp = {Wed, 31 Oct 2018 14:24:29 +0100}, - biburl = {https://dblp.org/rec/bib/journals/corr/abs-1810-11352}, - bibsource = {dblp computer science bibliography, https://dblp.org} -} - -@article{liptchinsky2017based, - title={Letter-Based Speech Recognition with Gated ConvNets}, - author={Liptchinsky, Vitaliy and Synnaeve, Gabriel and Collobert, Ronan}, - journal={arXiv preprint arXiv:1712.09444}, - year={2017} -} - -@inproceedings{Weng2018, - author={Chao Weng and Jia Cui and Guangsen Wang and Jun Wang and Chengzhu Yu and Dan Su and Dong Yu}, - title={Improving Attention Based Sequence-to-Sequence Models for End-to-End English Conversational Speech Recognition}, - year=2018, - booktitle={Proc. Interspeech 2018}, - pages={761--765}, - doi={10.21437/Interspeech.2018-1030}, - url={http://dx.doi.org/10.21437/Interspeech.2018-1030} -} - -@INPROCEEDINGS{Battenberg2017, -author={E. {Battenberg} and J. {Chen} and R. {Child} and A. {Coates} and Y. G. Y. {Li} and H. {Liu} and S. {Satheesh} and A. {Sriram} and Z. {Zhu}}, -booktitle={2017 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)}, -title={Exploring neural transducers for end-to-end speech recognition}, -year={2017}, -volume={}, -number={}, -pages={206-213}, -keywords={recurrent neural nets;speech recognition;Hub500 benchmark;CTC models;speech recognition pipeline;RNN-Transducer models;language model;Seq2Seq models;end-to-end speech recognition;neural transducers;Decoding;Hidden Markov models;Transducers;Task analysis;Speech;Mathematical model;Neural networks}, -doi={10.1109/ASRU.2017.8268937}, -ISSN={}, -month={Dec}, -} - -@inproceedings{ -loshchilov2018, -title={Decoupled Weight Decay Regularization}, -author={Ilya Loshchilov and Frank Hutter}, -booktitle={International Conference on Learning Representations}, -year={2019}, -url={https://openreview.net/forum?id=Bkg6RiCqY7}, -} - -@article{zhang2017ndadam, - author = {Zijun Zhang and Lin Ma and Zongpeng Li and Chuan Wu}, - title = {Normalized Direction-preserving Adam}, - journal = {arXiv e-prints arXiv:1709.04546}, - year = {2017}, -} - -@article{park2019, - author = {{Park}, Daniel S. and {Chan}, William and {Zhang}, Yu and - {Chiu}, Chung-Cheng and {Zoph}, Barret and {Cubuk}, Ekin D. and - {Le}, Quoc V.}, - title = "{SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition}", - journal = {arXiv e-prints}, - year = "2019", - eid = {arXiv:1904.08779}, - eprint = {1904.08779}, -} - -@article{novograd2019, - author = {{Ginsburg}, Boris and {Castonguay}, Patrice and {Hrinchuk}, Oleksii and - {Kuchaiev}, Oleksii and {Lavrukhin}, Vitaly and {Leary}, Ryan and - {Li}, Jason and {Nguyen}, Huyen and {Cohen}, Jonathan M.}, - title = "{Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks}", - journal = {arXiv e-prints}, - year = "2019", - eid = {arXiv:1905.11286}, - eprint = {1905.11286}, -} - -@article{kriman2019quartznet, - title={Quartznet: {Deep} automatic speech recognition with 1d time-channel separable convolutions}, - author={Kriman, Samuel and Beliaev, Stanislav and Ginsburg, Boris and Huang, Jocelyn and Kuchaiev, Oleksii and Lavrukhin, Vitaly and Leary, Ryan and Li, Jason and Zhang, Yang}, - journal={arXiv preprint arXiv:1910.10261}, - year={2019} -} - -@misc{itu1988g711, - title={{ITU-T} {G.711} - {Pulse} code modulation ({PCM}) of voice frequencies}, - author={ITU-T Geneva Switzerland}, - year={1988}, -} - -@article{han2020contextnet, - title={ContextNet: Improving convolutional neural networks for automatic speech recognition with global context}, - author={Han, Wei and Zhang, Zhengdong and Zhang, Yu and Yu, Jiahui and Chiu, Chung-Cheng and Qin, James and Gulati, Anmol and Pang, Ruoming and Wu, Yonghui}, - journal={arXiv:2005.03191}, - year={2020} -} - -@inproceedings{hu2018squeeze, - title={Squeeze-and-excitation networks}, - author={Hu, Jie and Shen, Li and Sun, Gang}, - booktitle={ICVPR}, - year={2018} -} - -@article{koluguri2020speakernet, - title={SpeakerNet: 1D Depth-wise Separable Convolutional Network for Text-Independent Speaker Recognition and Verification}, - author={Koluguri, Nithin Rao and Li, Jason and Lavrukhin, Vitaly and Ginsburg, Boris}, - journal={arXiv preprint arXiv:2010.12653}, - year={2020} -} - -@article{gulati2020conformer, - title={Conformer: Convolution-augmented transformer for speech recognition}, - author={Gulati, Anmol and Qin, James and Chiu, Chung-Cheng and Parmar, Niki and Zhang, Yu and Yu, Jiahui and Han, Wei and Wang, Shibo and Zhang, Zhengdong and Wu, Yonghui and others}, - journal={arXiv preprint arXiv:2005.08100}, - year={2020} -} - -@article{koluguri2021titanet, - title={TitaNet: Neural Model for speaker representation with 1D Depth-wise separable convolutions and global context}, - author={Koluguri, Nithin Rao and Park, Taejin and Ginsburg, Boris}, - journal={arXiv preprint arXiv:2110.04410}, - year={2021} -} - -@article{Dawalatabad_2021, - title={ECAPA-TDNN Embeddings for Speaker Diarization}, - url={http://dx.doi.org/10.21437/Interspeech.2021-941}, - DOI={10.21437/interspeech.2021-941}, - journal={Interspeech 2021}, - publisher={ISCA}, - author={Dawalatabad, Nauman and Ravanelli, Mirco and Grondin, François and Thienpondt, Jenthe and Desplanques, Brecht and Na, Hwidong}, - year={2021}, - month={Aug} -} - -@article{park2022multi, - title = {Multi-scale Speaker Diarization with Dynamic Scale Weighting}, - author = {Park, Tae Jin and Koluguri, Nithin Rao and Balam, Jagadeesh and Ginsburg, Boris}, - journal = {https://arxiv.org/abs/2203.15974}, - year = {2022} -} - - -@inproceedings{he2019streaming, - title={Streaming end-to-end speech recognition for mobile devices}, - author={He, Yanzhang and Sainath, Tara N and Prabhavalkar, Rohit and McGraw, Ian and Alvarez, Raziel and Zhao, Ding and Rybach, David and Kannan, Anjuli and Wu, Yonghui and Pang, Ruoming and others}, - booktitle={ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, - pages={6381--6385}, - year={2019}, - organization={IEEE} -} - -@misc{wav2vec2, - doi = {10.48550/ARXIV.2006.11477}, - url = {https://arxiv.org/abs/2006.11477}, - author = {Baevski, Alexei and Zhou, Henry and Mohamed, Abdelrahman and Auli, Michael}, - title = {wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations}, - publisher = {arXiv}, - year = {2020}, - copyright = {arXiv.org perpetual, non-exclusive license} -} - -@misc{w2v_bert, - doi = {10.48550/ARXIV.2108.06209}, - url = {https://arxiv.org/abs/2108.06209}, - author = {Chung, Yu-An and Zhang, Yu and Han, Wei and Chiu, Chung-Cheng and Qin, James and Pang, Ruoming and Wu, Yonghui}, - title = {W2v-BERT: Combining Contrastive Learning and Masked Language Modeling for Self-Supervised Speech Pre-Training}, - publisher = {arXiv}, - year = {2021}, - copyright = {arXiv.org perpetual, non-exclusive license} -} - -@misc{ssl_inter, - doi = {10.48550/ARXIV.2112.08778}, - url = {https://arxiv.org/abs/2112.08778}, - author = {Wang, Chengyi and Wu, Yu and Chen, Sanyuan and Liu, Shujie and Li, Jinyu and Qian, Yao and Yang, Zhenglu}, - title = {Self-Supervised Learning for speech recognition with Intermediate layer supervision}, - publisher = {arXiv}, - year = {2021}, - copyright = {arXiv.org perpetual, non-exclusive license} -} - -@misc{kim2022squeezeformer, - doi = {10.48550/ARXIV.2206.00888}, - url = {https://arxiv.org/abs/2206.00888}, - author = {Kim, Sehoon and Gholami, Amir and Shaw, Albert and Lee, Nicholas and Mangalam, Karttikeya and Malik, Jitendra and Mahoney, Michael W. and Keutzer, Kurt}, - keywords = {Audio and Speech Processing (eess.AS), Computation and Language (cs.CL), Sound (cs.SD), FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Computer and information sciences, FOS: Computer and information sciences}, - title = {Squeezeformer: An Efficient Transformer for Automatic Speech Recognition}, - publisher = {arXiv}, - year = {2022}, - copyright = {arXiv.org perpetual, non-exclusive license} -} - -@misc{park2022multi, - doi = {10.48550/ARXIV.2203.15974}, - url = {https://arxiv.org/abs/2203.15974}, - author = {Park, Tae Jin and Koluguri, Nithin Rao and Balam, Jagadeesh and Ginsburg, Boris}, - keywords = {Audio and Speech Processing (eess.AS), Computation and Language (cs.CL), FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Computer and information sciences, FOS: Computer and information sciences}, - title = {Multi-scale Speaker Diarization with Dynamic Scale Weighting}, - publisher = {arXiv}, - year = {2022}, - copyright = {Creative Commons Attribution 4.0 International} -} diff --git a/SoundScribe/SpeakerID/docs/source/asr/asr_language_modeling.rst b/SoundScribe/SpeakerID/docs/source/asr/asr_language_modeling.rst deleted file mode 100644 index bb823cb252c03a1db6a8a72dd7b5c20ed8fc6b55..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/asr_language_modeling.rst +++ /dev/null @@ -1,548 +0,0 @@ -##################### -ASR Language Modeling -##################### - -Language models have shown to help the accuracy of ASR models. NeMo supports the following two approaches to incorporate language models into the ASR models: - -* :ref:`ngram_modeling` -* :ref:`neural_rescoring` - -It is possible to use both approaches on the same ASR model. - - -.. _ngram_modeling: - -************************ -N-gram Language Modeling -************************ - -In this approach, an N-gram LM is trained on text data, then it is used in fusion with beam search decoding to find the -best candidates. The beam search decoders in NeMo support language models trained with KenLM library ( -`https://github.com/kpu/kenlm `__). -The beam search decoders and KenLM library are not installed by default in NeMo, and you need to install them to be -able to use beam search decoding and N-gram LM. -Please refer to `scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh `__ -on how to install them. Alternatively, you can build Docker image -`scripts/installers/Dockerfile.ngramtools `__ with all the necessary dependencies. - -NeMo supports both character-based and BPE-based models for N-gram LMs. An N-gram LM can be used with beam search -decoders on top of the ASR models to produce more accurate candidates. The beam search decoder would incorporate -the scores produced by the N-gram LM into its score calculations as the following: - -.. code-block:: - - final_score = acoustic_score + beam_alpha*lm_score + beam_beta*seq_length - -where acoustic_score is the score predicted by the acoustic encoder and lm_score is the one estimated by the LM. -Parameter 'beam_alpha' specifies amount of importance to place on the N-gram language model, and 'beam_beta' is a -penalty term to consider the sequence length in the scores. Larger alpha means more importance on the LM and less -importance on the acoustic model. Negative values for beta will give penalty to longer sequences and make the decoder -to prefer shorter predictions, while positive values would result in longer candidates. - -.. _train-ngram-lm: - -Train N-gram LM -=============== - -The script to train an N-gram language model with KenLM can be found at -`scripts/asr_language_modeling/ngram_lm/train_kenlm.py `__. - -This script would train an N-gram language model with KenLM library which can be used with the beam search decoders -on top of the ASR models. This script supports both character level and BPE level encodings and models which are -detected automatically from the type of the model. - - -You may train the N-gram model as the following: - -.. code-block:: - - python train_kenlm.py nemo_model_file= \ - train_paths= \ - kenlm_bin_path= \ - kenlm_model_file= \ - ngram_length= \ - preserve_arpa=true - -The `train_paths` parameter allows for various input types, such as a list of text files, JSON manifests, or directories, to be used as the training data. -If the file's extension is anything other than `.json`, it assumes that data format is plain text. For plain text format, each line should contain one -sample. For JSON manifest file, the file need to contain json formatted samples per each line like this: - -.. code-block:: - - {"audio_filepath": "/data_path/file1.wav", "text": "The transcript of the audio file."} - -It just extracts the `text` field from each line to create the training text file. After the N-gram model is trained, -it is stored at the path specified by `kenlm_model_file`. - -The following is the list of the arguments for the training script: - -+------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ -| **Argument** | **Type** | **Default** | **Description** | -+------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ -| nemo_model_file | str | Required | The path to `.nemo` file of the ASR model, or name of a pretrained NeMo model to extract a tokenizer. | -+------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ -| train_paths | List[str] | Required | List of training files or folders. Files can be a plain text file or ".json" manifest or ".json.gz". | -+------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ -| kenlm_model_file | str | Required | The path to store the KenLM binary model file. | -+------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ -| kenlm_bin_path | str | Required | The path to the bin folder of KenLM. It is a folder named `bin` under where KenLM is installed. | -+------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ -| ngram_length** | int | Required | Specifies order of N-gram LM. | -+------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ -| ngram_prune | List[int] | [0] | List of thresholds to prune N-grams. Example: [0,0,1]. See Pruning section on the https://kheafield.com/code/kenlm/estimation | -+------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ -| cache_path | str | "" | Cache path to save tokenized files. | -+------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ -| preserve_arpa | bool | ``False`` | Whether to preserve the intermediate ARPA file after construction of the BIN file. | -+------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ -| verbose | int | 1 | Verbose level. | -+------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ - -** Note: Recommend to use 6 as the order of the N-gram model for BPE-based models. Higher orders may need the re-compilation of KenLM to support it. - -Evaluate by Beam Search Decoding and N-gram LM -============================================== - -NeMo's beam search decoders are capable of using the KenLM's N-gram models to find the best candidates. -The script to evaluate an ASR model with beam search decoding and N-gram models can be found at -`scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py `__. - -This script has a large number of possible argument overrides, therefore it is advised to use ``python eval_beamsearch_ngram.py --help`` to see the full list of arguments. - -You may evaluate an ASR model as the following: - -.. code-block:: - - python eval_beamsearch_ngram.py nemo_model_file= \ - input_manifest= \ - beam_width=[] \ - beam_alpha=[] \ - beam_beta=[] \ - preds_output_folder= \ - probs_cache_file=null \ - decoding_mode=beamsearch_ngram \ - decoding_strategy="" - -It can evaluate a model in the three following modes by setting the argument `--decoding_mode`: - -* greedy: Just greedy decoding is done, and no beam search decoding is performed. -* beamsearch: The beam search decoding is done but without using the N-gram language model, final results would be equivalent to setting the weight of LM (beam_beta) to zero. -* beamsearch_ngram: The beam search decoding is done with N-gram LM. - -The `beamsearch` mode would evaluate by beam search decoding without any language model. -It would report the performances in terms of Word Error Rate (WER) and Character Error Rate (CER). Moreover, -the WER/CER of the model when the best candidate is selected among the candidates is also reported as the best WER/CER. -It can be an indicator of how good the predicted candidates are. - -The script would initially load the ASR model and predict the outputs of the model's encoder as log probabilities. -This part would be computed in batches on a device selected by `--device`, which can be CPU (`--device=cpu`) or a -single GPU (`--device=cuda:0`). The batch size of this part can get specified by `--acoustic_batch_size`. You may use -the largest batch size feasible to speed up the step of calculating the log probabilities. You may also use `--use_amp` -to speed up the calculation of log probabilities and make it possible to use larger sizes for `--acoustic_batch_size`. -Currently multi-GPU is not supported for calculating the log probabilities, but using `--probs_cache_file` can help. -It stores the log probabilities produced from the model's encoder into a pickle file so that next time the first step -can get skipped. - -The following is the list of the important arguments for the evaluation script: - -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| **Argument** | **Type** | **Default** | **Description** | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| nemo_model_file | str | Required | The path of the `.nemo` file of the ASR model to extract the tokenizer. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| input_manifest | str | Required | Path to the training file, it can be a text file or JSON manifest. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| kenlm_model_file | str | Required | The path to store the KenLM binary model file. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| preds_output_folder | str | None | The path to an optional folder to store the predictions. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| probs_cache_file | str | None | The cache file for storing the outputs of the model. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| acoustic_batch_size | int | 16 | The batch size to calculate log probabilities. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| use_amp | bool | False | Whether to use AMP if available to calculate log probabilities. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| device | str | cuda | The device to load the model onto to calculate log probabilities. | -| | | | It can `cpu`, `cuda`, `cuda:0`, `cuda:1`, ... | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| decoding_mode | str | beamsearch_ngram | The decoding scheme to be used for evaluation. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| beam_width | float | Required | List of the width or list of the widths of the beam search decoding. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| beam_alpha | float | Required | List of the alpha parameter for the beam search decoding. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| beam_beta | float | Required | List of the beta parameter for the beam search decoding. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| beam_batch_size | int | 128 | The batch size to be used for beam search decoding. | -| | | | Larger batch size can be a little faster, but uses larger memory. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| decoding_strategy | str | beam | String argument for type of decoding strategy for the model. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| decoding | Dict | BeamCTC | Subdict of beam search configs. Values found via | -| | Config | InferConfig | python eval_beamsearch_ngram.py --help | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| text_processing.do_lowercase | bool | ``False`` | Whether to make the training text all lower case. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| text_processing.punctuation_marks | str | "" | String with punctuation marks to process. Example: ".\,?" | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| text_processing.rm_punctuation | bool | ``False``| Whether to remove punctuation marks from text. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ -| text_processing.separate_punctuation | bool |``True``| Whether to separate punctuation with the previous word by space. | -+---------------------+----------+------------------+-------------------------------------------------------------------------+ - -Width of the beam search (`--beam_width`) specifies the number of top candidates/predictions the beam search decoder -would search for. Larger beams result in more accurate but slower predictions. - -.. note:: - - The ``eval_beamsearch_ngram.py`` script contains the entire subconfig used for CTC Beam Decoding. - Therefore it is possible to forward arguments for various beam search libraries such as ``flashlight`` - and ``pyctcdecode`` via the ``decoding`` subconfig. - -There is also a tutorial to learn more about evaluating the ASR models with N-gram LM here: -`Offline ASR Inference with Beam Search and External Language Model Rescoring `_ - -Beam Search Engines -------------------- - -NeMo ASR CTC supports multiple beam search engines for decoding. The default engine is ``beam`` which is the OpenSeq2Seq -decoding library. - -OpenSeq2Seq (``beam``) -~~~~~~~~~~~~~~~~~~~~~~ - -CPU-based beam search engine that is quite efficient and supports char and subword models. It requires a character/subword -KenLM model to be provided. - -The config for this decoding library is described above. - -Flashlight (``flashlight``) -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Flashlight is a C++ library for ASR decoding provided at `https://github.com/flashlight/flashlight `_. It is a CPU and CUDA-based beam search engine that is quite efficient and supports -char and subword models. It an ARPA KenLM file. - -It supports several advanced features such as lexicon based / lexicon free decoding, beam pruning threshold, and more. - -.. code-block:: python - - @dataclass - class FlashlightConfig: - lexicon_path: Optional[str] = None - boost_path: Optional[str] = None - beam_size_token: int = 16 - beam_threshold: float = 20.0 - unk_weight: float = -math.inf - sil_weight: float = 0.0 - -.. code-block:: - - # Lexicon-based decoding - python eval_beamsearch_ngram.py ... \ - decoding_strategy="flashlight" \ - decoding.beam.flashlight_cfg.lexicon_path='/path/to/lexicon.lexicon' \ - decoding.beam.flashlight_cfg.beam_size_token = 32 \ - decoding.beam.flashlight_cfg.beam_threshold = 25.0 - - # Lexicon-free decoding - python eval_beamsearch_ngram.py ... \ - decoding_strategy="flashlight" \ - decoding.beam.flashlight_cfg.beam_size_token = 32 \ - decoding.beam.flashlight_cfg.beam_threshold = 25.0 - -PyCTCDecode (``pyctcdecode``) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -PyCTCDecode is a Python library for ASR decoding provided at `https://github.com/kensho-technologies/pyctcdecode `_. It is a CPU-based beam search engine that is somewhat efficient for a pure python library, and supports char and subword models. It requires a character/subword KenLM ARPA / BINARY model to be provided. - -It has advanced features such as word boosting which can be useful for transcript customization. - -.. code-block:: python - - @dataclass - class PyCTCDecodeConfig: - beam_prune_logp: float = -10.0 - token_min_logp: float = -5.0 - prune_history: bool = False - hotwords: Optional[List[str]] = None - hotword_weight: float = 10.0 - -.. code-block:: - - # PyCTCDecoding - python eval_beamsearch_ngram.py ... \ - decoding_strategy="pyctcdecode" \ - decoding.beam.pyctcdecode_cfg.beam_prune_logp = -10. \ - decoding.beam.pyctcdecode_cfg.token_min_logp = -5. \ - decoding.beam.pyctcdecode_cfg.hotwords=[] \ - decoding.beam.pyctcdecode_cfg.hotword_weight=10.0 - - -Hyperparameter Grid Search --------------------------- - -Beam search decoding with N-gram LM has three main hyperparameters: `beam_width`, `beam_alpha`, and `beam_beta`. -The accuracy of the model is dependent to the values of these parameters, specially beam_alpha and beam_beta. -You may specify a single or list of values for each of these parameters to perform grid search. It would perform the -beam search decoding on all the combinations of the these three hyperparameters. -For instance, the following set of parameters would results in 2*1*2=4 beam search decodings: - -.. code-block:: - - python eval_beamsearch_ngram.py ... \ - beam_width=[64,128] \ - beam_alpha=[1.0] \ - beam_beta=[1.0,0.5] - - -Beam search ngram decoding for Transducer models (RNNT and HAT) -=============================================================== - -The similar script to evaluate an RNNT/HAT model with beam search decoding and N-gram models can be found at -`scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py `_ - -.. code-block:: - - python eval_beamsearch_ngram_transducer.py nemo_model_file= \ - input_manifest= \ - beam_width=[] \ - beam_alpha=[] \ - preds_output_folder= \ - probs_cache_file=null \ - decoding_strategy= - maes_prefix_alpha=[] \ - maes_expansion_gamma=[] \ - hat_subtract_ilm= \ - hat_ilm_weight=[] \ - - - -.. _neural_rescoring: - -**************** -Neural Rescoring -**************** - -In this approach a neural network is used which can gives scores to a candidate. A candidate is the text transcript predicted by the decoder of the ASR model. -The top K candidates produced by the beam search decoding (beam width of K) are given to a neural language model to rank them. -Ranking can be done by a language model which gives a score to each candidate. -This score is usually combined with the scores from the beam search decoding to produce the final scores and rankings. - -Train Neural Rescorer -===================== - -An example script to train such a language model with Transformer can be found at `examples/nlp/language_modeling/transformer_lm.py `__. -It trains a ``TransformerLMModel`` which can be used as a neural rescorer for an ASR system. Full documentation on language models training is available at: - -:doc:`../nlp/language_modeling` - -You may also use a pretrained language model from HuggingFace library like Transformer-XL and GPT instead of training your model. -Models like BERT and RoBERTa are not supported by this script as they are trained as a Masked Language Model and are not efficient and effective to score sentences out of the box. - - -Evaluation -========== - -Given a trained TransformerLMModel `.nemo` file or a pretrained HF model, the script available at -`scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py `__ -can be used to re-score beams obtained with ASR model. You need the `.tsv` file containing the candidates produced -by the acoustic model and the beam search decoding to use this script. The candidates can be the result of just the beam -search decoding or the result of fusion with an N-gram LM. You may generate this file by specifying `--preds_output_folder` for -`scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py `__. - -The neural rescorer would rescore the beams/candidates by using two parameters of `rescorer_alpha` and `rescorer_beta` as the following: - -.. code-block:: - - final_score = beam_search_score + rescorer_alpha*neural_rescorer_score + rescorer_beta*seq_length - -Parameter `rescorer_alpha` specifies amount of importance to place on the neural rescorer model, and `rescorer_beta` is -a penalty term to consider the sequence length in the scores. They have similar effects like the parameters -`beam_alpha` and `beam_beta` of beam search decoder and N-gram LM. - -You may follow the following steps to evaluate a neural LM: - -#. Obtain `.tsv` file with beams and their corresponding scores. Scores can be from a regular beam search decoder or - in fusion with an N-gram LM scores. For a given beam size `beam_size` and a number of examples - for evaluation `num_eval_examples`, it should contain (`num_eval_examples` x `beam_size`) lines of - form `beam_candidate_text \t score`. This file can be generated by `scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py `__ - -#. Rescore the candidates by `scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py `__. - -.. code-block:: - - python eval_neural_rescorer.py - --lm_model=[path to .nemo file of the LM or the name of a HF pretrained model] - --beams_file=[path to beams .tsv file] - --beam_size=[size of the beams] - --eval_manifest=[path to eval manifest .json file] - --batch_size=[batch size used for inference on the LM model] - --alpha=[the value for the parameter rescorer_alpha] - --beta=[the value for the parameter rescorer_beta] - --scores_output_file=[the optional path to store the rescored candidates] - -The candidates along with their new scores would be stored at the file specified by `--scores_output_file`. - -The following is the list of the arguments for the evaluation script: - -+---------------------+--------+------------------+-------------------------------------------------------------------------+ -| **Argument** |**Type**| **Default** | **Description** | -+---------------------+--------+------------------+-------------------------------------------------------------------------+ -| lm_model | str | Required | The path of the '.nemo' file of an ASR model, or the name of a | -| | | | HuggingFace pretrained model like 'transfo-xl-wt103' or 'gpt2' | -+---------------------+--------+------------------+-------------------------------------------------------------------------+ -| eval_manifest | str | Required | Path to the evaluation manifest file (.json manifest file) | -+---------------------+--------+------------------+-------------------------------------------------------------------------+ -| beams_file | str | Required | path to beams file (.tsv) containing the candidates and their scores | -+---------------------+--------+------------------+-------------------------------------------------------------------------+ -| beam_size | int | Required | The width of the beams (number of candidates) generated by the decoder | -+---------------------+--------+------------------+-------------------------------------------------------------------------+ -| alpha | float | None | The value for parameter rescorer_alpha | -| | | | Not passing value would enable linear search for rescorer_alpha | -+---------------------+--------+------------------+-------------------------------------------------------------------------+ -| beta | float | None | The value for parameter rescorer_beta | -| | | | Not passing value would enable linear search for rescorer_beta | -+---------------------+--------+------------------+-------------------------------------------------------------------------+ -| batch_size | int | 16 | The batch size used to calculate the scores | -+---------------------+--------+------------------+-------------------------------------------------------------------------+ -| max_seq_length | int | 512 | Maximum sequence length (in tokens) for the input | -+---------------------+--------+------------------+-------------------------------------------------------------------------+ -| scores_output_file | str | None | The optional file to store the rescored beams | -+---------------------+--------+------------------+-------------------------------------------------------------------------+ -| use_amp | bool | ``False`` | Whether to use AMP if available calculate the scores | -+---------------------+--------+------------------+-------------------------------------------------------------------------+ -| device | str | cuda | The device to load LM model onto to calculate the scores | -| | | | It can be 'cpu', 'cuda', 'cuda:0', 'cuda:1', ... | -+---------------------+--------+------------------+-------------------------------------------------------------------------+ - - -Hyperparameter Linear Search ----------------------------- - -This script also supports linear search for parameters `alpha` and `beta`. If any of the two is not -provided, a linear search is performed to find the best value for that parameter. When linear search is used, initially -`beta` is set to zero and the best value for `alpha` is found, then `alpha` is fixed with -that value and another linear search is done to find the best value for `beta`. -If any of the of these two parameters is already specified, then search for that one is skipped. After each search for a -parameter, the plot of WER% for different values of the parameter is also shown. - -It is recommended to first use the linear search for both parameters on a validation set by not providing any values for `--alpha` and `--beta`. -Then check the WER curves and decide on the best values for each parameter. Finally, evaluate the best values on the test set. - - -Word Boosting -============= - -The Flashlight decoder supports word boosting during CTC decoding using a KenLM binary and corresponding lexicon. Word boosting only -works in lexicon decoding mode, it does not work in lexicon-free mode. Word boosting allows one to bias the decoder for certain words, -such that you can manually increase or decrease the probability of emitting certain words. This can be very helpful if you have certain -uncommon or industry-specific words which you want to ensure transcribe correctly. - -For more information on word boosting, see `here `__ -and `here `__ - -In order to use word boosting in Nemo, you need to create a simple tab-separated text file which contains each word to be boosted, followed by -tab, and then the boosted score for that word. - -For example: - -.. code-block:: - - nvidia 40 - geforce 50 - riva 80 - turing 30 - badword -100 - -Positive scores boost words higher in the LM decoding step so they show up more frequently, whereas negative scores -squelch words so they show up less frequently. The recommended range for the boost score is +/- 20 to 100. - -The boost file handles both in-vocabulary words and OOV words just fine, so you can specify both IV and OOV words with corresponding scores. - -You can then pass this file to your flashlight config object during decoding: - -.. code-block:: - - # Lexicon-based decoding - python eval_beamsearch_ngram.py ... \ - decoding_strategy="flashlight" \ - decoding.beam.flashlight_cfg.lexicon_path='/path/to/lexicon.lexicon' \ - decoding.beam.flashlight_cfg.boost_path='/path/to/my_boost_file.boost' \ - decoding.beam.flashlight_cfg.beam_size_token = 32 \ - decoding.beam.flashlight_cfg.beam_threshold = 25.0 - -Combine N-gram Language Models -============================== - -Before combining N-gram LMs install required OpenGrm NGram library using `scripts/installers/install_opengrm.sh `__. -Alternatively, you can use Docker image `scripts/installers/Dockerfile.ngramtools `__ with all the necessary dependencies. - -To combine two N-gram language models, you can use the script ngram_merge.py located at -`scripts/asr_language_modeling/ngram_lm/ngram_merge.py `__. - -This script interpolate two ARPA N-gram language models and creates a KenLM binary file that can be used with the beam search decoders on top of ASR models. -You can specify weights (`--alpha` and `--beta`) for each of the models (`--ngram_a` and `--ngram_b`) correspondingly: `alpha` * `ngram_a` + `beta` * `ngram_b`. -This script supports both character level and BPE level encodings and models which are detected automatically from the type of the model. - -To combine two N-gram models, you can use the following command: - -.. code-block:: - - python ngram_merge.py --kenlm_bin_path \ - --ngram_bin_path \ - --arpa_a \ - --alpha \ - --arpa_b \ - --beta \ - --out_path - - - -If you provide `--test_file` and `--nemo_model_file`, the script will calculate the perplexity of the resulting N-gram model on the test set. -Note, the result of each step during the process is cached in the temporary file in the `--out_path`, to speed up further run. -You can use the `--force` flag to discard the cache and recalculate everything from scratch. - -.. code-block:: - - python ngram_merge.py --kenlm_bin_path \ - --ngram_bin_path \ - --arpa_a \ - --alpha \ - --arpa_b \ - --beta \ - --out_path - --nemo_model_file \ - --test_file \ - --symbols \ - --force - - -The following is the list of the arguments for the opengrm script: - -+----------------------+--------+------------------+-------------------------------------------------------------------------+ -| **Argument** |**Type**| **Default** | **Description** | -+----------------------+--------+------------------+-------------------------------------------------------------------------+ -| kenlm_bin_path | str | Required | The path to the bin folder of KenLM library. It is a folder named `bin` under where KenLM is installed. | -+----------------------+--------+------------------+-------------------------------------------------------------------------+ -| ngram_bin_path | str | Required | The path to the bin folder of OpenGrm Ngram. It is a folder named `bin` under where OpenGrm Ngram is installed. | -+----------------------+--------+------------------+-------------------------------------------------------------------------+ -| arpa_a | str | Required | Path to the ARPA N-gram model file A | -+----------------------+--------+------------------+-------------------------------------------------------------------------+ -| alpha | float | Required | Weight of N-gram model A | -+----------------------+--------+------------------+-------------------------------------------------------------------------+ -| arpa_b | int | Required | Path to the ARPA N-gram model file B | -+----------------------+--------+------------------+-------------------------------------------------------------------------+ -| beta | float | Required | Weight of N-gram model B | -+----------------------+--------+------------------+-------------------------------------------------------------------------+ -| out_path | str | Required | Path for writing temporary and resulting files. | -+----------------------+--------+------------------+-------------------------------------------------------------------------+ -| test_file | str | None | Path to test file to count perplexity if provided. | -+----------------------+--------+------------------+-------------------------------------------------------------------------+ -| symbols | str | None | Path to symbols (.syms) file. Could be calculated if it is not provided.| -+----------------------+--------+------------------+-------------------------------------------------------------------------+ -| nemo_model_file | str | None | The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model. | -+----------------------+--------+------------------+-------------------------------------------------------------------------+ -| force | bool | ``False`` | Whether to recompile and rewrite all files | -+----------------------+--------+------------------+-------------------------------------------------------------------------+ diff --git a/SoundScribe/SpeakerID/docs/source/asr/configs.rst b/SoundScribe/SpeakerID/docs/source/asr/configs.rst deleted file mode 100644 index 30c8a74c5176c0b7e0c4f2b60f6e19cb19804c6c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/configs.rst +++ /dev/null @@ -1,1110 +0,0 @@ -NeMo ASR Configuration Files -============================ - -This section describes the NeMo configuration file setup that is specific to models in the ASR collection. For general information -about how to set up and run experiments that is common to all NeMo models (e.g. Experiment Manager and PyTorch Lightning trainer -parameters), see the :doc:`../core/core` section. - -The model section of the NeMo ASR configuration files generally requires information about the dataset(s) being used, the preprocessor -for audio files, parameters for any augmentation being performed, as well as the model architecture specification. The sections on -this page cover each of these in more detail. - -Example configuration files for all of the NeMo ASR scripts can be found in the -`config directory of the examples `_. - - -Dataset Configuration ---------------------- - -Training, validation, and test parameters are specified using the ``train_ds``, ``validation_ds``, and -``test_ds`` sections in the configuration file, respectively. Depending on the task, there may be arguments specifying the sample rate -of the audio files, the vocabulary of the dataset (for character prediction), whether or not to shuffle the dataset, and so on. You may -also decide to leave fields such as the ``manifest_filepath`` blank, to be specified via the command-line at runtime. - -Any initialization parameter that is accepted for the Dataset class used in the experiment can be set in the config file. -Refer to the `Datasets <./api.html#Datasets>`__ section of the API for a list of Datasets and their respective parameters. - -An example ASR train and validation configuration should look similar to the following: - -.. code-block:: yaml - - # Specified at the beginning of the config file - labels: &labels [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", - "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] - - model: - train_ds: - manifest_filepath: ??? - sample_rate: 16000 - labels: *labels # Uses the labels above - batch_size: 32 - trim_silence: True - max_duration: 16.7 - shuffle: True - num_workers: 8 - pin_memory: true - # tarred datasets - is_tarred: false # If set to true, uses the tarred version of the Dataset - tarred_audio_filepaths: null # Not used if is_tarred is false - shuffle_n: 2048 # Not used if is_tarred is false - # bucketing params - bucketing_strategy: "synced_randomized" - bucketing_batch_size: null - bucketing_weights: null - - validation_ds: - manifest_filepath: ??? - sample_rate: 16000 - labels: *labels # Uses the labels above - batch_size: 32 - shuffle: False # No need to shuffle the validation data - num_workers: 8 - pin_memory: true - -There are two ways to test/validate on more than one manifest: - -- Specify a list in the `manifest_filepath` field. Results will be reported for each, the first one being used for overall loss / WER (specify `val_dl_idx` if you wish to change that). In this case, all manifests will share configuration parameters. -- Use the ds_item key and pass a list of config objects to it. This allows you to use differently configured datasets for validation, e.g. - -.. code-block:: yaml - - model: - validation_ds: - ds_item: - - name: dataset1 - manifest_filepath: ??? - # Config parameters for dataset1 - ... - - name: dataset2 - manifest_filepath: ??? - # Config parameters for dataset2 - ... - -By default, dataloaders are set up when the model is instantiated. However, dataloader setup can be deferred to -model's `setup()` method by setting ``defer_setup`` in the configuration. - -For example, training data setup can be deferred as follows: - -.. code-block:: yaml - - model: - train_ds: - # Configure training data as usual - ... - # Defer train dataloader setup from `__init__` to `setup` - defer_setup: true - - -Preprocessor Configuration --------------------------- - -If you are loading audio files for your experiment, you will likely want to use a preprocessor to convert from the -raw audio signal to features (e.g. mel-spectrogram or MFCC). The ``preprocessor`` section of the config specifies the audio -preprocessor to be used via the ``_target_`` field, as well as any initialization parameters for that preprocessor. - -An example of specifying a preprocessor is as follows: - -.. code-block:: yaml - - model: - ... - preprocessor: - # _target_ is the audio preprocessor module you want to use - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - normalize: "per_feature" - window_size: 0.02 - ... - # Other parameters for the preprocessor - -Refer to the `Audio Preprocessors <./api.html#Audio Preprocessors>`__ API section for the preprocessor options, expected arguments, -and defaults. - -Augmentation Configurations ---------------------------- - -There are a few on-the-fly spectrogram augmentation options for NeMo ASR, which can be specified by the -configuration file using a ``spec_augment`` section. - -For example, there are options for `Cutout `_ and -`SpecAugment `_ available via the ``SpectrogramAugmentation`` module. - -The following example sets up both ``Cutout`` (via the ``rect_*`` parameters) and ``SpecAugment`` (via the ``freq_*`` -and ``time_*`` parameters). - -.. code-block:: yaml - - model: - ... - spec_augment: - _target_: nemo.collections.asr.modules.SpectrogramAugmentation - # Cutout parameters - rect_masks: 5 # Number of rectangles to cut from any given spectrogram - rect_freq: 50 # Max cut of size 50 along the frequency dimension - rect_time: 120 # Max cut of size 120 along the time dimension - # SpecAugment parameters - freq_masks: 2 # Cut two frequency bands - freq_width: 15 # ... of width 15 at maximum - time_masks: 5 # Cut out 10 time bands - time_width: 25 # ... of width 25 at maximum - -You can use any combination of ``Cutout``, frequency/time ``SpecAugment``, or neither of them. - -With NeMo ASR, you can also add augmentation pipelines that can be used to simulate various kinds of noise -added to audio in the channel. Augmentors in a pipeline are applied on the audio data read in the data layer. Online -augmentors can be specified in the config file using an ``augmentor`` section in ``train_ds``. The following example -adds an augmentation pipeline that first adds white noise to an audio sample with a probability of 0.5 and at a level -randomly picked between -50 dB and -10 dB and then passes the resultant samples through a room impulse response randomly -picked from the manifest file provided for ``impulse`` augmentation in the config file. - -.. code-block:: yaml - - model: - ... - train_ds: - ... - augmentor: - white_noise: - prob: 0.5 - min_level: -50 - max_level: -10 - impulse: - prob: 0.3 - manifest_path: /path/to/impulse_manifest.json - -Refer to the `Audio Augmentors <./api.html#Audio Augmentors>`__ API section for more details. - -Tokenizer Configurations ------------------------- - -Some models utilize sub-word encoding via an external tokenizer instead of explicitly defining their vocabulary. - -For such models, a ``tokenizer`` section is added to the model config. ASR models currently support two types of -custom tokenizers: - -- Google Sentencepiece tokenizers (tokenizer type of ``bpe`` in the config) -- HuggingFace WordPiece tokenizers (tokenizer type of ``wpe`` in the config) -- Aggregate tokenizers ((tokenizer type of ``agg`` in the config), see below) - -In order to build custom tokenizers, refer to the ``ASR_with_Subword_Tokenization`` notebook available in the -ASR tutorials directory. - -The following example sets up a ``SentencePiece Tokenizer`` at a path specified by the user: - -.. code-block:: yaml - - model: - ... - tokenizer: - dir: "" - type: "bpe" # can be "bpe" or "wpe" - -The Aggregate (``agg``) tokenizer feature makes it possible to combine tokenizers in order to train multilingual -models. The config file would look like this: - -.. code-block:: yaml - - model: - ... - tokenizer: - type: "agg" # aggregate tokenizer - langs: - en: - dir: "" - type: "bpe" # can be "bpe" or "wpe" - es: - dir: "" - type: "bpe" # can be "bpe" or "wpe" - -In the above config file, each language is associated with its own pre-trained tokenizer, which gets assigned -a token id range in the order the tokenizers are listed. To train a multilingual model, one needs to populate the -``lang`` field in the manifest file, allowing the routing of each sample to the correct tokenizer. At inference time, -the routing is done based on the inferred token id range. - -For models which utilize sub-word tokenization, we share the decoder module (``ConvASRDecoder``) with character tokenization models. -All parameters are shared, but for models which utilize sub-word encoding, there are minor differences when setting up the config. For -such models, the tokenizer is utilized to fill in the missing information when the model is constructed automatically. - -For example, a decoder config corresponding to a sub-word tokenization model should look similar to the following: - -.. code-block:: yaml - - model: - ... - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoder - feat_in: *enc_final - num_classes: -1 # filled with vocabulary size from tokenizer at runtime - vocabulary: [] # filled with vocabulary from tokenizer at runtime - - -On-the-fly Code Switching -------------------------- - -Nemo supports creating code-switched synthetic utterances on-the-fly during training/validation/testing. This allows you to create ASR models which -support intra-utterance code switching. If you have Nemo formatted audio data on disk (either JSON manifests or tarred audio data), you -can easily mix as many of these audio sources together as desired by adding some extra parameters to your `train_ds`, `validation_ds`, and `test_ds`. - -Please note that this allows you to mix any kind of audio sources together to create synthetic utterances which sample from all sources. The most -common use case for this is blending different languages together to create a multilingual code-switched model, but you can also blend -together different audio sources from the same languages (or language families), to create noise robust data, or mix fast and slow speech from the -same language. - -For multilingual code-switched models, we recommend using AggTokenizer for your Tokenizer if mixing different languages. - -The following example shows how to mix 3 different languages: English (en), German (de), and Japanese (ja) added to the `train_ds` model block, however -you can add similar logic to your `validation_ds` and `test_ds` blocks for on-the-fly code-switched validation and test data too. This example mixes -together 3 languages, but you can use as many as you want. However, be advised that the more languages you add, the higher your `min_duration` and `max_duration` -need to be set to ensure all languages are sampled into each synthetic utterance, and setting these hyperparameters higher will use more VRAM per mini-batch during -training and evaluation. - -.. code-block:: yaml - - model: - train_ds: - manifest_filepath: [/path/to/EN/tarred_manifest.json, /path/to/DE/tarred_manifest.json, /path/to/JA/tarred_manifest.json] - tarred_audio_filepaths: ['/path/to/EN/tars/audio__OP_0..511_CL_.tar', '/path/to/DE/tars/audio__OP_0..1023_CL_.tar', '/path/to/JA/tars/audio__OP_0..2047_CL_.tar'] - is_code_switched: true - is_tarred: true - shuffle: true - code_switched: # add this block for code-switching - min_duration: 12 # the minimum number of seconds for each synthetic code-switched utterance - max_duration: 20 # the maximum number of seconds for each synthetic code-switched utterance - min_monolingual: 0.3 # the minimum percentage of utterances which will be pure monolingual (0.3 = 30%) - probs: [0.25, 0.5, 0.25] # the probability to sample each language (matches order of `language` above) if not provided, assumes uniform distribution - force_monochannel: true # if your source data is multi-channel, then setting this to True will force the synthetic utterances to be mono-channel - sampling_scales: 0.75 # allows you to down/up sample individual languages. Can set this as an array for individual languages, or a scalar for all languages - seed: 123 # add a seed for replicability in future runs (highly useful for `validation_ds` and `test_ds`) - - -Model Architecture Configurations ---------------------------------- - -Each configuration file should describe the model architecture being used for the experiment. Models in the NeMo ASR collection need -an ``encoder`` section and a ``decoder`` section, with the ``_target_`` field specifying the module to use for each. - -Here is the list of the parameters in the model section which are shared among most of the ASR models: - -+-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+---------------------------------+ -| **Parameter** | **Datatype** | **Description** | **Supported Values** | -+=========================+==================+===============================================================================================================+=================================+ -| :code:`log_prediction` | bool | Whether a random sample should be printed in the output at each step, along with its predicted transcript. | | -+-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+---------------------------------+ -| :code:`ctc_reduction` | string | Specifies the reduction type of CTC loss. Defaults to ``mean_batch`` which would take the average over the | :code:`none`, | -| | | batch after taking the average over the length of each sample. | :code:`mean_batch` | -| | | | :code:`mean`, :code:`sum` | -+-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+---------------------------------+ - -The following sections go into more detail about the specific configurations of each model architecture. - -For more information about the ASR models, refer to the :doc:`Models <./models>` section. - -Jasper and QuartzNet -~~~~~~~~~~~~~~~~~~~~ - -The `Jasper <./models.html#Jasper>`__ and `QuartzNet <./models.html#QuartzNet>`__ models are very similar, and as such the components in their -configs are very similar as well. - -Both architectures use the ``ConvASREncoder`` for the ``encoder``, with parameters detailed in the table below. The encoder parameters -include details about the Jasper/QuartzNet ``[BxR]`` encoder architecture, including how many blocks to use (``B``), how many times -to repeat each sub-block (``R``), and the convolution parameters for each block. - -The number of blocks ``B`` is determined by the number of list elements under ``jasper`` minus the one prologue and two epilogue blocks. -The number of sub-blocks ``R`` is determined by setting the ``repeat`` parameter. - -To use QuartzNet (which uses more compact time-channel separable convolutions) instead of Jasper, add :code:`separable: true` to all -but the last block in the architecture. - -Change the parameter name ``jasper``. - -+-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+-------------------------------------+ -| **Parameter** | **Datatype** | **Description** | **Supported Values** | -+=========================+==================+===============================================================================================================+=====================================+ -| :code:`feat_in` | int | The number of input features. Should be equal to :code:`features` in the preprocessor parameters. | | -+-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+-------------------------------------+ -| :code:`activation` | string | Which activation function to use in the encoder. | :code:`hardtanh`, :code:`relu`, | -| | | | :code:`selu`, :code:`swish` | -+-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+-------------------------------------+ -| :code:`conv_mask` | bool | Whether to use masked convolutions in the encoder. Defaults to ``true``. | | -+-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+-------------------------------------+ -| :code:`jasper` | | A list of blocks that specifies your encoder architecture. Each entry in this list represents one block in | | -| | | the architecture and contains the parameters for that block, including convolution parameters, dropout, and | | -| | | the number of times the block is repeated. Refer to the `Jasper `_ and | | -| | | `QuartzNet `_ papers for details about specific model configurations. | | -+-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+-------------------------------------+ - -A QuartzNet 15x5 (fifteen blocks, each sub-block repeated five times) encoder configuration should look similar to the following example: - -.. code-block:: yaml - - # Specified at the beginning of the file for convenience - n_mels: &n_mels 64 # Used for both the preprocessor and encoder as number of input features - repeat: &repeat 5 # R=5 - dropout: &dropout 0.0 - separable: &separable true # Set to true for QN. Set to false for Jasper. - - model: - ... - encoder: - _target_: nemo.collections.asr.modules.ConvASREncoder - feat_in: *n_mels # Should match "features" in the preprocessor. - activation: relu - conv_mask: true - - jasper: # This field name should be "jasper" for both types of models. - - # Prologue block - - dilation: [1] - dropout: *dropout - filters: 256 - kernel: [33] - repeat: 1 # Prologue block is not repeated. - residual: false - separable: *separable - stride: [2] - - # Block 1 - - dilation: [1] - dropout: *dropout - filters: 256 - kernel: [33] - repeat: *repeat - residual: true - separable: *separable - stride: [1] - - ... # Entries for blocks 2~14 - - # Block 15 - - dilation: [1] - dropout: *dropout - filters: 512 - kernel: [75] - repeat: *repeat - residual: true - separable: *separable - stride: [1] - - # Two epilogue blocks - - dilation: [2] - dropout: *dropout - filters: 512 - kernel: [87] - repeat: 1 # Epilogue blocks are not repeated - residual: false - separable: *separable - stride: [1] - - - dilation: [1] - dropout: *dropout - filters: &enc_filters 1024 - kernel: [1] - repeat: 1 # Epilogue blocks are not repeated - residual: false - stride: [1] - -Both Jasper and QuartzNet use the ``ConvASRDecoder`` as the decoder. The decoder parameters are detailed in the following table. - -+-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+---------------------------------+ -| **Parameter** | **Datatype** | **Description** | **Supported Values** | -+=========================+==================+===============================================================================================================+=================================+ -| :code:`feat_in` | int | The number of input features to the decoder. Should be equal to the number of filters in the last block of | | -| | | the encoder. | | -+-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+---------------------------------+ -| :code:`vocabulary` | list | A list of the valid output characters for your model. For example, for an English dataset, this could be a | | -| | | list of all lowercase letters, space, and apostrophe. | | -+-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+---------------------------------+ -| :code:`num_classes` | int | Number of output classes, i.e. the length of :code:`vocabulary`. | | -+-------------------------+------------------+---------------------------------------------------------------------------------------------------------------+---------------------------------+ - -For example, a decoder config corresponding to the encoder above should look similar to the following: - -.. code-block:: yaml - - model: - ... - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoder - feat_in: *enc_filters - vocabulary: *labels - num_classes: 28 # Length of the vocabulary list - -Citrinet -~~~~~~~~ - -The `Citrinet <./models.html#Citrinet>`__ and `QuartzNet <./models.html#QuartzNet>`__ models are very similar, and as such the -components in their configs are very similar as well. Citrinet utilizes Squeeze and Excitation, as well as sub-word tokenization, in -contrast to QuartzNet. Depending on the dataset, we utilize different tokenizers. For Librispeech, we utilize the HuggingFace WordPiece -tokenizer, and for all other datasets we utilize the Google Sentencepiece tokenizer - usually the ``unigram`` tokenizer type. - -Both architectures use the ``ConvASREncoder`` for the ``encoder``, with parameters detailed above. The encoder parameters include -details about the Citrinet-C encoder architecture, including how many filters are used per channel (``C``). The Citrinet-C -configuration is a shortform notation for Citrinet-21x5xC, such that ``B = 21`` and ``R = 5`` are the default and should generally -not be changed. - -To use Citrinet instead of QuartzNet, refer to the ``citrinet_512.yaml`` configuration found inside the ``examples/asr/conf/citrinet`` -directory. Citrinet is primarily comprised of the same :class:`~nemo.collections.asr.parts.submodules.jasper.JasperBlock` as ``Jasper`` or -``QuartzNet``. - -While the configs for Citrinet and QuartzNet are similar, we note the additional flags used for Citrinet below. Refer to the -``JasperBlock`` documentation for the meaning of these arguments. - -+---------------------------+------------------+-----------------------------------------------------------------------------------------------------------+-----------------------------------+ -| **Parameter** | **Datatype** | **Description** | **Supported Values** | -+===========================+==================+===========================================================================================================+===================================+ -| :code:`se` | bool | Whether to apply squeeze-and-excitation mechanism or not. | :code:`true` or :code:`false` | -+---------------------------+------------------+-----------------------------------------------------------------------------------------------------------+-----------------------------------+ -| :code:`se_context_size` | int | SE context size. -1 means global context. | :code:`-1` or :code:`+ve int` | -+---------------------------+------------------+-----------------------------------------------------------------------------------------------------------+-----------------------------------+ -| :code:`stride_last` | bool | Stride on the final repeated block or all repeated blocks. | :code:`true` or :code:`false` | -+---------------------------+------------------+-----------------------------------------------------------------------------------------------------------+-----------------------------------+ -| :code:`residual_mode` | str | Type of residual branch to construct. | :code:`"add"` or | -| | | Can be pointwise residual addition or pointwise strided residual attention | :code:`"stride_add"` | -+---------------------------+------------------+-----------------------------------------------------------------------------------------------------------+-----------------------------------+ - -A Citrinet-512 config should look similar to the following: - -.. code-block:: yaml - - model: - ... - # Specify some defaults across the entire model - model_defaults: - repeat: 5 - dropout: 0.1 - separable: true - se: true - se_context_size: -1 - ... - encoder: - _target_: nemo.collections.asr.modules.ConvASREncoder - feat_in: *n_mels # Should match "features" in the preprocessor. - activation: relu - conv_mask: true - - jasper: # This field name should be "jasper" for the JasperBlock (which constructs Citrinet). - - # Prologue block - - filters: 512 - repeat: 1 - kernel: [5] - stride: [1] - dilation: [1] - dropout: 0.0 - residual: false - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - # Block 1 - - filters: 512 - repeat: ${model.model_defaults.repeat} - kernel: [11] - stride: [2] - dilation: [1] - dropout: ${model.model_defaults.dropout} - residual: true - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - stride_last: true - residual_mode: "stride_add" - - ... # Entries for blocks 2~21 - - # Block 22 - - filters: 512 - repeat: ${model.model_defaults.repeat} - kernel: [39] - stride: [1] - dilation: [1] - dropout: ${model.model_defaults.dropout} - residual: true - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - # Epilogue block - - - filters: &enc_final 640 - repeat: 1 - kernel: [41] - stride: [1] - dilation: [1] - dropout: 0.0 - residual: false - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - -As mentioned above, Citrinet uses the ``ConvASRDecoder`` as the decoder layer similar to QuartzNet. Only the configuration must be -changed slightly as Citrinet utilizes sub-word tokenization. - -.. note:: - The following information is relevant to any of the above models that implements its encoder as an :class:`~nemo.collections.asr.modules.conv_asr.ConvASREncoder`, and utilizes the ``SqueezeExcite`` mechanism. - -The ``SqueezeExcite`` block within a :class:`~nemo.collections.asr.modules.conv_asr.ConvASREncoder` network can be modified to utilize a different context window after the model has been instantiated (even after the model has been trained) so as to evaluate the model with limited context. This can be achieved using the :meth:`~nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin.change_conv_asr_se_context_window` - -.. code-block:: python - - # Here, model can be any model that has a `ConvASREncoder` as its encoder, and utilized `SqueezeExcite` blocks - # `context_window` : It is an integer representing the number of timeframes (each corresponding to some window stride). - # `update_config` : Bool flag which determines whether the config of the model should be updated to reflect the new context window. - - # Here, we specify that 128 timeframes of 0.01s stride should be the context window - # This is equivalent to 128 * 0.01s context window for `SqueezeExcite` - model.change_conv_asr_se_context_window(context_window=128, update_config=True) - -Conformer-CTC -~~~~~~~~~~~~~ - -The config files for Conformer-CTC model contain character-based encoding and sub-word encoding at -``/examples/asr/conf/conformer/conformer_ctc_char.yaml`` and ``/examples/asr/conf/conformer/conformer_ctc_bpe.yaml`` -respectively. Some components of the configs of `Conformer-CTC <./models.html#Conformer-CTC>`__ include the following datasets: - -* ``train_ds``, ``validation_ds``, and ``test_ds`` -* opimizer (``optim``) -* augmentation (``spec_augment``) -* ``decoder`` -* ``trainer`` -* ``exp_manager`` - -These datasets are similar to other ASR models like `QuartzNet <./models.html#QuartzNet>`__. There should be a tokenizer section where you can -specify the tokenizer if you want to use sub-word encoding instead of character-based encoding. - - -The encoder section includes the details about the Conformer-CTC encoder architecture. You may find more information in the -config files and also :ref:`nemo.collections.asr.modules.ConformerEncoder `. - -Squeezeformer-CTC -~~~~~~~~~~~~~~~~~ - -The config files for Squeezeformer-CTC model contain character-based encoding and sub-word encoding at -``/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml`` and ``/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml`` -respectively. Components of the configs of `Squeezeformer-CTC <./models.html#Squeezeformer-CTC>`__ are similar to Conformer config - `QuartzNet <./configs.html#Conformer-CTC>`__. - -The encoder section includes the details about the Squeezeformer-CTC encoder architecture. You may find more information in the -config files and also :ref:`nemo.collections.asr.modules.SqueezeformerEncoder `. - - -ContextNet -~~~~~~~~~~ - -Please refer to the model page of `ContextNet <./models.html#ContextNet>`__ for more information on this model. - -Conformer-Transducer -~~~~~~~~~~~~~~~~~~~~ - -Please refer to the model page of `Conformer-Transducer <./models.html#Conformer-Transducer>`__ for more information on this model. - -LSTM-Transducer and LSTM-CTC -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The config files for LSTM-Transducer and LSTM-CTC models can be found at ``/examples/asr/conf/lstm/lstm_transducer_bpe.yaml`` and ``/examples/asr/conf/lstm/lstm_ctc_bpe.yaml`` respectively. -Most of the of the configs of are similar to other ctc or transducer models. The main difference is the encoder part. -The encoder section includes the details about the RNN-based encoder architecture. You may find more information in the -config files and also :ref:`nemo.collections.asr.modules.RNNEncoder `. - - -InterCTC Config ---------------- - -All CTC-based models also support `InterCTC loss `_. To use it, you need to specify -2 parameters as in example below - -.. code-block:: yaml - - model: - # ... - interctc: - loss_weights: [0.3] - apply_at_layers: [8] - -which can be used to reproduce the default setup from the paper (assuming the total number of layers is 18). -You can also specify multiple CTC losses from different layers, e.g., to get 2 losses from layers 3 and 8 with -weights 0.1 and 0.3, specify: - -.. code-block:: yaml - - model: - # ... - interctc: - loss_weights: [0.1, 0.3] - apply_at_layers: [3, 8] - -Note that the final-layer CTC loss weight is automatically computed to normalize -all weight to 1 (0.6 in the example above). - - -Stochastic Depth Config ------------------------ - -`Stochastic Depth `_ is a useful technique for regularizing ASR model training. -Currently it's only supported for :ref:`nemo.collections.asr.modules.ConformerEncoder `. To -use it, specify the following parameters in the encoder config file to reproduce the default setup from the paper: - -.. code-block:: yaml - - model: - # ... - encoder: - # ... - stochastic_depth_drop_prob: 0.3 - stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 1 - -See :ref:`documentation of ConformerEncoder ` for more details. Note that stochastic depth -is supported for both CTC and Transducer model variations (or any other kind of model/loss that's using -conformer as encoder). - - -Transducer Configurations -------------------------- - -All CTC-based ASR model configs can be modified to support Transducer loss training. Below, we discuss the modifications required in the config to enable Transducer training. All modifications are made to the ``model`` config. - -Model Defaults -~~~~~~~~~~~~~~ - -It is a subsection to the model config representing the default values shared across the entire model represented as ``model.model_defaults``. - -There are three values that are primary components of a transducer model. They are : - -* ``enc_hidden``: The hidden dimension of the final layer of the Encoder network. -* ``pred_hidden``: The hidden dimension of the final layer of the Prediction network. -* ``joint_hidden``: The hidden dimension of the intermediate layer of the Joint network. - -One can access these values inside the config by using OmegaConf interpolation as follows : - -.. code-block:: yaml - - model: - ... - model_defaults: - enc_hidden: 256 - pred_hidden: 256 - joint_hidden: 256 - ... - decoder: - ... - prednet: - pred_hidden: ${model.model_defaults.pred_hidden} - -Acoustic Encoder Model -~~~~~~~~~~~~~~~~~~~~~~ - -The transducer model is comprised of three models combined. One of these models is the Acoustic (encoder) model. We should be able to drop in any CTC Acoustic model config into this section of the transducer config. - -The only condition that needs to be met is that **the final layer of the acoustic model must have the hidden dimension defined in ``model_defaults.enc_hidden``**. - -Decoder / Prediction Model -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The Prediction model is generally an autoregressive, causal model that consumes text tokens and returns embeddings that will be used by the Joint model. The base config for an LSTM based Prediction network can be found in the the ``decoder`` section of `ContextNet <./models.html#ContextNet>`__ or other Transducer architectures. For further information refer to the ``Intro to Transducers`` tutorial in the ASR tutorial section. - -**This config can be copy-pasted into any custom transducer model with no modification.** - -Let us discuss some of the important arguments: - -* ``blank_as_pad``: In ordinary transducer models, the embedding matrix does not acknowledge the ``Transducer Blank`` token (similar to CTC Blank). However, this causes the autoregressive loop to be more complicated and less efficient. Instead, this flag which is set by default, will add the ``Transducer Blank`` token to the embedding matrix - and use it as a pad value (zeros tensor). This enables more efficient inference without harming training. For further information refer to the ``Intro to Transducers`` tutorial in the ASR tutorial section. - -* ``prednet.pred_hidden``: The hidden dimension of the LSTM and the output dimension of the Prediction network. - -.. code-block:: yaml - - decoder: - _target_: nemo.collections.asr.modules.RNNTDecoder - normalization_mode: null - random_state_sampling: false - blank_as_pad: true - - prednet: - pred_hidden: ${model.model_defaults.pred_hidden} - pred_rnn_layers: 1 - t_max: null - dropout: 0.0 - -Joint Model -~~~~~~~~~~~ - -The Joint model is a simple feed-forward Multi-Layer Perceptron network. This MLP accepts the output of the Acoustic and Prediction models and computes a joint probability distribution over the entire vocabulary space. The base config for the Joint network can be found in the the ``joint`` section of `ContextNet <./models.html#ContextNet>`__ or other Transducer architectures. For further information refer to the ``Intro to Transducers`` tutorial in the ASR tutorial section. - -**This config can be copy-pasted into any custom transducer model with no modification.** - -The Joint model config has several essential components which we discuss below : - -* ``log_softmax``: Due to the cost of computing softmax on such large tensors, the Numba CUDA implementation of RNNT loss will implicitly compute the log softmax when called (so its inputs should be logits). The CPU version of the loss doesn't face such memory issues so it requires log-probabilities instead. Since the behaviour is different for CPU-GPU, the ``None`` value will automatically switch behaviour dependent on whether the input tensor is on a CPU or GPU device. - -* ``preserve_memory``: This flag will call ``torch.cuda.empty_cache()`` at certain critical sections when computing the Joint tensor. While this operation might allow us to preserve some memory, the empty_cache() operation is tremendously slow and will slow down training by an order of magnitude or more. It is available to use but not recommended. - -* ``fuse_loss_wer``: This flag performs "batch splitting" and then "fused loss + metric" calculation. It will be discussed in detail in the next tutorial that will train a Transducer model. - -* ``fused_batch_size``: When the above flag is set to True, the model will have two distinct "batch sizes". The batch size provided in the three data loader configs (``model.*_ds.batch_size``) will now be the ``Acoustic model`` batch size, whereas the ``fused_batch_size`` will be the batch size of the ``Prediction model``, the ``Joint model``, the ``transducer loss`` module and the ``decoding`` module. - -* ``jointnet.joint_hidden``: The hidden intermediate dimension of the joint network. - -.. code-block:: yaml - - joint: - _target_: nemo.collections.asr.modules.RNNTJoint - log_softmax: null # sets it according to cpu/gpu device - - # fused mode - fuse_loss_wer: false - fused_batch_size: 16 - - jointnet: - joint_hidden: ${model.model_defaults.joint_hidden} - activation: "relu" - dropout: 0.0 - -Sampled Softmax Joint Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -There are some situations where a large vocabulary with a Transducer model - such as for multilingual models with a large -number of languages. In this setting, we need to consider the cost of memory of training Transducer networks which does -not allow large vocabulary. - -For such cases, one can instead utilize the ``SampledRNNTJoint`` module instead of the usual ``RNNTJoint`` module, in order -to compute the loss using a sampled subset of the vocabulary rather than the full vocabulary file. - -It adds only one additional parameter : - -* ``n_samples``: Specifies the minimum number of tokens to sample from the vocabulary space, - excluding the RNNT blank token. If a given value is larger than the entire vocabulary size, - then the full vocabulary will be used. - -The only difference in config required is to replace ``nemo.collections.asr.modules.RNNTJoint`` with ``nemo.collections.asr.modules.SampledRNNTJoint`` - -.. code-block:: yaml - - joint: - _target_: nemo.collections.asr.modules.SampledRNNTJoint - n_samples: 500 - ... # All other arguments from RNNTJoint can be used after this. - - -Effect of Batch Splitting / Fused Batch step -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The following information below explain why memory is an issue when training Transducer models and how NeMo tackles the issue with its Fused Batch step. The material can be read for a thorough understanding, otherwise, it can be skipped. You can also follow these steps in the "ASR_with_Transducers" tutorial. - -**Diving deeper into the memory costs of Transducer Joint** - -One of the significant limitations of Transducers is the exorbitant memory cost of computing the Joint module. The Joint module is comprised of two steps. - -1) Projecting the Acoustic and Transcription feature dimensions to some standard hidden dimension (specified by model.model_defaults.joint_hidden) - -2) Projecting this intermediate hidden dimension to the final vocabulary space to obtain the transcription. - -Take the following example. - -BS=32 ; T (after 2x stride) = 800, U (with character encoding) = 400-450 tokens, Vocabulary size V = 28 (26 alphabet chars, space and apostrophe). Let the hidden dimension of the Joint model be 640 (Most Google Transducer papers use hidden dimension of 640). - -* :math:`Memory \, (Hidden, \, gb) = 32 \times 800 \times 450 \times 640 \times 4 = 29.49` gigabytes (4 bytes per float). - -* :math:`Memory \, (Joint, \, gb) = 32 \times 800 \times 450 \times 28 \times 4 = 1.290` gigabytes (4 bytes per float) - -**NOTE**: This is just for the forward pass! We need to double this memory to store gradients! This much memory is also just for the Joint model **alone**. Far more memory is required for the Prediction model as well as the large Acoustic model itself and its gradients! - -Even with mixed precision, that's $\sim 30$ GB of GPU RAM for just 1 part of the network + its gradients. - -Effect of Fused Batch Step -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The fundamental problem is that the joint tensor grows in size when ``[T x U]`` grows in size. This growth in memory cost is due to many reasons - either by model construction (downsampling) or the choice of dataset preprocessing (character tokenization vs. sub-word tokenization). - -Another dimension that NeMo can control is **batch**. Due to how we batch our samples, small and large samples all get clumped together into a single batch. So even though the individual samples are not all as long as the maximum length of T and U in that batch, when a batch of such samples is constructed, it will consume a significant amount of memory for the sake of compute efficiency. - -So as is always the case - **trade-off compute speed for memory savings**. - -The fused operation goes as follows : - -1) Forward the entire acoustic model in a single pass. (Use global batch size here for acoustic model - found in ``model.*_ds.batch_size``) - -2) Split the Acoustic Model's logits by ``fused_batch_size`` and loop over these sub-batches. - -3) Construct a sub-batch of same ``fused_batch_size`` for the Prediction model. Now the target sequence length is :math:`U_{sub-batch} < U`. - -4) Feed this :math:`U_{sub-batch}` into the Joint model, along with a sub-batch from the Acoustic model (with :math:`T_{sub-batch} < T)`. Remember, we only have to slice off a part of the acoustic model here since we have the full batch of samples :math:`(B, T, D)` from the acoustic model. - -5) Performing steps (3) and (4) yields :math:`T_{sub-batch}` and :math:`U_{sub-batch}`. Perform sub-batch joint step - costing an intermediate :math:`(B, T_{sub-batch}, U_{sub-batch}, V)` in memory. - -6) Compute loss on sub-batch and preserve in a list to be later concatenated. - -7) Compute sub-batch metrics (such as Character / Word Error Rate) using the above Joint tensor and sub-batch of ground truth labels. Preserve the scores to be averaged across the entire batch later. - -8) Delete the sub-batch joint matrix :math:`(B, T_{sub-batch}, U_{sub-batch}, V)`. Only gradients from .backward() are preserved now in the computation graph. - -9) Repeat steps (3) - (8) until all sub-batches are consumed. - -10) Cleanup step. Compute full batch WER and log. Concatenate loss list and pass to PTL to compute the equivalent of the original (full batch) Joint step. Delete ancillary objects necessary for sub-batching. - -Transducer Decoding -~~~~~~~~~~~~~~~~~~~ - -Models which have been trained with CTC can transcribe text simply by performing a regular argmax over the output of their decoder. For transducer-based models, the three networks must operate in a synchronized manner in order to transcribe the acoustic features. The base config for the Transducer decoding step can be found in the the ``decoding`` section of `ContextNet <./models.html#ContextNet>`__ or other Transducer architectures. For further information refer to the ``Intro to Transducers`` tutorial in the ASR tutorial section. - -**This config can be copy-pasted into any custom transducer model with no modification.** - -The most important component at the top level is the ``strategy``. It can take one of many values: - -* ``greedy``: This is sample-level greedy decoding. It is generally exceptionally slow as each sample in the batch will be decoded independently. For publications, this should be used alongside batch size of 1 for exact results. - -* ``greedy_batch``: This is the general default and should nearly match the ``greedy`` decoding scores (if the acoustic features are not affected by feature mixing in batch mode). Even for small batch sizes, this strategy is significantly faster than ``greedy``. - -* ``beam``: Runs beam search with the implicit language model of the Prediction model. It will generally be quite slow, and might need some tuning of the beam size to get better transcriptions. - -* ``tsd``: Time synchronous decoding. Please refer to the paper: `Alignment-Length Synchronous Decoding for RNN Transducer `_ for details on the algorithm implemented. Time synchronous decoding (TSD) execution time grows by the factor T * max_symmetric_expansions. For longer sequences, T is greater and can therefore take a long time for beams to obtain good results. TSD also requires more memory to execute. - -* ``alsd``: Alignment-length synchronous decoding. Please refer to the paper: `Alignment-Length Synchronous Decoding for RNN Transducer `_ for details on the algorithm implemented. Alignment-length synchronous decoding (ALSD) execution time is faster than TSD, with a growth factor of T + U_max, where U_max is the maximum target length expected during execution. Generally, T + U_max < T * max_symmetric_expansions. However, ALSD beams are non-unique. Therefore it is required to use larger beam sizes to achieve the same (or close to the same) decoding accuracy as TSD. For a given decoding accuracy, it is possible to attain faster decoding via ALSD than TSD. - -* ``maes``: Modified Adaptive Expansion Search Decoding. Please refer to the paper `Accelerating RNN Transducer Inference via Adaptive Expansion Search `_. Modified Adaptive Synchronous Decoding (mAES) execution time is adaptive w.r.t the number of expansions (for tokens) required per timestep. The number of expansions can usually be constrained to 1 or 2, and in most cases 2 is sufficient. This beam search technique can possibly obtain superior WER while sacrificing some evaluation time. - -.. code-block:: yaml - - decoding: - strategy: "greedy_batch" - - # preserve decoding alignments - preserve_alignments: false - - # Overrides the fused batch size after training. - # Setting it to -1 will process whole batch at once when combined with `greedy_batch` decoding strategy - fused_batch_size: Optional[int] = -1 - - # greedy strategy config - greedy: - max_symbols: 10 - - # beam strategy config - beam: - beam_size: 2 - score_norm: true - softmax_temperature: 1.0 # scale the logits by some temperature prior to softmax - tsd_max_sym_exp: 10 # for Time Synchronous Decoding, int > 0 - alsd_max_target_len: 5.0 # for Alignment-Length Synchronous Decoding, float > 1.0 - maes_num_steps: 2 # for modified Adaptive Expansion Search, int > 0 - maes_prefix_alpha: 1 # for modified Adaptive Expansion Search, int > 0 - maes_expansion_beta: 2 # for modified Adaptive Expansion Search, int >= 0 - maes_expansion_gamma: 2.3 # for modified Adaptive Expansion Search, float >= 0 - -Transducer Loss -~~~~~~~~~~~~~~~ - -This section configures the type of Transducer loss itself, along with possible sub-sections. By default, an optimized implementation of Transducer loss will be used which depends on Numba for CUDA acceleration. The base config for the Transducer loss section can be found in the the ``loss`` section of `ContextNet <./models.html#ContextNet>`__ or other Transducer architectures. For further information refer to the ``Intro to Transducers`` tutorial in the ASR tutorial section. - -**This config can be copy-pasted into any custom transducer model with no modification.** - -The loss config is based on a resolver pattern and can be used as follows: - -1) ``loss_name``: ``default`` is generally a good option. Will select one of the available resolved losses and match the kwargs from a sub-configs passed via explicit ``{loss_name}_kwargs`` sub-config. - -2) ``{loss_name}_kwargs``: This sub-config is passed to the resolved loss above and can be used to configure the resolved loss. - - -.. code-block:: yaml - - loss: - loss_name: "default" - warprnnt_numba_kwargs: - fastemit_lambda: 0.0 - -FastEmit Regularization -^^^^^^^^^^^^^^^^^^^^^^^ - -FastEmit Regularization is supported for the default Numba based WarpRNNT loss. Recently proposed regularization approach - `FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization `_ allows us near-direct control over the latency of transducer models. - -Refer to the above paper for results and recommendations of ``fastemit_lambda``. - - -.. _Hybrid-ASR-TTS_model__Config: - -Hybrid ASR-TTS Model Configuration ----------------------------------- - -:ref:`Hybrid ASR-TTS model ` consists of three parts: - -* ASR model (``EncDecCTCModelBPE``, ``EncDecRNNTBPEModel`` or ``EncDecHybridRNNTCTCBPEModel``) -* TTS Mel Spectrogram Generator (currently, only :ref:`FastPitch ` model is supported) -* :ref:`Enhancer model ` (optional) - -Also, the config allows to specify :ref:`text-only dataset `. - -Main parts of the config: - -* ASR model - * ``asr_model_path``: path to the ASR model checkpoint (`.nemo`) file, loaded only once, then the config of the ASR model is stored in the ``asr_model`` field - * ``asr_model_type``: needed only when training from scratch. ``rnnt_bpe`` corresponds to ``EncDecRNNTBPEModel``, ``ctc_bpe`` to ``EncDecCTCModelBPE``, ``hybrid_rnnt_ctc_bpe`` to ``EncDecHybridRNNTCTCBPEModel`` - * ``asr_model_fuse_bn``: fusing BatchNorm in the pretrained ASR model, can improve quality in finetuning scenario -* TTS model - * ``tts_model_path``: path to the pretrained TTS model checkpoint (`.nemo`) file, loaded only once, then the config of the model is stored in the ``tts_model`` field -* Enhancer model - * ``enhancer_model_path``: optional path to the enhancer model. Loaded only once, the config is stored in the ``enhancer_model`` field -* ``train_ds`` - * ``text_data``: properties related to text-only data - * ``manifest_filepath``: path (or paths) to :ref:`text-only dataset ` manifests - * ``speakers_filepath``: path (or paths) to the text file containing speaker ids for the multi-speaker TTS model (speakers are sampled randomly during training) - * ``min_words`` and ``max_words``: parameters to filter text-only manifests by the number of words - * ``tokenizer_workers``: number of workers for initial tokenization (when loading the data). ``num_CPUs / num_GPUs`` is a recommended value. - * ``asr_tts_sampling_technique``, ``asr_tts_sampling_temperature``, ``asr_tts_sampling_probabilities``: sampling parameters for text-only and audio-text data (if both specified). Correspond to ``sampling_technique``, ``sampling_temperature``, and ``sampling_probabilities`` parameters of the :mod:`ConcatDataset `. - * all other components are similar to conventional ASR models -* ``validation_ds`` and ``test_ds`` correspond to the underlying ASR model - - -.. code-block:: yaml - - model: - sample_rate: 16000 - - # asr model - asr_model_path: ??? - asr_model: null - asr_model_type: null # rnnt_bpe, ctc_bpe or hybrid_rnnt_ctc_bpe; needed only if instantiating from config, otherwise type is auto inferred - asr_model_fuse_bn: false # only ConformerEncoder supported now, use false for other models - - # tts model - tts_model_path: ??? - tts_model: null - - # enhancer model - enhancer_model_path: null - enhancer_model: null - - train_ds: - text_data: - manifest_filepath: ??? - speakers_filepath: ??? - min_words: 1 - max_words: 45 # 45 - recommended value, ~16.7 sec for LibriSpeech - tokenizer_workers: 1 - asr_tts_sampling_technique: round-robin # random, round-robin, temperature - asr_tts_sampling_temperature: null - asr_tts_sampling_probabilities: null # [0.5,0.5] – ASR,TTS - manifest_filepath: ??? - batch_size: 16 # you may increase batch_size if your memory allows - # other params - -Finetuning with Text-Only Data -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -To finetune existing ASR model using text-only data use ``/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py`` script with the corresponding config ``/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml``. - -Please specify paths to all the required models (ASR, TTS, and Enhancer checkpoints), along with ``train_ds.text_data.manifest_filepath`` and ``train_ds.text_data.speakers_filepath``. - -.. code-block:: shell - - python speech_to_text_bpe_with_text_finetune.py \ - model.asr_model_path= \ - model.tts_model_path= \ - model.enhancer_model_path= \ - model.asr_model_fuse_bn= \ - model.train_ds.manifest_filepath= \ - model.train_ds.text_data.manifest_filepath= \ - model.train_ds.text_data.speakers_filepath= \ - model.train_ds.text_data.tokenizer_workers=4 \ - model.validation_ds.manifest_filepath= \ - model.train_ds.batch_size= - -Training from Scratch -~~~~~~~~~~~~~~~~~~~~~ - -To train ASR model from scratch using text-only data use ``/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py`` script with conventional ASR model config, e.g. ``/examples/asr/conf/conformer/conformer_ctc_bpe.yaml`` or ``/examples/asr/conf/conformer/conformer_transducer_bpe.yaml`` - -Please specify the ASR model type, paths to the TTS model, and (optional) enhancer, along with text-only data-related fields. -Use ``++`` or ``+`` markers for these options, since the options are not present in the original ASR model config. - -.. code-block:: shell - - python speech_to_text_bpe_with_text.py \ - ++asr_model_type= \ - ++tts_model_path= \ - ++enhancer_model_path= \ - ++model.train_ds.text_data.manifest_filepath= \ - ++model.train_ds.text_data.speakers_filepath= \ - ++model.train_ds.text_data.min_words=1 \ - ++model.train_ds.text_data.max_words=45 \ - ++model.train_ds.text_data.tokenizer_workers=4 - -Fine-tuning Configurations --------------------------- - -All ASR scripts support easy fine-tuning by partially/fully loading the pretrained weights from a checkpoint into the **currently instantiated model**. Note that the currently instantiated model should have parameters that match the pre-trained checkpoint (such that weights may load properly). In order to directly fine-tune a pre-existing checkpoint, please follow the tutorial `ASR Language Fine-tuning. `_ - -Models can be fine-tuned in two ways: -* By updating or retaining current tokenizer alone -* By updating model architecture and tokenizer - -Fine-tuning by updating or retaining current tokenizer -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In this case, the model architecture is not updated. The model is initialized with the pre-trained weights by -two ways: - -1) Providing a path to a NeMo model (via ``init_from_nemo_model``) -2) Providing a name of a pretrained NeMo model (which will be downloaded via the cloud) (via ``init_from_pretrained_model``) - -Then users can use existing tokenizer or update the tokenizer with new vocabulary. This is useful when users don't want to update the model architecture -but want to update the tokenizer with new vocabulary. - -The same script can be used to finetune CTC, RNNT or Hybrid models as well. - -/examples/asr/speech_to_text_finetune.py script supports this type of fine-tuning with the following arguments: - -.. code-block:: sh - - python examples/asr/speech_to_text_finetune.py \ - --config-path= \ - --config-name=) \ - model.train_ds.manifest_filepath="" \ - model.validation_ds.manifest_filepath="" \ - model.tokenizer.update_tokenizer= \ # True to update tokenizer, False to retain existing tokenizer - model.tokenizer.dir= \ # Path to tokenizer dir when update_tokenizer=True - model.tokenizer.type= \ # tokenizer type when update_tokenizer=True - trainer.devices=-1 \ - trainer.accelerator='gpu' \ - trainer.max_epochs=50 \ - +init_from_nemo_model="" (or +init_from_pretrained_model="") - -Fine-tuning by changing model architecture and tokenizer -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If users want to update the model architecture as well they can use the following script: - -For providing pretrained model, users can provide Pre-trained weights in multiple ways - - -1) Providing a path to a NeMo model (via ``init_from_nemo_model``) -2) Providing a name of a pretrained NeMo model (which will be downloaded via the cloud) (via ``init_from_pretrained_model``) -3) Providing a path to a Pytorch Lightning checkpoint file (via ``init_from_ptl_ckpt``) - -There are multiple ASR subtasks inside the ``examples/asr/`` directory, you can substitute the ```` tag below. - -.. code-block:: sh - - python examples/asr//script_to_.py \ - --config-path= \ - --config-name=) \ - model.train_ds.manifest_filepath="" \ - model.validation_ds.manifest_filepath="" \ - trainer.devices=-1 \ - trainer.accelerator='gpu' \ - trainer.max_epochs=50 \ - +init_from_nemo_model="" # (or +init_from_pretrained_model, +init_from_ptl_ckpt ) - -To reinitialize part of the model, to make it different from the pretrained model, users can mention them through config: - -.. code-block:: yaml - - init_from_nemo_model: "" - asr_model: - include: ["preprocessor","encoder"] - exclude: ["decoder"] - -Fine-tuning Execution Flow Diagram ----------------------------------- - -When preparing your own training or fine-tuning scripts, please follow the execution flow diagram order for correct inference. - -Depending on the type of model, there may be extra steps that must be performed - - -* CTC Models - `Examples directory for CTC Models `_ -* RNN Transducer Models - `Examples directory for Transducer Models `_ diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/asrlm_results.csv b/SoundScribe/SpeakerID/docs/source/asr/data/asrlm_results.csv deleted file mode 100644 index d9a395cb8b75ffc59300341feeeba2c8be2aa5fa..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/asrlm_results.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Model Base Class,Model Card -asrlm_en_transformer_large_ls,TransformerLMModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:asrlm_en_transformer_large_ls" diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_by.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_by.csv deleted file mode 100644 index 750dfd82ff94354f5ccbd2e361d0c33ca0eea463..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_by.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model,Model Base Class,Model Card -stt_by_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_by_fastconformer_hybrid_large_pc" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_ca.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_ca.csv deleted file mode 100644 index bd7e174b922faaf7e099ad843ede10cda6b46b8c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_ca.csv +++ /dev/null @@ -1,4 +0,0 @@ -Model,Model Base Class,Model Card -stt_ca_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_quartznet15x5" -stt_ca_conformer_ctc_large,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_conformer_ctc_large" -stt_ca_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_conformer_transducer_large" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_code_switching.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_code_switching.csv deleted file mode 100644 index 1320f19911e681be3cc3440651a6288e8891159f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_code_switching.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model,Model Base Class,Model Card -stt_enes_conformer_ctc_large_codesw,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_enes_conformer_ctc_large_codesw" -stt_enes_conformer_transducer_large_codesw,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_enes_conformer_transducer_large_codesw" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_de.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_de.csv deleted file mode 100644 index 6084e95c37c0a8c405f782764a1ffd0a3eeaa2d0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_de.csv +++ /dev/null @@ -1,7 +0,0 @@ -Model,Model Base Class,Model Card -stt_de_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_quartznet15x5" -stt_de_citrinet_1024,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_citrinet_1024" -stt_de_contextnet_1024,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_contextnet_1024" -stt_de_conformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_conformer_ctc_large" -stt_de_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_conformer_transducer_large" -stt_de_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_fastconformer_hybrid_large_pc" diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_en.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_en.csv deleted file mode 100644 index 1669ecdeefb5a4b777a814b351b0d5e7756beb50..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_en.csv +++ /dev/null @@ -1,41 +0,0 @@ -Model Name,Model Base Class,Model Card -QuartzNet15x5Base-En,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels" -stt_en_jasper10x5dr,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr" -stt_en_citrinet_256,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256" -stt_en_citrinet_512,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512" -stt_en_citrinet_1024,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024" -stt_en_citrinet_256_gamma_0_25,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256_gamma_0_25" -stt_en_citrinet_512_gamma_0_25,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512_gamma_0_25" -stt_en_citrinet_1024_gamma_0_25,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024_gamma_0_25" -stt_en_contextnet_256_mls,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_256_mls" -stt_en_contextnet_512_mls,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_512_mls" -stt_en_contextnet_1024_mls,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_1024_mls" -stt_en_contextnet_256,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_256" -stt_en_contextnet_512,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_512" -stt_en_contextnet_1024,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_1024" -stt_en_conformer_ctc_small,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small" -stt_en_conformer_ctc_medium,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium" -stt_en_conformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large" -stt_en_conformer_ctc_xlarge,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_xlarge" -stt_en_conformer_ctc_small_ls,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small_ls" -stt_en_conformer_ctc_medium_ls,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium_ls" -stt_en_conformer_ctc_large_ls,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large_ls" -stt_en_conformer_transducer_large_ls,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_large_ls" -stt_en_conformer_transducer_small,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_small" -stt_en_conformer_transducer_medium,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_medium" -stt_en_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_large" -stt_en_conformer_transducer_xlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_xlarge" -stt_en_conformer_transducer_xxlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_xxlarge" -stt_en_fastconformer_ctc_large_ls,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_large_ls" -stt_en_fastconformer_transducer_large_ls,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_large_ls" -stt_en_fastconformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_large" -stt_en_fastconformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_large" -stt_en_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_pc" -stt_en_fastconformer_transducer_xlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_xlarge" -stt_en_fastconformer_ctc_xlarge,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_xlarge" -stt_en_fastconformer_transducer_xxlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_xxlarge" -stt_en_fastconformer_ctc_xxlarge,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_xxlarge" -stt_en_fastconformer_hybrid_large_streaming_80ms,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_streaming_80ms" -stt_en_fastconformer_hybrid_large_streaming_480ms,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_streaming_480ms" -stt_en_fastconformer_hybrid_large_streaming_1040ms,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_streaming_1040ms" -stt_en_fastconformer_hybrid_large_streaming_multi,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_streaming_multi" diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_es.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_es.csv deleted file mode 100644 index 0fa8b0ecedf1633030aac2d12ecbe3d90273360b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_es.csv +++ /dev/null @@ -1,8 +0,0 @@ -Model,Model Base Class,Model Card -stt_es_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_quartznet15x5" -stt_es_citrinet_512,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_512" -stt_es_citrinet_1024_gamma_0_25,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_1024_gamma_0_25" -stt_es_conformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_conformer_ctc_large" -stt_es_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_conformer_transducer_large" -stt_es_contextnet_1024,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_contextnet_1024" -stt_es_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_fastconformer_hybrid_large_pc" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_fr.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_fr.csv deleted file mode 100644 index 0f17318caeadfc304eabdcc5b5d70390de616451..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_fr.csv +++ /dev/null @@ -1,9 +0,0 @@ -Model,Model Base Class,Model Card -stt_fr_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_quartznet15x5" -stt_fr_citrinet_1024_gamma_0_25,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_citrinet_1024_gamma_0_25" -stt_fr_no_hyphen_citrinet_1024_gamma_0_25,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_citrinet_1024_gamma_0_25" -stt_fr_contextnet_1024,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_contextnet_1024" -stt_fr_conformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_conformer_ctc_large" -stt_fr_no_hyphen_conformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_conformer_ctc_large" -stt_fr_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_conformer_transducer_large" -stt_fr_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_fastconformer_hybrid_large_pc" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_hi.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_hi.csv deleted file mode 100644 index 4d3df532ed2e253f74d0e7a0c66f5ae0381bb75e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_hi.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Model Base Class,Model Card -stt_hi_conformer_ctc_medium,EncDecCTCModelBPE,"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_hi_conformer_ctc_medium" diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_hr.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_hr.csv deleted file mode 100644 index 35a5b5f04f3996e2826f44e6e2e21f623f8bf841..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_hr.csv +++ /dev/null @@ -1,4 +0,0 @@ -Model,Model Base Class,Model Card -stt_hr_conformer_ctc_large,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_conformer_ctc_large" -stt_hr_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_conformer_transducer_large" -stt_hr_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_fastconformer_hybrid_large_pc" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_it.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_it.csv deleted file mode 100644 index 2301949665734fd272dcab30bbdc6cd5723414cd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_it.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model,Model Base Class,Model Card -stt_it_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_quartznet15x5" -stt_it_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_fastconformer_hybrid_large_pc" diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_kab.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_kab.csv deleted file mode 100644 index 76a54cfe42de4de29f8d702b50262abc546b193a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_kab.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model,Model Base Class,Model Card -stt_kab_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_kab_conformer_transducer_large" diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_mr.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_mr.csv deleted file mode 100644 index 00ae7211bd75729b5305deca275e26c140896e0c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_mr.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model Name,Model Base Class,Model Card -stt_mr_conformer_ctc_medium,EncDecCTCModelBPE,"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_mr_conformer_ctc_medium" - diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_multilingual.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_multilingual.csv deleted file mode 100644 index c56f05def825f16cfa637783507322068237228a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_multilingual.csv +++ /dev/null @@ -1,5 +0,0 @@ -Model,Model Base Class,Model Card -stt_enes_conformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_enes_conformer_ctc_large" -stt_enes_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_enes_conformer_transducer_large" -stt_multilingual_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_multilingual_fastconformer_hybrid_large_pc" -stt_multilingual_fastconformer_hybrid_large_pc_blend_eu,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_multilingual_fastconformer_hybrid_large_pc_blend_eu" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_pl.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_pl.csv deleted file mode 100644 index e3ad9bdb50b780065bdf18d328011c821f7fc240..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_pl.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model,Model Base Class,Model Card -stt_pl_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_pl_quartznet15x5" -stt_pl_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_pl_fastconformer_hybrid_large_pc" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_ru.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_ru.csv deleted file mode 100644 index 66b9b321f5fe96b4d0fe698daaf1f0ba01cb4343..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_ru.csv +++ /dev/null @@ -1,4 +0,0 @@ -Model,Model Base Class,Model Card -stt_ru_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_quartznet15x5" -stt_ru_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_fastconformer_hybrid_large_pc" - diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_rw.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_rw.csv deleted file mode 100644 index 0264fc8a70cdc40ba96ce38701c7e837ee773f4f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_rw.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model,Model Base Class,Model Card -stt_rw_conformer_ctc_large,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_rw_conformer_ctc_large" -stt_rw_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_rw_conformer_transducer_large" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_ua.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_ua.csv deleted file mode 100644 index df1b6c383d3be39e2b6b731918dc60fffd77e999..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_ua.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model,Model Base Class,Model Card -stt_ua_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ua_fastconformer_hybrid_large_pc" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_zh.csv b/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_zh.csv deleted file mode 100644 index 3d98f2fa4cec36cc1652fe2a18e4032dd0e377eb..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/benchmark_zh.csv +++ /dev/null @@ -1,4 +0,0 @@ -Model,Model Base Class,Model Card -stt_zh_citrinet_512,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512" -stt_zh_citrinet_1024_gamma_0_25,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_1024_gamma_0_25" -stt_zh_conformer_transducer_large,EncDecRNNTModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_conformer_transducer_large" diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/be/conformer_be.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/be/conformer_be.csv deleted file mode 100644 index 12fcfe0e554bced7865e409b5ad57e6557441425..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/be/conformer_be.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model Name,Language,MCV Test-Set v10 (be) -stt_be_conformer_ctc_large,be,4.7 % -stt_be_conformer_transducer_large,be,3.8 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/by/fastconformer_by.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/by/fastconformer_by.csv deleted file mode 100644 index c03cc945d99dde5c272d45f6960ccd4a3eadde06..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/by/fastconformer_by.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set v12.0 (be),MCV Test-Set v12.0 (be) -stt_by_fastconformer_hybrid_large_pc,by,2.7 %,2.7 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/ca/conformer_ca.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/ca/conformer_ca.csv deleted file mode 100644 index bc30b90a25b4f9dcf1d782fe2e4b44484b265b2c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/ca/conformer_ca.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (ca),MCV Dev-Set v9.0 (ca),MCV Test-Set v9.0 (ca) -stt_ca_conformer_ctc_large,ca,,4.70,4.27 -stt_ca_conformer_transducer_large,ca,,4.43,3.85 diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/ca/quartznet15x5_ca.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/ca/quartznet15x5_ca.csv deleted file mode 100644 index 6b826662e25ecdeeb1d68483f6f2260e0f9f38fd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/ca/quartznet15x5_ca.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (ca),MCV Dev-Set v9.0 (ca),MCV Test-Set v9.0 (ca) -stt_ca_quartznet15x5,ca,6.0,, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/de/citrinet_de.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/de/citrinet_de.csv deleted file mode 100644 index 1768373077b9c80671389e9f364ccc3c86614691..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/de/citrinet_de.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v12.0 (de),MCV Dev-Set v7.0 (de),MCV Test-Set v12.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) -stt_de_citrinet_1024,de,,,6.63,,7.59,4.06,5.07,12.33,10.02 diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/de/conformer_de.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/de/conformer_de.csv deleted file mode 100644 index 1bd1443de00e3d424b0a5aa5988c036b7a16137d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/de/conformer_de.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v12.0 (de),MCV Dev-Set v7.0 (de),MCV Test-Set v12.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) -stt_de_conformer_ctc_large,de,,,5.84,,6.68,3.85,4.63,12.56,10.51 -stt_de_conformer_transducer_large,de,,,4.75,,5.36,3.46,4.19,11.21,9.14 diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/de/contextnet_de.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/de/contextnet_de.csv deleted file mode 100644 index 40be2181a77f86afa75884202e6259cc7b7f1df5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/de/contextnet_de.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v12.0 (de),MCV Dev-Set v7.0 (de),MCV Test-Set v12.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) -stt_de_contextnet_1024,de,,,4.76,,5.5,3.53,4.2,11.32,9.4 diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/de/fastconformer_de.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/de/fastconformer_de.csv deleted file mode 100644 index fe6e6491f4438af2b1dd84d86010a650c07e0692..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/de/fastconformer_de.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v12.0 (de),MCV Dev-Set v7.0 (de),MCV Test-Set v12.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) -stt_de_fastconformer_hybrid_large_pc,de,,4.2 %,,4.9 %,,3.3 %,3.8 %,10.8 %,8.7 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/de/quartznet15x5_de.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/de/quartznet15x5_de.csv deleted file mode 100644 index 22da250a97f358200fac2b1a5816f0fe7bd2e9f4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/de/quartznet15x5_de.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v12.0 (de),MCV Dev-Set v7.0 (de),MCV Test-Set v12.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) -stt_de_quartznet15x5,de,11.78,,,,,,,, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/citrinet_en.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/citrinet_en.csv deleted file mode 100644 index 47f180e7298e22e6bee67d866426368ef999556b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/citrinet_en.csv +++ /dev/null @@ -1,7 +0,0 @@ -Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,SPGI Test,VoxPopuli Test (en),WSJ Dev 93,WSJ Eval 92 -stt_en_citrinet_256,en,,,4.2 % WER,10.7 % WER,4.4 % WER,10.7 % WER,,,,,,,,,,,, -stt_en_citrinet_512,en,,,3.7 % WER,8.9 % WER,3.7 % WER,8.9 % WER,,,,,,,,,,,, -stt_en_citrinet_1024,en,,,3.7 % WER,8.3 % WER,3.6 % WER,7.9 % WER,,,,,,,,,,,, -stt_en_citrinet_256_gamma_0_25,en,,,4.7 %,10.6 %,4.8 %,10.7 %,,,,,8.3 %,,,,,,5.8 %,3.6 % -stt_en_citrinet_512_gamma_0_25,en,,,4.0 %,9.0 %,3.9 %,9.0 %,,,,,6.9 %,,,,,,4.4 %,3.6 % -stt_en_citrinet_1024_gamma_0_25,en,,,3.4 %,7.7 %,3.4 %,7.6 %,,,,,6.2 %,,,,,,4.0 %,2.5 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/conformer_en.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/conformer_en.csv deleted file mode 100644 index 2b31a07b842a3a4f1db291931f167f6873fc652e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/conformer_en.csv +++ /dev/null @@ -1,28 +0,0 @@ -Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,SPGI Test,VoxPopuli Test (en),WSJ Dev 93,WSJ Eval 92 -stt_en_conformer_ctc_small,en,,,3.6,8.1,3.7,8.1,,,,,,,,,,,, -stt_en_conformer_ctc_medium,en,,,2.5,5.8,2.6,5.9,,,,,,,,,,,, -stt_en_conformer_ctc_large,en,,,1.9,4.4,2.1,4.5,,,,,,,,,,,, -stt_en_conformer_ctc_xlarge,en,,,1.77 %,3.79 %,2.00 %,3.74 %,,7.88 %,,5.99 %,,6.44 %,22.90 %,5.50 %,,,2.36 %, -stt_en_conformer_ctc_small_ls,en,,,3.3,8.8,3.4,8.8,,,,,,,,,,,, -stt_en_conformer_ctc_medium_ls,en,,,2.7,7.4,3.0,7.3,,,,,,,,,,,, -stt_en_conformer_ctc_large_ls,en,,,2.4,6.2,2.7,6.0,,,,,,,,,,,, -stt_en_conformer_transducer_small,en,,,2.8,6.6,2.5,6.6,,,,,,,,,,,, -stt_en_conformer_transducer_medium,en,,,2.0,4.6,2.1,4.7,,,,,,,,,,,, -stt_en_conformer_transducer_large,en,,,1.6,3.5,1.7,3.7,,,,,,,,,,,, -stt_en_conformer_transducer_large_ls,en,,,2.1,5.0,2.3,5.1,,,,,,,,,,,, -stt_en_conformer_transducer_xlarge,en,,,1.48 %,2.95 %,1.62 %,3.01 %,,6.46 %,4.59 %,5.32 %,5.70 %,6.47 %,21.32 %,,,,2.05 %,1.17 % -stt_en_conformer_transducer_xxlarge,en,,,1.52 %,3.09 %,1.72 %,3.14 %,,,5.29 %,5.85 %,6.64 %,,,,,,2.42 %,1.49 % -stt_en_fastconformer_hybrid_large_streaming_80ms (CTC),en,,,,,3.5 %,8.1 %,,,10.2 %,7.2 %,,,,,,,3.5 %,2.3 % -stt_en_fastconformer_hybrid_large_streaming_480ms (CTC),en,,,,,3.6 %,7.5 %,,,9.8 %,7.0 %,,,,,,,3.5 %,2.1 % -stt_en_fastconformer_hybrid_large_streaming_1040ms (CTC),en,,,,,2.7 %,6.4 %,,,9.0 %,7.0 %,,,,,,,3.2 %,1.9 % -stt_en_fastconformer_hybrid_large_streaming_80ms (RNNT),en,,,,,2.7 %,6.5 %,,,9.1 %,6.9 %,,,,,,,3.2 %,1.9 % -stt_en_fastconformer_hybrid_large_streaming_480ms (RNNT),en,,,,,2.7 %,6.1 %,,,8.5 %,6.7 %,,,,,,,3.1 %,1.8 % -stt_en_fastconformer_hybrid_large_streaming_1040ms (RNNT),en,,,,,2.3 %,5.5 %,,,8.0 %,6.6 %,,,,,,,2.9 %,1.6 % -stt_en_fastconformer_hybrid_large_streaming_multi (RNNT - 0ms),en,,,,,,7.0 %,,,,,,,,,,,, -stt_en_fastconformer_hybrid_large_streaming_multi (RNNT - 80ms),en,,,,,,6.4 %,,,,,,,,,,,, -stt_en_fastconformer_hybrid_large_streaming_multi (RNNT - 480),en,,,,,,5.7 %,,,,,,,,,,,, -stt_en_fastconformer_hybrid_large_streaming_multi (RNNT - 1040),en,,,,,,5.4 %,,,,,,,,,,,, -stt_en_fastconformer_hybrid_large_streaming_multi (CTC - 0ms),en,,,,,,8.4 %,,,,,,,,,,,, -stt_en_fastconformer_hybrid_large_streaming_multi (CTC - 80ms),en,,,,,,7.8 %,,,,,,,,,,,, -stt_en_fastconformer_hybrid_large_streaming_multi (CTC - 480),en,,,,,,6.7 %,,,,,,,,,,,, -stt_en_fastconformer_hybrid_large_streaming_multi (CTC - 1040),en,,,,,,6.2 %,,,,,,,,,,,, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/contextnet_en.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/contextnet_en.csv deleted file mode 100644 index 6f986e28039a5b7471e36c67729f87f66019b621..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/contextnet_en.csv +++ /dev/null @@ -1,7 +0,0 @@ -Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,SPGI Test,VoxPopuli Test (en),WSJ Dev 93,WSJ Eval 92 -stt_en_contextnet_256,en,,,3.3 %,7.9 %,3.3 %,8.0 %,,,9.7 %,11.0 %,7.1 %,,,,,,4.6 %,3.2 % -stt_en_contextnet_512,en,,,2.0 %,4.8 %,2.2 %,5.0 %,,,6.6 %,7.3 %,5.9 %,,,,,,2.8 %,1.4 % -stt_en_contextnet_1024,en,,,1.7 %,3.8 %,1.9 %,4.0 %,,7.9 %,,5.9 %,5.2 %,6.5 %,21.7 %,4.7 %,,,2.3 %,1.3 % -stt_en_contextnet_256_mls,en,,,,9.0 %,,9.2 %,,,9.4 %,10.9 %,,,,,,,, -stt_en_contextnet_512_mls,en,,,,5.2 %,,5.2 %,,,5.6 %,6.6 %,,,,,,,, -stt_en_contextnet_1024_mls,en,,,,4.1 %,,4.2 %,,,4.6 %,5.6 %,,,,,,,, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/fastconformer_en.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/fastconformer_en.csv deleted file mode 100644 index e993273dfbf42a2aceae5c3a5938fd6f97aa58fa..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/fastconformer_en.csv +++ /dev/null @@ -1,4 +0,0 @@ -Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,SPGI Test,VoxPopuli Test (en),WSJ Dev 93,WSJ Eval 92 -stt_en_fastconformer_ctc_large,en,,,1.9,4.2,2.1,4.2,,,,,,,,,,,, -stt_en_fastconformer_transducer_large,en,,,2.0,3.8,1.8,3.8,,,,,,,,,,,, -stt_en_fastconformer_hybrid_large_pc,en,8.0 %,10.3 %,,,2.0 %,4.1 %,8.2 %,,,4.5 %,4.6 %,,,,2.3 %,4.5 %,, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/jasper10x5dr_en.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/jasper10x5dr_en.csv deleted file mode 100644 index a812337ac0ebc2204bd31aedb320907dde29b0f7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/jasper10x5dr_en.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,SPGI Test,VoxPopuli Test (en),WSJ Dev 93,WSJ Eval 92 -stt_en_jasper10x5dr,en,,,3.74,10.21,,,,,,,,,,,,,, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/quartznet15x5_en.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/quartznet15x5_en.csv deleted file mode 100644 index 67b52bc9a0da5355447b2ac4d0440ccbc4a647e3..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/quartznet15x5_en.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,SPGI Test,VoxPopuli Test (en),WSJ Dev 93,WSJ Eval 92 -stt_en_quartznet15x5,en,,,4.38,11.3,,,,,,,,,,,,,, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/squeezeformer_en.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/squeezeformer_en.csv deleted file mode 100644 index ecd18cc40b972529be0df68e795044c9be67d0f9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/en/squeezeformer_en.csv +++ /dev/null @@ -1,7 +0,0 @@ -Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,SPGI Test,VoxPopuli Test (en),WSJ Dev 93,WSJ Eval 92 -stt_en_squeezeformer_ctc_xsmall_ls,en,,,3.6 %,9.7 %,3.8 %,9.4 %,,,,,,,,,,,, -stt_en_squeezeformer_ctc_small_ls,en,,,2.9 %,7.4 %,3.1 %,7.4 %,,,,,,,,,,,, -stt_en_squeezeformer_ctc_small_medium_ls,en,,,2.7 %,7.0 %,2.8 %,7.1 %,,,,,,,,,,,, -stt_en_squeezeformer_ctc_medium_ls,en,,,2.4 %,6.2 %,2.6 %,6.3 %,,,,,,,,,,,, -stt_en_squeezeformer_ctc_medium_large_ls,en,,,2.3 %,6.0 %,2.5 %,5.9 %,,,,,,,,,,,, -stt_en_squeezeformer_ctc_large_ls,en,,,2.3 %,5.7 %,2.4 %,5.7 %,,,,,,,,,,,, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/enes/conformer_enes.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/enes/conformer_enes.csv deleted file mode 100644 index 983e664d4de1b60053cb43a039b0c4ceafe10a00..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/enes/conformer_enes.csv +++ /dev/null @@ -1,5 +0,0 @@ -Model Name,Language,Fisher-Dev-En,Fisher-Dev-Es,Fisher-Test-En,Fisher-Test-Es,Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Dev-Set v7.0 (en),MCV Dev-Set v7.0 (es),MCV Test-Set v7.0 (en),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Dev (es),MLS Test (en),MLS Test (es),VoxPopuli Dev (en),VoxPopuli Dev (es),VoxPopuli Test (en),VoxPopuli Test (es) -stt_enes_conformer_ctc_large,enes,,16.7 %,,,2.2 %,5.5 %,2.6 %,5.5 %,5.8 %,,,,,3.5 %,,,,5.7 %,, -stt_enes_conformer_ctc_large_codesw,enes,,16.51 %,,16.31 %,2.22 %,5.36 %,2.55 %,5.38 %,,5.00 %,,5.51 %,,3.46 %,,3.73 %,,5.58 %,,6.63 % -stt_enes_conformer_transducer_large,enes,,16.2 %,,,2.0 %,4.6 %,2.2 %,4.6 %,5.0 %,,,,,3.3 %,,,,5.3 %,, -stt_enes_conformer_transducer_large_codesw,enes,15.70 %,,15.66 %,,1.97 %,4.54 %,2.17 %,4.53 %,4.51 %,,5.06 %,,3.27 %,,3.67 %,,5.28 %,,6.54 %, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/enes/contextnet_enes.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/enes/contextnet_enes.csv deleted file mode 100644 index 72a895303bbbd8b43fd94d2f66a93753b28af9e5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/enes/contextnet_enes.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,Fisher-Dev-En,Fisher-Dev-Es,Fisher-Test-En,Fisher-Test-Es,Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Dev-Set v7.0 (en),MCV Dev-Set v7.0 (es),MCV Test-Set v7.0 (en),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Dev (es),MLS Test (en),MLS Test (es),VoxPopuli Dev (en),VoxPopuli Dev (es),VoxPopuli Test (en),VoxPopuli Test (es) -stt_enes_contextnet_large,enes,,14.8 %,,,2.2 %,5.6 %,2.3 %,5.5 %,4.7 %,,,,,3.0 %,,,,5.0 %,, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/eo/conformer_eo.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/eo/conformer_eo.csv deleted file mode 100644 index f77d4c0eadfcc5d7b954a06df2036cfbbcf50f3d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/eo/conformer_eo.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model Name,Language,MCV Dev-Set v11.0 (eo),MCV Test-Set v11.0 (eo) -stt_eo_conformer_ctc_large,eo,2.9 %,4.8 % -stt_eo_conformer_transducer_large,eo,2.4 %,4.0 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/es/citrinet_es.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/es/citrinet_es.csv deleted file mode 100644 index 9471293dd227424466084fb01392dd185e6681d8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/es/citrinet_es.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v12.0 (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v12.0 (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) -stt_es_citrinet_512,es,,,,,,9.1 % WER,,,10.3 % WER,,,4.9 % WER,5.2 % WER,, -stt_es_citrinet_1024_gamma_0_25,es,19.9 %,21.3 %,19.1 %,15.8 %,15.9 %,,,6.1 %,,,6.8 %,3.5 %,4.1 %,5.6 %,7.0 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/es/conformer_es.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/es/conformer_es.csv deleted file mode 100644 index e7e47cbdc068e6361ac511e43b83b9a306c5ea3c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/es/conformer_es.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v12.0 (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v12.0 (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) -stt_es_conformer_ctc_large,es,23.7 %,25.3 %,22.4 %,18.3 %,18.5 %,,,6.3 %,,,6.9 %,4.3 %,4.2 %,6.1 %,7.5 % -stt_es_conformer_transducer_large,es,18.0 %,19.4 %,17.2 %,14.7 %,14.8 %,,,4.6 %,,,5.2 %,2.7 %,3.2 %,4.7 %,6.0 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/es/contextnet_es.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/es/contextnet_es.csv deleted file mode 100644 index 9f75e2a70bce4383a2566787e7fac81a6d8b9fd9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/es/contextnet_es.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v12.0 (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v12.0 (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) -stt_es_contextnet_1024,es,19.1 %,20.7 %,18.2 %,15.3 %,15.1 %,,,4.8 %,,,5.2 %,3.1 %,3.5 %,5.1 %,6.2 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/es/fastconformer_es.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/es/fastconformer_es.csv deleted file mode 100644 index a6c12afe95e1f7cf42d6090d80805c13ac038881..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/es/fastconformer_es.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v12.0 (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v12.0 (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) -stt_es_fastconformer_hybrid_large_pc,es,,,,29.4 %,28.9 %,,7.1 %,,,7.5 %,,10.6 %,11.8 %,8.6 %,9.8 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/es/quartznet15x5_es.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/es/quartznet15x5_es.csv deleted file mode 100644 index 54de5e94025b4462cae4797da4cd35a2c433d0bd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/es/quartznet15x5_es.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v12.0 (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v12.0 (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) -stt_es_quartznet15x5,es,,,,,,12.97,,,,,,,,, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/fr/citrinet_fr.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/fr/citrinet_fr.csv deleted file mode 100644 index 651dcb8494400ac5053dff0280ed10ab88350abf..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/fr/citrinet_fr.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (fr),MCV Dev-Set v7.0 (fr),MCV Dev-Set v7.0 (fr) (No Hyphen),MCV Test-Set v7.0 (fr),MCV Test-Set v7.0 (fr) (No Hyphen),MLS Dev (en),MLS Dev (en) (No Hyphen),MLS Test (en),MLS Test (en) (No Hyphen) -stt_fr_citrinet_1024_gamma_0_25,fr,,10.76,9.90,12.20,11.11,6.66,6.19,5.53,5.12 diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/fr/conformer_fr.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/fr/conformer_fr.csv deleted file mode 100644 index 8f74dfe8cae047091e8c7f6081dcc24aca37df10..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/fr/conformer_fr.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (fr),MCV Dev-Set v7.0 (fr),MCV Dev-Set v7.0 (fr) (No Hyphen),MCV Test-Set v7.0 (fr),MCV Test-Set v7.0 (fr) (No Hyphen),MLS Dev (en),MLS Dev (en) (No Hyphen),MLS Test (en),MLS Test (en) (No Hyphen) -stt_fr_conformer_ctc_large,fr,,8.35,7.88,9.63,9.01,5.88,5.90,4.91,4.63 -stt_fr_conformer_transducer_large,fr,,6.85,,7.95,,5.05,,4.10, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/fr/contextnet_fr.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/fr/contextnet_fr.csv deleted file mode 100644 index 71f601871d1522745af18714f8691675d1c4d468..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/fr/contextnet_fr.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (fr),MCV Dev-Set v7.0 (fr),MCV Dev-Set v7.0 (fr) (No Hyphen),MCV Test-Set v7.0 (fr),MCV Test-Set v7.0 (fr) (No Hyphen),MLS Dev (en),MLS Dev (en) (No Hyphen),MLS Test (en),MLS Test (en) (No Hyphen) -stt_fr_contextnet_1024,fr,,8.32,,9.42,,6.02,,5.01, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/fr/quartznet15x5_fr.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/fr/quartznet15x5_fr.csv deleted file mode 100644 index a30f447f42818eeb4e8e1e871e0d15dabe1955e1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/fr/quartznet15x5_fr.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (fr),MCV Dev-Set v7.0 (fr),MCV Dev-Set v7.0 (fr) (No Hyphen),MCV Test-Set v7.0 (fr),MCV Test-Set v7.0 (fr) (No Hyphen),MLS Dev (en),MLS Dev (en) (No Hyphen),MLS Test (en),MLS Test (en) (No Hyphen) -stt_fr_quartznet15x5,fr,14.01,,,,,,,, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/hr/conformer_hr.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/hr/conformer_hr.csv deleted file mode 100644 index 4cfd3f79a89fd55cfdfc964b2b8dd82f34cb978b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/hr/conformer_hr.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model Name,Language,ParlaSpeech Dev-Set v1.0 (hr),ParlaSpeech Test-Set v1.0 (hr),Parlaspeech Dev-Set (v??) (hr),Parlaspeech Test-Set (v??) (hr) -stt_hr_conformer_ctc_large,hr,4.43,4.70,, -stt_hr_conformer_transducer_large,hr,4.56,4.69,, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/hr/fastconformer_hr.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/hr/fastconformer_hr.csv deleted file mode 100644 index ee54e981e7aa3763a795ba74483e027e405a27d1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/hr/fastconformer_hr.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,ParlaSpeech Dev-Set v1.0 (hr),ParlaSpeech Test-Set v1.0 (hr),Parlaspeech Dev-Set (v??) (hr),Parlaspeech Test-Set (v??) (hr) -stt_hr_fastconformer_hybrid_large_pc,hr,,,4.5 %,4.2 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/it/conformer_it.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/it/conformer_it.csv deleted file mode 100644 index c86a906e982ceac07cfab57c747aab174bcba327..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/it/conformer_it.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (it),MCV Dev-Set v11.0 (it),MCV Dev-Set v12.0 (it),MCV Test-Set v11.0 (it),MCV Test-Set v12.0 (it),MLS Dev (en),MLS Test (en),VoxPopuli Dev (it),VoxPopuli Test (it) -stt_it_conformer_ctc_large,it,,5.38,,5.92,,13.16,10.62,13.43,16.75 -stt_it_conformer_transducer_large,it,,4.80,,5.24,,14.62,12.18,12.00,15.15 diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/it/fastconformer_it.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/it/fastconformer_it.csv deleted file mode 100644 index 3a684662295e60377ca7db25f57ba66c42805eb7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/it/fastconformer_it.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (it),MCV Dev-Set v11.0 (it),MCV Dev-Set v12.0 (it),MCV Test-Set v11.0 (it),MCV Test-Set v12.0 (it),MLS Dev (en),MLS Test (en),VoxPopuli Dev (it),VoxPopuli Test (it) -stt_it_fastconformer_hybrid_large_pc,it,,,5.2 %,,5.8 %,13.6 %,11.5 %,12.7 %,15.6 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/it/quartznet15x5_it.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/it/quartznet15x5_it.csv deleted file mode 100644 index f22cfda089dcba68165881c91d5c00b76a2676ce..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/it/quartznet15x5_it.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (it),MCV Dev-Set v11.0 (it),MCV Dev-Set v12.0 (it),MCV Test-Set v11.0 (it),MCV Test-Set v12.0 (it),MLS Dev (en),MLS Test (en),VoxPopuli Dev (it),VoxPopuli Test (it) -stt_it_quartznet15x5,it,15.22,,,,,,,, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/kab/conformer_kab.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/kab/conformer_kab.csv deleted file mode 100644 index 9db989dc23778d02a380c391f9c78c7b0b0694a8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/kab/conformer_kab.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Test-Set v10.0 (kab) -stt_kab_conformer_transducer_large,kab,18.86 diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/pl/fastconformer_pl.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/pl/fastconformer_pl.csv deleted file mode 100644 index 8cf9a506b70468c59b03c7194ae0eb46f44fbf70..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/pl/fastconformer_pl.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (pl),MCV Dev-Set v12.0 (pl),MCV Test-Set v12.0 (pl),MLS Dev (en),MLS Test (en),VoxPopuli Dev (pl),VoxPopuli Test (pl) -stt_pl_fastconformer_hybrid_large_pc,pl,,6.0 %,8.7 %,7.1 %,5.8 %,11.3 %,8.5 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/pl/quartznet15x5_pl.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/pl/quartznet15x5_pl.csv deleted file mode 100644 index 98c80fdd54013a610982f4488048ff9cdfb875e8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/pl/quartznet15x5_pl.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set (v??) (pl),MCV Dev-Set v12.0 (pl),MCV Test-Set v12.0 (pl),MLS Dev (en),MLS Test (en),VoxPopuli Dev (pl),VoxPopuli Test (pl) -stt_pl_quartznet15x5,pl,14,,,,,, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/ru/conformer_ru.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/ru/conformer_ru.csv deleted file mode 100644 index a4f2c20a2726a8cd55a79578edf4497dee88838b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/ru/conformer_ru.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model Name,Language,GOLOS Crowd Test-Set (v??) (ru),GOLOS Farfield Test-Set (v??) (ru),Librispeech Test,MCV Dev-Set (v??) (ru),MCV Dev-Set v10.0 (ru),MCV Test-Set v10.0 (ru) -stt_ru_conformer_ctc_large,ru,2.8 %,7.1 %,13.5 %,,3.9 %,4.3 % -stt_ru_conformer_transducer_large,ru,2.7%,7.6%,12.0%,,3.5%,4.0% diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/ru/quartznet15x5_ru.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/ru/quartznet15x5_ru.csv deleted file mode 100644 index db86ab2e8b6bf179d9c94659ba24f2231149b4fc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/ru/quartznet15x5_ru.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,GOLOS Crowd Test-Set (v??) (ru),GOLOS Farfield Test-Set (v??) (ru),Librispeech Test,MCV Dev-Set (v??) (ru),MCV Dev-Set v10.0 (ru),MCV Test-Set v10.0 (ru) -stt_ru_quartznet15x5,ru,,,,16.23,, diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/rw/conformer_rw.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/rw/conformer_rw.csv deleted file mode 100644 index e5544a8067d55fc7d9753391e7dfefa90cca5336..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/rw/conformer_rw.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model Name,Language,MCV Test-Set v9.0 (rw) -stt_rw_conformer_ctc_large,rw,18.2 % -stt_rw_conformer_transducer_large,rw,16.2 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/ua/fastconformer_ua.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/ua/fastconformer_ua.csv deleted file mode 100644 index c325a73c5f53647a416841fc0b83422736362e7d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/ua/fastconformer_ua.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Test-Set v12.0 (ua) -stt_ua_fastconformer_hybrid_large_pc,ua,5.2 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/zh/citrinet_zh.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/zh/citrinet_zh.csv deleted file mode 100644 index 2ad05e0233e1a103dbef05728d49535d70dcdbc5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/zh/citrinet_zh.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model Name,Language,AIShell Dev-Android v2,AIShell Dev-Ios v1,AIShell Dev-Ios v2,AIShell Dev-Mic v2,AIShell Test-Android v2,AIShell Test-Ios v1,AIShell Test-Ios v2,AIShell Test-Mic v2 -stt_zh_citrinet_512,zh,,6.25%,,,,6.44%,, -stt_zh_citrinet_1024_gamma_0_25,zh,5.2 %,,4.8 %,5.2 %,5.5 %,,5.1 %,5.5 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores/zh/conformer_zh.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores/zh/conformer_zh.csv deleted file mode 100644 index 8d0ef96dc8d9d7766eb13fc7c6d08717c129132a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores/zh/conformer_zh.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,AIShell Dev-Android v2,AIShell Dev-Ios v1,AIShell Dev-Ios v2,AIShell Dev-Mic v2,AIShell Test-Android v2,AIShell Test-Ios v1,AIShell Test-Ios v2,AIShell Test-Mic v2 -stt_zh_conformer_transducer_large,zh,3.4,,3.2,3.4,3.4,,3.2,3.4 diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/by/fastconformer_by.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/by/fastconformer_by.csv deleted file mode 100644 index 88f5e320f088545ee617f8a1be399ff583b1e9e6..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/by/fastconformer_by.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set v12.0 (be),MCV Test-Set v12.0 (be) -stt_by_fastconformer_hybrid_large_pc,by,3.8 %,3.9 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/de/fastconformer_de.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/de/fastconformer_de.csv deleted file mode 100644 index f862289184600b29ad00eed52a8357273fe122e9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/de/fastconformer_de.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set v12.0 (de),MCV Test-Set v12.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) -stt_de_fastconformer_hybrid_large_pc,de,4.7 %,5.4 %,10.1 %,11.1 %,12.6 %,10.4 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/en/fastconformer_en.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/en/fastconformer_en.csv deleted file mode 100644 index 9495643af30df3c2fdd4e37fb4d71add382e4bfb..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/en/fastconformer_en.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MLS Test (en),NSC Part1,SPGI Test,VoxPopuli Test (en) -stt_en_fastconformer_hybrid_large_pc,en,12.5 %,19.0 %,7.3 %,9.2 %,10.1 %,12.7 %,7.2 %,5.1 %,6.7 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/es/fastconformer_es.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/es/fastconformer_es.csv deleted file mode 100644 index 501771865ed8495b9f90d524222e08ebf0ab3a65..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/es/fastconformer_es.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set v12.0 (es),MCV Test-Set v12.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) -stt_es_fastconformer_hybrid_large_pc,es,14.7 %,14.6 %,4.5 %,5.0 %,3.1 %,3.9 %,4.4 %,5.6 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/hr/fastconformer_hr.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/hr/fastconformer_hr.csv deleted file mode 100644 index 3c024c09f329d2aa2f3bff642aca1e10140681aa..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/hr/fastconformer_hr.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,Parlaspeech Dev-Set (v??) (hr),Parlaspeech Test-Set (v??) (hr) -stt_hr_fastconformer_hybrid_large_pc,hr,10.4 %,8.7 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/it/fastconformer_it.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/it/fastconformer_it.csv deleted file mode 100644 index 6bcf2c0b44001ec8c2f00638aec85a400e782d2c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/it/fastconformer_it.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set v12.0 (it),MCV Test-Set v12.0 (it),MLS Dev (en),MLS Test (en),VoxPopuli Dev (it),VoxPopuli Test (it) -stt_it_fastconformer_hybrid_large_pc,it,7.8 %,8.2 %,26.4 %,22.5 %,16.8 %,19.6 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/pl/fastconformer_pl.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/pl/fastconformer_pl.csv deleted file mode 100644 index 5cbadae40b599924eab259da979d7a368e426925..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/pl/fastconformer_pl.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Dev-Set v12.0 (pl),MCV Test-Set v12.0 (pl),MLS Dev (en),MLS Test (en),VoxPopuli Dev (pl),VoxPopuli Test (pl) -stt_pl_fastconformer_hybrid_large_pc,pl,8.9 %,11.0 %,16.0 %,11.0 %,14.0 %,11.4 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/ua/fastconformer_ua.csv b/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/ua/fastconformer_ua.csv deleted file mode 100644 index b486fa23aeb3bea1186976515a488b5a38a9bac1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/data/scores_pc/ua/fastconformer_ua.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Language,MCV Test-Set v12.0 (ua) -stt_ua_fastconformer_hybrid_large_pc,ua,7.3 % diff --git a/SoundScribe/SpeakerID/docs/source/asr/datasets.rst b/SoundScribe/SpeakerID/docs/source/asr/datasets.rst deleted file mode 100644 index 95cbed26427d5322ba88e3d4dbaf2d5255c20398..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/datasets.rst +++ /dev/null @@ -1,525 +0,0 @@ -Datasets -======== - -NeMo has scripts to convert several common ASR datasets into the format expected by the ``nemo_asr`` collection. You can get started -with those datasets by following the instructions to run those scripts in the section appropriate to each dataset below. - -If the user has their own data and want to preprocess it to use with NeMo ASR models, refer to the `Preparing Custom ASR Data`_ section. - -If the user already has a dataset that you want to convert to a tarred format, refer to the `Tarred Datasets`_ section. - -.. _LibriSpeech_dataset: - -LibriSpeech ------------ - -Run the following scripts to download the LibriSpeech data and convert it into the format expected by `nemo_asr`. At least 250GB free -space is required. - -.. code-block:: bash - - # install sox - sudo apt-get install sox - mkdir data - python get_librispeech_data.py --data_root=data --data_set=ALL - -After this, the ``data`` folder should contain wav files and ``.json`` manifests for NeMo ASR datalayer. - -Each line is a training example. ``audio_filepath`` contains the path to the wav file, ``duration`` is the duration in seconds, and ``text`` is the transcript: - -.. code-block:: json - - {"audio_filepath": "/1355-39947-0000.wav", "duration": 11.3, "text": "psychotherapy and the community both the physician and the patient find their place in the community the life interests of which are superior to the interests of the individual"} - {"audio_filepath": "/1355-39947-0001.wav", "duration": 15.905, "text": "it is an unavoidable question how far from the higher point of view of the social mind the psychotherapeutic efforts should be encouraged or suppressed are there any conditions which suggest suspicion of or direct opposition to such curative work"} - -Fisher English Training Speech ------------------------------- - -Run these scripts to convert the Fisher English Training Speech data into a format expected by the ``nemo_asr`` collection. - -In brief, the following scripts convert the ``.sph`` files to ``.wav``, slices those files into smaller audio samples, matches the -smaller slices with their corresponding transcripts, and splits the resulting audio segments into train, validation, and test sets -(with one manifest each). - -.. note:: - - 106 GB of space is required to run the ``.wav`` conversion - - additional 105 GB is required for the slicing and matching - - ``sph2pipe`` is required in order to run the ``.wav`` conversion - -**Instructions** - -The following scripts assume that you already have the Fisher dataset from the Linguistic Data Consortium, with a directory structure -that looks similar to the following: - -.. code-block:: bash - - FisherEnglishTrainingSpeech/ - ├── LDC2004S13-Part1 - │   ├── fe_03_p1_transcripts - │   ├── fisher_eng_tr_sp_d1 - │   ├── fisher_eng_tr_sp_d2 - │   ├── fisher_eng_tr_sp_d3 - │   └── ... - └── LDC2005S13-Part2 - ├── fe_03_p2_transcripts - ├── fe_03_p2_sph1 - ├── fe_03_p2_sph2 - ├── fe_03_p2_sph3 - └── ... - -The transcripts that will be used are located in the ``fe_03_p<1,2>_transcripts/data/trans`` directory. The audio files (``.sph``) -are located in the remaining directories in an ``audio`` subdirectory. - -#. Convert the audio files from ``.sph`` to ``.wav`` by running: - - .. code-block:: bash - - cd /scripts/dataset_processing - python fisher_audio_to_wav.py \ - --data_root= --dest_root= - - This will place the unsliced ``.wav`` files in ``/LDC200[4,5]S13-Part[1,2]/audio-wav/``. It will take several - minutes to run. - -#. Process the transcripts and slice the audio data. - - .. code-block:: bash - - python process_fisher_data.py \ - --audio_root= --transcript_root= \ - --dest_root= \ - --remove_noises - - This script splits the full dataset into train, validation, test sets, and places the audio slices in the corresponding folders - in the destination directory. One manifest is written out per set, which includes each slice's transcript, duration, and path. - - This will likely take around 20 minutes to run. Once finished, delete the 10 minute long ``.wav`` files. - -2000 HUB5 English Evaluation Speech ------------------------------------ - -Run the following script to convert the HUB5 data into a format expected by the ``nemo_asr`` collection. - -Similarly, to the Fisher dataset processing scripts, this script converts the ``.sph`` files to ``.wav``, slices the audio files and -transcripts into utterances, and combines them into segments of some minimum length (default is 10 seconds). The resulting segments -are all written out to an audio directory and the corresponding transcripts are written to a manifest JSON file. - -.. note:: - - 5 GB of free space is required to run this script - - ``sph2pipe`` is also required to be installed - -This script assumes you already have the 2000 HUB5 dataset from the Linguistic Data Consortium. - -Run the following command to process the 2000 HUB5 English Evaluation Speech samples: - -.. code-block:: bash - - python process_hub5_data.py \ - --data_root= \ - --dest_root= - -You can optionally include ``--min_slice_duration=`` if you would like to change the minimum audio segment duration. - -AN4 Dataset ------------ - -This is a small dataset recorded and distributed by Carnegie Mellon University. It consists of recordings of people spelling out -addresses, names, etc. Information about this dataset can be found on the `official CMU site `_. - -#. `Download and extract the dataset `_ (which is labeled "NIST's Sphere audio (.sph) format (64M)". - -#. Convert the ``.sph`` files to ``.wav`` using sox, and build one training and one test manifest. - - .. code-block:: bash - - python process_an4_data.py --data_root= - -After the script finishes, the ``train_manifest.json`` and ``test_manifest.json`` can be found in the ``/an4/`` directory. - -Aishell-1 ---------- - -To download the Aishell-1 data and convert it into a format expected by ``nemo_asr``, run: - -.. code-block:: bash - - # install sox - sudo apt-get install sox - mkdir data - python get_aishell_data.py --data_root=data - -After the script finishes, the ``data`` folder should contain a ``data_aishell`` folder which contains a wav file, a transcript folder, and related ``.json`` and ``vocab.txt`` files. - -Aishell-2 ---------- - -To process the AIShell-2 dataset, in the command below, set the data folder of AIShell-2 using ``--audio_folder`` and where to push -these files using ``--dest_folder``. In order to generate files in the supported format of ``nemo_asr``, run: - -.. code-block:: bash - - python process_aishell2_data.py --audio_folder= --dest_folder= - -After the script finishes, the ``train.json``, ``dev.json``, ``test.json``, and ``vocab.txt`` files can be found in the ``dest_folder`` directory. - -.. _section-with-manifest-format-explanation: - -Preparing Custom ASR Data -------------------------- - -The ``nemo_asr`` collection expects each dataset to consist of a set of utterances in individual audio files plus -a manifest that describes the dataset, with information about one utterance per line (``.json``). -The audio files can be of any format supported by `Pydub `_, though we recommend -WAV files as they are the default and have been most thoroughly tested. - -There should be one manifest file per dataset that will be passed in, therefore, if the user wants separate training and validation -datasets, they should also have separate manifests. Otherwise, they will be loading validation data with their training data and vice -versa. - -Each line of the manifest should be in the following format: - -.. code-block:: json - - {"audio_filepath": "/path/to/audio.wav", "text": "the transcription of the utterance", "duration": 23.147} - -The :code:`audio_filepath` field should provide an absolute path to the ``.wav`` file corresponding to the utterance. -The :code:`text` field should contain the full transcript for the utterance, and the :code:`duration` field should -reflect the duration of the utterance in seconds. - -Each entry in the manifest (describing one audio file) should be bordered by '{' and '}' and must -be contained on one line. The fields that describe the file should be separated by commas, and have the form :code:`"field_name": value`, -as shown above. There should be no extra lines in the manifest, i.e. there should be exactly as many lines in the manifest as -there are audio files in the dataset. - -Since the manifest specifies the path for each utterance, the audio files do not have to be located -in the same directory as the manifest, or even in any specific directory structure. - -Once there is a manifest that describes each audio file in the dataset, use the dataset by passing -in the manifest file path in the experiment config file, e.g. as ``training_ds.manifest_filepath=``. - -Tarred Datasets ---------------- - -If experiments are run on a cluster with datasets stored on a distributed file system, the user will likely -want to avoid constantly reading multiple small files and would prefer tarring their audio files. -There are tarred versions of some NeMo ASR dataset classes for this case, such as the ``TarredAudioToCharDataset`` -(corresponding to the ``AudioToCharDataset``) and the ``TarredAudioToBPEDataset`` (corresponding to the -``AudioToBPEDataset``). The tarred audio dataset classes in NeMo use `WebDataset `_. - -To use an existing tarred dataset instead of a non-tarred dataset, set ``is_tarred: true`` in -the experiment config file. Then, pass in the paths to all of the audio tarballs in ``tarred_audio_filepaths``, either as a list -of filepaths, e.g. ``['/data/shard1.tar', '/data/shard2.tar']``, or in a single brace-expandable string, e.g. -``'/data/shard_{1..64}.tar'`` or ``'/data/shard__OP_1..64_CL_'`` (recommended, see note below). - -.. note:: - For brace expansion, there may be cases where ``{x..y}`` syntax cannot be used due to shell interference. This occurs most commonly - inside SLURM scripts. Therefore, we provide a few equivalent replacements. Supported opening braces (equivalent to ``{``) are ``(``, - ``[``, ``<`` and the special tag ``_OP_``. Supported closing braces (equivalent to ``}``) are ``)``, ``]``, ``>`` and the special - tag ``_CL_``. For SLURM based tasks, we suggest the use of the special tags for ease of use. - -As with non-tarred datasets, the manifest file should be passed in ``manifest_filepath``. The dataloader assumes that the length -of the manifest after filtering is the correct size of the dataset for reporting training progress. - -The ``tarred_shard_strategy`` field of the config file can be set if you have multiple shards and are running an experiment with -multiple workers. It defaults to ``scatter``, which preallocates a set of shards per worker which do not change during runtime. -Note that this strategy, on specific occasions (when the number of shards is not divisible with ``world_size``), will not sample -the entire dataset. As an alternative the ``replicate`` strategy, will preallocate the entire set of shards to every worker and not -change it during runtime. The benefit of this strategy is that it allows each worker to sample data points from the entire dataset -independently of others. Note, though, that more than one worker may sample the same shard, and even sample the same data points! -As such, there is no assured guarantee that all samples in the dataset will be sampled at least once during 1 epoch. Note that -for these reasons it is not advisable to use tarred datasets as validation and test datasets. - -For more information about the individual tarred datasets and the parameters available, including shuffling options, -see the corresponding class APIs in the `Datasets <./api.html#Datasets>`__ section. - -.. warning:: - If using multiple workers, the number of shards should be divisible by the world size to ensure an even - split among workers. If it is not divisible, logging will give a warning but training will proceed, but likely hang at the last epoch. - In addition, if using distributed processing, each shard must have the same number of entries after filtering is - applied such that each worker ends up with the same number of files. We currently do not check for this in any dataloader, but the user's - program may hang if the shards are uneven. - -Sharded Manifests -~~~~~~~~~~~~~~~~~ -If your dataset / manifest is large, you may wish to use sharded manifest files instead of a single manifest file. The naming convention -is identical to the audio tarballs and there should be a 1:1 relationship between a sharded audio tarfile and its manifest shard; e.g. -``'/data/sharded_manifests/manifest__OP_1..64_CL_'`` in the above example. Using sharded manifests improves job startup times and -decreases memory usage, as each worker only loads manifest shards for the corresponding audio shards instead of the entire manifest. - -To enable sharded manifest filename expansion, set the ``shard_manifests`` field of the config file to true. In addition, the -``defer_setup`` flag needs to be true as well, so that the dataloader will be initialized after the DDP and its length can be collected from -the distributed workers. - - -Conversion to Tarred Datasets -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You can easily convert your existing NeMo-compatible ASR datasets using the -`conversion script here `_. - -.. code:: bash - - python convert_to_tarred_audio_dataset.py \ - --manifest_path= \ - --target_dir= \ - --num_shards= - --max_duration= \ - --min_duration= \ - --shuffle --shuffle_seed=0 - -This script shuffles the entries in the given manifest (if ``--shuffle`` is set, which we recommend), filter -audio files according to ``min_duration`` and ``max_duration``, and tar the remaining audio files to the directory -``--target_dir`` in ``n`` shards, along with separate manifest and metadata files. - -The files in the target directory should look similar to the following: - -.. code:: - - target_dir/ - ├── audio_1.tar - ├── audio_2.tar - ├── ... - ├── metadata.yaml - ├── tarred_audio_manifest.json - ├── sharded_manifests/ - ├── manifest_1.json - ├── ... - └── manifest_N.json - - -Note that file structures are flattened such that all audio files are at the top level in each tarball. This ensures that -filenames are unique in the tarred dataset and the filepaths do not contain "-sub" and forward slashes in each ``audio_filepath`` are -simply converted to underscores. For example, a manifest entry for ``/data/directory1/file.wav`` would be ``_data_directory1_file.wav`` -in the tarred dataset manifest, and ``/data/directory2/file.wav`` would be converted to ``_data_directory2_file.wav``. - -Sharded manifests are generated by default; this behavior can be toggled via the ``no_shard_manifests`` flag. - -Bucketing Datasets ------------------- - -For training ASR models, audios with different lengths may be grouped into a batch. It would make it necessary to use paddings to make all the same length. -These extra paddings is a significant source of computation waste. Splitting the training samples into buckets with different lengths and sampling from the same bucket for each batch would increase the computation efficicncy. -It may result into training speeedup of more than 2X. To enable and use the bucketing feature, you need to create the bucketing version of the dataset by using `conversion script here `_. -You may use --buckets_num to specify the number of buckets (Recommend to use 4 to 8 buckets). It creates multiple tarred datasets, one per bucket, based on the audio durations. The range of [min_duration, max_duration) is split into equal sized buckets. - - -To enable the bucketing feature in the dataset section of the config files, you need to pass the multiple tarred datasets as a list of lists. -If user passes just a list of strings, then the datasets would simply get concatenated which would be different from bucketing. -Here is an example for 4 buckets and 512 shards: - -.. code:: - - python speech_to_text_bpe.py - ... - model.train_ds.manifest_filepath=[[PATH_TO_TARS/bucket1/tarred_audio_manifest.json], - [PATH_TO_TARS/bucket2/tarred_audio_manifest.json], - [PATH_TO_TARS/bucket3/tarred_audio_manifest.json], - [PATH_TO_TARS/bucket4/tarred_audio_manifest.json]] - model.train_ds.tarred_audio_filepaths=[[PATH_TO_TARS/bucket1/audio__OP_0..511_CL_.tar], - [PATH_TO_TARS/bucket2/audio__OP_0..511_CL_.tar], - [PATH_TO_TARS/bucket3/audio__OP_0..511_CL_.tar], - [PATH_TO_TARS/bucket4/audio__OP_0..511_CL_.tar]] - -When bucketing is enabled, in each epoch, first all GPUs would use the first bucket, then go to the second bucket, and so on. It guarantees that all GPUs are using the same bucket at the same time. It reduces the number of paddings in each batch and speedup the training significantly without hurting the accuracy significantly. - -There are two types of batching: - -* Fixed-size bucketing: all batches would have the same number of samples specified by train_ds.batch_size -* Adaptive-size bucketing: uses different batch sizes for each bucket. - -Adaptive-size bucketing helps to increase the GPU utilization and speedup the training. -Batches sampled from buckets with smaller audio lengths can be larger which would increase the GPU utilization and speedup the training. -You may use train_ds.bucketing_batch_size to enable the adaptive batching and specify the batch sizes for the buckets. -When bucketing_batch_size is not set, train_ds.batch_size is going to be used for all buckets (fixed-size bucketing). - -bucketing_batch_size can be set as an integer or a list of integers to explicitly specify the batch size for each bucket. -if bucketing_batch_size is set to be an integer, then linear scaling is being used to scale-up the batch sizes for batches with shorted audio size. For example, setting train_ds.bucketing_batch_size=8 for 4 buckets would use these sizes [32,24,16,8] for different buckets. -When bucketing_batch_size is set, traind_ds.batch_size need to be set to 1. - -Training an ASR model on audios sorted based on length may affect the accuracy of the model. We introduced some strategies to mitigate it. -We support three types of bucketing strategies: - -* fixed_order: the same order of buckets are used for all epochs -* synced_randomized (default): each epoch would have a different order of buckets. Order of the buckets is shuffled every epoch. -* fully_randomized: similar to synced_randomized but each GPU has its own random order. So GPUs would not be synced. - -Tha parameter train_ds.bucketing_strategy can be set to specify one of these strategies. The recommended strategy is synced_randomized which gives the highest training speedup. -The fully_randomized strategy would have lower speedup than synced_randomized but may give better accuracy. - -Bucketing may improve the training speed more than 2x but may affect the final accuracy of the model slightly. Training for more epochs and using 'synced_randomized' strategy help to fill this gap. -Currently bucketing feature is just supported for tarred datasets. - -Upsampling Datasets -------------------- - -Buckets may also be 'weighted' to allow multiple runs through a target dataset during each training epoch. This can be beneficial in cases when a dataset is composed of several component sets of unequal sizes and one desires to mitigate bias towards the larger sets through oversampling. - -Weighting is managed with the `bucketing_weights` parameter. After passing your composite tarred datasets in the format described above for bucketing, pass a list of integers (one per bucket) to indicate how many times a manifest should be read during training. - -For example, by passing `[2,1,1,3]` to the code below: - -.. code:: - - python speech_to_text_bpe.py - ... - model.train_ds.manifest_filepath=[[PATH_TO_TARS/bucket1/tarred_audio_manifest.json], - [PATH_TO_TARS/bucket2/tarred_audio_manifest.json], - [PATH_TO_TARS/bucket3/tarred_audio_manifest.json], - [PATH_TO_TARS/bucket4/tarred_audio_manifest.json]] - model.train_ds.tarred_audio_filepaths=[[PATH_TO_TARS/bucket1/audio__OP_0..511_CL_.tar], - [PATH_TO_TARS/bucket2/audio__OP_0..511_CL_.tar], - [PATH_TO_TARS/bucket3/audio__OP_0..511_CL_.tar], - [PATH_TO_TARS/bucket4/audio__OP_0..511_CL_.tar]] - ... - model.train_ds.bucketing_weights=[2,1,1,3] - -NeMo will configure training so that all data in `bucket1` will be present twice in a training epoch, `bucket4` will be present three times, and that of `bucket2` and `bucket3` will occur only once each. Note that this will increase the effective amount of data present during training and thus affect training time per epoch. - -If using adaptive bucketing, note that the same batch size will be assigned to each instance of the upsampled data. That is, given the following: - -.. code:: - - python speech_to_text_bpe.py - ... - model.train_ds.manifest_filepath=[[PATH_TO_TARS/bucket1/tarred_audio_manifest.json], - [PATH_TO_TARS/bucket2/tarred_audio_manifest.json], - [PATH_TO_TARS/bucket3/tarred_audio_manifest.json], - [PATH_TO_TARS/bucket4/tarred_audio_manifest.json]] - ... - ... - model.train_ds.bucketing_weights=[2,1,1,3] - model.train_ds.bucketing_batch_size=[4,4,4,2] - -All instances of data from `bucket4` will still be trained with a batch size of 2 while all others would have a batch size of 4. As with standard bucketing, this requires `batch_size`` to be set to 1. -If `bucketing_batch_size` is not specified, all datasets will be passed with the same fixed batch size as specified by the `batch_size` parameter. - -It is recommended to set bucketing strategies to `fully_randomized` during multi-GPU training to prevent possible dataset bias during training. - - -Datasets on AIStore -------------------- - -`AIStore `_ is an open-source lightweight object storage system focused on large-scale deep learning. -AIStore is aimed to scale linearly with each added storage node, can be deployed on any Linux machine and can provide a unified namespace across multiple remote backends, such as Amazon S3, Google Cloud, and Microsoft Azure. -More details are provided in the `documentation `_ and the `repository `_ of the AIStore project. - -NeMo currently supports datasets from an AIStore bucket provider under ``ais://`` namespace. - -AIStore Setup -~~~~~~~~~~~~~ - -NeMo is currently relying on the AIStore (AIS) command-line interface (CLI) to handle the supported datasets. -The CLI is available in current NeMo Docker containers. -If necessary, the CLI can be configured using the instructions provided in `AIStore CLI `_ documentation. - -To start using the AIS CLI to access data on an AIS cluster, an endpoint needs to be configured. -The endpoint is configured by setting ``AIS_ENDPOINT`` environment variable before using the CLI - -.. code:: - - export AIS_ENDPOINT=http://hostname:port - ais --help - -In the above, ``hostname:port`` denotes the address of an AIS gateway. -For example, the address could be ``localhost:51080`` if testing using a local `minimal production-ready standalone Docker container `_. - -Dataset Setup -~~~~~~~~~~~~~ - -Currently, both tarred and non-tarred datasets are supported. -For any dataset, the corresponding manifest file is cached locally and processed as a regular manifest file. -For non-tarred datasets, the audio data is also cached locally. -For tarred datasets, shards from the AIS cluster are used by piping ``ais get`` to WebDataset. - -Tarred Dataset from AIS -^^^^^^^^^^^^^^^^^^^^^^^ - -A tarred dataset can be easily used as described in the :ref:`Tarred Datasets` section by providing paths to manifests on an AIS cluster. -For example, a tarred dataset from an AIS cluster can be configured as - -.. code:: - - manifest_filepath='ais://bucket/tarred_audio_manifest.json' - tarred_audio_filepaths='ais://bucket/shard_{1..64}.tar' - -:ref:`Bucketing Datasets` are configured in a similar way by providing paths on an AIS cluster. - -Non-tarred Dataset from AIS -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A non-tarred dataset can be easly used by providing a manifest file path on an AIS cluster - -.. code:: - - manifest_filepath='ais://bucket/dataset_manifest.json' - -Note that it is assumed that the manifest file path contains audio file paths relative to the manifest locations. -For example the manifest file may have lines in the following format - -.. code-block:: json - - {"audio_filepath": "path/to/audio.wav", "text": "transcription of the uterance", "duration": 23.147} - -The corresponding audio file would be downloaded from ``ais://bucket/path/to/audio.wav``. - -Cache configuration -^^^^^^^^^^^^^^^^^^^ - -Manifests and audio files from non-tarred datasets will be cached locally. -Location of the cache can be configured by setting two environment variables - -- ``NEMO_DATA_STORE_CACHE_DIR``: path to a location which can be used to cache the data -- ``NEMO_DATA_STORE_CACHE_SHARED``: flag to denote whether the cache location is shared between the compute nodes - -In a multi-node environment, the cache location may or may be not shared between the nodes. -This can be configured by setting ``NEMO_DATA_STORE_CACHE_SHARED`` to ``1`` when the location is shared between the nodes or to ``0`` when each node has a separate cache. - -When a globally shared cache is available, the data should be cached only once from the global rank zero node. -When a node-specific cache is used, the data should be cached only once by each local rank zero node. -To control this behavior using `torch.distributed.barrier`, instantiation of the corresponding dataloader needs to be deferred ``ModelPT::setup``, to ensure a distributed environment has been initialized. -This can be achieved by setting ``defer_setup`` as - -.. code:: shell - - ++model.train_ds.defer_setup=true - ++model.validation_ds.defer_setup=true - ++model.test_ds.defer_setup=true - - -Complete Example -^^^^^^^^^^^^^^^^ - -An example using an AIS cluster at ``hostname:port`` with a tarred dataset for training, a non-tarred dataset for validation and node-specific caching is given below - -.. code:: shell - - export AIS_ENDPOINT=http://hostname:port \ - && export NEMO_DATA_STORE_CACHE_DIR=/tmp \ - && export NEMO_DATA_STORE_CACHE_SHARED=0 \ - python speech_to_text_bpe.py \ - ... - model.train_ds.manifest_filepath=ais://train_bucket/tarred_audio_manifest.json \ - model.train_ds.tarred_audio_filepaths=ais://train_bucket/audio__OP_0..511_CL_.tar \ - ++model.train_ds.defer_setup=true \ - mode.validation_ds.manifest_filepath=ais://validation_bucket/validation_manifest.json \ - ++model.validation_ds.defer_setup=true - - -.. _Hybrid-ASR-TTS_model__Text-Only-Data: - -Preparing Text-Only Data for Hybrid ASR-TTS Models --------------------------------------------------- - -:ref:`Hybrid ASR-TTS models ` require a text-only dataset for training the ASR model. -Each record in the dataset (in ``.json`` file) should contain the following fields: - -* ``text``: text to use as a target for the ASR model -* ``tts_text`` or/and ``tts_text_normalized``: text to use as a source for TTS model. ``tts_text_normalized`` should contain normalized text for TTS model. If there is no such field, ``tts_text`` will be used after normalization using the normalizer from the TTS model. It is highly recommended to normalize the text and create ``tts_text_normalized`` field manually, since current normalizers are unsuitable for processing a large amount of text on the fly. - -**Example record:** - -.. code-block:: json - - {"text": "target for one hundred billion parameters asr model", - "tts_text": "Target for 100B parameters ASR model.", - "tts_text_normalized": "Target for one hundred billion parameters ASR model."} diff --git a/SoundScribe/SpeakerID/docs/source/asr/examples/kinyarwanda_asr.rst b/SoundScribe/SpeakerID/docs/source/asr/examples/kinyarwanda_asr.rst deleted file mode 100644 index bd1eac94e31f0a690cb2d6b9b65bc86921ecabce..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/examples/kinyarwanda_asr.rst +++ /dev/null @@ -1,631 +0,0 @@ -######################################################################## -Example: Kinyarwanda ASR using Mozilla Common Voice Dataset -######################################################################## - -In this example, we describe essential steps of training an ASR model for a new language (Kinyarwanda). Namely, - -* Data preprocessing -* Building tokenizers -* Tarred datasets and bucketing -* Training from scratch and finetuning -* Inference and evaluation - - -************************** -Kinyarwanda Speech Dataset -************************** -We use `Mozilla Common Voice `_ dataset for Kinyarwanda which is a large dataset with 2000+ hours of audio data. - -**Note**: You should download this dataset by yourself. - -Mozilla distributes the dataset in tsv+mp3 format. -After downloading and unpacking, the dataset has the following structure - -.. code-block:: bash - - ├── cv-corpus-9.0-2022-04-27 - │ └── rw - │ ├── clips [here are all audio files, e.g. common_voice_rw_26260276.mp3] - │ ├── dev.tsv - │ ├── invalidated.tsv - │ ├── other.tsv - │ ├── reported.tsv - │ ├── test.tsv - │ ├── train.tsv - │ └── validated.tsv - -Mozilla provides **train/dev/test** split of the data, so we can just use it. -Let's look at the format of a .tsv file - -.. code-block:: bash - - head train.tsv - -.. code-block:: bash - - client_id path sentence up_votes down_votes age gender accents locale segment - e2a04c0ecacf81302f4270a3dddaa7a131420f6b7319208473af17d4adf3724ad9a3b6cdee107e2f321495db86f114a50c396e0928464a58dfad472130e7514a common_voice_rw_26273273.mp3 kandi tuguwe neza kugira ngo twakire amagambo y’ukuri, 2 0 twenties male rw - e2a04c0ecacf81302f4270a3dddaa7a131420f6b7319208473af17d4adf3724ad9a3b6cdee107e2f321495db86f114a50c396e0928464a58dfad472130e7514a common_voice_rw_26273478.mp3 Simbi na we akajya kwiga nubwo byari bigoye 2 0 twenties male rw - e2a04c0ecacf81302f4270a3dddaa7a131420f6b7319208473af17d4adf3724ad9a3b6cdee107e2f321495db86f114a50c396e0928464a58dfad472130e7514a common_voice_rw_26273483.mp3 Inshuti yanjye yaje kunsura ku biro byanjye. 2 0 twenties male rw - e2a04c0ecacf81302f4270a3dddaa7a131420f6b7319208473af17d4adf3724ad9a3b6cdee107e2f321495db86f114a50c396e0928464a58dfad472130e7514a common_voice_rw_26273488.mp3 Grand Canyon ni ahantu hazwi cyane ba mukerarugendo. 2 0 twenties male rw - -Each line corresponds to one record (usually one sentence) and contains: - -* name of the audio file -* corresponding transcription -* meta information: client_id, age, gender, etc. - - -Resampling and creating manifests -################################# - -To be able to use a dataset with NeMo Toolkit, we first need to - -* Convert *.tsv* files to *.json* manifests -* Convert *.mp3* files to *.wav* with sample rate of 16000 - -To convert a .tsv file to .json manifest, we used the following script - -.. code-block:: bash - - python tsv_to_json.py \ - --tsv=cv-corpus-9.0-2022-04-27/rw/train.tsv \ - --folder=cv-corpus-9.0-2022-04-27/rw/clips \ - --sampling_count=-1 - -**tsv_to_json.py**: - -.. code-block:: python - - import pandas as pd - import json - import tqdm - import argparse - - parser = argparse.ArgumentParser("MCV TSV-to-JSON converter") - parser.add_argument("--tsv", required=True, type=str, help="Input TSV file") - parser.add_argument("--sampling_count", required=True, type=int, help="Number of examples, you want, use -1 for all examples") - parser.add_argument("--folder", required=True, type=str, help="Relative path to folder with audio files") - args = parser.parse_args() - - df = pd.read_csv(args.tsv, sep='\t') - with open(args.tsv.replace('.tsv', '.json'), 'w') as fo: - mod = 1 - if args.sampling_count > 0: - mod = len(df) // args.sampling_count - for idx in tqdm.tqdm(range(len(df))): - if idx % mod != 0: - continue - item = { - 'audio_filepath': args.folder + "/" + df['path'][idx], - 'text': df['sentence'][idx], - 'up_votes': int(df['up_votes'][idx]), 'down_votes': int(df['down_votes'][idx]), - 'age': df['age'][idx], 'gender': df['gender'][idx], 'accents': df['accents'][idx], - 'client_id': df['client_id'][idx] - } - fo.write(json.dumps(item) + "\n") - -This script will create a corresponding **train.json** manifest near the initial **train.tsv**. It will look like this: - -.. code-block:: bash - - {"audio_filepath": "cv-corpus-9.0-2022-04-27/rw/clips/common_voice_rw_26273273.mp3", "text": "kandi tuguwe neza kugira ngo twakire amagambo y\u2019ukuri,", "up_votes": 2, "down_votes": 0, "age": "twenties", "gender": "male", "accents": NaN, "client_id": "e2a04c0ecacf81302f4270a3dddaa7a131420f6b7319208473af17d4adf3724ad9a3b6cdee107e2f321495db86f114a50c396e0928464a58dfad472130e7514a"} - {"audio_filepath": "cv-corpus-9.0-2022-04-27/rw/clips/common_voice_rw_26273478.mp3", "text": "Simbi na we akajya kwiga nubwo byari bigoye", "up_votes": 2, "down_votes": 0, "age": "twenties", "gender": "male", "accents": NaN, "client_id": "e2a04c0ecacf81302f4270a3dddaa7a131420f6b7319208473af17d4adf3724ad9a3b6cdee107e2f321495db86f114a50c396e0928464a58dfad472130e7514a"} - {"audio_filepath": "cv-corpus-9.0-2022-04-27/rw/clips/common_voice_rw_26273483.mp3", "text": "Inshuti yanjye yaje kunsura ku biro byanjye.", "up_votes": 2, "down_votes": 0, "age": "twenties", "gender": "male", "accents": NaN, "client_id": "e2a04c0ecacf81302f4270a3dddaa7a131420f6b7319208473af17d4adf3724ad9a3b6cdee107e2f321495db86f114a50c396e0928464a58dfad472130e7514a"} - {"audio_filepath": "cv-corpus-9.0-2022-04-27/rw/clips/common_voice_rw_26273488.mp3", "text": "Grand Canyon ni ahantu hazwi cyane ba mukerarugendo.", "up_votes": 2, "down_votes": 0, "age": "twenties", "gender": "male", "accents": NaN, "client_id": "e2a04c0ecacf81302f4270a3dddaa7a131420f6b7319208473af17d4adf3724ad9a3b6cdee107e2f321495db86f114a50c396e0928464a58dfad472130e7514a"} - -For resampling we used the following script: - -.. code-block:: bash - - mkdir train - python ../decode_resample.py \ - --manifest=cv-corpus-9.0-2022-04-27/rw/train.json \ - --destination_folder=./train - -**decode_resample.py**: - -.. code-block:: python - - import argparse - import os - import json - - import sox - from sox import Transformer - import tqdm - import multiprocessing - from tqdm.contrib.concurrent import process_map - - - parser = argparse.ArgumentParser() - parser.add_argument('--manifest', required=True, type=str, help='path to the original manifest') - parser.add_argument("--num_workers", default=multiprocessing.cpu_count(), type=int, help="Workers to process dataset.") - parser.add_argument("--destination_folder", required=True, type=str, help="Destination folder where audio files will be stored") - args = parser.parse_args() - - - def process(x): - if not isinstance(x['text'], str): - x['text'] = '' - else: - x['text'] = x['text'].lower().strip() - _, file_with_ext = os.path.split(x['audio_filepath']) - name, ext = os.path.splitext(file_with_ext) - output_wav_path = args.destination_folder + "/" + name + '.wav' - if not os.path.exists(output_wav_path): - tfm = Transformer() - tfm.rate(samplerate=16000) - tfm.channels(n_channels=1) - tfm.build(input_filepath=x['audio_filepath'], - output_filepath=output_wav_path) - x['duration'] = sox.file_info.duration(output_wav_path) - x['audio_filepath'] = output_wav_path - return x - - - def load_data(manifest): - data = [] - with open(manifest, 'r') as f: - for line in tqdm.tqdm(f): - item = json.loads(line) - data.append(item) - return data - - - data = load_data(args.manifest) - - data_new = process_map(process, data, max_workers=args.num_workers, chunksize=100) - - with open(args.manifest.replace('.json', '_decoded.json'), 'w') as f: - for item in tqdm.tqdm(data_new): - f.write(json.dumps(item) + '\n') - -It will write the resampled .wav-files to the specified directory and save a new json manifest with corrected audiopaths. - -**Note:** You need to repeat these steps for **test.tsv** and **dev.tsv** as well. - -****************** -Data Preprocessing -****************** - -Before we start training the model on the above manifest files, we need to preprocess the text data. Data pre-processing is done to reduce ambiguity in transcripts. This is an essential step, and often requires moderate expertise in the language. - -We used the following script -**prepare_dataset_kinyarwanda.py**: - -.. code-block:: python - - import json - import os - import re - from collections import defaultdict - from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest - from tqdm.auto import tqdm - - def write_processed_manifest(data, original_path): - original_manifest_name = os.path.basename(original_path) - new_manifest_name = original_manifest_name.replace(".json", "_processed.json") - - manifest_dir = os.path.split(original_path)[0] - filepath = os.path.join(manifest_dir, new_manifest_name) - write_manifest(filepath, data) - print(f"Finished writing manifest: {filepath}") - return filepath - - - # calculate the character set - def get_charset(manifest_data): - charset = defaultdict(int) - for row in tqdm(manifest_data, desc="Computing character set"): - text = row['text'] - for character in text: - charset[character] += 1 - return charset - - - # Preprocessing steps - def remove_special_characters(data): - chars_to_ignore_regex = "[\.\,\?\:\-!;()«»…\]\[/\*–‽+&_\\½√>€™$•¼}{~—=“\"”″‟„]" - apostrophes_regex = "[’'‘`ʽ']" - data["text"] = re.sub(chars_to_ignore_regex, " ", data["text"]) # replace punctuation by space - data["text"] = re.sub(apostrophes_regex, "'", data["text"]) # replace different apostrophes by one - data["text"] = re.sub(r"'+", "'", data["text"]) # merge multiple apostrophes - - # remove spaces where apostrophe marks a deleted vowel - # this rule is taken from https://huggingface.co/lucio/wav2vec2-large-xlsr-kinyarwanda-apostrophied - data["text"] = re.sub(r"([b-df-hj-np-tv-z])' ([aeiou])", r"\1'\2", data["text"]) - - data["text"] = re.sub(r" '", " ", data["text"]) # delete apostrophes at the beginning of word - data["text"] = re.sub(r"' ", " ", data["text"]) # delete apostrophes at the end of word - data["text"] = re.sub(r" +", " ", data["text"]) # merge multiple spaces - return data - - - def replace_diacritics(data): - data["text"] = re.sub(r"[éèëēê]", "e", data["text"]) - data["text"] = re.sub(r"[ãâāá]", "a", data["text"]) - data["text"] = re.sub(r"[úūü]", "u", data["text"]) - data["text"] = re.sub(r"[ôōó]", "o", data["text"]) - data["text"] = re.sub(r"[ćç]", "c", data["text"]) - data["text"] = re.sub(r"[ïī]", "i", data["text"]) - data["text"] = re.sub(r"[ñ]", "n", data["text"]) - return data - - - def remove_oov_characters(data): - oov_regex = "[^ 'aiuenrbomkygwthszdcjfvplxq]" - data["text"] = re.sub(oov_regex, "", data["text"]) # delete oov characters - data["text"] = data["text"].strip() - return data - - - # Processing pipeline - def apply_preprocessors(manifest, preprocessors): - for processor in preprocessors: - for idx in tqdm(range(len(manifest)), desc=f"Applying {processor.__name__}"): - manifest[idx] = processor(manifest[idx]) - - print("Finished processing manifest !") - return manifest - - - # List of pre-processing functions - PREPROCESSORS = [ - remove_special_characters, - replace_diacritics, - remove_oov_characters, - ] - - train_manifest = "train_decoded.json" - dev_manifest = "dev_decoded.json" - test_manifest = "test_decoded.json" - - train_data = read_manifest(train_manifest) - dev_data = read_manifest(dev_manifest) - test_data = read_manifest(test_manifest) - - # Apply preprocessing - train_data_processed = apply_preprocessors(train_data, PREPROCESSORS) - dev_data_processed = apply_preprocessors(dev_data, PREPROCESSORS) - test_data_processed = apply_preprocessors(test_data, PREPROCESSORS) - - # Write new manifests - train_manifest_cleaned = write_processed_manifest(train_data_processed, train_manifest) - dev_manifest_cleaned = write_processed_manifest(dev_data_processed, dev_manifest) - test_manifest_cleaned = write_processed_manifest(test_data_processed, test_manifest) - -It performs the following operations: - -* Remove all punctuation except for apostrophes -* Replace different kinds of apostrophes by one -* Lowercase -* Replace rare characters with diacritics (e.g. [éèëēê] => e) -* Delete all remaining out-of-vocabulary (OOV) characters - -The final Kinyarwanda alphabet in all trancripts consists of Latin letters, space and apostrophe. - -******************* -Building Tokenizers -******************* - -Though it is possible to train character-based ASR model, usually we get some improvement in quality and speed if we predict longer units. The commonly used tokenization algorithm is called `Byte-pair encoding `_. This is a deterministic tokenization algorithm based on corpus statistics. It splits the words to subtokens and the beginning of word is marked by special symbol so it's easy to restore the original words. -NeMo toolkit supports on-the-fly subword tokenization, so you need not modify the transcripts, but need to pass your tokenizer via the model config. NeMo supports both Word Piece Tokenizer (via HuggingFace) and Sentence Piece Tokenizer (via Google SentencePiece library) -For Kinyarwanda experiments we used 128 subtokens for the CTC model and 1024 subtokens for the Transducer model. The tokenizers for these models were built using the text transcripts of the train set with this script. For vocabulary of size 1024 we restrict maximum subtoken length to 4 symbols (2 symbols for size 128) to avoid populating vocabulary with specific frequent words from the dataset. This does not affect the model performance and potentially helps to adapt to other domain without retraining tokenizer. -We used the following script from NeMo toolkit to create `Sentencepiece `_ tokenizers with different vocabulary sizes (128 and 1024 subtokens) - -.. code-block:: bash - - python ${NEMO_ROOT}/scripts/tokenizers/process_asr_text_tokenizer.py \ - --manifest=dev_decoded_processed.json,train_decoded_processed.json \ - --vocab_size=1024 \ - --data_root=tokenizer_bpe_maxlen_4 \ - --tokenizer="spe" \ - --spe_type=bpe \ - --spe_character_coverage=1.0 \ - --spe_max_sentencepiece_length=4 \ - --log - - python ${NEMO_ROOT}/scripts/tokenizers/process_asr_text_tokenizer.py \ - --manifest=dev_decoded_processed.json,train_decoded_processed.json \ - --vocab_size=128 \ - --data_root=tokenizer_bpe_maxlen_2 \ - --tokenizer="spe" \ - --spe_type=bpe \ - --spe_character_coverage=1.0 \ - --spe_max_sentencepiece_length=2 \ - --log - -Most of the arguments are similar to those explained in the `ASR with Subword Tokenization tutorial `_. - -The resulting tokenizer is a folder like that: - -.. code-block:: bash - - ├── tokenizer_spe_bpe_v1024_max_4 - │ ├── tokenizer.model - │ ├── tokenizer.vocab - │ └── vocab.txt - -Remember that you will need to pass the path to tokenizer in the model config. -You can see all the subtokens in the **vocab.txt** file. - -***************************** -Tarred datasets and bucketing -***************************** - -There are two useful techniques for training on large datasets. - -* Tarred dataset allows to store the dataset as large .tar files instead of small separate audio files. It speeds up the training and minimizes the load on the network in the cluster. -* Bucketing groups utterances with similar duration. It reduces padding and speeds up the training. - -The NeMo toolkit provides a script to implement both of these techniques. - -.. code-block:: bash - - ## create tarred dataset with 1 bucket - python ${NEMO_ROOT}/scripts/speech_recognition/convert_to_tarred_audio_dataset.py \ - --manifest_path=train_decoded_processed.json \ - --target_dir=train_tarred_1bk \ - --num_shards=1024 \ - --max_duration=11.0 \ - --min_duration=1.0 \ - --shuffle \ - --shuffle_seed=1 \ - --sort_in_shards \ - --workers=-1 - - - ## create tarred dataset with 4 buckets - python ${NEMO_ROOT}/scripts/speech_recognition/convert_to_tarred_audio_dataset.py \ - --manifest_path=train_decoded_processed.json \ - --target_dir=train_tarred_4bk \ - --num_shards=1024 \ - --max_duration=11.0 \ - --min_duration=1.0 \ - --shuffle \ - --shuffle_seed=1 \ - --sort_in_shards \ - --workers=-1 \ - --buckets_num=4 - -**Note**: we only need to process train data, dev and test are usually much smaller and can be used as is. - -Our final dataset folder looks like this: - -.. code-block:: bash - - ├── dev [15988 .wav files] - ├── dev_decoded_processed.json (dev manifest) - ├── test [16213 .wav files] - ├── test_decoded_processed.json (test manifest) - └── train_tarred_1bk - ├── metadata.yaml - ├── tarred_audio_manifest.json - └── [1024 .tar files] - -In case of 4 buckets it will look like: - -.. code-block:: bash - - └── train_tarred_4bk - ├── bucket1 - ├── metadata.yaml - ├── tarred_audio_manifest.json - └── [1024 .tar files] - ├── bucket2 - ... - ├── bucket3 - └── bucket4 - -************************************ -Training from scratch and finetuning -************************************ - -ASR models -########## - -Our goal was to train two ASR models with different architectures: `Conformer-CTC `_ and `Conformer-Transducer `_, with around 120 million parameters. -The CTC model predicts output tokens for each timestep. The outputs are assumed to be independent of each other. As a result the CTC models work faster but they can produce outputs that are inconsistent with each other. CTC models are often combined with external language models in production. In contrast, the Transducer models contain the decoding part which generates the output tokens one by one and the next token prediction depends on this history. Due to autoregressive nature of decoding the inference speed is several times slower than that of CTC models, but the quality is usually better because it can incorporate language model information within the same model. - -Training scripts and configs -############################ - -To train a Conformer-CTC model, we use `speech_to_text_ctc_bpe.py `_ with the default config `conformer_ctc_bpe.yaml `_. -To train a Conformer-Transducer model, we use `speech_to_text_rnnt_bpe.py `_ with the default config `conformer_transducer_bpe.yaml `_. -Any options of default config can be overwritten from command line. -Usually we should provide the options related to the dataset and tokenizer. - -This is an example of how we can run the training script: - -.. code-block:: bash - - TOKENIZER=tokenizers/tokenizer_spe_bpe_v1024_max_4/ - TRAIN_MANIFEST=data/train_tarred_1bk/tarred_audio_manifest.json - TRAIN_FILEPATHS=data/train_tarred_1bk/audio__OP_0..1023_CL_.tar - VAL_MANIFEST=data/dev_decoded_processed.json - TEST_MANIFEST=data/test_decoded_processed.json - - python ${NEMO_ROOT}/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path=../conf/conformer/ \ - --config-name=conformer_ctc_bpe \ - exp_manager.name="Some name of our experiment" \ - exp_manager.resume_if_exists=true \ - exp_manager.resume_ignore_no_checkpoint=true \ - exp_manager.exp_dir=results/ \ - model.tokenizer.dir=$TOKENIZER \ - model.train_ds.is_tarred=true \ - model.train_ds.tarred_audio_filepaths=$TRAIN_FILEPATHS \ - model.train_ds.manifest_filepath=$TRAIN_MANIFEST \ - model.validation_ds.manifest_filepath=$VAL_MANIFEST \ - model.test_ds.manifest_filepath=$TEST_MANIFEST - -The option *exp_manager.resume_if_exists=true* allows to resume training. Actually you can stop training at any moment and then continue from the last checkpoint. -When the training is finished, the final model will be saved as *.nemo* file inside the folder that we specified in *exp_manager.exp_dir*. - -Training dynamics -################# - -The figure below shows the training dynamics when we train Kinyarwanda models **from scratch**. In these experiments we used the hyperparameters from the default configs, the training was run on 2 nodes with 16 gpus per node, training batch size was 32. We see that Transducer model achieves better quality than CTC. - - .. image:: ../images/kinyarwanda_from_scratch.png - :align: center - :alt: Training dynamics of Kinyarwanda models trained from scratch - :width: 800px - -Finetuning from another model -############################# - -Often it's a good idea to initialize our ASR model with the weights of some other pretrained model, for example, a model for another language. It usually makes our model to converge faster and achieve better quality, especially if the dataset for our target language is small. - -Though Kinyarwanda dataset is rather large, we also tried finetuning Kinyarwanda Conformer-Transducer model from different pretrained checkpoints, namely: - -* English Conformer-Transducer checkpoint -* Self-supervised Learning (SSL) checkpoint trained on English data -* SSL checkpoint trained on multilingual data - -To initialize from **non-SSL checkpoint** we should simply add the option `+init_from_pretrained_model`: - -.. code-block:: bash - - INIT_MODEL='stt_en_conformer_ctc_large' - - python ${NEMO_ROOT}/examples/asr/asr_ctc/speech_to_text_ctc_bpe.py - ...[same options as in the previous example]... - +init_from_pretrained_model=${INIT_MODEL} - -In that case the pretrained model `stt_en_conformer_ctc_large `_ will be automatically downloaded from NVIDIA GPU Cloud(NGC) and used to initialize weights before training. - -To initialize from **SSL checkpoint** we should edit our training script like the following code: - -.. code-block:: python - - import nemo.collections.asr as nemo_asr - ssl_model = nemo_asr.models.ssl_models.SpeechEncDecSelfSupervisedModel.from_pretrained(model_name='ssl_en_conformer_large') - - # define fine-tune model - asr_model = nemo_asr.models.EncDecCTCModelBPE(cfg=cfg.model, trainer=trainer) - - # load ssl checkpoint - asr_model.load_state_dict(ssl_model.state_dict(), strict=False) - - del ssl_model - -When using finetuning you probably will need to change the some hyperparameters from the default config, especially the learning rate and learning rate policy. In the experiments below we used *model.optim.sched.name=CosineAnnealing* and *model.optim.lr=1e-3*. - -The figure below compares the training dynamics for three Conformer-Transducer models. They differ only by how they are initialized. We see that finetuning leads to faster convergence and better quality. Initializing from SSL gives lowest WER at earlier stages, but in a longer period it performs worse. - - .. image:: ../images/kinyarwanda_finetuning.png - :align: center - :alt: Training dynamics of Kinyarwanda models trained from scratch and finetuned from different pretrained checkpoints - :width: 800px - -************************ -Inference and evaluation -************************ - -Running the inference -##################### - -To run the inference we need a pretrained model. This can be either a `.nemo` file that we get after the training is finished, or any published model from `NGC `_. -We run the inference using the following script: - -.. code-block:: bash - - python ${NEMO_ROOT}/examples/asr/transcribe_speech.py \ - model_path=.nemo \ - dataset_manifest=./test_decoded_processed.json \ - output_filename=./test_with_predictions.json \ - batch_size=8 \ - cuda=1 \ - amp=True - -To run inference with NVIDIA's Kinyarwanda checkpoints `STT Rw Conformer-CTC Large `_ or `STT Rw Conformer-Transducer Large `_ use: - -.. code-block:: bash - - python ${NEMO_ROOT}/examples/asr/transcribe_speech.py \ - pretrained_name="stt_rw_conformer_ctc_large" \ - dataset_manifest=test_decoded_processed.json \ - output_filename=./pred_ctc.json \ - batch_size=8 \ - cuda=1 \ - amp=True - -**Note:** If you want to transcribe new audios, you can pass a folder with audio files using `audio_dir` parameter instead of `dataset_manifest`. - -After the inference is finished the `output_filename` is a `.json` manifest augmented with a new field `pred_text` containing the resulting transcript. Example: - -.. code-block:: - - {"audio_filepath": "test/common_voice_rw_19835615.wav", "text": "kw'ibumoso", "up_votes": 2, "down_votes": 0, "age": NaN, "gender": NaN, "accents": NaN, "client_id": "66675a7003e6baa3e7d4af01bff8324ac3c5f15e7f8918180799dd2928227c791f19e2811f9ec5779a2b06dac1b7a97fa7740dcfe98646ea1b5e106250c260be", "duration": 3.672, "pred_text": "n'ibumoso"} - {"audio_filepath": "test/common_voice_rw_24795878.wav", "text": "ni ryari uheruka kurya urusenda", "up_votes": 2, "down_votes": 0, "age": NaN, "gender": NaN, "accents": NaN, "client_id": "90e0438947a75b6c0cf59a0444aee3b81a76c5f9459c4b22df2e14b4ce257aeacaed8ac6092bfcd75b8e831633d58a84287fd62190c21d70d75efe8d93ed74ed", "duration": 3.312, "pred_text": "ni ryari uheruka kurya urusenda"} - {"audio_filepath": "test/common_voice_rw_24256935.wav", "text": "umunani", "up_votes": 2, "down_votes": 0, "age": NaN, "gender": NaN, "accents": NaN, "client_id": "974d4876e99e7437183c20f9107053acc9e514379d448bcf00aaaabc0927f5380128af86d39650867fa80a82525110dfc40784a5371c989de1a5bdf531f6d943", "duration": 3.24, "pred_text": "umunani"} - -Word Error Rate (WER) and Character Error Rate (CER) -#################################################### - -As soon as we have a manifest file with `text` and `pred_text` we can measure the quality of predictions of our model. - -.. code-block:: bash - - # Calculate WER - python ${NEMO_ROOT}/examples/asr/speech_to_text_eval.py \ - dataset_manifest=test_with_predictions.json \ - use_cer=False \ - only_score_manifest=True - - # Calculate CER - python ${NEMO_ROOT}/examples/asr/speech_to_text_eval.py \ - dataset_manifest=test_with_predictions.json \ - use_cer=True \ - only_score_manifest=True - - -Evaluation of NVIDIA's Kinyarwanda checkpoints -############################################## - -If you run inference and evaluation of NVIDIA's published Kinyarwanda models, you should get metrics like these: - -+----------------------------------+-------+-------+ -| Model | WER % | CER % | -+==================================+=======+=======+ -| stt_rw_conformer_ctc_large | 18.22 | 5.45 | -+----------------------------------+-------+-------+ -| stt_rw_conformer_trasducer_large | 16.19 | 5.7 | -+----------------------------------+-------+-------+ - -Error analysis -############## - -Still, even WER of 16% is not as good as we usually get for other languages trained with NeMo toolkit, so we may want to look at the errors that the model makes to better understand what's the problem. - -We can use `Speech Data Explorer `_ to analyze the errors. - -If we run - -.. code-block:: bash - - python ${NEMO_ROOT}/tools/speech_data_explorer/data_explorer.py - -it will start a local server, and provide a http address to open from the browser. -In the UI we can see the model predictions and their diff with the reference, and also we can listen to the corresponding audio. We also can sort the sentences by descending WER and look through the top of them. - -The error analysis showed several problems concerning the Kinyarwanda dataset: - -* Noisy multi-speaker records (e.g. common_voice_rw_19830859.wav) -* Bad quality of record (e.g. common_voice_rw_24452415.wav) -* Orthographic variability related to space/no space/apostrophe - * *kugira ngo / kugirango* - * *nkuko / nk'uko* - * *n iyo / n'iyo* -* Multiple orthographic variants for foreign words - * *telefoni / telephone* - * *film / filime* - * *isiraheli / israel* - * *radio / radiyo* - * *kongo / congo* -* l/r variability - * *abamalayika / abamarayika* - - diff --git a/SoundScribe/SpeakerID/docs/source/asr/images/citrinet_vertical.png b/SoundScribe/SpeakerID/docs/source/asr/images/citrinet_vertical.png deleted file mode 100644 index a9a7f8c2e8530dd866aa46aa95f5c59fa56b442f..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/images/citrinet_vertical.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/images/conformer_ctc.png b/SoundScribe/SpeakerID/docs/source/asr/images/conformer_ctc.png deleted file mode 100644 index e491856502ed39793d631633cfc3d4da5b7d5e53..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/images/conformer_ctc.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/images/ctc_asr.png b/SoundScribe/SpeakerID/docs/source/asr/images/ctc_asr.png deleted file mode 100644 index 56ef705602da10b6dca5e5f01afd66bc1f5230d7..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/images/ctc_asr.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/images/hat.png b/SoundScribe/SpeakerID/docs/source/asr/images/hat.png deleted file mode 100644 index 4631fe89211d20a9afe5cd038bb2c8618b3f1d4b..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/images/hat.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/images/hybrid_asr_tts_model.png b/SoundScribe/SpeakerID/docs/source/asr/images/hybrid_asr_tts_model.png deleted file mode 100644 index 458c47b8b7723aeb1244511d2949a7c8110c131a..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/images/hybrid_asr_tts_model.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/images/jasper_layers.png b/SoundScribe/SpeakerID/docs/source/asr/images/jasper_layers.png deleted file mode 100644 index af88379af98f95cd02141b9dfcb669ead346700c..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/images/jasper_layers.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/images/jasper_vertical.png b/SoundScribe/SpeakerID/docs/source/asr/images/jasper_vertical.png deleted file mode 100644 index 13ba93c2c73b0b186a26076e1581698ede4f6da5..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/images/jasper_vertical.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/images/kinyarwanda_finetuning.png b/SoundScribe/SpeakerID/docs/source/asr/images/kinyarwanda_finetuning.png deleted file mode 100644 index 1cbce4a6293f78dbcc7201fba03dd7f5d0480753..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/images/kinyarwanda_finetuning.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/images/kinyarwanda_from_scratch.png b/SoundScribe/SpeakerID/docs/source/asr/images/kinyarwanda_from_scratch.png deleted file mode 100644 index 2aa58ab61c16f12a4d5ee9e24b34e55391f41feb..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/images/kinyarwanda_from_scratch.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/images/quartz_vertical.png b/SoundScribe/SpeakerID/docs/source/asr/images/quartz_vertical.png deleted file mode 100644 index 4cbede907736f631cb9774b207289a35dd07c757..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/images/quartz_vertical.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/images/squeezeformer.png b/SoundScribe/SpeakerID/docs/source/asr/images/squeezeformer.png deleted file mode 100644 index b6e1b218499cf461bab48e832c66c59c741cc1cb..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/images/squeezeformer.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/intro.rst b/SoundScribe/SpeakerID/docs/source/asr/intro.rst deleted file mode 100644 index 2ac27c4312dc75d928e8c3fb29c32bba4dbc381d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/intro.rst +++ /dev/null @@ -1,195 +0,0 @@ -Automatic Speech Recognition (ASR) -================================== - -Automatic Speech Recognition (ASR), also known as Speech To Text (STT), refers to the problem of automatically transcribing spoken language. -You can use NeMo to transcribe speech using open-sourced pretrained models in :ref:`14+ languages `, or :doc:`train your own<./examples/kinyarwanda_asr>` ASR models. - - - -Transcribe speech with 3 lines of code ----------------------------------------- -After :ref:`installing NeMo`, you can transcribe an audio file as follows: - -.. code-block:: python - - import nemo.collections.asr as nemo_asr - asr_model = nemo_asr.models.ASRModel.from_pretrained("stt_en_fastconformer_transducer_large") - transcript = asr_model.transcribe(["path/to/audio_file.wav"]) - -Obtain word timestamps -^^^^^^^^^^^^^^^^^^^^^^^^^ - -You can also obtain timestamps for each word in the transcription as follows: - -.. code-block:: python - - # import nemo_asr and instantiate asr_model as above - import nemo.collections.asr as nemo_asr - asr_model = nemo_asr.models.ASRModel.from_pretrained("stt_en_fastconformer_transducer_large") - - # update decoding config to preserve alignments and compute timestamps - from omegaconf import OmegaConf, open_dict - decoding_cfg = asr_model.cfg.decoding - with open_dict(decoding_cfg): - decoding_cfg.preserve_alignments = True - decoding_cfg.compute_timestamps = True - asr_model.change_decoding_strategy(decoding_cfg) - - # specify flag `return_hypotheses=True`` - hypotheses = asr_model.transcribe(["path/to/audio_file.wav"], return_hypotheses=True) - - # if hypotheses form a tuple (from RNNT), extract just "best" hypotheses - if type(hypotheses) == tuple and len(hypotheses) == 2: - hypotheses = hypotheses[0] - - timestamp_dict = hypotheses[0].timestep # extract timesteps from hypothesis of first (and only) audio file - print("Hypothesis contains following timestep information :", list(timestamp_dict.keys())) - - # For a FastConformer model, you can display the word timestamps as follows: - # 80ms is duration of a timestep at output of the Conformer - time_stride = 8 * asr_model.cfg.preprocessor.window_stride - - word_timestamps = timestamp_dict['word'] - - for stamp in word_timestamps: - start = stamp['start_offset'] * time_stride - end = stamp['end_offset'] * time_stride - word = stamp['char'] if 'char' in stamp else stamp['word'] - - print(f"Time : {start:0.2f} - {end:0.2f} - {word}") - -Transcribe speech via command line ----------------------------------- -You can also transcribe speech via the command line using the following `script `_, for example: - -.. code-block:: bash - - python /blob/main/examples/asr/transcribe_speech.py \ - pretrained_name="stt_en_fastconformer_transducer_large" \ - audio_dir= # path to dir containing audio files to transcribe - -The script will save all transcriptions in a JSONL file where each line corresponds to an audio file in ````. -This file will correspond to a format that NeMo commonly uses for saving model predictions, and also for storing -input data for training and evaluation. You can learn more about the format that NeMo uses for these files -(which we refer to as "manifest files") :ref:`here`. - -You can also specify the files to be transcribed inside a manifest file, and pass that in using the argument -``dataset_manifest=`` instead of ``audio_dir``. - - -Incorporate a language model (LM) to improve ASR transcriptions ---------------------------------------------------------------- - -You can often get a boost to transcription accuracy by using a Language Model to help choose words that are more likely -to be spoken in a sentence. - -You can get a good improvement in transcription accuracy even using a simple N-gram LM. - -After :ref:`training ` an N-gram LM, you can use it for transcribing audio as follows: - -1. Install the OpenSeq2Seq beam search decoding and KenLM libraries using `this script `_. -2. Perform transcription using `this script `_: - -.. code-block:: bash - - python eval_beamsearch_ngram.py nemo_model_file= \ - input_manifest= \ - beam_width=[] \ - beam_alpha=[] \ - beam_beta=[] \ - preds_output_folder= \ - probs_cache_file=null \ - decoding_mode=beamsearch_ngram \ - decoding_strategy="" - -See more information about LM decoding :doc:`here <./asr_language_modeling>`. - -Use real-time transcription ---------------------------- - -It is possible to use NeMo to transcribe speech in real-time. You can find an example of how to do -this in the following `notebook tutorial `_. - - -Try different ASR models ------------------------- - -NeMo offers a variety of open-sourced pretrained ASR models that vary by model architecture: - -* **encoder architecture** (FastConformer, Conformer, Citrinet, etc.), -* **decoder architecture** (Transducer, CTC & hybrid of the two), -* **size** of the model (small, medium, large, etc.). - -The pretrained models also vary by: - -* **language** (English, Spanish, etc., including some **multilingual** and **code-switching** models), -* whether the output text contains **punctuation & capitalization** or not. - -The NeMo ASR checkpoints can be found on `HuggingFace `_, or on `NGC `_. All models released by the NeMo team can be found on NGC, and some of those are also available on HuggingFace. - -All NeMo ASR checkpoints open-sourced by the NeMo team follow the following naming convention: -``stt_{language}_{encoder name}_{decoder name}_{model size}{_optional descriptor}``. - -You can load the checkpoints automatically using the ``ASRModel.from_pretrained()`` class method, for example: - -.. code-block:: python - - import nemo.collections.asr as nemo_asr - # model will be fetched from NGC - asr_model = nemo_asr.models.ASRModel.from_pretrained("stt_en_fastconformer_transducer_large") - # if model name is prepended with "nvidia/", the model will be fetched from huggingface - asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/stt_en_fastconformer_transducer_large") - # you can also load open-sourced NeMo models released by other HF users using: - # asr_model = nemo_asr.models.ASRModel.from_pretrained("/") - -See further documentation about :doc:`loading checkpoints <./results>`, a full :ref:`list ` of models and their :doc:`benchmark scores <./score>`. - -There is also more information about the ASR model architectures available in NeMo :doc:`here <./models>`. - - -Try out NeMo ASR transcription in your browser ----------------------------------------------- -You can try out transcription with NeMo ASR models without leaving your browser, by using the HuggingFace Space embedded below. - -.. raw:: html - - - - - - -ASR tutorial notebooks ----------------------- -Hands-on speech recognition tutorial notebooks can be found under `the ASR tutorials folder `_. -If you are a beginner to NeMo, consider trying out the `ASR with NeMo `_ tutorial. -This and most other tutorials can be run on Google Colab by specifying the link to the notebooks' GitHub pages on Colab. - -ASR model configuration ------------------------ -Documentation regarding the configuration files specific to the ``nemo_asr`` models can be found in the :doc:`Configuration Files <./configs>` section. - -Preparing ASR datasets ----------------------- -NeMo includes preprocessing scripts for several common ASR datasets. The :doc:`Datasets <./datasets>` section contains instructions on -running those scripts. It also includes guidance for creating your own NeMo-compatible dataset, if you have your own data. - -Further information -------------------- -For more information, see additional sections in the ASR docs on the left-hand-side menu or in the list below: - -.. toctree:: - :maxdepth: 1 - - models - datasets - asr_language_modeling - results - scores - configs - api - resources - examples/kinyarwanda_asr.rst diff --git a/SoundScribe/SpeakerID/docs/source/asr/models.rst b/SoundScribe/SpeakerID/docs/source/asr/models.rst deleted file mode 100644 index a4f77625dff8bd7c685b471b2c1c589617787185..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/models.rst +++ /dev/null @@ -1,408 +0,0 @@ -Models -====== - -This section gives a brief overview of the models that NeMo's ASR collection currently supports. - -Each of these models can be used with the example ASR scripts (in the ``/examples/asr`` directory) by -specifying the model architecture in the config file used. Examples of config files for each model can be found in -the ``/examples/asr/conf`` directory. - -For more information about the config files and how they should be structured, refer to the :doc:`./configs` section. - -Pretrained checkpoints for all of these models, as well as instructions on how to load them, can be found in the :doc:`./results` -section. You can use the available checkpoints for immediate inference, or fine-tune them on your own datasets. The checkpoints section -also contains benchmark results for the available ASR models. - -.. _Jasper_model: - -Jasper ------- - -Jasper ("Just Another Speech Recognizer") :cite:`asr-models-li2019jasper` is a deep time delay neural network (TDNN) comprising of -blocks of 1D-convolutional layers. The Jasper family of models are denoted as ``Jasper_[BxR]`` where ``B`` is the number of blocks -and ``R`` is the number of convolutional sub-blocks within a block. Each sub-block contains a 1-D convolution, batch normalization, -ReLU, and dropout: - - .. image:: images/jasper_vertical.png - :align: center - :alt: jasper model - :scale: 50% - -Jasper models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModel` class. - -QuartzNet ---------- - -QuartzNet :cite:`asr-models-kriman2019quartznet` is a version of Jasper :cite:`asr-models-li2019jasper` model with separable -convolutions and larger filters. It can achieve performance similar to Jasper but with an order of magnitude fewer parameters. -Similarly to Jasper, the QuartzNet family of models are denoted as ``QuartzNet_[BxR]`` where ``B`` is the number of blocks and ``R`` -is the number of convolutional sub-blocks within a block. Each sub-block contains a 1-D *separable* convolution, batch normalization, -ReLU, and dropout: - - .. image:: images/quartz_vertical.png - :align: center - :alt: quartznet model - :scale: 40% - -QuartzNet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModel` class. - -.. _Citrinet_model: - -Citrinet --------- - -Citrinet is a version of QuartzNet :cite:`asr-models-kriman2019quartznet` that extends ContextNet :cite:`asr-models-han2020contextnet`, -utilizing subword encoding (via Word Piece tokenization) and Squeeze-and-Excitation mechanism :cite:`asr-models-hu2018squeeze` to -obtain highly accurate audio transcripts while utilizing a non-autoregressive CTC based decoding scheme for efficient inference. - - .. image:: images/citrinet_vertical.png - :align: center - :alt: citrinet model - :scale: 50% - -Citrinet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModelBPE` class. - -.. _ContextNet_model: - -ContextNet ----------- - -ContextNet is a model uses Transducer/RNNT loss/decoder and is introduced in :cite:`asr-models-han2020contextnet`. -It uses Squeeze-and-Excitation mechanism :cite:`asr-models-hu2018squeeze` to model larger context. -Unlike Citrinet, it has an autoregressive decoding scheme. - -ContextNet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecRNNTBPEModel` class for a -model with sub-word encoding and :class:`~nemo.collections.asr.models.EncDecRNNTModel` for char-based encoding. - -You may find the example config files of ContextNet model with character-based encoding at -``/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml`` and -with sub-word encoding at ``/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml``. - -.. _Conformer-CTC_model: - -Conformer-CTC -------------- - -Conformer-CTC is a CTC-based variant of the Conformer model introduced in :cite:`asr-models-gulati2020conformer`. Conformer-CTC has a -similar encoder as the original Conformer but uses CTC loss and decoding instead of RNNT/Transducer loss, which makes it a non-autoregressive model. -We also drop the LSTM decoder and instead use a linear decoder on the top of the encoder. This model uses the combination of -self-attention and convolution modules to achieve the best of the two approaches, the self-attention layers can learn the global -interaction while the convolutions efficiently capture the local correlations. The self-attention modules support both regular -self-attention with absolute positional encoding, and also Transformer-XL's self-attention with relative positional encodings. - -Here is the overall architecture of the encoder of Conformer-CTC: - - .. image:: images/conformer_ctc.png - :align: center - :alt: Conformer-CTC Model - :scale: 50% - -This model supports both the sub-word level and character level encodings. You can find more details on the config files for the -Conformer-CTC models at `Conformer-CTC <./configs.html#conformer-ctc>`_. The variant with sub-word encoding is a BPE-based model -which can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModelBPE` class, while the -character-based variant is based on :class:`~nemo.collections.asr.models.EncDecCTCModel`. - -You may find the example config files of Conformer-CTC model with character-based encoding at -``/examples/asr/conf/conformer/conformer_ctc_char.yaml`` and -with sub-word encoding at ``/examples/asr/conf/conformer/conformer_ctc_bpe.yaml``. - -.. _Conformer-Transducer_model: - -Conformer-Transducer --------------------- - -Conformer-Transducer is the Conformer model introduced in :cite:`asr-models-gulati2020conformer` and uses RNNT/Transducer loss/decoder. -It has the same encoder as Conformer-CTC but utilizes RNNT/Transducer loss/decoder which makes it an autoregressive model. - -Most of the config file for Conformer-Transducer models are similar to Conformer-CTC except the sections related to the decoder and loss: decoder, loss, joint, decoding. -You may take a look at our `tutorials page <../starthere/tutorials.html>`_ on Transducer models to become familiar with their configs: -`Introduction to Transducers `_ and -`ASR with Transducers `_ -You can find more details on the config files for the Conformer-Transducer models at `Conformer-CTC <./configs.html#conformer-ctc>`_. - -This model supports both the sub-word level and character level encodings. The variant with sub-word encoding is a BPE-based model -which can be instantiated using the :class:`~nemo.collections.asr.models.EncDecRNNTBPEModel` class, while the -character-based variant is based on :class:`~nemo.collections.asr.models.EncDecRNNTModel`. - -You may find the example config files of Conformer-Transducer model with character-based encoding at -``/examples/asr/conf/conformer/conformer_transducer_char.yaml`` and -with sub-word encoding at ``/examples/asr/conf/conformer/conformer_transducer_bpe.yaml``. - -Fast-Conformer --------------- - -The Fast Conformer (CTC and RNNT) models have a faster version of the Conformer encoder and differ from it as follows: - -* 8x depthwise convolutional subsampling with 256 channels -* Reduced convolutional kernel size of 9 in the conformer blocks - -The Fast Conformer encoder is about 2.4x faster than the regular Conformer encoder without a significant model quality degradation. -128 subsampling channels yield a 2.7x speedup vs baseline but model quality starts to degrade. -With local attention, inference is possible on audios >1 hrs (256 subsampling channels) / >2 hrs (128 channels). - -Fast Conformer models were trained using CosineAnnealing (instead of Noam) as the scheduler. - -You may find the example CTC config at -``/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml`` and -the transducer config at ``/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml`` - -Note that both configs are subword-based (BPE). - -You can also train these models with longformer-style attention (https://arxiv.org/abs/2004.05150) using the following configs: CTC config at -``/examples/asr/conf/fastconformer/fast-conformer-long_ctc_bpe.yaml`` and transducer config at ``/examples/asr/conf/fastconformer/fast-conformer-long_transducer_bpe.yaml`` -This allows using the model on longer audio (up to 70 minutes with Fast Conformer). Note that the Fast Conformer checkpoints -can be used with limited context attention even if trained with full context. However, if you also want to use global tokens, -which help aggregate information from outside the limited context, then training is required. - -You may find more examples under ``/examples/asr/conf/fastconformer/``. - -Cache-aware Streaming Conformer -------------------------------- - -Buffered streaming uses overlapping chunks to make an offline ASR model to be used for streaming with reasonable accuracy. However, it uses significant amount of duplication in computations due to the overlapping chunks. -Also there is a accuracy gep between the offline model and the streaming one as there is inconsistency between how we train the model and how we perform inference for streaming. -The Cache-aware Streaming Conformer models would tackle and address these disadvantages. These streaming Conformers are trained with limited right context that it would make it possible to match how the model is being used in both the training and inference. -They also uses caching to store intermediate activations to avoid any duplication in compute. -The cache-aware approach is supported for both the Conformer-CTC and Conformer-Transducer and enables the model to be used very efficiently for streaming. - -Three categories of layers in Conformer have access to right tokens: 1-depthwise convolutions 2-self-attention, and 3-convolutions in the downsampling layers. -Streaming Conformer models uses causal convolutions or convolutions with lower right context and also self-attention with limited right context to limit the effective right context for the input. -The model trained with such limitations can be used in streaming mode and give the exact same outputs and accuracy as when the whole audio is given to the model in offline mode. -These model can use caching mechanism to store and reuse the activations during streaming inference to avoid any duplications in the computations as much as possible. - -We support the following three right context modeling: - -* fully causal model with zero look-ahead: tokens would not see any future tokens. convolution layers are all causal and right tokens are masked for self-attention. - -It gives zero latency but with limited accuracy. -To train such a model, you need to set `model.encoder.att_context_size=[left_context,0]` and `model.encoder.conv_context_size=causal` in the config. - -* regular look-ahead: convolutions would be able to see few future frames, and self-attention would also see the same number of future tokens. - -In this approach the activations for the look-ahead part is not cached and recalculated in the next chunks. The right context in each layer should be a small number as multiple layers would increase the effective context size and then increase the look-ahead size and latency. -For example for a model of 17 layers with 4x downsampling and 10ms window shift, then even 2 right context in each layer means 17*2*10*4=1360ms look-ahead. Each step after the downsampling corresponds to 4*10=40ms. - -* chunk-aware look-ahead: input is split into equal chunks. Convolutions are fully causal while self-attention layers would be able to see all the tokens in their corresponding chunk. - -For example, in a model which chunk size of 20 tokens, tokens at the first position of each chunk would see all the next 19 tokens while the last token would see zero future tokens. -This approach is more efficient than regular look-ahead in terms of computations as the activations for most of the look-ahead part would be cached and there is close to zero duplications in the calculations. -In terms of accuracy, this approach gives similar or even better results in term of accuracy than regular look-ahead as each token in each layer have access to more tokens on average. That is why we recommend to use this approach for streaming. Therefore we recommend to use the chunk-aware for cache-aware models. - -** Note: Latencies are based on the assumption that the forward time of the network is zero and it just estimates the time needed after a frame would be available until it is passed through the model. - -Approaches with non-zero look-ahead can give significantly better accuracy by sacrificing latency. The latency can get controlled by the left context size. Increasing the right context would help the accuracy to a limit but would increase the computation time. - -In all modes, left context can be controlled by the number of tokens to be visible in the self-attention and the kernel size of the convolutions. -For example, if left context of self-attention in each layer is set to 20 tokens and there are 10 layers of Conformer, then effective left context is 20*10=200 tokens. -Left context of self-attention for regular look-ahead can be set as any number while it should be set as a multiplication of the right context in chunk-aware look-ahead. -For convolutions, if we use a left context of 30 in such model, then there would be 30*10=300 effective left context. -Left context of convolutions is dependent to the their kernel size while it can be any number for self-attention layers. Higher left context for self-attention means larger cache and more computations for the self-attention. -Self-attention left context of around 6 secs would give close result to have unlimited left context. For a model with 4x downsampling and shift window of 10ms in the preprocessor, each token corresponds to 4*10=40ms. - -If striding approach is used for downsampling, all the convolutions in downsampling would be fully causal and don't see future tokens. - -* Multiple Look-aheads -We support multiple look-aheads for cahce-aware models. You may specify a list of context sizes for att_context_size. -During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. -For example you may enable multiple look-aheads by setting `model.encoder.att_context_size=[[70,13],[70,6],[70,1],[70,0]]` for the training. -The first item in the list would be the default during test/validation/inference. To switch between different look-aheads, you may use the method `asr_model.encoder.set_default_att_context_size(att_context_size)` or set the att_context_size like the following when using the script `speech_transcribe.py`: - -.. code-block:: bash - - python [NEMO_GIT_FOLDER]/examples/asr/transcribe_speech.py \ - pretrained_name="stt_en_fastconformer_hybrid_large_streaming_multi" \ - audio_dir="" \ - att_context_size=[70,0] - -.. - -You may find the example config files for cache-aware streaming FastConformer models at -``/examples/asr/conf/fastconformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml`` for Transducer variant and -at ``/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe.yaml`` for CTC variant. It is recommended to use FastConformer as they are more than 2X faster in both training and inference than regular Conformer. -The hybrid versions of FastConformer can be found here: ``/examples/asr/conf/conformer/hybrid_cache_aware_streaming/`` - -Examples for regular Conformer can be found at -``/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml`` for Transducer variant and -at ``/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe.yaml`` for CTC variant. - -To simulate cache-aware streaming, you may use the script at ``/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py``. It can simulate streaming in single stream or multi-stream mode (in batches) for an ASR model. -This script can be used for models trained offline with full-context but the accuracy would not be great unless the chunk size is large enough which would result in high latency. -It is recommended to train a model in streaming model with limited context for this script. More info can be found in the script. - -Note cache-aware streaming models are being exported without caching support by default. -To include caching support, `model.set_export_config({'cache_support' : 'True'})` should be called before export. -Or, if ``/scripts/export.py`` is being used: -`python export.py cache_aware_conformer.nemo cache_aware_conformer.onnx --export-config cache_support=True` - -.. _LSTM-Transducer_model: - -LSTM-Transducer ---------------- - -LSTM-Transducer is a model which uses RNNs (eg. LSTM) in the encoder. The architecture of this model is followed from suggestions in :cite:`asr-models-he2019streaming`. -It uses RNNT/Transducer loss/decoder. The encoder consists of RNN layers (LSTM as default) with lower projection size to increase the efficiency. -Layer norm is added between the layers to stabilize the training. -It can be trained/used in unidirectional or bidirectional mode. The unidirectional mode is fully causal and can be used easily for simple and efficient streaming. However the accuracy of this model is generally lower than other models like Conformer and Citrinet. - -This model supports both the sub-word level and character level encodings. You may find the example config file of RNNT model with wordpiece encoding at ``/examples/asr/conf/lstm/lstm_transducer_bpe.yaml``. -You can find more details on the config files for the RNNT models at `LSTM-Transducer <./configs.html#lstm-transducer>`_. - -.. _LSTM-CTC_model: - -LSTM-CTC --------- - -LSTM-CTC model is a CTC-variant of the LSTM-Transducer model which uses CTC loss/decoding instead of Transducer. -You may find the example config file of LSTM-CTC model with wordpiece encoding at ``/examples/asr/conf/lstm/lstm_ctc_bpe.yaml``. - -.. _Squeezeformer-CTC_model: - -Squeezeformer-CTC ------------------ - -Squeezeformer-CTC is a CTC-based variant of the Squeezeformer model introduced in :cite:`asr-models-kim2022squeezeformer`. Squeezeformer-CTC has a -similar encoder as the original Squeezeformer but uses CTC loss and decoding instead of RNNT/Transducer loss, which makes it a non-autoregressive model. The vast majority of the architecture is similar to Conformer model, so please refer to `Conformer-CTC <./models.html#conformer-ctc>`_. - -The model primarily differs from Conformer in the following ways : - -* Temporal U-Net style time reduction, effectively reducing memory consumption and FLOPs for execution. -* Unified activations throughout the model. -* Simplification of module structure, removal of redundant layers. - -Here is the overall architecture of the encoder of Squeezeformer-CTC: - - .. image:: images/squeezeformer.png - :align: center - :alt: Squeezeformer-CTC Model - :scale: 50% - -This model supports both the sub-word level and character level encodings. You can find more details on the config files for the -Squeezeformer-CTC models at `Squeezeformer-CTC <./configs.html#squeezeformer-ctc>`_. The variant with sub-word encoding is a BPE-based model -which can be instantiated using the :class:`~nemo.collections.asr.models.EncDecCTCModelBPE` class, while the -character-based variant is based on :class:`~nemo.collections.asr.models.EncDecCTCModel`. - -You may find the example config files of Squeezeformer-CTC model with character-based encoding at -``/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml`` and -with sub-word encoding at ``/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml``. - -.. _Hybrid-Transducer_CTC_model: - -Hybrid-Transducer-CTC ---------------------- - -Hybrid RNNT-CTC models is a group of models with both the RNNT and CTC decoders. Training a unified model would speedup the convergence for the CTC models and would enable -the user to use a single model which works as both a CTC and RNNT model. This category can be used with any of the ASR models. -Hybrid models uses two decoders of CTC and RNNT on the top of the encoder. The default decoding strategy after the training is done is RNNT. -User may use the ``asr_model.change_decoding_strategy(decoder_type='ctc' or 'rnnt')`` to change the default decoding. - -The variant with sub-word encoding is a BPE-based model -which can be instantiated using the :class:`~nemo.collections.asr.models.EncDecHybridRNNTCTCBPEModel` class, while the -character-based variant is based on :class:`~nemo.collections.asr.models.EncDecHybridRNNTCTCModel`. - -You may use the example scripts under ``/examples/asr/asr_hybrid_transducer_ctc`` for both the char-based encoding and sub-word encoding. -These examples can be used to train any Hybrid ASR model like Conformer, Citrinet, QuartzNet, etc. - -You may find the example config files of Conformer variant of such hybrid models with character-based encoding at -``/examples/asr/conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc_char.yaml`` and -with sub-word encoding at ``/examples/asr/conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc_bpe.yaml``. - -Similar example configs for FastConformer variants of Hybrid models can be found here: -``/examples/asr/conf/fastconformer/hybrid_transducer_ctc/`` -``/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/`` - -Note Hybrid models are being exported as RNNT (encoder and decoder+joint parts) by default. -To export as CTC (single encoder+decoder graph), `model.set_export_config({'decoder_type' : 'ctc'})` should be called before export. -Or, if ``/scripts/export.py`` is being used: -`python export.py hybrid_transducer.nemo hybrid_transducer.onnx --export-config decoder_type=ctc` - -.. _Conformer-HAT_model: - -Conformer-HAT (Hybrid Autoregressive Transducer) ------------------------------------------------- -Conformer HAT model (do not confuse it with Hybrid-Transducer-CTC) is a modification of Conformer-Transducer model based on `Google paper `_. -The main idea is to separate labels and blank score predictions, which allows to estimate the internal LM probabilities during decoding. -When external LM is available for inference, the internal LM can be subtracted from HAT model prediction in beamsearch decoding to improve external LM efficiency. -It can be helpful in the case of text-only adaptation for new domains. - -The only difference from the standard Conformer-Transducer model (RNNT) is the use of `"HATJiont" `_ -class (instead of "RNNTJoint") for joint module. The all HAT logic is implemented in the "HATJiont" class. - - .. image:: images/hat.png - :align: center - :alt: HAT Model - :scale: 50% - -You may find the example config files of Conformer-HAT model with character-based encoding at -``/examples/asr/conf/conformer/hat/conformer_hat_char.yaml`` and -with sub-word encoding at ``/examples/asr/conf/conformer/hat/conformer_hat_bpe.yaml``. - -By default, the decoding for HAT model works in the same way as for Conformer-Transducer. -In the case of external ngram LM fusion you can use ``/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py``. -To enable HAT internal LM subtraction set ``hat_subtract_ilm=True`` and find more appropriate couple of ``beam_alpha`` and ``hat_ilm_weight`` values in terms of the best recognition accuracy. - - -.. _Hybrid-ASR-TTS_model: - -Hybrid ASR-TTS Model --------------------- - -Hybrid ASR-TTS Model (``ASRWithTTSModel``) is a transparent wrapper for the ASR model with a frozen pretrained text-to-spectrogram model. The approach is described in the paper -`Text-only domain adaptation for end-to-end ASR using integrated text-to-mel-spectrogram generator `_. -This allows using text-only data for training and finetuning, mixing it with audio-text pairs if necessary. - -The model consists of three models: - -* ASR model (``EncDecCTCModelBPE`` or ``EncDecRNNTBPEModel``) -* Frozen TTS Mel Spectrogram Generator (currently, only :ref:`FastPitch ` model is supported) -* Optional frozen :ref:`Spectrogram Enhancer model ` model trained to mitigate mismatch between real and generated mel spectrogram - - .. image:: images/hybrid_asr_tts_model.png - :align: center - :alt: Hybrid ASR-TTS Model - :scale: 50% - -For the detailed information see: - -* :ref:`Text-only dataset ` preparation -* :ref:`Configs and training ` - - -.. _Confidence-Ensembles: - -Confidence-based Ensembles --------------------------- - -Confidence-based ensemble is a simple way to combine multiple models into a single system by only retaining the -output of the most confident model. Below is a schematic illustration of how such ensembles work. - - .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.19.0/conf-ensembles-overview.png - :align: center - :alt: confidence-based ensembles - :scale: 50% - -For more details about this model, see the `paper `_ -or read our `tutorial `_. - -NeMo support Confidence-based Ensembles through the -:ref:`nemo.collections.asr.models.confidence_ensembles.ConfidenceEnsembleModel ` class. - -A typical workflow to create and use the ensemble is like this - -1. Run `scripts/confidence_ensembles/build_ensemble.py `_ - script to create ensemble from existing models. See the documentation inside the script for usage examples - and description of all the supported functionality. -2. The script outputs a checkpoint that combines all the models in an ensemble. It can be directly used to transcribe - speech by calling ``.trascribe()`` method or using - `examples/asr/transcribe_speech.py `_. - -Note that the ensemble cannot be modified after construction (e.g. it does not support finetuning) and only -transcribe functionality is supported (e.g., ``.forward()`` is not properly defined). - - -References ----------- - -.. bibliography:: asr_all.bib - :style: plain - :labelprefix: ASR-MODELS - :keyprefix: asr-models- diff --git a/SoundScribe/SpeakerID/docs/source/asr/results.rst b/SoundScribe/SpeakerID/docs/source/asr/results.rst deleted file mode 100644 index 3dca110d89e41b17fe29c64947c0f08602b7fdc9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/results.rst +++ /dev/null @@ -1,323 +0,0 @@ -Checkpoints -=========== - -There are two main ways to load pretrained checkpoints in NeMo: - -* Using the :code:`restore_from()` method to load a local checkpoint file (``.nemo``), or -* Using the :code:`from_pretrained()` method to download and set up a checkpoint from NGC. - -Refer to the following sections for instructions and examples for each. - -Note that these instructions are for loading fully trained checkpoints for evaluation or fine-tuning. For resuming an unfinished -training experiment, use the Experiment Manager to do so by setting the ``resume_if_exists`` flag to ``True``. - -Loading Local Checkpoints -------------------------- - -NeMo automatically saves checkpoints of a model that is trained in a ``.nemo`` format. Alternatively, to manually save the model at any -point, issue :code:`model.save_to(.nemo)`. - -If there is a local ``.nemo`` checkpoint that you'd like to load, use the :code:`restore_from()` method: - -.. code-block:: python - - import nemo.collections.asr as nemo_asr - model = nemo_asr.models..restore_from(restore_path="") - -Where the model base class is the ASR model class of the original checkpoint, or the general ``ASRModel`` class. - - -Hybrid ASR-TTS Models Checkpoints ---------------------------------- - -:ref:`Hybrid ASR-TTS model ` is a transparent wrapper for the ASR model, text-to-mel-spectrogram generator, and optional enhancer. -The model is saved as a solid ``.nemo`` checkpoint containing all these parts. -Due to transparency, the ASR model can be extracted after training/finetuning separately by using the ``asr_model`` attribute (NeMo submodel) -:code:`hybrid_model.asr_model.save_to(.nemo)` or by using a wrapper -made for convenience purpose :code:`hybrid_model.save_asr_model_to(.nemo)` - - -NGC Pretrained Checkpoints --------------------------- - -The ASR collection has checkpoints of several models trained on various datasets for a variety of tasks. These checkpoints are -obtainable via NGC `NeMo Automatic Speech Recognition collection `_. -The model cards on NGC contain more information about each of the checkpoints available. - -The tables below list the ASR models available from NGC. The models can be accessed via the :code:`from_pretrained()` method inside -the ASR Model class. In general, you can load any of these models with code in the following format: - -.. code-block:: python - - import nemo.collections.asr as nemo_asr - model = nemo_asr.models.ASRModel.from_pretrained(model_name="") - -Where the model name is the value under "Model Name" entry in the tables below. - -For example, to load the base English QuartzNet model for speech recognition, run: - -.. code-block:: python - - model = nemo_asr.models.ASRModel.from_pretrained(model_name="QuartzNet15x5Base-En") - -You can also call :code:`from_pretrained()` from the specific model class (such as :code:`EncDecCTCModel` -for QuartzNet) if you need to access a specific model functionality. - -If you would like to programmatically list the models available for a particular base class, you can use the -:code:`list_available_models()` method. - -.. code-block:: python - - nemo_asr.models..list_available_models() - -Transcribing/Inference -^^^^^^^^^^^^^^^^^^^^^^ - -To perform inference and transcribe a sample of speech after loading the model, use the ``transcribe()`` method: - -.. code-block:: python - - model.transcribe(paths2audio_files=[list of audio files], batch_size=BATCH_SIZE, logprobs=False) - -Setting the argument ``logprobs`` to ``True`` returns the log probabilities instead of transcriptions. For more information, see `nemo.collections.asr.modules <./api.html#modules>`__. -The audio files should be 16KHz mono-channel wav files. - -Inference on long audio -^^^^^^^^^^^^^^^^^^^^^^ - -In some cases the audio is too long for standard inference, especially if you're using a model such as Conformer, where the time and memory costs of the attention layers scale quadratically with the duration. - -There are two main ways of performing inference on long audio files in NeMo: - -The first way is to use buffered inference, where the audio is divided into chunks to run on, and the output is merged afterwards. -The relevant scripts for this are contained in `this folder `_. - -The second way, specifically for models with the Conformer/Fast Conformer encoder, is to use local attention, which changes the costs to be linear. -You can train Fast Conformer models with Longformer-style (https://arxiv.org/abs/2004.05150) local+global attention using one of the following configs: CTC config at -``/examples/asr/conf/fastconformer/fast-conformer-long_ctc_bpe.yaml`` and transducer config at ``/examples/asr/conf/fastconformer/fast-conformer-long_transducer_bpe.yaml``. -You can also convert any model trained with full context attention to local, though this may result in lower WER in some cases. You can switch to local attention when running the -`transcribe `_ or `evaluation `_ -scripts in the following way: - -.. code-block:: python - - python speech_to_text_eval.py \ - (...other parameters...) \ - ++model_change.conformer.self_attention_model="rel_pos_local_attn" \ - ++model_change.conformer.att_context_size=[128, 128] - -Alternatively, you can change the attention model after loading a checkpoint: - -.. code-block:: python - - asr_model = ASRModel.from_pretrained('stt_en_conformer_ctc_large') - asr_model.change_attention_model( - self_attention_model="rel_pos_local_attn", - att_context_size=[128, 128] - ) - - -Inference on Apple M-Series GPU -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To perform inference on Apple Mac M-Series GPU (``mps`` PyTorch device), use PyTorch 2.0 or higher (see :ref:`mac-installation` section). Environment variable ``PYTORCH_ENABLE_MPS_FALLBACK=1`` should be set, since not all operations in PyTorch are currently implemented on ``mps`` device. - -If ``allow_mps=true`` flag is passed to ``speech_to_text_eval.py``, the ``mps`` device will be selected automatically. - -.. code-block:: python - - PYTORCH_ENABLE_MPS_FALLBACK=1 python speech_to_text_eval.py \ - (...other parameters...) \ - allow_mps=true - - -Fine-tuning on Different Datasets -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -There are multiple ASR tutorials provided in the :ref:`Tutorials ` section. Most of these tutorials explain how to instantiate a pre-trained model, prepare the model for fine-tuning on some dataset (in the same language) as a demonstration. - -Inference Execution Flow Diagram --------------------------------- - -When preparing your own inference scripts, please follow the execution flow diagram order for correct inference, found at the `examples directory for ASR collection `_. - -Automatic Speech Recognition Models ------------------------------------ - -Below is a list of all the ASR models that are available in NeMo for specific languages, as well as auxiliary language models for certain languages. - -Language Models for ASR -^^^^^^^^^^^^^^^^^^^^^^^ - -.. csv-table:: - :file: data/asrlm_results.csv - :align: left - :widths: 30, 30, 40 - :header-rows: 1 - -| - - -.. _asr-checkpoint-list-by-language: - -Speech Recognition (Languages) ------------------------------- - -English -^^^^^^^ -.. csv-table:: - :file: data/benchmark_en.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 - ------------------------------ - -Mandarin -^^^^^^^^ -.. csv-table:: - :file: data/benchmark_zh.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 - ------------------------------ - -German -^^^^^^ -.. csv-table:: - :file: data/benchmark_de.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 - ------------------------------ - -French -^^^^^^ -.. csv-table:: - :file: data/benchmark_fr.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 - ------------------------------ - -Polish -^^^^^^ -.. csv-table:: - :file: data/benchmark_pl.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 - ------------------------------ - -Italian -^^^^^^^ -.. csv-table:: - :file: data/benchmark_it.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 - ------------------------------ - -Russian -^^^^^^^ -.. csv-table:: - :file: data/benchmark_ru.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 - ------------------------------ - -Spanish -^^^^^^^ -.. csv-table:: - :file: data/benchmark_es.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 - - ------------------------------ - -Catalan -^^^^^^^ -.. csv-table:: - :file: data/benchmark_ca.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 - ------------------------------ - -Hindi -^^^^^^^ -.. csv-table:: - :file: data/benchmark_hi.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 - ------------------------------ - -Marathi -^^^^^^^ -.. csv-table:: - :file: data/benchmark_mr.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 - ------------------------------ - -Kinyarwanda -^^^^^^^^^^^ -.. csv-table:: - :file: data/benchmark_rw.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 - ------------------------------ - -Belarusian -^^^^^^^^^^^ -.. csv-table:: - :file: data/benchmark_by.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 - ------------------------------ - -Ukrainian -^^^^^^^^^^^ -.. csv-table:: - :file: data/benchmark_ua.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 - ------------------------------ - -Multilingual -^^^^^^^^^^^ -.. csv-table:: - :file: data/benchmark_multilingual.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 - ------------------------------ - -Code-Switching -^^^^^^^^^^^ -.. csv-table:: - :file: data/benchmark_code_switching.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/scores.rst b/SoundScribe/SpeakerID/docs/source/asr/scores.rst deleted file mode 100644 index d008a26700ec0fb59877adc610f32b9da36cd9a5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/scores.rst +++ /dev/null @@ -1,436 +0,0 @@ -.. - AUTOGENERATED DOC: DO NOT EDIT MANUALLY ! - -Scores ------- - -EN -^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/en/citrinet_en.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/en/conformer_en.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/en/contextnet_en.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/en/fastconformer_en.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/en/jasper10x5dr_en.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/en/quartznet15x5_en.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/en/squeezeformer_en.csv - --------------------- - -BE -^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/be/conformer_be.csv - --------------------- - -BY -^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/by/fastconformer_by.csv - --------------------- - -CA -^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/ca/conformer_ca.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/ca/quartznet15x5_ca.csv - --------------------- - -DE -^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/de/citrinet_de.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/de/conformer_de.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/de/contextnet_de.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/de/fastconformer_de.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/de/quartznet15x5_de.csv - --------------------- - -ENES -^^^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/enes/conformer_enes.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/enes/contextnet_enes.csv - --------------------- - -EO -^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/eo/conformer_eo.csv - --------------------- - -ES -^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/es/citrinet_es.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/es/conformer_es.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/es/contextnet_es.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/es/fastconformer_es.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/es/quartznet15x5_es.csv - --------------------- - -FR -^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/fr/citrinet_fr.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/fr/conformer_fr.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/fr/contextnet_fr.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/fr/quartznet15x5_fr.csv - --------------------- - -HR -^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/hr/conformer_hr.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/hr/fastconformer_hr.csv - --------------------- - -IT -^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/it/conformer_it.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/it/fastconformer_it.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/it/quartznet15x5_it.csv - --------------------- - -KAB -^^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/kab/conformer_kab.csv - --------------------- - -PL -^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/pl/fastconformer_pl.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/pl/quartznet15x5_pl.csv - --------------------- - -RU -^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/ru/conformer_ru.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/ru/quartznet15x5_ru.csv - --------------------- - -RW -^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/rw/conformer_rw.csv - --------------------- - -UA -^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/ua/fastconformer_ua.csv - --------------------- - -ZH -^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/zh/citrinet_zh.csv - --------------------- - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores/zh/conformer_zh.csv - --------------------- - - - -Scores with Punctuation and Capitalization ------------------------------------------- - -EN with P&C -^^^^^^^^^^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores_pc/en/fastconformer_en.csv - --------------------- - -BY with P&C -^^^^^^^^^^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores_pc/by/fastconformer_by.csv - --------------------- - -DE with P&C -^^^^^^^^^^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores_pc/de/fastconformer_de.csv - --------------------- - -ES with P&C -^^^^^^^^^^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores_pc/es/fastconformer_es.csv - --------------------- - -HR with P&C -^^^^^^^^^^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores_pc/hr/fastconformer_hr.csv - --------------------- - -IT with P&C -^^^^^^^^^^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores_pc/it/fastconformer_it.csv - --------------------- - -PL with P&C -^^^^^^^^^^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores_pc/pl/fastconformer_pl.csv - --------------------- - -UA with P&C -^^^^^^^^^^^ - -.. csv-table:: - :header-rows: 1 - :align: left - :file: data/scores_pc/ua/fastconformer_ua.csv - --------------------- - diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/api.rst b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/api.rst deleted file mode 100644 index 37feabaed9f8dc1c082d6b5e6be4f06011c1e75e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/api.rst +++ /dev/null @@ -1,20 +0,0 @@ -NeMo Speaker Diarization API -============================= - - -Model Classes -------------- -.. autoclass:: nemo.collections.asr.models.ClusteringDiarizer - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.models.EncDecDiarLabelModel - :show-inheritance: - :members: add_speaker_model_config, _init_segmentation_info, _init_speaker_model, setup_training_data, setup_validation_data, setup_test_data, get_ms_emb_seq, get_cluster_avg_embs_model, get_ms_mel_feat, forward, forward_infer, training_step, validation_step, compute_accuracies - -Mixins ------- -.. autoclass:: nemo.collections.asr.parts.mixins.mixins.DiarizationMixin - :show-inheritance: - :members: - diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/configs.rst b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/configs.rst deleted file mode 100644 index f95bf18025c84925ae5b79f5fd2992420a5e3b23..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/configs.rst +++ /dev/null @@ -1,253 +0,0 @@ -NeMo Speaker Diarization Configuration Files -============================================ - -Both training and inference of speaker diarization is configured by ``.yaml`` files. The diarizer section will generally require information about the dataset(s) being used, models used in this pipeline, as well as inference related parameters such as post processing of each models. The sections on this page cover each of these in more detail. - -.. note:: - For model details and deep understanding about configs, training, fine-tuning and evaluations, - please refer to ``/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb`` and ``/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb``; - for other applications such as possible integration with ASR, have a look at ``/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb``. - - -Hydra Configurations for Diarization Training -============================================= - -Currently, NeMo supports Multi-scale diarization decoder (MSDD) as a neural diarizer model. MSDD is a speaker diarization model based on initializing clustering and multi-scale segmentation input. Example configuration files for MSDD model training can be found in ``/examples/speaker_tasks/diarization/conf/neural_diarizer/``. - -* Model name convention for MSDD: msdd_scl___Povl_xxx -* Example: ``msdd_5scl_15_05_50Povl_256x3x32x2.yaml`` has 5 scales, the longest scale is 1.5 sec, the shortest scale is 0.5 sec, with 50 percent overlap, hidden layer size is 256, 3 LSTM layers, 32 CNN channels, 2 repeated Conv layers - -MSDD model checkpoint (.ckpt) and NeMo file (.nemo) contain speaker embedding model (TitaNet) and the speaker model is loaded along with standalone MSDD module. Note that MSDD models require more than one scale. Thus, the parameters in ``diarizer.speaker_embeddings.parameters`` should have more than one scale to function as a MSDD model. - - -General Diarizer Configuration ------------------------------- - -The items (OmegaConfig keys) directly under ``model`` determines segmentation and clustering related parameters. Multi-scale parameters (``window_length_in_sec``, ``shift_length_in_sec`` and ``multiscale_weights``) are specified. ``max_num_of_spks``, ``scale_n``, ``soft_label_thres`` and ``emb_batch_size`` are set here and then assigned to dataset configurations. - -.. code-block:: yaml - - diarizer: - out_dir: null - oracle_vad: True # If True, uses RTTM files provided in manifest file to get speech activity (VAD) timestamps - speaker_embeddings: - model_path: ??? # .nemo local model path or pretrained model name (titanet_large is recommended) - parameters: - window_length_in_sec: [1.5,1.25,1.0,0.75,0.5] # Window length(s) in sec (floating-point number). either a number or a list. ex) 1.5 or [1.5,1.0,0.5] - shift_length_in_sec: [0.75,0.625,0.5,0.375,0.25] # Shift length(s) in sec (floating-point number). either a number or a list. ex) 0.75 or [0.75,0.5,0.25] - multiscale_weights: [1,1,1,1,1] # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. ex) [0.33,0.33,0.33] - save_embeddings: True # Save embeddings as pickle file for each audio input. - - - num_workers: ${num_workers} # Number of workers used for data-loading. - max_num_of_spks: 2 # Number of speakers per model. This is currently fixed at 2. - scale_n: 5 # Number of scales for MSDD model and initializing clustering. - soft_label_thres: 0.5 # Threshold for creating discretized speaker label from continuous speaker label in RTTM files. - emb_batch_size: 0 # If this value is bigger than 0, corresponding number of embedding vectors are attached to torch graph and trained. - -Dataset Configuration ---------------------- - -Training, validation, and test parameters are specified using the ``train_ds``, ``validation_ds``, and -``test_ds`` sections in the configuration YAML file, respectively. The items such as ``num_spks``, ``soft_label_thres`` and ``emb_batch_size`` follow the settings in ``model`` key. You may also leave fields such as the ``manifest_filepath`` or ``emb_dir`` blank, and then specify it via command-line interface. Note that ``test_ds`` is not used during training and only used for speaker diarization inference. - -.. code-block:: yaml - - train_ds: - manifest_filepath: ??? - emb_dir: ??? - sample_rate: ${sample_rate} - num_spks: ${model.max_num_of_spks} - soft_label_thres: ${model.soft_label_thres} - labels: null - batch_size: ${batch_size} - emb_batch_size: ${model.emb_batch_size} - shuffle: True - - validation_ds: - manifest_filepath: ??? - emb_dir: ??? - sample_rate: ${sample_rate} - num_spks: ${model.max_num_of_spks} - soft_label_thres: ${model.soft_label_thres} - labels: null - batch_size: 2 - emb_batch_size: ${model.emb_batch_size} - shuffle: False - - test_ds: - manifest_filepath: null - emb_dir: null - sample_rate: 16000 - num_spks: ${model.max_num_of_spks} - soft_label_thres: ${model.soft_label_thres} - labels: null - batch_size: 2 - shuffle: False - seq_eval_mode: False - - -Pre-processor Configuration ---------------------------- - -In the MSDD configuration, pre-processor configuration follows the pre-processor of the embedding extractor model. - -.. code-block:: yaml - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - normalize: "per_feature" - window_size: 0.025 - sample_rate: ${sample_rate} - window_stride: 0.01 - window: "hann" - features: 80 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - - -Model Architecture Configurations ---------------------------------- - -The hyper-parameters for MSDD models are under the ``msdd_module`` key. The model architecture can be changed by setting up the ``weighting_scheme`` and ``context_vector_type``. The detailed explanation for architecture can be found in the :doc:`Models <./models>` page. - -.. code-block:: yaml - - msdd_module: - _target_: nemo.collections.asr.modules.msdd_diarizer.MSDD_module - num_spks: ${model.max_num_of_spks} # Number of speakers per model. This is currently fixed at 2. - hidden_size: 256 # Hidden layer size for linear layers in MSDD module - num_lstm_layers: 3 # Number of stacked LSTM layers - dropout_rate: 0.5 # Dropout rate - cnn_output_ch: 32 # Number of filters in a conv-net layer. - conv_repeat: 2 # Determines the number of conv-net layers. Should be greater or equal to 1. - emb_dim: 192 # Dimension of the speaker embedding vectors - scale_n: ${model.scale_n} # Number of scales for multiscale segmentation input - weighting_scheme: 'conv_scale_weight' # Type of weighting algorithm. Options: ('conv_scale_weight', 'attn_scale_weight') - context_vector_type: 'cos_sim' # Type of context vector: options. Options: ('cos_sim', 'elem_prod') - -Loss Configurations -------------------- - -Neural diarizer uses a binary cross entropy (BCE) loss. A set of weights for negative (absence of the speaker's speech) and positive (presence of the speaker's speech) can be provided to the loss function. - -.. code-block:: yaml - - loss: - _target_: nemo.collections.asr.losses.bce_loss.BCELoss - weight: null # Weight for binary cross-entropy loss. Either `null` or list type input. (e.g. [0.5,0.5]) - - -Hydra Configurations for Diarization Inference -============================================== - -Example configuration files for speaker diarization inference can be found in ``/examples/speaker_tasks/diarization/conf/inference/``. Choose a yaml file that fits your targeted domain. For example, if you want to diarize audio recordings of telephonic speech, choose ``diar_infer_telephonic.yaml``. - -The configurations for all the components of diarization inference are included in a single file named ``diar_infer_.yaml``. Each ``.yaml`` file has a few different sections for the following modules: VAD, Speaker Embedding, Clustering and ASR. - -In speaker diarization inference, the datasets provided in manifest format denote the data that you would like to perform speaker diarization on. - -Diarizer Configurations ------------------------ - -An example ``diarizer`` Hydra configuration could look like: - -.. code-block:: yaml - - diarizer: - manifest_filepath: ??? - out_dir: ??? - oracle_vad: False # If True, uses RTTM files provided in manifest file to get speech activity (VAD) timestamps - collar: 0.25 # Collar value for scoring - ignore_overlap: True # Consider or ignore overlap segments while scoring - -Under ``diarizer`` key, there are ``vad``, ``speaker_embeddings``, ``clustering`` and ``asr`` keys containing configurations for the inference of the corresponding modules. - -Configurations for Voice Activity Detector ------------------------------------------- - -Parameters for VAD model are provided as in the following Hydra config example. - -.. code-block:: yaml - - vad: - model_path: null # .nemo local model path or pretrained model name or none - external_vad_manifest: null # This option is provided to use external vad and provide its speech activity labels for speaker embeddings extraction. Only one of model_path or external_vad_manifest should be set - - parameters: # Tuned parameters for CH109 (using the 11 multi-speaker sessions as dev set) - window_length_in_sec: 0.15 # Window length in sec for VAD context input - shift_length_in_sec: 0.01 # Shift length in sec for generate frame level VAD prediction - smoothing: "median" # False or type of smoothing method (eg: median) - overlap: 0.875 # Overlap ratio for overlapped mean/median smoothing filter - onset: 0.4 # Onset threshold for detecting the beginning and end of a speech - offset: 0.7 # Offset threshold for detecting the end of a speech - pad_onset: 0.05 # Adding durations before each speech segment - pad_offset: -0.1 # Adding durations after each speech segment - min_duration_on: 0.2 # Threshold for small non_speech deletion - min_duration_off: 0.2 # Threshold for short speech segment deletion - filter_speech_first: True - -Configurations for Speaker Embedding in Diarization ---------------------------------------------------- - -Parameters for speaker embedding model are provided in the following Hydra config example. Note that multiscale parameters either accept list or single floating point number. - -.. code-block:: yaml - - speaker_embeddings: - model_path: ??? # .nemo local model path or pretrained model name (titanet_large, ecapa_tdnn or speakerverification_speakernet) - parameters: - window_length_in_sec: 1.5 # Window length(s) in sec (floating-point number). Either a number or a list. Ex) 1.5 or [1.5,1.25,1.0,0.75,0.5] - shift_length_in_sec: 0.75 # Shift length(s) in sec (floating-point number). Either a number or a list. Ex) 0.75 or [0.75,0.625,0.5,0.375,0.25] - multiscale_weights: null # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. Ex) [1,1,1,1,1] - save_embeddings: False # Save embeddings as pickle file for each audio input. - -Configurations for Clustering in Diarization --------------------------------------------- - -Parameters for clustering algorithm are provided in the following Hydra config example. - -.. code-block:: yaml - - clustering: - parameters: - oracle_num_speakers: False # If True, use num of speakers value provided in the manifest file. - max_num_speakers: 20 # Max number of speakers for each recording. If oracle_num_speakers is passed, this value is ignored. - enhanced_count_thres: 80 # If the number of segments is lower than this number, enhanced speaker counting is activated. - max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. - sparse_search_volume: 30 # The higher the number, the more values will be examined with more time. - -Configurations for Diarization with ASR ---------------------------------------- - -The following configuration needs to be appended under ``diarizer`` to run ASR with diarization to get a transcription with speaker labels. - -.. code-block:: yaml - - asr: - model_path: ??? # Provide NGC cloud ASR model name. stt_en_conformer_ctc_* models are recommended for diarization purposes. - parameters: - asr_based_vad: False # if True, speech segmentation for diarization is based on word-timestamps from ASR inference. - asr_based_vad_threshold: 50 # threshold (multiple of 10ms) for ignoring the gap between two words when generating VAD timestamps using ASR based VAD. - asr_batch_size: null # Batch size can be dependent on each ASR model. Default batch sizes are applied if set to null. - lenient_overlap_WDER: True # If true, when a word falls into speaker-overlapped regions, consider the word as a correctly diarized word. - decoder_delay_in_sec: null # Native decoder delay. null is recommended to use the default values for each ASR model. - word_ts_anchor_offset: null # Offset to set a reference point from the start of the word. Recommended range of values is [-0.05 0.2]. - word_ts_anchor_pos: "start" # Select which part of the word timestamp we want to use. The options are: 'start', 'end', 'mid'. - fix_word_ts_with_VAD: False # Fix the word timestamp using VAD output. You must provide a VAD model to use this feature. - colored_text: False # If True, use colored text to distinguish speakers in the output transcript. - print_time: True # If True, the start of the end time of each speaker turn is printed in the output transcript. - break_lines: False # If True, the output transcript breaks the line to fix the line width (default is 90 chars) - - ctc_decoder_parameters: # Optional beam search decoder (pyctcdecode) - pretrained_language_model: null # KenLM model file: .arpa model file or .bin binary file. - beam_width: 32 - alpha: 0.5 - beta: 2.5 - - realigning_lm_parameters: # Experimental feature - arpa_language_model: null # Provide a KenLM language model in .arpa format. - min_number_of_words: 3 # Min number of words for the left context. - max_number_of_words: 10 # Max number of words for the right context. - logprob_diff_threshold: 1.2 # The threshold for the difference between two log probability values from two hypotheses. diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/data/diarization_results.csv b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/data/diarization_results.csv deleted file mode 100644 index fc3594520ea4e1b1ebfbeeab8f8cd534c4954682..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/data/diarization_results.csv +++ /dev/null @@ -1,7 +0,0 @@ -Model Name,Model Base Class,Model Card -vad_multilingual_marblenet,EncDecClassificationModel,"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/vad_multilingual_marblenet" -vad_marblenet,EncDecClassificationModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:vad_marblenet" -vad_telephony_marblenet,EncDecClassificationModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:vad_telephony_marblenet" -titanet_large,EncDecSpeakerLabelModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:titanet_large" -ecapa_tdnn,EncDecSpeakerLabelModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:ecapa_tdnn" -diar_msdd_telephonic,EncDecDiarLabelModel,"https://ngc.nvidia.com/catalog/models/nvidia:diar_msdd_telephonic" diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/datasets.rst b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/datasets.rst deleted file mode 100644 index 9f1a43a58f11bdba80b43508adeed922b5a92278..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/datasets.rst +++ /dev/null @@ -1,264 +0,0 @@ -Datasets -======== - -This page is about formatting a dataset for diarization training and inference. To train or fine-tune the speaker diarization system, you could either train/fine-tune speaker embedding extractor model separately or you can train/fine-tune speaker embedding extractor and neural diarizer at the same time. - -* To train or fine-tune a speaker embedding extractor model separately, please go check out these pages: :doc:`Speech Classification Datasets <../speech_classification/datasets>` and :doc:`Speaker Recognition Datasets <../speaker_recognition/datasets>` for preparing datasets for training and validating VAD and speaker embedding models respectively. - - -* To train or fine-tune speaker embedding extractor and neural diarizer together, please follow the dataset preparation process in this page. - -Data Preparation for Training ------------------------------ - -.. image:: images/msdd_train_and_infer.png - :align: center - :width: 800px - :alt: MSDD training and inference - -As shown in the above figure, a full-fledged speaker diarization process through speaker embedding extractor, clustering algorithm and neural diarizer. Note that only speaker embedding extractor and neural diarizer are trainable models and they can be train/fine-tune together on diarization datasets. We recommend to use a speaker embedding extractor model that is trained on large amount of single-speaker dataset and use it for training a neural diarizer model. - -Speaker diarization training is also managed by Hydra configurations based on ``.yaml`` files, just as in other NeMo neural models. See :doc:`NeMo Speaker Diarization Configuration Files <./configs>` for setting up the input Hydra configuration file for speaker diarization. Input data should be provided in line delimited JSON format as below: - -* Create a manifest file for speaker diarization - -Speaker diarization training and inference both require the same type of manifest files. This manifest file can be created by using the script in ``/scripts/speaker_tasks/pathfiles_to_diarize_manifest.py``. The following example shows how to run ``pathfiles_to_diarize_manifest.py`` by providing path list files. - -.. code-block:: shell-session - - python NeMo/scripts/speaker_tasks/pathfiles_to_diarize_manifest.py \ - --paths2audio_files='/path/to/audio_file_path_list.txt' \ - --paths2rttm_files='/path/to/rttm_file_list.txt' \ - --manifest_filepath='/path/to/manifest_filepath/train_manifest.json - - -All three arguments are required. Note that we need to maintain consistency on unique filenames for every field (key) by only changing the filename extensions. For example, if there is an audio file named ``abcd01.wav``, the rttm file should be named as ``abcd01.rttm`` and the transcription file should be named as ``abcd01.txt``. - -- Example audio file path list ``audio_file_path_list.txt`` - -.. code-block:: bash - - /path/to/abcd01.wav - /path/to/abcd02.wav - - -To train a diarization model, one needs to provide Rich Transcription Time Marked (RTTM) files as ground truth label files. Here is one line from a RTTM file as an example: - -.. code-block:: bash - - SPEAKER TS3012d.Mix-Headset 1 331.573 0.671 MTD046ID - - -Make a list of RTTM files for the audio files you have in ``audio_file_path_list.txt``. - -- Example RTTM file path list ``rttm_file_path_list.txt`` - -.. code-block:: bash - - /path/to/abcd01.rttm - /path/to/abcd02.rttm - -.. note:: - We expect all the provided files (e.g. audio, rttm, text) to have the same base name and the name should be unique (uniq-id). - -As an output file, ``train_manifest.json`` will have the following line for each audio file: - -.. code-block:: bash - - {"audio_filepath": "/path/to/abcd01.wav", "offset": 0, "duration": null, "label": "infer", "text": "-", "num_speakers": 2, "rttm_filepath": "/path/to/rttm/abcd01.rttm"} - - -* Manifest files for MSDD training - -After generating a session-wise manifest file, we need to break down each session-wise manifest file into a split manifest file containing start time and duration of the split samples due to memory capacity. More importantly, since MSDD only uses pairwise (two-speaker) model and data samples, we need to split RTTM files if there are more than two speakers. - -Note that you should specify window length and shift length of the base scale of your MSDD model when you generate the manifest file for training samples. More importantly, ``step_count`` determines how many steps (i.e., base-scale segments) are in a split data sample. If ``step_count`` is too long, you might not be able to load a single sample in a batch. - -.. code-block:: bash - - python NeMo/scripts/speaker_tasks/create_msdd_train_dataset.py \ - --input_manifest_path='path/to/train_manifest.json' \ - --output_manifest_path='path/to/train_manifest.50step.json' \ - --pairwise_rttm_output_folder='path/to/rttm_output_folder' \ - --window=0.5 \ - --shift=0.25 \ - --step_count=50 - -All arguments are required to generate a new manifest file. Specify a session-wise diarization manifest file to ``--input_manifest_path`` and specify an output file name in ``--output_manifest_path``. In the folder that is specified for ``--pairwise_rttm_output_folder``, the script will create multiple two-speaker RTTM files from the given RTTM file and create manifest file that only contains two speakers in the specified RTTM range. - - -For example, if ``abcd01.wav`` has three speakers (``1911,1988,192``), the three RTTM files will be created: ``abcd01.1911_1988.rttm``, ``abcd01.1911_192.rttm`` and ``abcd01.1988_192.rttm``. Subsequently, the segments will be only generated from the newly generated two-speaker RTTM files. - - -Specify ``window`` and ``shift`` of the base-scale in your MSDD model. In this example, we use default setting of ``window=0.5`` and ``shift=0.25`` and ``step_count=50``. Here are example lines in the output file ``/path/to/train_manifest.50step.json``. - -- Example manifest file ``train_manifest.50step.json``. - -.. code-block:: bash - - {"audio_filepath": "/path/to/abcd01.wav", "offset": 0.007, "duration": 14.046, "label": "infer", "text": "-", "num_speakers": 2, "rttm_filepath": "simulated_train/abcd01.1919_1988.rttm"} - {"audio_filepath": "/path/to/abcd01.wav", "offset": 13.553, "duration": 16.429, "label": "infer", "text": "-", "num_speakers": 2, "rttm_filepath": "simulated_train/abcd01.1919_1988.rttm"} - {"audio_filepath": "/path/to/abcd02.wav", "offset": 0.246, "duration": 15.732, "label": "infer", "text": "-", "num_speakers": 2, "rttm_filepath": "path/to/rttm_output_folder/abcd02.777_5694.rttm"} - {"audio_filepath": "/path/to/abcd02.wav", "offset": 15.478, "duration": 14.47, "label": "infer", "text": "-", "num_speakers": 2, "rttm_filepath": "path/to/rttm_output_folder/abcd02.777_5694.rttm"} - - -Prepare the msdd training dataset for both train and validation. After the training dataset is prepared, you can train an MSDD model with the following script: - -.. code-block:: bash - - python ./multiscale_diar_decoder.py --config-path='../conf/neural_diarizer' --config-name='msdd_5scl_15_05_50Povl_256x3x32x2.yaml' \ - trainer.devices=1 \ - trainer.max_epochs=20 \ - model.base.diarizer.speaker_embeddings.model_path="titanet_large" \ - model.train_ds.manifest_filepath="" \ - model.validation_ds.manifest_filepath="" \ - model.train_ds.emb_dir="" \ - model.validation_ds.emb_dir="" \ - exp_manager.name='sample_train' \ - exp_manager.exp_dir='./msdd_exp' \ - -In the above example training session, we use ``titanet_large`` model as a pretrained speaker embedding model. - -Data Preparation for Inference ------------------------------- - -As in dataset preparation for diarization trainiing, diarization inference is based on Hydra configurations which are fulfilled by ``.yaml`` files. See :doc:`NeMo Speaker Diarization Configuration Files <./configs>` for setting up the input Hydra configuration file for speaker diarization inference. Input data should be provided in line delimited JSON format as below: - -.. code-block:: bash - - {"audio_filepath": "/path/to/abcd.wav", "offset": 0, "duration": null, "label": "infer", "text": "-", "num_speakers": null, "rttm_filepath": "/path/to/rttm/abcd.rttm", "uem_filepath": "/path/to/uem/abcd.uem"} - -In each line of the input manifest file, ``audio_filepath`` item is mandatory while the rest of the items are optional and can be passed for desired diarization setting. We refer to this file as a manifest file. This manifest file can be created by using the script in ``/scripts/speaker_tasks/pathfiles_to_diarize_manifest.py``. The following example shows how to run ``pathfiles_to_diarize_manifest.py`` by providing path list files. - -.. code-block:: bash - - python pathfiles_to_diarize_manifest.py --paths2audio_files /path/to/audio_file_path_list.txt \ - --paths2txt_files /path/to/transcript_file_path_list.txt \ - --paths2rttm_files /path/to/rttm_file_path_list.txt \ - --paths2uem_files /path/to/uem_file_path_list.txt \ - --paths2ctm_files /path/to/ctm_file_path_list.txt \ - --manifest_filepath /path/to/manifest_output/input_manifest.json - -The ``--paths2audio_files`` and ``--manifest_filepath`` are required arguments. Note that we need to maintain consistency on unique filenames for every field (key) by only changing the filename extensions. For example, if there is an audio file named ``abcd.wav``, the rttm file should be named as ``abcd.rttm`` and the transcription file should be named as ``abcd.txt``. - -- Example audio file path list ``audio_file_path_list.txt`` - -.. code-block:: bash - - /path/to/abcd01.wav - /path/to/abcd02.wav - -- Example RTTM file path list ``rttm_file_path_list.txt`` - -.. code-block:: bash - - /path/to/abcd01.rttm - /path/to/abcd02.rttm - - -The path list files containing the absolute paths to these WAV, RTTM, TXT, CTM and UEM files should be provided as in the above example. ``pathsfiles_to_diarize_manifest.py`` script will match each file using the unique filename (e.g. ``abcd``). Finally, the absolute path of the created manifest file should be provided through Hydra configuration as shown below: - -.. code-block:: yaml - - diarizer.manifest_filepath="path/to/manifest/input_manifest.json" - -The following are descriptions about each field in an input manifest JSON file. - -.. note:: - We expect all the provided files (e.g. audio, rttm, text) to have the same base name and the name should be unique (uniq-id). - -``audio_filepath`` (Required): - - a string containing absolute path to the audio file. - -``num_speakers`` (Optional): - - If the number of speakers is known, provide the integer number or assign null if not known. - -``rttm_filepath`` (Optional): - - To evaluate a diarization system with known rttm files, one needs to provide Rich Transcription Time Marked (RTTM) files as ground truth label files. If RTTM files are provided, the diarization evaluation will be initiated. Here is one line from a RTTM file as an example: - -.. code-block:: bash - - SPEAKER TS3012d.Mix-Headset 1 331.573 0.671 MTD046ID - -``text`` (Optional): - - Ground truth transcription for diarization with ASR inference. Provide the ground truth transcription of the given audio file in string format - -.. code-block:: bash - - {"text": "this is an example transcript"} - -``uem_filepath`` (Optional): - - The UEM file is used for specifying the scoring regions to be evaluated in the given audio file. - UEMfile follows the following convention: `` ``. ```` is set to 1. - - Example lines of UEM file: - -.. code-block:: bash - - TS3012d.Mix-Headset 1 12.31 108.98 - TS3012d.Mix-Headset 1 214.00 857.09 - -``ctm_filepath`` (Optional): - - CTM file is used for the evaluation of word-level diarization results and word-timestamp alignment. CTM file follows the following convention: `` `` Since confidence is not required for evaluating diarization results, it can have any value. Note that the ```` should be exactly matched with speaker IDs in RTTM. - - Example lines of CTM file: - -.. code-block:: bash - - TS3012d.Mix-Headset MTD046ID 12.879 0.32 okay 0 - TS3012d.Mix-Headset MTD046ID 13.203 0.24 yeah 0 - - -Evaluation on Benchmark Datasets --------------------------------- - -The following instructions can help one to reproduce the expected diarization performance on two benchmark English dialogue datasets. The following results are evaluations based on 0.25 second collar without evaluating overlapped speech. The evaluation is based on oracle VAD results from RTTM files. Therefore, diarization error rate (DER) is equal to confusion error rate since oracle VAD has no miss detection or false alarm. - -AMI Meeting Corpus -~~~~~~~~~~~~~~~~~~ - -The following are the suggested parameters for reproducing the diarization performance for `AMI `_ test set. This setting is based on meeting domain configuration in ``/examples/speaker_tasks/diarization/conf/inference/diar_infer_meeting.yaml`` - -.. code-block:: bash - - diarizer.manifest_filepath="/path/to/AMItest_input_manifest.json" - diarizer.oracle_num_speakers=null # Performing unknown number of speaker case - diarizer.oracle_vad=True # Use oracle VAD extracted from RTTM files. - diarizer.collar=0.25 - diarizer.ignore_overlap=True - diarizer.speaker_embeddings.model_path="titanet_large" - -We provide a helper script to download the dataset and format it into a NeMo manifest. - -.. code-block:: bash - - python scripts/data_processing/speaker_tasks/get_ami_data.py --manifest_filepath AMItest_input_manifest.json - - -CallHome American English Speech (CHAES), LDC97S42 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We use the CH109 set which is a subset of the CHAES dataset which has only two speakers in one session. -The following are the suggested parameters for reproducing the diarization performance for the CH109 set and this setting is based on telephonic domain configuration in ``/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml`` - -.. code-block:: bash - - diarizer.manifest_filepath="/path/to/ch109_input_manifest.json" - diarizer.oracle_vad=True # Use oracle VAD extracted from RTTM files. - diarizer.collar=0.25 - diarizer.ignore_overlap=True - diarizer.speaker_embeddings.model_path="titanet_large" - - -To evaluate the performance on AMI Meeting Corpus, the following instructions can help. - - Download CHAES Meeting Corpus at LDC website `LDC97S42 `_ (CHAES is not publicly available). - - Download the CH109 filename list (whitelist) from `CH109 whitelist `_. - - Download RTTM files for CH109 set from `CH109 RTTM files `_. - - Generate an input manifest file using ``/scripts/speaker_tasks/pathfiles_to_diarize_manifest.py`` - diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/asr_sd_diagram.png b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/asr_sd_diagram.png deleted file mode 100644 index ba7613fcc75cfc636ab35018ffd5d8b3642a1939..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/asr_sd_diagram.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/data_flow.png b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/data_flow.png deleted file mode 100644 index 5d7a878eb61a06601b53ad061fe48678c2445fe3..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/data_flow.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/ms_trade_off.png b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/ms_trade_off.png deleted file mode 100644 index a9e4e18c00055b3aa1e5f48a29258680bb7edff1..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/ms_trade_off.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/msdd_train_and_infer.png b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/msdd_train_and_infer.png deleted file mode 100644 index 6af539c9f5ffd2f9a54a55e75fe82656af744706..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/msdd_train_and_infer.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/scale_weight_cnn.png b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/scale_weight_cnn.png deleted file mode 100644 index 4620ca0ad04595c4fa1d90e0a9b9d13d2c11165b..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/scale_weight_cnn.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/sd_pipeline.png b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/sd_pipeline.png deleted file mode 100644 index 47b16cc8c1b7393b34b1d627cd02ae92f8f23fb0..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/sd_pipeline.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/sequence_model.png b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/sequence_model.png deleted file mode 100644 index 9f00218289f352e15af8b2c382b3ab2781dd7bdb..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/sequence_model.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/weighted_sum.png b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/weighted_sum.png deleted file mode 100644 index 72fcd094d9c0901c65f5eee576e33a1cc6a5be52..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/images/weighted_sum.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/intro.rst b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/intro.rst deleted file mode 100644 index bd8dae2936148c8c81945f2db78b9cefdc072bca..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/intro.rst +++ /dev/null @@ -1,49 +0,0 @@ -Speaker Diarization -=================== - -Speaker diarization is the process of segmenting audio recordings by speaker labels and aims to answer the question “who spoke when?”. Speaker diarization makes a clear distinction when it is compared with speech recognition. As shown in the figure below, before we perform speaker diarization, we know “what is spoken” yet we do not know “who spoke it”. Therefore, speaker diarization is an essential feature for a speech recognition system to enrich the transcription with speaker labels. - -.. image:: images/asr_sd_diagram.png - :align: center - :width: 800px - :alt: Speaker diarization pipeline- VAD, segmentation, speaker embedding extraction, clustering - - - -To figure out "who spoke when", speaker diarization systems need to capture the characteristics of unseen speakers and tell apart which regions in the audio recording belong to which speaker. To achieve this, speaker diarization systems extract voice characteristics, count the number of speakers, then assign the audio segments to the corresponding speaker index. - -The following figure shows the overall data flow of the NeMo speaker diarization pipeline. - -.. image:: images/sd_pipeline.png - :align: center - :width: 800px - :alt: Speaker diarization pipeline- VAD, segmentation, speaker embedding extraction, clustering - -NeMo speaker diarization system consists of the following modules: - -- **Voice Activity Detector (VAD)**: A trainable model which detects the presence or absence of speech to generate timestamps for speech activity from the given audio recording. - -- **Speaker Embedding Extractor**: A trainable model that extracts speaker embedding vectors containing voice characteristics from raw audio signal. - -- **Clustering Module**: A non-trainable module that groups speaker embedding vectors into a number of clusters. - -- **Neural Diarizer**: A trainable model that estimates speaker labels from the given features. - -Speaker diarization evaluation can be done in two different modes depending on the VAD settings: - -- **oracle VAD**: Speaker diarization based on ground-truth VAD timestamps -- **system VAD**: Speaker diarization based on the results from a VAD model - -The full documentation tree is as follows: - -.. toctree:: - :maxdepth: 8 - - models - datasets - results - configs - api - resources - -.. include:: resources.rst diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/models.rst b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/models.rst deleted file mode 100644 index 033b63a16267769dbf2690a8323913b9c94197d1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/models.rst +++ /dev/null @@ -1,110 +0,0 @@ -Models -====== - -This section gives a brief overview of the supported speaker diarization models in NeMo's ASR collection. - -Currently speaker diarization pipeline in NeMo involves `MarbleNet <../speech_classification/models.html#marblenet-vad>`__ model for Voice Activity Detection (VAD) and `TitaNet <../speaker_recognition/models.html#titanet>`__ models for speaker embedding extraction and `Multi-scale Diarizerion Decoder` for neural diarizer, which will be explained in this page. - -.. _Multi_Scale_Diarization_Decoder: - -Multi-Scale Diarization Decoder -------------------------------- - -.. image:: images/sd_pipeline.png - :align: center - :width: 800px - :alt: Speaker diarization pipeline- VAD, segmentation, speaker embedding extraction, clustering - -Speaker diarization system needs to produce very accurate timestamps since speaker turns can be extremely short in conversational settings. Human conversation often involves very short back-channel words such as “yes”, “uh-huh” or “oh” and these words are very challenging for machines to transcribe and tell the speaker. Therefore, while segmenting audio recordings in terms of speaker identity, speaker diarization requires fine-grained decisions on relatively short segments, ranging from a few tenths of a second to several seconds. Making accurate, fine-grained decisions on such short audio segments is challenging because it is less likely to capture reliable speaker traits from the very short audio segments. We will discuss how this problem can be addressed by introducing a new technique called the multi-scale approach and multiscale diarization decoder to handle multi-scale inputs. - -Extracting long audio segments is desirable in terms of the quality of speaker characteristics. However, the length of audio segments also limits the granularity, which leads to a coarse unit length for speaker label decisions. Therefore, speaker diarization systems are challenged by a trade-off between temporal resolution and the fidelity of the speaker representation, as depicted in the curve shown in the figure below. During the speaker feature extraction process in the speaker diarization pipeline, the temporal resolution is inevitably sacrificed by taking a long speech segment to obtain high-quality speaker representation vectors. In plain and simple language, if we try to be very accurate on voice characteristics then we need to look into a longer span of time. However, at the same time, if we look into a longer span of time, we have to make a decision on a fairly long span of time and this leads to coarse decisions (temporal resolution is low). This can be easily understood if we think about the fact that even human listeners cannot accurately tell who is speaking if only half a second of recorded speech is given. - -In traditional diarization systems, an audio segment length ranges from 1.5~3.0 seconds since such numbers make a good compromise between the quality of speaker characteristics and temporal resolution. We refer to this type of segmentation method as a single-scale approach. Even with an overlap technique, the single-scale segmentation limits the temporal resolution to 0.75~1.5 seconds, which leaves room for improvement in terms of temporal accuracy. Having a coarse temporal resolution not only deteriorates the performance of diarization but also decreases speaker counting accuracy since short speech segments are not captured properly. More importantly, such coarse temporal resolution in the speaker timestamps makes the matching between the decoded ASR text and speaker diarization result more error-prone. - -.. image:: images/ms_trade_off.png - :align: center - :width: 800px - :alt: Speaker diarization pipeline- VAD, segmentation, speaker embedding extraction, clustering - -To tackle the problem, the multi-scale approach is proposed to cope with such a trade-off by extracting speaker features from multiple segment lengths and then combining the results from multiple scales. The multi-scale approach is fulfilled by employing multi-scale segmentation and extracting speaker embeddings from each scale. The left side of the above figure shows how four different scales in a multi-scale segmentation approach are performed. During the segment affinity calculation process, all the information from the longest scale to the shortest scale is combined, yet a decision is made only for the shortest segment range. When combining the features from each scale, the weight of each scale largely affects the speaker diarization performance. - -Since scale weights largely determine the accuracy of the speaker diarization system, the scale weights should be set to have the maximized speaker diarization performance. Hence, we came up with a novel multi-scale diarization system called multiscale diarization decoder :cite:`sd-models-park2022multi` that dynamically determines the importance of each scale at each timestep. - -Multiscale diarization decoder takes the multiple speaker embedding vectors from multiple scales and then estimates desirable scale weights. Based on the estimated scale weights, speaker labels are generated. Hence, the proposed system weighs more on the large scale if the input signals are considered to have more accurate information on the certain scales. - -.. image:: images/data_flow.png - :align: center - :width: 800px - :alt: Speaker diarization pipeline- VAD, segmentation, speaker embedding extraction, clustering - -The data flow of the multiscale speaker diarization system is shown in the above figure. Multi-scale segments are extracted from audio input, and corresponding speaker embedding vectors for multi-scale audio input are generated by using speaker embedding extractor (TitaNet). Followingly, the extracted multi-scale embeddings are processed by clustering algorithm to provide initializing clustering result to MSDD module. MSDD module uses cluster-average speaker embedding vectors to compare these with input speaker embedding sequences. The scale weights for each step is estimated to weigh the importance of each scale. Finally, the sequence model is trained to output speaker label probabilities for each speaker. - - -.. image:: images/scale_weight_cnn.png - :align: center - :width: 800px - :alt: A figure explaining CNN based scale weighting mechanism - - -A neural network model named multi-scale diarization decoder (MSDD) is trained to take advantage of a multi-scale approach by dynamically calculating the weight of each scale. MSDD takes the initial clustering results and compares the extracted speaker embeddings with the cluster-average speaker representation vectors. - -Most importantly, the weight of each scale at each time step is determined through a scale weighting mechanism where the scale weights are calculated from a 1-D convolutional neural networks (CNNs) applied to the multi-scale speaker embedding inputs and the cluster average embeddings as described in the above figure. - -.. image:: images/weighted_sum.png - :align: center - :width: 800px - :alt: A figure explaining weighted sum of cosine similarity values - -The estimated scale weights are applied to cosine similarity values calculated for each speaker and each scale. The above figure shows the process of calculating the context vector by applying the estimated scale weights on cosine similarity calculated between cluster-average speaker embedding and input speaker embeddings. - -Aside from CNN-based weighting scheme, MSDD implementation in NeMo toolkit allows multiple options for calculating scale weights ``model.msdd_module.weighting_scheme``: - - -- ``conv_scale_weight``: Default setting. Use 1-D CNN filters to calculate scale weights. - -- ``attn_scale_weight``: Calculate the scale weights by applying an attention mechanism between cluster-average embeddings and input embeddings. This can be viewed as attention values for scale at each timestep. - -Finally, each context vector for each step is fed to a multi-layer LSTM model that generates per-speaker speaker existence probability. The figure below shows how speaker label sequences are estimated by LSTM model and context vector input. - -.. image:: images/sequence_model.png - :align: center - :width: 400px - :alt: Speaker diarization pipeline- VAD, segmentation, speaker embedding extraction, clustering - -In NeMo toolkit, MSDD implementation has multiple options for the context vector by specifying ``model.msdd_module.context_vector_type``: - - -- ``cos_sim``: As described in this document, scale weights are applied to cosine similarity values between cluster-average embedding vectors and input embedding vectors. Default is ``cos_sim``. - - -- ``elem_prod``: The scale weights are directly applied to speaker embedding vectors then a weighted speaker embedding vector is calculated for both cluster-average embedding vectors and input embedding vectors. Finally, elementwise product between the cluster-average weighted speaker embedding vector and input multi-scale embedding vector are calculated and fed to LSTMs as a context vector for each step. - - -MSDD is designed with the following aspects in mind: - -* **Flexible number of speakers**: MSDD employs pairwise inference to diarize conversation with arbitrary numbers of speakers. For example, if there are 4 speakers, 6 pairs will be extracted, and inference results from MSDD are averaged to obtain results for each of the 4 speakers. - - -* **Overlap-aware diarization**: MSDD independently estimates the probability of two speaker labels of two speakers at each step. This enables overlap detection where two speakers are speaking at the same time. - - -* **Pretrained speaker embedding model**: MSDD is based on the pretrained embedding extractor (TitaNet) model. By using a pretrained speaker model, we can leverage the neural network weights learned from a relatively large amount of single-speaker speech data. In addition, MSDD is designed to be optimized with a pretrained speaker to fine-tune the entire speaker diarization system on a domain-specific diarization dataset. - - -* **End-to-end training of diarization model**: Since all the arithmetic operations in MSDD support gradient calculation, a speaker embedding model can be attached to the computational graph of an MSDD model and can be jointly trained from the loss calculated from speaker label outputs. - - -* **Superior temporal resolution for uniform segmentation approach**: While single-scale clustering diarizer shows the best performance at 1.5-second segment length where unit decision length is 0.75 second (half-overlap), the multi-scale approach has unit decision length of 0.25 second. The temporal resolution can be even more enhanced by using shorter shift length which requires more steps and resources. Note that merely applying 0.5-second segment length to a single-scale diarizer significantly drops the diarization performance due to the degraded fidelity of speaker features. - - -* **Performance improvement from clustering diarizer**: Diarization Error Rate (DER) is calculated by comparing hypothesis timestamps and ground-truth timestamps. MSDD can reduce the diarization error rate up to 60% on two speaker datasets when compared to the single-scale clustering diarizer.  - -References ------------ - -.. bibliography:: ../asr_all.bib - :style: plain - :labelprefix: SD-MODELS - :keyprefix: sd-models- - - diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/resources.rst b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/resources.rst deleted file mode 100644 index ee38217b8d8d6caeab916f31ea8638e28ceb7a71..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/resources.rst +++ /dev/null @@ -1,24 +0,0 @@ - -Resource and Documentation Guide --------------------------------- - -Hands-on speaker diarization tutorial notebooks can be found under ``/tutorials/speaker_tasks``. - -There are tutorials for performing speaker diarization inference using :ref:`MarbleNet_model`, :ref:`TitaNet_model`, and :ref:`Multi_Scale_Diarization_Decoder`. -We also provide tutorials about getting ASR transcriptions combined with speaker labels along with voice activity timestamps with NeMo ASR collections. - -Most of the tutorials can be run on Google Colab by specifying the link to the notebooks' GitHub pages on Colab. - -If you are looking for information about a particular model used for speaker diarization inference, or would like to find out more about the model -architectures available in the `nemo_asr` collection, check out the :doc:`Models <./models>` page. - -Documentation on dataset preprocessing can be found on the :doc:`Datasets <./datasets>` page. -NeMo includes preprocessing scripts for several common ASR datasets, and this page contains instructions on running -those scripts. -It also includes guidance for creating your own NeMo-compatible dataset, if you have your own data. - -Information about how to load model checkpoints (either local files or pretrained ones from NGC), perform inference, as well as a list -of the checkpoints available on NGC are located on the :doc:`Checkpoints <./results>` page. - -Documentation for configuration files specific to the ``nemo_asr`` models can be found on the -:doc:`Configuration Files <./configs>` page. diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/results.rst b/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/results.rst deleted file mode 100644 index 87af559d12c58d6040a294268df3c3e415726810..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_diarization/results.rst +++ /dev/null @@ -1,87 +0,0 @@ -Checkpoints -=========== - -There are two main ways to load pretrained checkpoints in NeMo as introduced in `loading ASR checkpoints <../results.html#checkpoints>`__. -In speaker diarization, the diarizer loads checkpoints that are passed through the config file. For example: - -Loading Local Checkpoints ---------------------------- - -Load VAD models - -.. code-block:: bash - - pretrained_vad_model='/path/to/vad_multilingual_marblenet.nemo' # local .nemo or pretrained vad model name - ... - # pass with hydra config - config.diarizer.vad.model_path=pretrained_vad_model - - -Load speaker embedding models - -.. code-block:: bash - - pretrained_speaker_model='/path/to/titanet-l.nemo' # local .nemo or pretrained speaker embedding model name - ... - # pass with hydra config - config.diarizer.speaker_embeddings.model_path=pretrained_speaker_model - -Load neural diarizer models - -.. code-block:: bash - - pretrained_neural_diarizer_model='/path/to/diarizer_msdd_telephonic.nemo' # local .nemo or pretrained neural diarizer model name - ... - # pass with hydra config - config.diarizer.msdd_model.model_path=pretrained_neural_diarizer_model - - -NeMo will automatically save checkpoints of a model you are training in a `.nemo` format. -You can also manually save your models at any point using :code:`model.save_to(.nemo)`. - - -Inference ---------- - -.. note:: - For details and deep understanding, please refer to ``/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb``. - -Check out :doc:`Datasets <./datasets>` for preparing audio files and optional label files. - -Run and evaluate speaker diarizer with below command: - -.. code-block:: bash - - # Have a look at the instruction inside the script and pass the arguments you might need. - python /examples/speaker_tasks/diarization/offline_diarization.py - - -NGC Pretrained Checkpoints --------------------------- - -The ASR collection has checkpoints of several models trained on various datasets for a variety of tasks. -These checkpoints are obtainable via NGC `NeMo Automatic Speech Recognition collection `_. -The model cards on NGC contain more information about each of the checkpoints available. - -In general, you can load models with model name in the following format, - -.. code-block:: python - - pretrained_vad_model='vad_multilingual_marblenet' - pretrained_speaker_model='titanet_large' - pretrained_neural_diarizer_model='diar_msdd_telephonic' - ... - config.diarizer.vad.model_path=retrained_vad_model \ - config.diarizer.speaker_embeddings.model_path=pretrained_speaker_model \ - config.diarizer.msdd_model.model_path=pretrained_neural_diarizer_model - -where the model name is the value under "Model Name" entry in the tables below. - -Models for Speaker Diarization Pipeline -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. csv-table:: - :file: data/diarization_results.csv - :align: left - :widths: 30, 30, 40 - :header-rows: 1 diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/api.rst b/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/api.rst deleted file mode 100644 index 0f95cb281145aab44508a7e02b7decfb6a940510..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/api.rst +++ /dev/null @@ -1,11 +0,0 @@ -NeMo Speaker Recognition API -============================= - - -Model Classes -------------- -.. autoclass:: nemo.collections.asr.models.label_models.EncDecSpeakerLabelModel - :show-inheritance: - :members: setup_finetune_model, get_embedding, verify_speakers - - diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/configs.rst b/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/configs.rst deleted file mode 100644 index a6fcf6f7582d528847c7d0c91c55c3357e648bc9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/configs.rst +++ /dev/null @@ -1,116 +0,0 @@ -NeMo Speaker Recognition Configuration Files -============================================ - -This page covers NeMo configuration file setup that is specific to speaker recognition models. -For general information about how to set up and run experiments that is common to all NeMo models (e.g. -experiment manager and PyTorch Lightning trainer parameters), see the :doc:`../../core/core` page. - -The model section of NeMo speaker recognition configuration files will generally require information about the dataset(s) being -used, the preprocessor for audio files, parameters for any augmentation being performed, as well as the -model architecture specification. -The sections on this page cover each of these in more detail. - -Example configuration files for all of the Speaker related scripts can be found in the -config directory of the examples ``{NEMO_ROOT/examples/speaker_tasks/recognition/conf}``. - - -Dataset Configuration ---------------------- - -Training, validation, and test parameters are specified using the ``train_ds``, ``validation_ds``, and -``test_ds`` sections of your configuration file, respectively. -Depending on the task, you may have arguments specifying the sample rate of your audio files, max time length to consider for each audio file , whether or not to shuffle the dataset, and so on. -You may also decide to leave fields such as the ``manifest_filepath`` blank, to be specified via the command line -at run time. - -Any initialization parameters that are accepted for the Dataset class used in your experiment -can be set in the config file. - -An example TitaNet train and validation configuration could look like (``{NEMO_ROOT}examples/speaker_tasks/recognition/conf/titanet-large.yaml``): - -.. code-block:: yaml - - model: - train_ds: - manifest_filepath: ??? - sample_rate: 16000 - labels: None # finds labels based on manifest file - batch_size: 32 - trim_silence: False - shuffle: True - - validation_ds: - manifest_filepath: ??? - sample_rate: 16000 - labels: None # Keep None, to match with labels extracted during training - batch_size: 32 - shuffle: False # No need to shuffle the validation data - - -If you would like to use tarred dataset, have a look at `Datasets Configuration <../configs.html#dataset-configuration>`__. - - -Preprocessor Configuration --------------------------- -Preprocessor helps to compute MFCC or mel spectrogram features that are given as inputs to model. -For details on how to write this section, refer to `Preprocessor Configuration <../configs.html#preprocessor-configuration>`__ - - -Augmentation Configurations ---------------------------- - -For TitaNet training we use on-the-fly augmentations with MUSAN and RIR impulses using ``noise`` augmentor section - -The following example sets up musan augmentation with audio files taken from manifest path and -minimum and maximum SNR specified with min_snr and max_snr respectively. This section can be added to -``train_ds`` part in model - -.. code-block:: yaml - - model: - ... - train_ds: - ... - augmentor: - noise: - manifest_path: /path/to/musan/manifest_file - prob: 0.2 # probability to augment the incoming batch audio with augmentor data - min_snr_db: 5 - max_snr_db: 15 - - -See the :class:`nemo.collections.asr.parts.preprocessing.perturb.AudioAugmentor` API section for more details. - - -Model Architecture Configurations ---------------------------------- - -Each configuration file should describe the model architecture being used for the experiment. -Models in the NeMo ASR collection need a ``encoder`` section and a ``decoder`` section, with the ``_target_`` field -specifying the module to use for each. - -The following sections go into more detail about the specific configurations of each model architecture. - -For more information about the TitaNet Encoder models, see the :doc:`Models <./models>` page. - -Decoder Configurations ------------------------- - -After features have been computed from TitaNet encoder, we pass these features to the decoder to compute embeddings and then to compute log probabilities -for training models. - -.. code-block:: yaml - - model: - ... - decoder: - _target_: nemo.collections.asr.modules.SpeakerDecoder - feat_in: *enc_feat_out - num_classes: 7205 # Total number of classes in voxceleb1,2 training manifest file - pool_mode: attention # xvector, attention - emb_sizes: 192 # number of intermediate emb layers. can be comma separated for additional layers like 512,512 - angular: true # if true then loss will be changed to angular softmax loss and consider scale and margin from loss section else train with cross-entropy loss - - loss: - scale: 30 - margin 0.2 diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/data/speaker_results.csv b/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/data/speaker_results.csv deleted file mode 100644 index c92c971e49396ddbfa5bfe0825c5f3c60c67b96b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/data/speaker_results.csv +++ /dev/null @@ -1,5 +0,0 @@ -Model Name,Model Base Class,Model Card -titanet_large,EncDecSpeakerLabelModel,"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_large" -titanet_small,EncDecSpeakerLabelModel,"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_small" -speakerverification_speakernet,EncDecSpeakerLabelModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:speakerverification_speakernet" -ecapa_tdnn,EncDecSpeakerLabelModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:ecapa_tdnn" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/datasets.rst b/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/datasets.rst deleted file mode 100644 index 88c600b3c5237109c8391746e43e10281179fe4c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/datasets.rst +++ /dev/null @@ -1,62 +0,0 @@ -Datasets -======== - -.. _HI-MIA: - -HI-MIA --------- - -Run the script to download and process ``hi-mia`` dataset in order to generate files in the supported format of ``nemo_asr``. You should set the data folder of -hi-mia using ``--data_root``. These scripts are present in ``/scripts`` - -.. code-block:: bash - - python get_hi-mia_data.py --data_root= - -After download and conversion, your `data` folder should contain directories with following set of files as: - -* `data//train.json` -* `data//dev.json` -* `data//{set}_all.json` -* `data//utt2spk` - - -All-other Datasets ------------------- - -These methods can be applied to any dataset to get similar training or inference manifest files. - -`filelist_to_manifest.py` script in `$/scripts/speaker_tasks/` folder generates manifest file from a text file containing paths to audio files. - -sample `filelist.txt` file contents: - -.. code-block:: bash - - /data/datasets/voxceleb/data/dev/aac_wav/id00179/Q3G6nMr1ji0/00086.wav - /data/datasets/voxceleb/data/dev/aac_wav/id00806/VjpQLxHQQe4/00302.wav - /data/datasets/voxceleb/data/dev/aac_wav/id01510/k2tzXQXvNPU/00132.wav - -This list file is used to generate manifest file. This script has optional arguments to split the whole manifest file in to train and dev and also segment audio files to smaller segments for robust training (for testing, we don't need to create segments for each utterance). - -sample usage: - -.. code-block:: bash - - python filelist_to_manifest.py --filelist=filelist.txt --id=-3 --out=speaker_manifest.json - -This would create a manifest containing file contents as shown below: -.. code-block:: json - - {"audio_filepath": "/data/datasets/voxceleb/data/dev/aac_wav/id00179/Q3G6nMr1ji0/00086.wav", "offset": 0, "duration": 4.16, "label": "id00179"} - {"audio_filepath": "/data/datasets/voxceleb/data/dev/aac_wav/id00806/VjpQLxHQQe4/00302.wav", "offset": 0, "duration": 12.288, "label": "id00806"} - {"audio_filepath": "/data/datasets/voxceleb/data/dev/aac_wav/id01510/k2tzXQXvNPU/00132.wav", "offset": 0, "duration": 4.608, "label": "id01510"} - -For other optional arguments like splitting manifest file to train and dev and for creating segements from each utterance refer to the arguments -described in the script. - -Tarred Datasets ---------------- - -Similarly to ASR, you can tar your audio files and use ASR Dataset class ``TarredAudioToSpeechLabelDataset`` (corresponding to the ``AudioToSpeechLabelDataset``) for this case. - -If you want to use tarred dataset, have a look at `ASR Tarred Datasets <../datasets.html#tarred-datasets>`__. diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/images/ICASPP_SpeakerNet.png b/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/images/ICASPP_SpeakerNet.png deleted file mode 100644 index ecdccccb69ba94f2af56a627113a11ff0768a43e..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/images/ICASPP_SpeakerNet.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/images/titanet_network.png b/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/images/titanet_network.png deleted file mode 100644 index 08668e5b6d08989b9b7a165e0f5526799ca96ab5..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/images/titanet_network.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/intro.rst b/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/intro.rst deleted file mode 100644 index 9242fdb1a9b2261870b9aa23f8ab36e35b0eca50..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/intro.rst +++ /dev/null @@ -1,23 +0,0 @@ -Speaker Recognition (SR) -======================== - -Speaker recognition is a broad research area which solves two major tasks: speaker identification (what is the identity of the speaker?) and speaker verification (is the speaker who they claim to be?). We focus on text-independent speaker recognition when the identity of the speaker is based on how the speech is spoken, not necessarily in what is being said. Typically such speaker recognition systems operate on unconstrained speech utterances, which are converted into vectors of fixed length, called speaker embeddings. Speaker embeddings can also be used in automatic speech recognition (ASR) and speech synthesis. - -The goal of most speaker recognition systems is to get good speaker level representations that could help distinguish oneself from other speakers. To achieve this, we first train a neural network model in an end-to-end manner optimizing the encoder using cross-entropy or angular softmax loss. We modify the decoder to get these fixed size embeddings irrespective of the length of the audio input and employ a pooling strategy such as mean and variance based statistics pooling or attention based method to generate these embeddings. - -In speaker identification, we typically train on a larger training set with cross-entropy loss and fine-tune later on preferred set of labels where one would want to classify only known sets of speakers. -On the other hand, in speaker verification, we train an embedding extractor with angular softmax loss and compare the embeddings from one audio file coming from a single speaker with embeddings from an unknown speaker. For quantifying the similarity of the embeddings we use scoring techniques such as cosine similarity. - -The full documentation tree: - -.. toctree:: - :maxdepth: 8 - - models - configs - datasets - results - api - resources - -.. include:: resources.rst \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/models.rst b/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/models.rst deleted file mode 100644 index e568f8523c919e7d82cb1ba54867a4db167b4a20..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/models.rst +++ /dev/null @@ -1,72 +0,0 @@ -Models -====== - -Examples of config files for all the below models can be found in the ``/examples/speaker_recognition/conf`` directory. - -For more information about the config files and how they should be structured, see the :doc:`./configs` page. - -Pretrained checkpoints for all of these models, as well as instructions on how to load them, can be found on the :doc:`./results` page. -You can use the available checkpoints for immediate inference, or fine-tune them on your own datasets. -The Checkpoints page also contains benchmark results for the available speaker recognition models. - -.. _TitaNet_model: - -TitaNet ------------ - -TitaNet model :cite:`sr-models-koluguri2021titanet` is based on the ContextNet architecture :cite:`sr-models-han2020contextnet` for extracting speaker representations. -We employ 1D depth-wise separable convolutions with Squeeze-and-Excitation (SE) layers with global context followed by channel attention based statistics pooling layer to map -variable-length utterances to a fixed-length embedding (tvector). TitaNet is a scalable architecture and achieves state-of-the-art performance on speaker verification and diarization tasks. - - .. image:: images/titanet_network.png - :align: center - :alt: speakernet model - :scale: 50% - -SpeakerNet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecSpeakerLabelModel` class. - -SpeakerNet ------------ - -The model is based on the QuartzNet ASR architecture :cite:`sr-models-koluguri2020speakernet` -comprising of an encoder and decoder structure. We use the encoder of the QuartzNet model as a top-level feature extractor, and feed the output to the statistics pooling layer, where -we compute the mean and variance across channel dimensions to capture the time-independent utterance-level speaker features. - -The QuartzNet encoder used for speaker embeddings shown in figure below has the following structure: a QuartzNet BxR -model has B blocks, each with R sub-blocks. Each sub-block applies the following operations: a 1D convolution, batch norm, ReLU, and dropout. All sub-blocks in a block have the same number of output channels. These blocks are connected with residual connections. We use QuartzNet with 3 blocks, 2 sub-blocks, and 512 channels, as the Encoder for Speaker Embeddings. All conv layers have stride 1 and dilation 1. - - - .. image:: images/ICASPP_SpeakerNet.png - :align: center - :alt: speakernet model - :scale: 40% - -Top level acoustic Features, obtained from the output of -encoder are used to compute intermediate features that are -then passed to the decoder for getting utterance level speaker -embeddings. The intermediate time-independent features are -computed using a statistics pooling layer, where we compute the mean and standard deviation of features across -time-channels, to get a time-independent feature representation S of size Batch_size × 3000. -The intermediate features, S are passed through the Decoder consisting of two layers each of output size 512 for a -linear transformation from S to the final number of classes -N for the larger (L) model, and a single linear layer of output size 256 to the final number of classes N for the medium -(M) model. We extract q-vectors after the final linear layer -of fixed size 512, 256 for SpeakerNet-L and SpeakerNet-M -models respectively. - -SpeakerNet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecSpeakerLabelModel` class. - -ECAPA_TDNN ----------- - -The model is based on the paper "ECAPA_TDNN Embeddings for Speaker Diarization" :cite:`sr-models-Dawalatabad_2021` comprising an encoder of time dilation layers which are based on Emphasized Channel Attention, Propagation, and Aggregation. The ECAPA-TDNN model employs a channel and context dependent attention mechanism, Multi layer Feature Aggregation (MFA), as well as Squeeze-Excitation (SE) and residual blocks, due to faster training and inference we replacing residual blocks with group convolution blocks of single dilation. These models has shown good performance over various speaker tasks. - -ecapa_tdnn models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecSpeakerLabelModel` class. - -References ------------ - -.. bibliography:: ../asr_all.bib - :style: plain - :labelprefix: SR-MODELS - :keyprefix: sr-models- diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/resources.rst b/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/resources.rst deleted file mode 100644 index 55e83bb598b7fcdc7e3f0a62958335bfc50e0362..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/resources.rst +++ /dev/null @@ -1,22 +0,0 @@ - -Resource and Documentation Guide --------------------------------- - -Hands-on speaker recognition tutorial notebooks can be found under -`the speaker recognition tutorials folder `_. This and most other tutorials can be run on Google Colab by specifying the link to the notebooks' GitHub pages on Colab. - -If you are looking for information about a particular SpeakerNet model, or would like to find out more about the model -architectures available in the ``nemo_asr`` collection, check out the :doc:`Models <./models>` page. - -Documentation on dataset preprocessing can be found on the :doc:`Datasets <./datasets>` page. -NeMo includes preprocessing and other scripts for speaker_recognition in folder, and this page contains instructions on running -those scripts. It also includes guidance for creating your own NeMo-compatible dataset, if you have your own data. - -Information about how to load model checkpoints (either local files or pretrained ones from NGC), perform inference, as well as a list -of the checkpoints available on NGC are located on the :doc:`Checkpoints <./results>` page. - -Documentation for configuration files specific to the ``nemo_asr`` models can be found on the -:doc:`Configuration Files <./configs>` page. - - -For a clear step-by-step tutorial we advise you to refer to the tutorials found in `folder `_. diff --git a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/results.rst b/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/results.rst deleted file mode 100644 index a6029595823fd05165f804fb682dd0bbcae7be1e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speaker_recognition/results.rst +++ /dev/null @@ -1,138 +0,0 @@ -Checkpoints -=========== - -There are two main ways to load pretrained checkpoints in NeMo: - -* Using the :code:`restore_from()` method to load a local checkpoint file (`.nemo`), or -* Using the :code:`from_pretrained()` method to download and set up a checkpoint from NGC. - -See the following sections for instructions and examples for each. - -Note that these instructions are for loading fully trained checkpoints for evaluation or fine-tuning. -For resuming an unfinished training experiment, please use the experiment manager to do so by setting the -``resume_if_exists`` flag to True. - -Loading Local Checkpoints -------------------------- - -NeMo will automatically save checkpoints of a model you are training in a `.nemo` format. -You can also manually save your models at any point using :code:`model.save_to(.nemo)`. - -If you have a local ``.nemo`` checkpoint that you'd like to load, simply use the :code:`restore_from()` method: - -.. code-block:: python - - import nemo.collections.asr as nemo_asr - model = nemo_asr.models..restore_from(restore_path="") - -Where the model base class is the ASR model class of the original checkpoint, or the general `ASRModel` class. - -Speaker Label Inference ------------------------- - -The goal of speaker label inference is to infer speaker labels using a speaker model with known speaker labels from enrollment set. We provide `speaker_identification_infer.py` script for this purpose under `/examples/speaker_tasks/recognition` folder. -Currently supported backends are cosine_similarity and neural classifier. - -The audio files should be 16KHz mono channel wav files. - -The script takes two manifest files: - -* enrollment_manifest : This manifest contains enrollment data with known speaker labels. -* test_manifest: This manifest contains test data for which we map speaker labels captured from enrollment manifest using one of provided backend - -sample format for each of these manifests is provided in `/examples/speaker_tasks/recognition/conf/speaker_identification_infer.yaml` config file. - -To infer speaker labels using cosine_similarity backend - -.. code-block:: bash - - python speaker_identification_infer.py data.enrollment_manifest= data.test_manifest= backend.backend_model=cosine_similarity - - -Speaker Embedding Extraction ------------------------------ -Speaker Embedding Extraction, is to extract speaker embeddings for any wav file (from known or unknown speakers). We provide two ways to do this: - -* single Python liner for extracting embeddings from a single file -* Python script for extracting embeddings from a bunch of files provided through manifest file - -For extracting embeddings from a single file: - -.. code-block:: python - - speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_name="") - embs = speaker_model.get_embedding('') - -For extracting embeddings from a bunch of files: - -The audio files should be 16KHz mono channel wav files. - -Write audio files to a ``manifest.json`` file with lines as in format: - -.. code-block:: json - - {"audio_filepath": "/audio_file.wav", "duration": "duration of file in sec", "label": "speaker_id"} - -This python call will download best pretrained model from NGC and writes embeddings pickle file to current working directory - -.. code-block:: bash - - python examples/speaker_tasks/recognition/extract_speaker_embeddings.py --manifest=manifest.json - -or you can run `batch_inference()` to perform inference on the manifest with seleted batch_size to get embeddings - -.. code-block:: python - - speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(model_name="") - embs, logits, gt_labels, trained_labels = speaker_model.batch_inference(manifest, batch_size=32) - -Speaker Verification Inference ------------------------------- - -Speaker Verification is a task of verifying if two utterances are from the same speaker or not. - -We provide a helper function to verify the audio files and return True if two provided audio files are from the same speaker, False otherwise. - -The audio files should be 16KHz mono channel wav files. - -.. code-block:: python - - speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_name="titanet_large") - decision = speaker_model.verify_speakers('path/to/one/audio_file','path/to/other/audio_file') - - -NGC Pretrained Checkpoints --------------------------- - -The SpeakerNet-ASR collection has checkpoints of several models trained on various datasets for a variety of tasks. -`TitaNet `_ , `ECAPA_TDNN `_ and `Speaker_Verification `_ model cards on NGC contain more information about each of the checkpoints available. - -The tables below list the speaker embedding extractor models available from NGC, and the models can be accessed via the -:code:`from_pretrained()` method inside the EncDecSpeakerLabelModel Model class. - -In general, you can load any of these models with code in the following format: - -.. code-block:: python - - import nemo.collections.asr as nemo_asr - model = nemo_asr.models..from_pretrained(model_name="") - -where the model name is the value under "Model Name" entry in the tables below. - -If you would like to programatically list the models available for a particular base class, you can use the -:code:`list_available_models()` method. - -.. code-block:: python - - nemo_asr.models..list_available_models() - - -Speaker Recognition Models -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. csv-table:: - :file: data/speaker_results.csv - :align: left - :widths: 30, 30, 40 - :header-rows: 1 - diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/configs.rst b/SoundScribe/SpeakerID/docs/source/asr/speech_classification/configs.rst deleted file mode 100644 index d67706a46617dddd65659eeb33c96eadad67d059..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/configs.rst +++ /dev/null @@ -1,115 +0,0 @@ -NeMo Speech Classification Configuration Files -================================================ - -This page covers NeMo configuration file setup that is specific to models in the Speech Classification collection. -For general information about how to set up and run experiments that is common to all NeMo models (e.g. -experiment manager and PyTorch Lightning trainer parameters), see the :doc:`../../core/core` page. - -The model section of NeMo Speech Classification configuration files will generally require information about the dataset(s) being -used, the preprocessor for audio files, parameters for any augmentation being performed, as well as the -model architecture specification. -The sections on this page cover each of these in more detail. - -Example configuration files for all of the NeMo ASR scripts can be found in the -``/examples/asr/conf>``. - - -Dataset Configuration ---------------------- - -Training, validation, and test parameters are specified using the ``train_ds``, ``validation_ds``, and -``test_ds`` sections of your configuration file, respectively. -Depending on the task, you may have arguments specifying the sample rate of your audio files, labels, whether or not to shuffle the dataset, and so on. -You may also decide to leave fields such as the ``manifest_filepath`` blank, to be specified via the command line -at runtime. - -Any initialization parameters that are accepted for the Dataset class used in your experiment -can be set in the config file. -See the `Datasets <../api.html#Datasets>`__ section of the API for a list of Datasets and their respective parameters. - -An example Speech Classification train and validation configuration could look like: - -.. code-block:: yaml - - model: - sample_rate: 16000 - repeat: 2 # number of convolutional sub-blocks within a block, R in _[BxRxC] - dropout: 0.0 - kernel_size_factor: 1.0 - labels: ['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', - 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', - 'wow', 'yes', 'zero'] - - train_ds: - manifest_filepath: ??? - sample_rate: ${model.sample_rate} - labels: ${model.labels} # Uses the labels above - batch_size: 128 - shuffle: True - - validation_ds: - manifest_filepath: ??? - sample_rate: ${model.sample_rate} - labels: ${model.labels} # Uses the labels above - batch_size: 128 - shuffle: False # No need to shuffle the validation data - -If you would like to use tarred dataset, have a look at `Datasets Configuration <../configs.html#dataset-configuration>`__. - - -Preprocessor Configuration --------------------------- -Preprocessor helps to compute MFCC or mel spectrogram features that are given as inputs to model. -For details on how to write this section, refer to `Preprocessor Configuration <../configs.html#preprocessor-configuration>`__ - -Check config yaml files in ``/examples/asr/conf`` to find the processors been used by speech classification models. - - -Augmentation Configurations ---------------------------- - -There are a few on-the-fly spectrogram augmentation options for NeMo ASR, which can be specified by the -configuration file using the ``augmentor`` and ``spec_augment`` section. -For details on how to write this section, refer to `Augmentation Configuration <../configs.html#augmentation-configurations>`__ - -Check config yaml files in ``/tutorials/asr/conf`` to find the processors been used by speech classification models. - - -Model Architecture Configurations ---------------------------------- - -Each configuration file should describe the model architecture being used for the experiment. -Models in the NeMo ASR collection need a ``encoder`` section and a ``decoder`` section, with the ``_target_`` field -specifying the module to use for each. - -The following sections go into more detail about the specific configurations of each model architecture. - -The `MatchboxNet <./models.html#matchboxnet-speech-commands>`__ and `MarbleNet <./models.html#marblenet-vad>`__ models are very similar, and they are based on `QuartzNet <../models.html#quartznet>`__ and as such the components in their -configs are very similar as well. - -Decoder Configurations ------------------------- - -After features have been computed from ConvASREncoder, we pass the features to decoder to compute embeddings and then to compute log_probs -for training models. - -.. code-block:: yaml - - model: - ... - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoderClassification - feat_in: *enc_final_filters - return_logits: true # return logits if true, else return softmax output - pooling_type: 'avg' # AdaptiveAvgPool1d 'avg' or AdaptiveMaxPool1d 'max' - - -Fine-tuning Execution Flow Diagram ----------------------------------- - -When preparing your own training or fine-tuning scripts, please follow the execution flow diagram order for correct inference. - -Depending on the type of model, there may be extra steps that must be performed - - -* Speech Classification models - `Examples directory for Classification Models `_ - diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/data/classification_results.csv b/SoundScribe/SpeakerID/docs/source/asr/speech_classification/data/classification_results.csv deleted file mode 100644 index 06de98a2c4330135185b980932b2fe06cfab8321..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/data/classification_results.csv +++ /dev/null @@ -1,11 +0,0 @@ -Model Name,Model Base Class,Model Card -langid_ambernet,EncDecSpeakerLabelModel,"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/langid_ambernet" -vad_multilingual_marblenet,EncDecClassificationModel,"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/vad_multilingual_marblenet" -vad_marblenet,EncDecClassificationModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:vad_marblenet" -vad_telephony_marblenet,EncDecClassificationModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:vad_telephony_marblenet" -commandrecognition_en_matchboxnet3x1x64_v1,EncDecClassificationModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x1x64_v1" -commandrecognition_en_matchboxnet3x2x64_v1,EncDecClassificationModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x2x64_v1" -commandrecognition_en_matchboxnet3x1x64_v2,EncDecClassificationModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x1x64_v2" -commandrecognition_en_matchboxnet3x2x64_v2,EncDecClassificationModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x2x64_v2" -commandrecognition_en_matchboxnet3x1x64_v2_subset_task,EncDecClassificationModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x1x64_v2_subset_task" -commandrecognition_en_matchboxnet3x2x64_v2_subset_task,EncDecClassificationModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x2x64_v2_subset_task" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/datasets.rst b/SoundScribe/SpeakerID/docs/source/asr/speech_classification/datasets.rst deleted file mode 100644 index 09739c55ecb9c6ef71884fe87635f95a15ab7efd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/datasets.rst +++ /dev/null @@ -1,149 +0,0 @@ -Datasets -======== - -NeMo has scripts to convert several common ASR datasets into the format expected by the `nemo_asr` collection. -You can get started with those datasets by following the instructions to run those scripts in the section appropriate to each dataset below. - -If you have your own data and want to preprocess it to use with NeMo ASR models, check out the `Preparing Custom Speech Classification Data`_ section at the bottom of the page. - -.. _Freesound-dataset: - -Freesound ------------ - -`Freesound `_ is a website that aims to create a huge open collaborative database of audio snippets, samples, recordings, bleeps. -Most audio samples are released under Creative Commons licenses that allow their reuse. -Researchers and developers can access Freesound content using the Freesound API to retrieve meaningful sound information such as metadata, analysis files, and the sounds themselves. - -**Instructions** - -Go to ``/scripts/freesound_download_resample`` and follow the below steps to download and convert freedsound data into a format expected by the `nemo_asr` collection. - -1. We will need some required libraries including freesound, requests, requests_oauthlib, joblib, librosa and sox. If they are not installed, please run `pip install -r freesound_requirements.txt` -2. Create an API key for freesound.org at https://freesound.org/help/developers/ -3. Create a python file called `freesound_private_apikey.py` and add lined `api_key = and client_id = ` -4. Authorize by run `python freesound_download.py --authorize` and visit the website and paste response code -5. Feel free to change any arguments in `download_resample_freesound.sh` such as max_samples and max_filesize -6. Run `bash download_resample_freesound.sh ` . For example: - -.. code-block:: bash - - bash download_resample_freesound.sh 4000 ./freesound ./freesound_resampled_background - -Note that downloading this dataset may take hours. Change categories in download_resample_freesound.sh to include other (speech) categories audio files. -Then, you should have 16khz mono wav files in ``. - - -.. _Google-Speech-Commands-Dataset: - -Google Speech Commands Dataset ------------------------------- - -Google released two versions of the dataset with the first version containing 65k samples over 30 classes and the second containing 110k samples over 35 classes. -We refer to these datasets as `v1` and `v2` respectively. - -Run the script `process_speech_commands_data.py` to process Google Speech Commands dataset in order to generate files in the supported format of `nemo_asr`, -which can be found in ``/scripts/dataset_processing/``. -You should set the data folder of Speech Commands using :code:`--data_root` and the version of the dataset using :code:`--data_version` as an int. - -You can further rebalance the train set by randomly oversampling files inside the manifest by passing the `--rebalance` flag. - -.. code-block:: bash - - python process_speech_commands_data.py --data_root= --data_version=<1 or 2> {--rebalance} - - -Then, you should have `train_manifest.json`, `validation_manifest.json` and `test_manifest.json` -in the directory `{data_root}/google_speech_recognition_v{1/2}`. - -.. note:: - You should have at least 4GB or 6GB of disk space available if you use v1 or v2 respectively. - Also, it will take some time to download and process, so go grab a coffee. - -Each line is a training example. - -.. code-block:: bash - - {"audio_filepath": "/two/8aa35b0c_nohash_0.wav", "duration": 1.0, "label": "two"} - {"audio_filepath": "/two/ec5ab5d5_nohash_2.wav", "duration": 1.0, "label": "two"} - - - -Speech Command & Freesound for VAD ------------------------------------- -Speech Command & Freesound (SCF) dataset is used to train MarbleNet in the `paper `_. Here we show how to download and process it. -This script assumes that you already have the Freesound dataset, if not, have a look at :ref:`Freesound-dataset`. -We will use the open-source :ref:`Google-Speech-Commands-Dataset` (we will use V2 of the dataset for SCF dataset, but require very minor changes to support V1 dataset) as our speech data. - -These scripts below will download the Google Speech Commands v2 dataset and convert speech and background data to a format suitable for use with nemo_asr. - -.. note:: - You may additionally pass :code:`--test_size` or :code:`--val_size` flag for splitting train val and test data. - - You may additionally pass :code:`--window_length_in_sec` flag for indicating the segment/window length. Default is 0.63s. - - You may additionally pass a :code:`-rebalance_method='fixed|over|under'` at the end of the script to rebalance the class samples in the manifest. - - - -* `'fixed'`: Fixed number of sample for each class. Train 5000, val 1000, and test 1000. (Change number in script if you want) -* `'over'`: Oversampling rebalance method -* `'under'`: Undersampling rebalance method - - -.. code-block:: bash - - mkdir './google_dataset_v2' - python process_vad_data.py --out_dir='./manifest/' --speech_data_root='./google_dataset_v2'--background_data_root= --log --rebalance_method='fixed' - - -After download and conversion, your `manifest` folder should contain a few json manifest files: - -* `(balanced_)background_testing_manifest.json` -* `(balanced_)background_training_manifest.json` -* `(balanced_)background_validation_manifest.json` -* `(balanced_)speech_testing_manifest.json` -* `(balanced_)speech_training_manifest.json` -* `(balanced_)speech_validation_manifest.json` - -Each line is a training example. `audio_filepath` contains path to the wav file, `duration` is duration in seconds, `offset` is offset in seconds, and `label` is label (class): - -.. code-block:: bash - - {"audio_filepath": "/two/8aa35b0c_nohash_0.wav", "duration": 0.63, "label": "speech", "offset": 0.0} - {"audio_filepath": "/Emergency_vehicle/id_58368 simambulance.wav", "duration": 0.63, "label": "background", "offset": 4.0} - - -.. _Voxlingua107: - -Voxlingua107 ------------------------------- - -VoxLingua107 consists of short speech segments automatically extracted from YouTube videos. -It contains 107 languages. The total amount of speech in the training set is 6628 hours, and 62 hours per language on average but it's highly imbalanced. -It also includes separate evaluation set containing 1609 speech segments from 33 languages, validated by at least two volunteers. - -You could download dataset from its `official website `_. - -Each line is a training example. - -.. code-block:: bash - - {"audio_filepath": "/ln/lFpWXQYseo4__U__S113---0400.650-0410.420.wav", "offset": 0, "duration": 3.0, "label": "ln"} - {"audio_filepath": "/lt/w0lp3mGUN8s__U__S28---0352.170-0364.770.wav", "offset": 8, "duration": 4.0, "label": "lt"} - - -Preparing Custom Speech Classification Data --------------------------------------------- - -Preparing Custom Speech Classification Data is almost identical to `Preparing Custom ASR Data <../datasets.html#preparing-custom-asr-data>`__. - -Instead of :code:`text` entry in manifest, you need :code:`label` to determine class of this sample - - -Tarred Datasets ---------------- - -Similarly to ASR, you can tar your audio files and use ASR Dataset class ``TarredAudioToClassificationLabelDataset`` (corresponding to the ``AudioToClassificationLabelDataset``) for this case. - -If you would like to use tarred dataset, have a look at `ASR Tarred Datasets <../datasets.html#tarred-datasets>`__. \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/images/marblenet_vertical.png b/SoundScribe/SpeakerID/docs/source/asr/speech_classification/images/marblenet_vertical.png deleted file mode 100644 index 3174d1ef5ea56ca36159d7525f2f6acf6c2367b0..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/images/marblenet_vertical.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/images/matchboxnet_vertical.png b/SoundScribe/SpeakerID/docs/source/asr/speech_classification/images/matchboxnet_vertical.png deleted file mode 100644 index 2d69c94ecdb820b9b0859137873674995138ac14..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/images/matchboxnet_vertical.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/intro.rst b/SoundScribe/SpeakerID/docs/source/asr/speech_classification/intro.rst deleted file mode 100644 index 6a7126b8e73389a3d83decba494c47f626b389e3..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/intro.rst +++ /dev/null @@ -1,31 +0,0 @@ -Speech Classification -================================== -Speech Classification refers to a set of tasks or problems of getting a program to automatically classify input utterance or audio segment into categories, -such as Speech Command Recognition (multi-class), Voice Activity Detection (binary or multi-class), and Audio Sentiment Classification (typically multi-class), etc. - -**Speech Command Recognition** is the task of classifying an input audio pattern into a discrete set of classes. -It is a subset of Automatic Speech Recognition (ASR), sometimes referred to as Key Word Spotting, in which a model is constantly analyzing speech patterns to detect certain "command" classes. -Upon detection of these commands, a specific action can be taken by the system. -It is often the objective of command recognition models to be small and efficient so that they can be deployed onto low-power sensors and remain active for long durations of time. - - -**Voice Activity Detection (VAD)** also known as speech activity detection or speech detection, is the task of predicting which parts of input audio contain speech versus background noise. -It is an essential first step for a variety of speech-based applications including Automatic Speech Recognition. -It serves to determine which samples to be sent to the model and when to close the microphone. - -**Spoken Language Identification (Lang ID)** also known as spoken language recognition, is the task of recognizing the language of the spoken utterance automatically. -It typically serves as the prepossessing of ASR, determining which ASR model would be activate based on the language. - - -The full documentation tree is as follows: - -.. toctree:: - :maxdepth: 8 - - models - datasets - results - configs - resources.rst - -.. include:: resources.rst diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/models.rst b/SoundScribe/SpeakerID/docs/source/asr/speech_classification/models.rst deleted file mode 100644 index 50919336fd93d51176ad2abd17aa6f1270e10443..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/models.rst +++ /dev/null @@ -1,84 +0,0 @@ -Models -====== - -This page gives a brief overview of the models that NeMo's Speech Classification collection currently supports. -For Speech Classification, we support Speech Command (Keyword) Detection and Voice Activity Detection (VAD). - -Each of these models can be used with the example ASR scripts (in the ``/examples/asr`` directory) by -specifying the model architecture in the config file used. -Examples of config files for each model can be found in the ``/examples/asr/conf`` directory. - -For more information about the config files and how they should be structured, see the :doc:`./configs` page. - -Pretrained checkpoints for all of these models, as well as instructions on how to load them, can be found on the :doc:`./results` page. -You can use the available checkpoints for immediate inference, or fine-tune them on your own datasets. -The Checkpoints page also contains benchmark results for the available ASR models. - -.. _MatchboxNet_model: - -MatchboxNet (Speech Commands) ------------------------------- - -MatchboxNet :cite:`sc-models-matchboxnet` is an end-to-end neural network for speech command recognition based on `QuartzNet <../models.html#QuartzNet>`__. - -Similarly to QuartzNet, the MatchboxNet family of models are denoted as MatchBoxNet_[BxRxC] where B is the number of blocks, and R is the number of convolutional sub-blocks within a block, and C is the number of channels. Each sub-block contains a 1-D *separable* convolution, batch normalization, ReLU, and dropout: - - .. image:: images/matchboxnet_vertical.png - :align: center - :alt: MatchboxNet model - :scale: 50% - -It can reach state-of-the art accuracy on the Google Speech Commands dataset while having significantly fewer parameters than similar models. -The `_v1` and `_v2` are denoted for models trained on `v1` (30-way classification) and `v2` (35-way classification) datasets; -And we use _subset_task to represent (10+2)-way subset (10 specific classes + other remaining classes + silence) classification task. - -MatchboxNet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecClassificationModel` class. - -.. note:: - For model details and deep understanding about Speech Command Detedction training, inference, finetuning and etc., - please refer to ``/tutorials/asr/Speech_Commands.ipynb`` and ``/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb``. - - - -.. _MarbleNet_model: - -MarbleNet (VAD) ------------------- - -MarbleNet :cite:`sc-models-marblenet` an end-to-end neural network for speech command recognition based on :ref:`MatchboxNet_model`, - -Similarly to MatchboxNet, the MarbleNet family of models are denoted as MarbleNet_[BxRxC] where B is the number of blocks, and R is the number of convolutional sub-blocks within a block, and C is the number of channels. Each sub-block contains a 1-D *separable* convolution, batch normalization, ReLU, and dropout: - - .. image:: images/marblenet_vertical.png - :align: center - :alt: MarbleNet model - :scale: 30% - -It can reach state-of-the art performance on the difficult `AVA speech dataset `_ while having significantly fewer parameters than similar models even training on simple data. -MarbleNet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecClassificationModel` class. - -.. note:: - For model details and deep understanding about VAD training, inference, postprocessing, threshold tuning and etc., - please refer to ``/tutorials/asr/06_Voice_Activiy_Detection.ipynb`` and ``/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb``. - - - -.. _AmberNet_model: - -AmberNet (Lang ID) ------------------- - -AmberNet is an end-to-end neural network for language identification moden based on `TitaNet <../speaker_recognition/models.html#titanet>`__. - -It can reach state-of-the art performance on the `Voxlingua107 dataset `_ while having significantly fewer parameters than similar models. -AmberNet models can be instantiated using the :class:`~nemo.collections.asr.models.EncDecSpeakerLabelModel` class. - - - -References ----------------- - -.. bibliography:: ../asr_all.bib - :style: plain - :labelprefix: SC-MODELS - :keyprefix: sc-models- diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/resources.rst b/SoundScribe/SpeakerID/docs/source/asr/speech_classification/resources.rst deleted file mode 100644 index eea302c1b94b009b9a19c5b2e1baae5c6d5d75f7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/resources.rst +++ /dev/null @@ -1,20 +0,0 @@ -Resource and Documentation Guide --------------------------------- - -Hands-on speech classification tutorial notebooks can be found under ``/tutorials/asr/``. -There are training and offline & online microphone inference tutorials for Speech Command Detection and Voice Activity Detection tasks. -This and most other tutorials can be run on Google Colab by specifying the link to the notebooks' GitHub pages on Colab. - -If you are looking for information about a particular Speech Classification model or would like to find out more about the model -architectures available in the `nemo_asr` collection, check out the :doc:`Models <./models>` page. - -Documentation on dataset preprocessing can be found on the :doc:`Datasets <./datasets>` page. -NeMo includes preprocessing scripts for several common ASR datasets, and this page contains instructions on running -those scripts. -It also includes guidance for creating your own NeMo-compatible dataset, if you have your own data. - -Information about how to load model checkpoints (either local files or pretrained ones from NGC), perform inference, as well as a list -of the checkpoints available on NGC are located on the :doc:`Checkpoints <./results>` page. - -Documentation for configuration files specific to the ``nemo_asr`` models can be found on the -:doc:`Configuration Files <./configs>` page. diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/results.rst b/SoundScribe/SpeakerID/docs/source/asr/speech_classification/results.rst deleted file mode 100644 index 9eeca4a4036b3103ffc944ace33c66ac00e0d5b6..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speech_classification/results.rst +++ /dev/null @@ -1,138 +0,0 @@ -Checkpoints -=========== - -There are two main ways to load pretrained checkpoints in NeMo: - -* Using the :code:`restore_from()` method to load a local checkpoint file (`.nemo`), or -* Using the :code:`from_pretrained()` method to download and set up a checkpoint from NGC. - -See the following sections for instructions and examples for each. - -Note that these instructions are for loading fully trained checkpoints for evaluation or fine-tuning. -For resuming an unfinished training experiment, please use the experiment manager to do so by setting the -``resume_if_exists`` flag to True. - -Loading Local Checkpoints -------------------------- - -NeMo will automatically save checkpoints of a model you are training in a `.nemo` format. -You can also manually save your models at any point using :code:`model.save_to(.nemo)`. - -If you have a local ``.nemo`` checkpoint that you'd like to load, simply use the :code:`restore_from()` method: - -.. code-block:: python - - import nemo.collections.asr as nemo_asr - model = nemo_asr.models..restore_from(restore_path="") - -Where the model base class is the ASR model class of the original checkpoint, or the general `ASRModel` class. - - -Transcribing/Inference ------------------------ - -The audio files should be 16KHz monochannel wav files. - -`Transcribe speech command segment:` - -You may perform inference and transcribe a sample of speech after loading the model by using its 'transcribe()' method: - -.. code-block:: python - - mbn_model = nemo_asr.models.EncDecClassificationModel.from_pretrained(model_name="") - mbn_model.transcribe([list of audio files], batch_size=BATCH_SIZE, logprobs=False) - -Setting argument ``logprobs`` to True would return the log probabilities instead of transcriptions. You may find more details in `Modules <../api.html#modules>`__. - -Learn how to fine tune on your own data or on subset classes in ``/tutorials/asr/Speech_Commands.ipynb`` - - -`Run VAD inference:` - -.. code-block:: bash - - python /examples/asr/speech_classification/vad_infer.py --config-path="../conf/vad" --config-name="vad_inference_postprocessing.yaml" dataset= - - -This script will perform vad frame-level prediction and will help you perform postprocessing and generate speech segments as well if needed. - -Have a look at configuration file ``/examples/asr/conf/vad/vad_inference_postprocessing.yaml`` and scripts under ``/scripts/voice_activity_detection`` for details regarding posterior processing, postprocessing and threshold tuning. - -Posterior processing includes generating predictions with overlapping input segments. Then a smoothing filter is applied to decide the label for a frame spanned by multiple segments. - -For VAD postprocessing we introduce - -Binarization: - - ``onset`` and ``offset`` threshold for detecting the beginning and end of a speech. - - padding durations ``pad_onset`` before and padding duarations ``pad_offset`` after each speech segment; - -Filtering: - - ``min_duration_on`` threshold for short speech segment deletion, - - ``min_duration_on`` threshold for small silence deletion, - - ``filter_speech_first`` to control whether to perform short speech segment deletion first. - - -`Identify language of utterance` - -You may load the model and identify the language of an audio file by using `get_label()` method: - -.. code-block:: python - - langid_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(model_name="") - lang = langid_model.get_label('') - -or you can run `batch_inference()` to perform inference on a manifest with seleted batch_size to get trained model labels and gt_labels with logits - -.. code-block:: python - - langid_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(model_name="") - lang_embs, logits, gt_labels, trained_labels = langid_model.batch_inference(manifest_filepath, batch_size=32) - - -NGC Pretrained Checkpoints --------------------------- - -The Speech Classification collection has checkpoints of several models trained on various datasets for a variety of tasks. -These checkpoints are obtainable via NGC `NeMo Automatic Speech Recognition collection `_. -The model cards on NGC contain more information about each of the checkpoints available. - -The tables below list the Speech Classification models available from NGC, and the models can be accessed via the -:code:`from_pretrained()` method inside the ASR Model class. - -In general, you can load any of these models with code in the following format. - -.. code-block:: python - - import nemo.collections.asr as nemo_asr - model = nemo_asr.models.EncDecClassificationModel.from_pretrained(model_name="") - -Where the model name is the value under "Model Name" entry in the tables below. - -For example, to load the MatchboxNet3x2x64_v1 model for speech command detection, run: - -.. code-block:: python - - model = nemo_asr.models.EncDecClassificationModel.from_pretrained(model_name="commandrecognition_en_matchboxnet3x2x64_v1") - -You can also call :code:`from_pretrained()` from the specific model class (such as :code:`EncDecClassificationModel` -for MatchboxNet and MarbleNet) if you will need to access specific model functionality. - -If you would like to programatically list the models available for a particular base class, you can use the -:code:`list_available_models()` method. - -.. code-block:: python - - nemo_asr.models..list_available_models() - - -Speech Classification Models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. tabularcolumns:: 30 30 40 - -.. csv-table:: - :file: data/classification_results.csv - :header-rows: 1 - :class: longtable - :widths: 1 1 1 - diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/api.rst b/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/api.rst deleted file mode 100644 index 735c583f911549e8e69a76c70c5da5b965958df3..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/api.rst +++ /dev/null @@ -1,22 +0,0 @@ -NeMo Speech Intent Classification and Slot Filling collection API -================================================================= - - -Model Classes -------------- -.. autoclass:: nemo.collections.asr.models.SLUIntentSlotBPEModel - :show-inheritance: - :members: - - -Mixins ------- - -.. autoclass:: nemo.collections.asr.parts.mixins.ASRModuleMixin - :show-inheritance: - :members: - -.. autoclass:: nemo.collections.asr.parts.mixins.ASRBPEMixin - :show-inheritance: - :members: - diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/configs.rst b/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/configs.rst deleted file mode 100644 index 48f85d0233324dc85fa22e70eaed4e778f4b49d7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/configs.rst +++ /dev/null @@ -1,170 +0,0 @@ -NeMo Speech Intent Classification and Slot Filling Configuration Files -======================================================================= - -This page covers NeMo configuration file setup that is specific to models in the Speech Intent Classification and Slot Filling collection. -For general information about how to set up and run experiments that is common to all NeMo models (e.g. -experiment manager and PyTorch Lightning trainer parameters), see the :doc:`../../core/core` page. - -Dataset Configuration ---------------------- - -Dataset configuration for Speech Intent Classification and Slot Filling model is mostly the same as for standard ASR training, -covered `here <../configs.html#Dataset Configuration>`__. One exception is that ``use_start_end_token`` must be set to ``True``. - -An example of train and validation configuration should look similar to the following: - -.. code-block:: yaml - - model: - train_ds: - manifest_filepath: ??? - sample_rate: ${model.sample_rate} - batch_size: 16 # you may increase batch_size if your memory allows - shuffle: true - num_workers: 8 - pin_memory: false - use_start_end_token: true - trim_silence: false - max_duration: 11.0 - min_duration: 0.0 - # tarred datasets - is_tarred: false - tarred_audio_filepaths: null - shuffle_n: 2048 - # bucketing params - bucketing_strategy: "synced_randomized" - bucketing_batch_size: null - - validation_ds: - manifest_filepath: ??? - sample_rate: ${model.sample_rate} - batch_size: 16 # you may increase batch_size if your memory allows - shuffle: false - num_workers: 8 - pin_memory: true - use_start_end_token: true - min_duration: 8.0 - - -Preprocessor Configuration --------------------------- - -Preprocessor helps to compute MFCC or mel spectrogram features that are given as inputs to model. -For details on how to write this section, refer to `Preprocessor Configuration <../configs.html#preprocessor-configuration>`__ - -Augmentation Configurations ---------------------------- - - -There are a few on-the-fly spectrogram augmentation options for NeMo ASR, which can be specified by the -configuration file using the ``augmentor`` and ``spec_augment`` section. -For details on how to write this section, refer to `Augmentation Configuration <../configs.html#augmentation-configurations>`__ - - -Model Architecture Configurations ---------------------------------- - -The ``encoder`` of the model is a `Conformer-large <./models.html#Conformer-CTC>`__ model without the text decoder, and can be initialized with pretrained checkpoints. The ``decoder`` is a Transforemr model, with additional ``embedding`` and ``classifier`` modules. - -An example config for the model can be: - -.. code-block:: yaml - - pretrained_encoder: - name: stt_en_conformer_ctc_large # which model use to initialize the encoder, set to null if not using any. Only used to initialize training, not used in resuming from checkpoint. - freeze: false # whether to freeze the encoder during training. - - model: - sample_rate: 16000 - encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.preprocessor.features} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 17 # SSL conformer-large have only 17 layers - d_model: 512 - - # Sub-sampling params - subsampling: striding # vggnet or striding, vggnet may give better results but needs more memory - subsampling_factor: 4 # must be power of 2 - subsampling_conv_channels: -1 # -1 sets it to d_model - - # Reduction parameters: Can be used to add another subsampling layer at a given position. - # Having a 2x reduction will speedup the training and inference speech while keeping similar WER. - # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup. - reduction: null # pooling, striding, or null - reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder - reduction_factor: 1 - - # Feed forward module's params - ff_expansion_factor: 4 - - # Multi-headed Attention Module's params - self_attention_model: rel_pos # rel_pos or abs_pos - n_heads: 8 # may need to be lower for smaller d_models - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - att_context_size: [-1, -1] # -1 means unlimited context - xscaling: true # scales up the input embeddings by sqrt(d_model) - untie_biases: true # unties the biases of the TransformerXL layers - pos_emb_max_len: 5000 - - # Convolution module's params - conv_kernel_size: 31 - conv_norm_type: 'batch_norm' # batch_norm or layer_norm - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_pre_encoder: 0.1 # The dropout used before the encoder - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - embedding: - _target_: nemo.collections.asr.modules.transformer.TransformerEmbedding - vocab_size: -1 - hidden_size: ${model.encoder.d_model} - max_sequence_length: 512 - num_token_types: 1 - embedding_dropout: 0.0 - learn_positional_encodings: false - - decoder: - _target_: nemo.collections.asr.modules.transformer.TransformerDecoder - num_layers: 3 - hidden_size: ${model.encoder.d_model} - inner_size: 2048 - num_attention_heads: 8 - attn_score_dropout: 0.0 - attn_layer_dropout: 0.0 - ffn_dropout: 0.0 - - classifier: - _target_: nemo.collections.common.parts.MultiLayerPerceptron - hidden_size: ${model.encoder.d_model} - num_classes: -1 - num_layers: 1 - activation: 'relu' - log_softmax: true - - -Loss Configurations ---------------------------------- - -The loss function by default is the negative log-likelihood loss, where optional label-smoothing can be applied by using the following config (default is 0.0): - -.. code-block:: yaml - - loss: - label_smoothing: 0.0 - - -Inference Configurations ---------------------------------- -During inference, three types of sequence generation strategies can be applied: ``greedy search``, ``beam search`` and ``top-k search``. - -.. code-block:: yaml - - sequence_generator: - type: greedy # choices=[greedy, topk, beam] - max_sequence_length: ${model.embedding.max_sequence_length} - temperature: 1.0 # for top-k sampling - beam_size: 1 # K for top-k sampling, N for beam search - len_pen: 0 # for beam-search diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/data/benchmark_sis.csv b/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/data/benchmark_sis.csv deleted file mode 100644 index 397e890486b721b2286eca53538b13175b884635..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/data/benchmark_sis.csv +++ /dev/null @@ -1,2 +0,0 @@ -Model Name,Model Base Class,Model Card -slu_conformer_transformer_large_slurp,SLUIntentSlotBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:slu_conformer_transformer_large_slurp" diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/datasets.rst b/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/datasets.rst deleted file mode 100644 index 3a0168a850a18231614ace194051fd07723dbd77..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/datasets.rst +++ /dev/null @@ -1,10 +0,0 @@ -Datasets -======== - -Input data should be provided in line delimited JSON format as below: - -.. code-block:: bash - - {"audio_filepath": "/path/to/abcd.wav", "offset": 0, "duration": 10.1, "text": "{'scenario': 'Calendar', 'action': 'Create_entry', 'entities': [{'type': 'event_name', 'filler': 'brunch'}, {'type': 'date', 'filler': 'Saturday'}, {'type': 'timeofday', 'filler': 'morning'}, {'type': 'person', 'filler': 'Aronson'}]}"} - -The semantics annotation is a Python dictionary flattened as a string, and indexed by the "text" key in the manifest. For a semantics annotation, there are three mandatory keys: "scenario", "action" and "entities". The values for "scenario" and "action" are strings, where the value for "entities" is a Python list of dictionary. Each item in "entities" is also a Python dictionary, with two keys "type" (entity slot) and "filler" (slot filler). diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/images/example.png b/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/images/example.png deleted file mode 100644 index 1094247bb96a910d068ddbfa30f76381c7657e19..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/images/example.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/images/framework.png b/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/images/framework.png deleted file mode 100644 index 3b64ae9b5f6555bda1f3e7e16ce19d109a50c76c..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/images/framework.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/intro.rst b/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/intro.rst deleted file mode 100644 index 785df41e57b6019f7e019814f76c74a9490a5d46..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/intro.rst +++ /dev/null @@ -1,29 +0,0 @@ -Speech Intent Classification and Slot Filling -============================================== - -**Intent Classification and Slot Filling** aims to not only recognize the user's intention, but also detect entity slots and their corresponding lexical fillers. Below is an example: - - -.. image:: images/example.png - :align: center - :scale: 50% - :alt: slurp_example - - -Different from its counterpart in Natural Language Understanding (NLU) that takes text as input, here the model predicts the semantics directly from audio input. - - - -The full documentation tree is as follows: - -.. toctree:: - :maxdepth: 8 - - models - datasets - results - configs - api - resources - -.. include:: resources.rst diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/models.rst b/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/models.rst deleted file mode 100644 index bc628dd36ba56cb409e9ed90a8808260c1b432da..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/models.rst +++ /dev/null @@ -1,15 +0,0 @@ -Models -====== - -There are mainly two approaches in Speech Intent Classification and Slot Filling, where we can either use an End-to-End (E2E) model that directly predicts sematics from audio, or use a cascading model composed of an ASR model followed by an NLU model. E2E methods are preferred over cascading models, since it avoids error propagation from ASR to NLU and thus have better performance. - -Our E2E model in NeMo is based on an **Encoder-Decoder** framework, where a Conformer-large module is used as the encoder to extract features, and a Transformer Decoder is applied on top of the features to predict the semantics. - -.. image:: images/framework.png - :align: center - :scale: 70% - :alt: sis_framework - -The output is a Python dictionary object flattened as a string representation, so that the problem can be formulated as a sequence-to-sequence (audio-to-text) problem. - -The model is trained by Negative Log-Likelihood (NLL) Loss with teacher forcing. diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/resources.rst b/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/resources.rst deleted file mode 100644 index 017c184e243679f2d3bb4a11b56be7137592648c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/resources.rst +++ /dev/null @@ -1,11 +0,0 @@ -Resources and Documentation ---------------------------- - -Example of Speech Intent Classification and Slot Filling can be found `here `_. - -Information about how to load model checkpoints (either local files or pretrained ones from NGC), -as well as a list of the checkpoints available on NGC are located on the `Checkpoints <./results.html>`__ -page. - -Documentation regarding the configuration files specific to SLU can be found in the -`Configuration Files <./configs.html>`__ page. diff --git a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/results.rst b/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/results.rst deleted file mode 100644 index b22f99b31cd83734d9530c2a558b6e565e410515..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/speech_intent_slot/results.rst +++ /dev/null @@ -1,56 +0,0 @@ -Checkpoints -=========== - -There are two main ways to load pretrained checkpoints in NeMo: - -* Using the :code:`restore_from()` method to load a local checkpoint file (`.nemo`), or -* Using the :code:`from_pretrained()` method to download and set up a checkpoint from NGC. - -See the following sections for instructions and examples for each. - -Note that these instructions are for loading fully trained checkpoints for evaluation or fine-tuning. -For resuming an unfinished training experiment, please use the experiment manager to do so by setting the -``resume_if_exists`` flag to True. - -Loading Local Checkpoints -------------------------- - -NeMo will automatically save checkpoints of a model you are training in a `.nemo` format. -You can also manually save your models at any point using :code:`model.save_to(.nemo)`. - -If you have a local ``.nemo`` checkpoint that you'd like to load, simply use the :code:`restore_from()` method: - -.. code-block:: python - - import nemo.collections.asr as nemo_asr - model = nemo_asr.models..restore_from(restore_path="") - -Where the model base class is the ASR model class of the original checkpoint, or the general `ASRModel` class. - - -Inference ------------------------ - -The audio files should be 16KHz monochannel wav files. - -**Transcribe Audios to Semantics:** - -You may perform inference on a sample of speech after loading the model by using its 'transcribe()' method: - -.. code-block:: python - - slu_model = nemo_asr.models.SLUIntentSlotBPEModel.from_pretrained(model_name="") - predictions = slu_model.transcribe([list of audio files], batch_size="") - - -SLU Models ------------------------------------ - -Below is a list of all the Speech Intent Classification and Slot Filling models that are available in NeMo. - - -.. csv-table:: - :file: data/benchmark_sis.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 diff --git a/SoundScribe/SpeakerID/docs/source/asr/ssl/api.rst b/SoundScribe/SpeakerID/docs/source/asr/ssl/api.rst deleted file mode 100644 index 7103243a4b20c093ecd96f4c7cff64131321ff50..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/ssl/api.rst +++ /dev/null @@ -1,24 +0,0 @@ -NeMo SSL collection API -============================= - - -Model Classes -------------- -.. autoclass:: nemo.collections.asr.models.SpeechEncDecSelfSupervisedModel - :show-inheritance: - :members: - - -Mixins ------- - -.. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin - :show-inheritance: - :members: - -.. autoclass:: nemo.core.classes.mixins.access_mixins.AccessMixin - :show-inheritance: - :members: - - - diff --git a/SoundScribe/SpeakerID/docs/source/asr/ssl/configs.rst b/SoundScribe/SpeakerID/docs/source/asr/ssl/configs.rst deleted file mode 100644 index f0194c10638cebb1b18ef2c56f751ec31efb389e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/ssl/configs.rst +++ /dev/null @@ -1,413 +0,0 @@ -NeMo SSL Configuration Files -============================ - -This page covers NeMo configuration file setup that is specific to models in the Speech Self-Supervised Pre-training collection. -For general information about how to set up and run experiments that is common to all NeMo models (e.g. -experiment manager and PyTorch Lightning trainer parameters), see the :doc:`../../core/core` page. - -Dataset Configuration ---------------------- - -Dataset configuration for self-supervised model is mostly the same as for standard ASR training, -covered `here <../configs.html#Dataset Configuration>`__. The main difference is that in order to perform contrastive loss, -we will need to mask an equivalent amount of patches for all utterances in a batch. This means that we want to avoid -the durations varying too significantly within a single batch. There are several ways you can achieve this in NeMo: - -1) The simplest way is to use the ``min_duration`` parameter in the dataset config, which will simply -discard all utterances below the specified length. This is a viable option if removing these utterances will not -significantly impact the total amount of hours of your dataset. - -2) If your dataset contains many long utterances (longer than ~16 seconds) with varying length, then you may instead -want to use the ``random_segment`` perturbation, which will sample segments of a certain length from the full sample at -runtime (samples below the provided segment length will be padded). You can enable this by adding the following to your -dataset config: - -.. code-block:: yaml - - augmentor: - random_segment: - prob: 1.0 - duration_sec: 16 # specify the duration you want - -3) You can also use bucketing to ensure similar utterance lengths within batches. -See `Bucketing documentation <../datasets.html#bucketing-datasets>`__. - -An example of SSL train and validation configuration should look similar to the following: - -.. code-block:: yaml - - model: - train_ds: - manifest_filepath: ??? - sample_rate: ${model.sample_rate} - batch_size: 16 # you may increase batch_size if your memory allows - shuffle: true - num_workers: 8 - pin_memory: false - use_start_end_token: true - trim_silence: false - max_duration: 16.7 - min_duration: 8.0 - # tarred datasets - is_tarred: false - tarred_audio_filepaths: null - shuffle_n: 2048 - # bucketing params - bucketing_strategy: "synced_randomized" - bucketing_batch_size: null - - validation_ds: - manifest_filepath: ??? - sample_rate: ${model.sample_rate} - batch_size: 16 # you may increase batch_size if your memory allows - shuffle: false - num_workers: 8 - pin_memory: true - use_start_end_token: false - min_duration: 8.0 - - -Preprocessor Configuration --------------------------- - -Preprocessor helps to compute MFCC or mel spectrogram features that are given as inputs to model. -For details on how to write this section, refer to `Preprocessor Configuration <../configs.html#preprocessor-configuration>`__ - -Augmentation Configurations ---------------------------- - -For self-supervised pre-training, we recommend using the ``MaskedPatchAugmentation`` class for spectrogram masking. -This augmentation divides utterances into fixed size patches, and then masks a fixed amount/fraction of them. You can -also add ``freq_masks`` and ``freq_width`` to apply masking to frequency bands. - -If you are using contrastive loss with negatives sampled from masked steps in same utterance only, -make sure that the total amount of masked steps in each utterance will be big enough for the number of sampled negatives. -For example, if you are using 4x stride and want to sample 100 negatives, then you will need more than 400 masked steps. -If you are using the default ``patch_size`` of 48, then this means you will need to set ``mask_patches`` to at least 9. -When using a fraction of the total amount of patches instead of a fixed amount, you will need to make sure that the -minimum duration of your samples in large enough for the number of negatives to sample. - -.. code-block:: yaml - - spec_augment: - _target_: nemo.collections.asr.modules.MaskedPatchAugmentation - patch_size: 48 # size of a single patch - mask_patches: 0.5 # fraction of patches to mask (can be fixed int amount instead) - freq_masks: 3 # Cut three frequency bands - freq_width: 20 # ... of width 20 at maximum - - -Model Architecture Configurations ---------------------------------- - -Each configuration file should describe the model architecture being used for the experiment. For self-supervised pre-training, -we will typically train the encoder of the model and then re-use it for fine-tuning, so the encoder can be configured in the same way -as you would for an ASR model. Note that any ASR model encoder can be used with any of the available pre-training methods, -though, given the same model sizes, we find the best downstream results when using `Conformer <./models.html#Conformer-Transducer>`__. - -Unlike the encoders, the decoders and corresponding losses will be specific to the self-supervised pre-training, and are small enough that -you can discard them when transferring the model to fine-tuning. - -The most basic method of pre-training we can use is to have the model solve a contrastive task -(this is the approach used in wav2vec 2.0 :cite:`ssl-models-wav2vec2`) -We can define the corresponding decoder and loss configs in the following way for an encoder with stride 4x. - -.. code-block:: yaml - - decoder_out: 128 - - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction - feat_in: ${model.encoder.d_model} - feat_hidden: 128 - feat_out: ${model.decoder_out} - stride_layers: 0 - # if loss.combine_time_steps is less than the encoder stride, then a corresponding amount of stride_layers needs to - # be added to the decoder (here stride and combine_time_steps are both 4) - non_stride_layers: 0 - - loss: - _target_: nemo.collections.asr.losses.ContrastiveLoss - in_dim: ${model.preprocessor.features} - proj_dim: ${model.decoder_out} - combine_time_steps: 4 # how many spectrogram time steps are used for one target/representation for contrastive task - quantized_targets: true # should quantizer or linear layer be used - codebook_size: 300 # size of a single codebook for quantizer - num_groups: 2 # number of codebooks to use for quantizer - num_negatives: 100 # number of sampled negatives for each target - sample_from_same_utterance_only: true # should negatives be sampled only from the same utterance - sample_from_non_masked: false # should negatives be sampled from non-masked steps - -Note that in the above example we combine 4 steps from the input spectrogram into a single "token" for the loss, -which corresponds to the encoder stride 4x. We might want to use different values for "combine_time_steps" and encoder stride. -In that case, we will need to add stride layers to decoders to match the strides. We can use the following example config -for a Citrinet encoder with stride 8x. In order to go from stride 8x to 4x, we use a single ``stride_layer`` in the decoder -with ``stride_transpose`` set to True. - -.. code-block:: yaml - - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction - feat_in: ${model.model_defaults.enc_final} - feat_hidden: 128 - feat_out: ${model.model_defaults.decoder_out_channels} - stride_layers: 1 - #if loss.combine_time_steps is less than the encoder stride, then a corresponding amount of stride_layers needs to - #be added to the decoder (here stride is 8 and combine_time_steps is 4, so 1 stride layer is added) - non_stride_layers: 0 - stride_tranpose: true # whether to use transposed convolution for stride layers or not - - loss: - _target_: nemo.collections.asr.losses.ContrastiveLoss - in_dim: *n_mels - proj_dim: ${model.model_defaults.decoder_out_channels} - combine_time_steps: 4 #how many spectrogram time steps are used for one target/representation for contrastive task - quantized_targets: false #should quantizer or linear layer be used - sample_from_same_utterance_only: true #should negatives be sampled only from the same utterance - sample_from_non_masked: false #should negatives be sampled from non-masked steps - - -It can be beneficial to combine contrastive loss with other losses, such as a masked language modeling (mlm) loss -(similar approach to W2V-Bert :cite:`ssl-models-w2v_bert`). -In order to do this, instead of specifying a single ``decoder`` and ``loss`` in the config, we can specify a ``loss_list``, -which can contain any amount of corresponding decoders and losses. For each decoder-loss pair, -we can specify a separate named sub-config, which contains the following fields: - -1. ``decoder`` - The decoder config, specifying a ``target`` class and parameters. -2. ``loss`` - The corresponding loss config, specifying a ``target`` class and parameters. -3. ``loss_alpha`` - A multiplier on this loss (1.0 by default). -4. ``targets_from_loss`` - This parameter specifies which contrastive loss we should extract labels from. It is necessary for any loss which requires labels, if labels aren't present in your manifest. -5. ``transpose_encoded`` - This parameter is used to optionally transpose the encoded features before passing them into this loss. -6. ``start_step`` - The training step at which we should start using this decoder+loss. -7. ``output_from_layer`` - This parameter can be used to specify the name of the layer that we should extract encoded features from to pass into this decoder. If it's not specified or set to null, the final encoder layer is used. - - -The following is an example of a `loss_list` for a combination of contrastive+mlm losses, -where the mlm loss uses targets from the quantization module of the contrastive loss. - - -.. code-block:: yaml - - decoder_out: 128 - - loss_list: - contrastive: - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction - feat_in: ${model.encoder.d_model} - feat_hidden: 128 - # features in hidden layer of decoder - feat_out: ${model.decoder_out} - stride_layers: 0 - # if loss.combine_time_steps is less than the encoder stride, then a corresponding amount of stride_layers needs to - # be added to the decoder (here stride and combine_time_steps are both 4) - non_stride_layers: 0 - loss: - _target_: nemo.collections.asr.losses.ContrastiveLoss - in_dim: ${model.preprocessor.features} - proj_dim: ${model.decoder_out} - combine_time_steps: 4 # how many spectrogram time steps are used for one target/representation for contrastive task - quantized_targets: true # should quantizer or linear layer be used - # (quantizer is required to extract pseudo-labels for other losses) - codebook_size: 300 - num_groups: 2 - sample_from_same_utterance_only: true # should negatives be sampled only from the same utterance - sample_from_non_masked: false # should negatives be sampled from non-masked steps - - mlm: - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoder - feat_in: ${model.encoder.d_model} - num_classes: 90000 - # set this to be equal to codebook_size^groups in the contrastive loss - loss: - _target_: nemo.collections.asr.losses.MLMLoss - combine_time_steps: 4 - targets_from_loss: "contrastive" - # since this loss requires targets, we can either get them from a manifest or from a quantized contrastive loss - loss_alpha: 1000. - # multiplier applied to this loss relative to others - transpose_encoded: false - # transposing input may be necessary depending on which layer is used as input to decoder - start_step: 0 - # determines what global step this loss starts being used at; - # this can be set to a higher number if your training is long enough, - # which may increase early training stability - output_from_layer: null - # if we wanted to use outputs from non-final encoder layer as input to this decoder, - # the layer name should be specified here - - -We can also use other losses which require labels instead of mlm, such as ctc or rnnt loss. Since these losses, unlike mlm, -don't require our targets to have a direct alignment with our steps, we may also want to use set the ``reduce_ids`` parameter of the -contrastive loss to true, to convert any sequence of consecutive equivalent ids to a single occurrence of that id. - -An example of a ``loss_list`` consisting of contrastive+ctc loss can look like this: - -.. code-block:: yaml - - decoder_out: 128 - - loss_list: - contr: - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction - feat_in: ${model.encoder.d_model} - feat_hidden: 128 - feat_out: ${model.decoder_out} - stride_layers: 0 - non_stride_layers: 0 - loss: - _target_: nemo.collections.asr.losses.ContrastiveLoss - in_dim: ${model.preprocessor.features} - proj_dim: ${model.decoder_out} - combine_time_steps: 4 - quantized_targets: true - codebook_size: 300 - num_groups: 2 - sample_from_same_utterance_only: true - sample_from_non_masked: false - reduce_ids: true - - ctc: - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoder - feat_in: ${model.encoder.d_model} - num_classes: 90000 - loss: - _target_: nemo.collections.asr.losses.CTCLossForSSL - num_classes: 90000 - targets_from_loss: "contr" - start_step: 3000 - -An example of contrastive+rnnt can look like this: - -.. code-block:: yaml - - decoder_out: 128 - - loss_list: - contr: - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction - feat_in: ${model.encoder.d_model} - feat_hidden: 128 - feat_out: ${model.decoder_out} - stride_layers: 0 - non_stride_layers: 0 - loss: - _target_: nemo.collections.asr.losses.ContrastiveLoss - in_dim: ${model.preprocessor.features} - proj_dim: ${model.decoder_out} - combine_time_steps: 4 - quantized_targets: true - codebook_size: 24 - sample_from_same_utterance_only: true - sample_from_non_masked: false - reduce_ids: true - - rnnt: - decoder: - _target_: nemo.collections.asr.modules.RNNTDecoderJointSSL - decoder: - _target_: nemo.collections.asr.modules.RNNTDecoder - normalization_mode: null # Currently only null is supported for export. - random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf - blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. - vocab_size: 576 - prednet: - pred_hidden: 640 - pred_rnn_layers: 1 - t_max: null - dropout: 0.1 - joint: - _target_: nemo.collections.asr.modules.RNNTJoint - log_softmax: null # 'null' would set it automatically according to CPU/GPU device - preserve_memory: false # dramatically slows down training, but might preserve some memory - experimental_fuse_loss_wer: false - jointnet: - encoder_hidden: 512 - pred_hidden: 640 - joint_hidden: 640 - activation: "relu" - dropout: 0.1 - num_classes: 576 - loss: - _target_: nemo.collections.asr.losses.RNNTLossForSSL - num_classes: 576 - targets_from_loss: "contr" - start_step: 1000 - - -We can also use multiple losses, which use features from different intermediate layers of the encoder as input :cite:`ssl-models-ssl_inter`. -In the following config example, we use contrastive loss + three different mlm losses, which use encoder outputs -respectively from 6th, 12th and final layer. - -.. code-block:: yaml - - decoder_out: 128 - - loss_list: - contr: - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction - feat_in: ${model.encoder.d_model} - feat_hidden: 128 - feat_out: ${model.decoder_out} - stride_layers: 0 - non_stride_layers: 0 - loss: - _target_: nemo.collections.asr.losses.ContrastiveLoss - in_dim: ${model.preprocessor.features} - proj_dim: ${model.decoder_out} - combine_time_steps: 4 - quantized_targets: true - codebook_size: 300 - sample_from_same_utterance_only: true - sample_from_non_masked: false - loss_alpha: 5. - - mlm: - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoder - feat_in: ${model.encoder.d_model} - num_classes: 90000 - loss: - _target_: nemo.collections.asr.losses.MLMLoss - combine_time_steps: 4 - targets_from_loss: "contr" - loss_alpha: 1000. - - mlm_2: - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoder - feat_in: ${model.encoder.d_model} - num_classes: 90000 - loss: - _target_: nemo.collections.asr.losses.MLMLoss - combine_time_steps: 4 - targets_from_loss: "contr" - loss_alpha: 300. - output_from_layer: "layers.5" - transpose_encoded: true - - mlm_3: - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoder - feat_in: ${model.encoder.d_model} - num_classes: 90000 - loss: - _target_: nemo.collections.asr.losses.MLMLoss - combine_time_steps: 4 - targets_from_loss: "contr" - loss_alpha: 300. - output_from_layer: "layers.11" - transpose_encoded: true - -References ------------ - -.. bibliography:: ../asr_all.bib - :style: plain - :labelprefix: SSL-MODELS - :keyprefix: ssl-models- \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/ssl/data/benchmark_ssl.csv b/SoundScribe/SpeakerID/docs/source/asr/ssl/data/benchmark_ssl.csv deleted file mode 100644 index 6085a5f3fd4f51d6bbb8dbfc24830f729e09d06b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/ssl/data/benchmark_ssl.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model Name,Model Base Class,Model Card -ssl_en_conformer_large,SpeechEncDecSelfSupervisedModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:ssl_en_conformer_large" -ssl_en_conformer_xlarge,SpeechEncDecSelfSupervisedModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:ssl_en_conformer_xlarge" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/ssl/datasets.rst b/SoundScribe/SpeakerID/docs/source/asr/ssl/datasets.rst deleted file mode 100644 index b597dcf4b7b3101b716197633a4aae88bb98de4c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/ssl/datasets.rst +++ /dev/null @@ -1,8 +0,0 @@ -Datasets -======== - -Any dataset available in NeMo for ASR (`ASR datasets <../datasets.html>`__) can be used for SSL. -To create your own NeMo compatible datasets, refer to -`Preparing Custom ASR Data <../datasets.html#preparing-custom-asr-data>`__ -section. Note that explicit labels (transcriptions) are not utilized in SSL and hence are optional -when creating datasets for SSL. \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/asr/ssl/intro.rst b/SoundScribe/SpeakerID/docs/source/asr/ssl/intro.rst deleted file mode 100644 index d1a7366164d85a0576a0849b1002c403dc89a523..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/ssl/intro.rst +++ /dev/null @@ -1,34 +0,0 @@ -Self-Supervised Learning -================================= - -Self-Supervised Learning (SSL) refers to the problem of learning without explicit labels. As -any learning process require feedback, without explit labels, SSL derives supervisory signals from -the data itself. The general ideal of SSL is to predict any hidden part (or property) of the input -from observed part of the input (e.g., filling in the blanks in a sentence or predicting whether -an image is upright or inverted). - -SSL for speech/audio understanding broadly falls into either contrastive or reconstruction -based approaches. In contrastive methods, models learn by distinguishing between true and distractor -tokens (or latents). Examples of contrastive approaches are Contrastive Predictive Coding (CPC), -Masked Language Modeling (MLM) etc. In reconstruction methods, models learn by directly estimating -the missing (intentionally leftout) portions of the input. Masked Reconstruction, Autoregressive -Predictive Coding (APC) are few examples. - -In the recent past, SSL has been a major benefactor in improving Acoustic Modeling (AM), i.e., the -encoder module of neural ASR models. Here too, majority of SSL effort is focused on improving AM. -While it is common that AM is the focus of SSL in ASR, it can also be utilized in improving other parts of -ASR models (e.g., predictor module in transducer based ASR models). - -The full documentation tree is as follows: - -.. toctree:: - :maxdepth: 8 - - models - datasets - results - configs - api - resources - -.. include:: resources.rst diff --git a/SoundScribe/SpeakerID/docs/source/asr/ssl/models.rst b/SoundScribe/SpeakerID/docs/source/asr/ssl/models.rst deleted file mode 100644 index 1713aff6472c3dac448389d0cf70838c9b325ce0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/ssl/models.rst +++ /dev/null @@ -1,12 +0,0 @@ -Models -====== - -End-to-End ASR models are typically of encoder-decoder style, where the encoder does acoustic -modeling i.e., converting speech wavform into features, and the decoder converts those features into -text. Encoder contains the bulk of trainable parameters and is usually the focus of SSL in ASR. -Thus, any architecture that can be used as encoder in ASR models can be pre-trained using SSL. For an -overview of model architectures that are currently supported in NeMo's ASR's collection, refer -to `ASR Models <../models.html>`__. Note that SSL also uses encoder-decoder style of models. During -down-stream fine-tuning, the encoder is retained where as the decoder (used during SSL) is replaced -with down-stream task specific module. Refer to `checkpoints <./results.html>`__ to see how this is -accomplished in NeMo. diff --git a/SoundScribe/SpeakerID/docs/source/asr/ssl/resources.rst b/SoundScribe/SpeakerID/docs/source/asr/ssl/resources.rst deleted file mode 100644 index 416c37ceb6f09fc0d05360fce7fc361d5bc91af8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/ssl/resources.rst +++ /dev/null @@ -1,23 +0,0 @@ -Resources and Documentation ---------------------------- - -Refer to `SSL-for-ASR notebook `_ -for a hands-on tutorial. If you are a beginner to NeMo, consider trying out the -`ASR with NeMo `_ -tutorial. This and most other tutorials can be run on Google Colab by specifying the link to the -notebooks' GitHub pages on Colab. - -If you are looking for information about a particular ASR model, or would like to find out more -about the model architectures available in the ``nemo_asr`` collection, refer to the -`ASR Models <../models.html>`__ page. - -NeMo includes preprocessing scripts for several common ASR datasets. The `ASR Datasets <../datasets.html>`__ -page contains instructions on running those scripts. It also includes guidance for creating your -own NeMo-compatible dataset, if you have your own data. - -Information about how to load model checkpoints (either local files or pretrained ones from NGC), -as well as a list of the checkpoints available on NGC are located on the `Checkpoints <./results.html>`__ -page. - -Documentation regarding the configuration files specific to the SSL can be found in the -`Configuration Files <./configs.html>`__ page. diff --git a/SoundScribe/SpeakerID/docs/source/asr/ssl/results.rst b/SoundScribe/SpeakerID/docs/source/asr/ssl/results.rst deleted file mode 100644 index adc14b15285b85cb86e9ff662fc13a248e98d9fc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/asr/ssl/results.rst +++ /dev/null @@ -1,106 +0,0 @@ -Checkpoints -=========== - -Pre-trained SSL checkpoints available in NeMo need to be further fine-tuned on down-stream task. -There are two main ways to load pretrained checkpoints in NeMo: - -* Using the :code:`restore_from()` method to load a local checkpoint file (``.nemo``), or -* Using the :code:`from_pretrained()` method to download and set up a checkpoint from NGC. - -Refer to the following sections for instructions and examples for each. - -Note that these instructions are for fine-tuning. To resume an unfinished training experiment, -use the Experiment Manager to do so by setting the ``resume_if_exists`` flag to ``True``. - -Loading Local Checkpoints -------------------------- - -NeMo automatically saves checkpoints of a model that is trained in a ``.nemo`` format. Alternatively, to manually save the model at any -point, issue :code:`model.save_to(.nemo)`. - -If there is a local ``.nemo`` checkpoint that you'd like to load, use the :code:`restore_from()` method: - -.. code-block:: python - - import nemo.collections.asr as nemo_asr - ssl_model = nemo_asr.models..restore_from(restore_path="") - -Where the model base class is the ASR model class of the original checkpoint, or the general ``ASRModel`` class. - -Loading NGC Pretrained Checkpoints ----------------------------------- - -The SSL collection has checkpoints of several models trained on various datasets. These checkpoints are -obtainable via NGC `NeMo Automatic Speech Recognition collection `_. -The model cards on NGC contain more information about each of the checkpoints available. - -The table at the end of this page lists the SSL models available from NGC. The models can be accessed via the :code:`from_pretrained()` method inside -the ASR Model class. In general, you can load any of these models with code in the following format: - -.. code-block:: python - - import nemo.collections.asr as nemo_asr - ssl_model = nemo_asr.models.ASRModel.from_pretrained(model_name="") - -Where the ``model_name`` is the value under "Model Name" entry in the tables below. - -For example, to load the conformer Large SSL checkpoint, run: - -.. code-block:: python - - ssl_model = nemo_asr.models.ASRModel.from_pretrained(model_name="ssl_en_conformer_large") - -You can also call :code:`from_pretrained()` from the specific model class (such as :code:`SpeechEncDecSelfSupervisedModel` -for Conformer) if you need to access a specific model functionality. - -If you would like to programatically list the models available for a particular base class, you can use the -:code:`list_available_models()` method. - -.. code-block:: python - - nemo_asr.models..list_available_models() - - -Loading SSL checkpoint into Down-stream Model ---------------------------------------------- -After loading an SSL checkpoint as shown above, it's ``state_dict`` needs to be copied to a -down-stream model for fine-tuning. - -For example, to load a SSL checkpoint for ASR down-stream task using ``EncDecRNNTBPEModel``, run: - -.. code-block:: python - - # define down-stream model - asr_model = nemo_asr.models.EncDecRNNTBPEModel(cfg=cfg.model, trainer=trainer) - - # load ssl checkpoint - asr_model.load_state_dict(ssl_model.state_dict(), strict=False) - - # discard ssl model - del ssl model - -Refer to `SSL configs <./configs.html>`__ to do this automatically via config files. - - -Fine-tuning on Downstream Datasets ------------------------------------ - -After loading SSL checkpoint into down-stream model, refer to multiple ASR tutorials provided in the :ref:`Tutorials ` section. -Most of these tutorials explain how to fine-tune on some dataset as a demonstration. - -Inference Execution Flow Diagram --------------------------------- - -When preparing your own inference scripts after downstream fine-tuning, please follow the execution flow diagram order for correct inference, found at the `examples directory for ASR collection `_. - -SSL Models ------------------------------------ - -Below is a list of all the SSL models that are available in NeMo. - - -.. csv-table:: - :file: data/benchmark_ssl.csv - :align: left - :widths: 40, 10, 50 - :header-rows: 1 diff --git a/SoundScribe/SpeakerID/docs/source/common/callbacks.rst b/SoundScribe/SpeakerID/docs/source/common/callbacks.rst deleted file mode 100644 index a627e0dd2ca25032cbedece07a2eaf8e4ed3ecc3..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/common/callbacks.rst +++ /dev/null @@ -1,51 +0,0 @@ -********* -Callbacks -********* - -Exponential Moving Average (EMA) -================================ - -During training, EMA maintains a moving average of the trained parameters. -EMA parameters can produce significantly better results and faster convergence for a variety of different domains and models. - -EMA is a simple calculation. EMA Weights are pre-initialized with the model weights at the start of training. - -Every training update, the EMA weights are updated based on the new model weights. - -.. math:: - ema_w = ema_w * decay + model_w * (1-decay) - -Enabling EMA is straightforward. We can pass the additional argument to the experiment manager at runtime. - -.. code-block:: bash - - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/path/to/my/train/manifest.json \ - model.validation_ds.manifest_filepath=/path/to/my/validation/manifest.json \ - trainer.devices=2 \ - trainer.accelerator='gpu' \ - trainer.max_epochs=50 \ - exp_manager.ema.enable=True # pass this additional argument to enable EMA - -To change the decay rate, pass the additional argument. - -.. code-block:: bash - - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - ... - exp_manager.ema.enable=True \ - exp_manager.ema.decay=0.999 - -We also offer other helpful arguments. - -.. list-table:: - :header-rows: 1 - - * - Argument - - Description - * - `exp_manager.ema.validate_original_weights=True` - - Validate the original weights instead of EMA weights. - * - `exp_manager.ema.every_n_steps=2` - - Apply EMA every N steps instead of every step. - * - `exp_manager.ema.cpu_offload=True` - - Offload EMA weights to CPU. May introduce significant slow-downs. diff --git a/SoundScribe/SpeakerID/docs/source/common/data.rst b/SoundScribe/SpeakerID/docs/source/common/data.rst deleted file mode 100644 index 4c2f38cbba833f01caf0000d0f3d12277349a0f8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/common/data.rst +++ /dev/null @@ -1,13 +0,0 @@ -Data ----- - -.. autoclass:: nemo.collections.common.data.dataset.ConcatDataset - :show-inheritance: - :members: - :undoc-members: - - -.. autoclass:: nemo.collections.common.data.dataset.ConcatMapDataset - :show-inheritance: - :members: - :undoc-members: diff --git a/SoundScribe/SpeakerID/docs/source/common/intro.rst b/SoundScribe/SpeakerID/docs/source/common/intro.rst deleted file mode 100644 index fadbd9528485e1cd268e7ee05569cb0571603c20..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/common/intro.rst +++ /dev/null @@ -1,13 +0,0 @@ -Common Collection -================= - -The common collection contains things that could be used across all collections. - -.. toctree:: - :maxdepth: 8 - - callbacks - losses - metrics - tokenizers - data diff --git a/SoundScribe/SpeakerID/docs/source/common/losses.rst b/SoundScribe/SpeakerID/docs/source/common/losses.rst deleted file mode 100644 index 006746face297a696099a3b1151c30096debb92d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/common/losses.rst +++ /dev/null @@ -1,16 +0,0 @@ -Losses ------- -.. autoclass:: nemo.collections.common.losses.AggregatorLoss - :special-members: __init__ - -.. autoclass:: nemo.collections.common.losses.CrossEntropyLoss - :special-members: __init__ - -.. autoclass:: nemo.collections.common.losses.MSELoss - :special-members: __init__ - -.. autoclass:: nemo.collections.common.losses.SmoothedCrossEntropyLoss - :special-members: __init__ - -.. autoclass:: nemo.collections.common.losses.SpanningLoss - :special-members: __init__ diff --git a/SoundScribe/SpeakerID/docs/source/common/metrics.rst b/SoundScribe/SpeakerID/docs/source/common/metrics.rst deleted file mode 100644 index a47bd9f6f09b7e66751bcad60453586157d61134..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/common/metrics.rst +++ /dev/null @@ -1,7 +0,0 @@ -Metrics -------- - -.. autoclass:: nemo.collections.common.metrics.Perplexity - :show-inheritance: - :members: - :undoc-members: diff --git a/SoundScribe/SpeakerID/docs/source/common/tokenizers.rst b/SoundScribe/SpeakerID/docs/source/common/tokenizers.rst deleted file mode 100644 index 5c7336e8d603bd31202223fb1f8479f3b4f58186..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/common/tokenizers.rst +++ /dev/null @@ -1,8 +0,0 @@ -Tokenizers ----------- -.. autoclass:: nemo.collections.common.tokenizers.AutoTokenizer - :special-members: __init__ -.. autoclass:: nemo.collections.common.tokenizers.SentencePieceTokenizer - :special-members: __init__ -.. autoclass:: nemo.collections.common.tokenizers.TokenizerSpec - :special-members: __init__ diff --git a/SoundScribe/SpeakerID/docs/source/conf.py b/SoundScribe/SpeakerID/docs/source/conf.py deleted file mode 100644 index 952e25332ca43257f1a172e9495fd66c1e883410..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/conf.py +++ /dev/null @@ -1,266 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import re -import sys -import glob - -import sphinx_book_theme - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. - -sys.path.insert(0, os.path.abspath("../..")) -sys.path.insert(0, os.path.abspath("../../nemo")) - -from package_info import __version__ - -templates_path = ["_templates"] - -autodoc_mock_imports = [ - 'torch', - 'torch.nn', - 'torch.utils', - 'torch.optim', - 'torch.utils.data', - 'torch.utils.data.sampler', - 'torchtext', - 'torchvision', - 'ruamel.yaml', # ruamel.yaml has ., which is troublesome for this regex - 'hydra', # hydra-core in requirements, hydra during import - 'dateutil', # part of core python - 'transformers.tokenization_bert', # has ., troublesome for this regex - 'sklearn', # scikit_learn in requirements, sklearn in import - 'nemo_text_processing.inverse_text_normalization', # Not installed automatically - 'nemo_text_processing.text_normalization', # Not installed automatically - 'attr', # attrdict in requirements, attr in import - 'torchmetrics', # inherited from PTL - 'lightning_utilities', # inherited from PTL - 'lightning_fabric', - 'apex', - 'megatron.core', - 'transformer_engine', - 'joblib', # inherited from optional code - 'IPython', - 'ipadic', - 'psutil', - 'regex', -] - -_skipped_autodoc_mock_imports = ['wrapt', 'numpy'] - -for req_path in sorted(list(glob.glob("../../requirements/*.txt"))): - if "docs.txt" in req_path: - continue - - req_file = os.path.abspath(os.path.expanduser(req_path)) - with open(req_file, 'r') as f: - for line in f: - line = line.replace("\n", "") - req = re.search(r"([a-zA-Z0-9-_]*)", line) - if req: - req = req.group(1) - req = req.replace("-", "_") - - if req not in autodoc_mock_imports: - if req in _skipped_autodoc_mock_imports: - print(f"Skipping req : `{req}` (lib {line})") - continue - - autodoc_mock_imports.append(req) - print(f"Adding req : `{req}` to autodoc mock requirements (lib {line})") - else: - print(f"`{req}` already added to autodoc mock requirements (lib {line})") - -# -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. - -extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.todo", - "sphinx.ext.coverage", - "sphinx.ext.mathjax", - "sphinx.ext.ifconfig", - "sphinx.ext.viewcode", - "sphinx.ext.napoleon", - "sphinx.ext.githubpages", - "sphinxcontrib.bibtex", - "sphinx.ext.inheritance_diagram", - "sphinx.ext.intersphinx", - "sphinx.ext.autosectionlabel", - "sphinxcontrib.bibtex", - "sphinx_copybutton", - "sphinxext.opengraph", -] - -bibtex_bibfiles = [ - 'asr/asr_all.bib', - 'nlp/nlp_all.bib', - 'nlp/text_normalization/tn_itn_all.bib', - 'tools/tools_all.bib', - 'tts/tts_all.bib', - 'text_processing/text_processing_all.bib', - 'core/adapters/adapter_bib.bib', -] - -intersphinx_mapping = { - 'pytorch': ('https://pytorch.org/docs/stable', None), - 'pytorch-lightning': ('https://pytorch-lightning.readthedocs.io/en/latest/', None), -} - -# Set default flags for all classes. -autodoc_default_options = {'members': None, 'undoc-members': None, 'show-inheritance': True} - -locale_dirs = ['locale/'] # path is example but recommended. -gettext_compact = False # optional. - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = ".rst" - -# The master toctree document. -master_doc = "index" - -# General information about the project. -project = "NVIDIA NeMo" -copyright = "© 2021-2023 NVIDIA Corporation & Affiliates. All rights reserved." -author = "NVIDIA CORPORATION" - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. - - -# The short X.Y version. -# version = "0.10.0" -version = __version__ -# The full version, including alpha/beta/rc tags. -# release = "0.9.0" -release = __version__ - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This patterns also effect to html_static_path and html_extra_path -exclude_patterns = [] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = "default" - -### Previous NeMo theme -# # NVIDIA theme settings. -# html_theme = 'nvidia_theme' - -# html_theme_path = ["."] - -# html_theme_options = { -# 'display_version': True, -# 'project_version': version, -# 'project_name': project, -# 'logo_path': None, -# 'logo_only': True, -# } -# html_title = 'Introduction' - -# html_logo = html_theme_options["logo_path"] - -# -- Options for HTMLHelp output ------------------------------------------ - -# Output file base name for HTML help builder. -htmlhelp_basename = "nemodoc" - -### from TLT conf.py -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# - -# html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] - -html_theme = "sphinx_book_theme" -html_logo = os.path.join('nv_logo.png') -html_title = 'NVIDIA NeMo' - -html_theme_options = { - 'logo_only': True, - 'display_version': True, - # 'prev_next_buttons_location': 'bottom', - # 'style_external_links': False, - # 'style_nav_header_background': '#000000', - # Toc options - 'collapse_navigation': False, - # 'sticky_navigation': False, - 'navigation_depth': 10, - # 'includehidden': False, - # 'titles_only': False, - # Sphinx Book theme, - 'repository_url': 'https://github.com/NVIDIA/NeMo', - 'use_repository_button': True, - 'show_navbar_depth': 1, - 'show_toc_level': 10, -} - - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". - -html_favicon = 'favicon.ico' - -html_static_path = ['_static'] - -html_last_updated_fmt = '' - - -def setup(app): - app.add_css_file('css/custom.css') - app.add_js_file('js/pk_scripts.js') - - -# html_css_files = [ -# './custom.css', -# ] - -# html_js_files = [ -# './pk_scripts.js', -# ] - -# OpenGraph settings -ogp_site_url = 'https://nvidia.github.io/NeMo/' -ogp_image = 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/_static/nv_logo.png' - -# MathJax CDN -mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@3.2.2/es5/mml-chtml.min.js" diff --git a/SoundScribe/SpeakerID/docs/source/core/adapters/adapter_bib.bib b/SoundScribe/SpeakerID/docs/source/core/adapters/adapter_bib.bib deleted file mode 100644 index 9a04d876f162657f83cda03a228672efd08a323e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/core/adapters/adapter_bib.bib +++ /dev/null @@ -1,21 +0,0 @@ - - -@inproceedings{houlsby2019adapter, - title={Parameter-efficient transfer learning for NLP}, - author={Houlsby, Neil and Giurgiu, Andrei and Jastrzebski, Stanislaw and Morrone, Bruna and De Laroussilhe, Quentin and Gesmundo, Andrea and Attariyan, Mona and Gelly, Sylvain}, - booktitle={International Conference on Machine Learning}, - pages={2790--2799}, - year={2019}, - organization={PMLR} -} - -@misc{Junxian2021unified, - doi = {10.48550/ARXIV.2110.04366}, - url = {https://arxiv.org/abs/2110.04366}, - author = {He, Junxian and Zhou, Chunting and Ma, Xuezhe and Berg-Kirkpatrick, Taylor and Neubig, Graham}, - keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences}, - title = {Towards a Unified View of Parameter-Efficient Transfer Learning}, - publisher = {arXiv}, - year = {2021}, - copyright = {arXiv.org perpetual, non-exclusive license} -} diff --git a/SoundScribe/SpeakerID/docs/source/core/adapters/api.rst b/SoundScribe/SpeakerID/docs/source/core/adapters/api.rst deleted file mode 100644 index b0f2a8e13610f5dc6120fc8e32fbf3402b7981da..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/core/adapters/api.rst +++ /dev/null @@ -1,65 +0,0 @@ -Adapters API -============ - -Core ----- - -.. autoclass:: nemo.core.adapter_mixins.AdapterModuleMixin - :show-inheritance: - :members: - :member-order: bysource - :undoc-members: adapter_module_names - ------ - -.. autoclass:: nemo.core.adapter_mixins.AdapterModelPTMixin - :show-inheritance: - :members: - :member-order: bysource - :undoc-members: adapter_module_names - ------ - -Adapter Networks ----------------- - - -.. autoclass:: nemo.collections.common.parts.adapter_modules.AdapterModuleUtil - :show-inheritance: - :members: - :member-order: bysource - ------ - -.. autoclass:: nemo.collections.common.parts.adapter_modules.LinearAdapter - :show-inheritance: - :members: - :member-order: bysource - ------ - -Adapter Strategies ------------------- - - -.. autoclass:: nemo.core.classes.mixins.adapter_mixin_strategies.AbstractAdapterStrategy - :show-inheritance: - :members: - :member-order: bysource - :undoc-members: adapter_module_names - ------ - -.. autoclass:: nemo.core.classes.mixins.adapter_mixin_strategies.ReturnResultAdapterStrategy - :show-inheritance: - :members: - :member-order: bysource - :undoc-members: adapter_module_names - ------ - -.. autoclass:: nemo.core.classes.mixins.adapter_mixin_strategies.ResidualAddAdapterStrategy - :show-inheritance: - :members: - :member-order: bysource - :undoc-members: adapter_module_names diff --git a/SoundScribe/SpeakerID/docs/source/core/adapters/components.rst b/SoundScribe/SpeakerID/docs/source/core/adapters/components.rst deleted file mode 100644 index cc2ea0b525df206981adc98608983404b7c95c2c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/core/adapters/components.rst +++ /dev/null @@ -1,90 +0,0 @@ -Adapter Components -================== - -Adapters can be considered as any set of parameters that are added to a pre-existing module/model. In our case, we currently support the standard adapter in literature, more advanced adapter modules are being researched and can potentially be supported by NeMo. - -An adapter module can be any pytorch module, but it must follow certain straightforward requirements - - -1) The model accepts an input of some input dimension, and its output must match this dimension. -2) Ideally, the module is initialized such that the output of the adapter when initialized is such that it does not modify the original input. This allows the model to produce the same output results, even when additional parameters have been added. - -According to Junxian et al :cite:`adapters-Junxian2021unified`, we can consider an adapter being represented as three components - - -1) Functional form - the trainable parameters that will modify the input -2) Insertion form - Where the adapter outputs are integrated with the original input. The input to the adapters can be the last output of the layer, the input to some attention layer, or even the original input to the module itself (before even the modules forward pass). -3) Composition function - How the adapters outputs are integrated with the inputs. It can be as simple as residual addition connection, or concatenation, or point-wise multiplication etc. - -Functional Form - Adapter Networks -================================== - -Adapter modules represent the functional form of the adapter. We discuss an example of a most commonly used adapter module found in literature, titled the ``LinearAdapter`` (or Houlsby Adapter) :cite:`adapters-houlsby2019adapter`. - -.. note:: - - All adapter modules must extend :class:`~nemo.collections.common.parts.adapter_modules.AdapterModuleUtil` and should ideally have an equivalent DataClass config for easy instantiation ! - - -.. autoclass:: nemo.collections.common.parts.adapter_modules.AdapterModuleUtil - :show-inheritance: - :members: - :member-order: bysource - ------ - -.. autoclass:: nemo.collections.common.parts.adapter_modules.LinearAdapter - :show-inheritance: - :members: - :member-order: bysource - - -Insertion Form - Module Adapters --------------------------------- - -Adapter modules can be integrated into many different locations of a given module. For example, it is possible to have an adapter that affects only the outputs of the final layer in each module. We can also have a ``Parallel Adapter`` :cite:`adapters-Junxian2021unified` that operates at the input of the module itself, in parallel to the forward pass of the module. Yet another insertion location is inside the Multi Head Attention Layers. - -On top of this, while adapters are commonly used only in the layers containing the most parameters (say the Encoder of a network), some models can support adapters in multiple locations (Encoder-Decoder architecture for Language Models, Machine Translation, or even Encoder-Decoder-Joint for ASR with Transducer Loss). As such, NeMo utilizes the concept of ``Module Adapters``. - -``Module Adapters`` are very simply defined when adding an adapter - by specifying the module that the adapter should be inserted into. - -.. code-block:: python - - # Get the list of supported modules / locations in a adapter compatible Model - print(model.adapter_module_names) # assume ['', 'encoder', 'decoder'] - - # When calling add_adapter, specify the module name in the left of the colon symbol, and the adapter name afterwords. - # The adapter is then directed to the decoder module instead of the default / encoder module. - model.add_adapter("decoder:first_adapter", cfg=...) - -You might note that ``model.adapter_module_names`` can sometimes return ``''`` as one of the supported module names - this refers to the "default module". Generally we try to provide the default as the most commonly used adapter in literature - for example, Encoder adapters in NLP/NMT/ASR. - -Composition Function - Adapter Strategies ------------------------------------------ - -Finally, we discuss how to compose the input and output of adapter modules. In order to generalize this step, we construct ``Adapter Strategies``. -A strategy is any class (not torch.nn.Module!) that extends :class:`~nemo.core.classes.mixins.adapter_mixin_strategies.AbstractAdapterStrategy`, and provides a ``forward()`` method that accepts a specific signature of the inputs and produces an output tensor which combines the input and output with some specific method. - -We discuss a simple residual additional connection strategy below - that accepts an input to the adapter and an adapters output and simply adds them together. It also supports ``stochastic_depth`` which enables adapters to be dynamically switched off during training, making training more robust. - -.. autoclass:: nemo.core.classes.mixins.adapter_mixin_strategies.AbstractAdapterStrategy - :show-inheritance: - :members: - :member-order: bysource - :undoc-members: adapter_module_names - ------ - -.. autoclass:: nemo.core.classes.mixins.adapter_mixin_strategies.ResidualAddAdapterStrategy - :show-inheritance: - :members: - :member-order: bysource - :undoc-members: adapter_module_names - ------ - - -References ----------- - -.. bibliography:: ./adapter_bib.bib - :style: plain - :keyprefix: adapters- diff --git a/SoundScribe/SpeakerID/docs/source/core/adapters/intro.rst b/SoundScribe/SpeakerID/docs/source/core/adapters/intro.rst deleted file mode 100644 index fd94c8d2344660cb93942011748bce0165855ee0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/core/adapters/intro.rst +++ /dev/null @@ -1,147 +0,0 @@ -Adapters -======== - -In NeMo, we often train models and fine-tune them for a specific task. This is a reasonable approach when the models are just a few million parameters. However, this approach quickly becomes infeasible when approaching hundreds of millions or even billions of parameters. As a potential solution to such a scenario, where fine-tuning a massive model is no longer feasible, we look to `Adapters `_ :cite:`adapters-houlsby2019adapter` to specialize our model on a specific domain or task. Adapters require a fraction of the total number of parameters as the original model and are much more efficient to fine-tune. - -.. note:: - - For a detailed tutorial on adding ``Adapter`` support to any PyTorch module, please refer to the `Tutorials for NeMo Adapters <../../starthere/tutorials.html>`_. - - -What are Adapters? ------------------- - -Adapters are a straightforward concept - one formulation can be shown by the diagram below. At their simplest, they are residual Feedforward layers that compress the input dimension (:math:`D`) to a small bottleneck dimension (:math:`H`), such that :math:`R^D \text{->} R^H`, compute an activation (such as ReLU), finally mapping :math:`R^H \text{->} R^D` with another Feedforward layer. This output is then added to the input via a simple residual connection. - -.. raw:: html - -
- -
- -Adapter modules such as this are usually initialized such that the initial output of the adapter will always be zeros so as to prevent degradation of the original model's performance due to addition of such modules. - -``torch.nn.Module`` with Adapters ---------------------------------- - -In NeMo, Adapters are supported via a ``Mixin`` class that can be attached to any ``torch.nn.Module``. Such a module will -have multiple additional methods which will enable adapter capabilities in that module. - -.. code-block:: python - - # Import the adapter mixin from NeMo - from nemo.core import adapter_mixins - - # NOTE: See the *two* classes being inherited here ! - class MyModule(torch.nn.Module, adapter_mixins.AdapterModuleMixin): - pass - - -AdapterModuleMixin ------------------- -Let's look into what :class:`~nemo.core.adapter_mixins.AdapterModuleMixin` adds to the general PyTorch module. Some of the most important methods that are required are listed below : - -1) ``add_adapter``: Used to add an adapter with a unique name to the module. -2) ``get_enabled_adapters``: Returns a list of names of all enabled adapter modules. -3) ``set_enabled_adapters``: Sets whether a single (or all) adapters are enabled or disabled. -4) ``is_adapter_available``: Check if any adapter is available and enabled or not. - -Modules that extend this mixin usually can directly use these methods without extending them, but we will cover a case below -where you may wish to extend these methods. - -.. autoclass:: nemo.core.adapter_mixins.AdapterModuleMixin - :show-inheritance: - :members: - :member-order: bysource - :undoc-members: adapter_module_names - - -Using the Adapter Module ------------------------- - -Now that ``MyModule`` supports adapters, we can easily add adapters, set their state, check if they are available and -perform their forward pass. Note that if multiple adapters are enabled, they are called in a chain, the output of the previous adapter is passed as input to the next adapter and so on. - -.. code-block:: python - - # Import the adapter mixin and modules from NeMo - import torch - from nemo.core import adapter_mixins - from nemo.collections.common.parts import adapter_modules - - class MyModule(torch.nn.Module, adapter_mixins.AdapterModuleMixin): - - def forward(self, x: torch.Tensor) -> torch.Tensor: - output = self.layers(x) # assume self.layers is some Sequential() module - - if self.is_adapter_available(): # check if adapters were added or not - output = self.forward_enabled_adapters() # perform the forward of all enabled adapters in a chain - - return output - - # Now let us create a module, add an adapter and do a forward pass with some random inputs - module = MyModule(dim) # assume dim is some input and output dimension of the module. - - # Add an adapter - module.add_adapter("first_adapter", cfg=adapter_modules.LinearAdapter(in_features=dim, dim=5)) - - # Check if adapter is available - module.is_adapter_available() # returns True - - # Check the name(s) of the enabled adapters - module.get_enabled_adapters() # returns ['first_adapter'] - - # Set the state of the adapter (by name) - module.set_enabled_adapters(name="first_adapter", enabled=True) - - # Freeze all the weights of the original module (equivalent to calling module.freeze() for a NeuralModule) - for param in module.parameters(): - param.requires_grad = False - - # Unfreeze only the adapter weights (so that we finetune only the adapters and not the original weights !) - module.unfreeze_enabled_adapters() - - # Now you can train this model's adapters ! - input_data = torch.randn(4, dim) # assume dim is the input-output dim of the module - outputs_with_adapter = module(input_data) - - # Compute loss and backward ... - - -Adapter Compatible Models -------------------------- - -If the goal was to support adapters in a single module, then the goal has been accomplished. In the real world however, we -build large composite models out of multiple modules and combine them to build a final model that we then train. We do this using the -:class:`~nemo.core.adapter_mixins.AdapterModelPTMixin`. - -.. note:: - - For an in-depth guide to supporting hierarchical adapter modules, please refer to the `Tutorials for NeMo Adapters <../../starthere/tutorials.html>`_. - -.. autoclass:: nemo.core.adapter_mixins.AdapterModelPTMixin - :show-inheritance: - :members: - :member-order: bysource - :undoc-members: adapter_module_names - -Below, we will discuss some useful functionality of Adapter Compatible Models. - -1) ``Save and restore a Model with adapter capability``: Any NeMo model that implements this class correctly can save and restore NeMo models with adapter capabilities, thereby allowing sharing of adapters. -2) ``save_adapters`` and ``load_adapters``: Adapters are usually a very small number of parameters, there is no need for the entire model to be duplicated for each adapter. This method allows storing just the adapter module(s) separately from the Model, so that you can use the same "base" model, and share just the Adapter modules. - - -.. toctree:: - :maxdepth: 8 - :caption: Adapters - - components - api - - -References ----------- - -.. bibliography:: ./adapter_bib.bib - :style: plain - :keyprefix: adapters- diff --git a/SoundScribe/SpeakerID/docs/source/core/api.rst b/SoundScribe/SpeakerID/docs/source/core/api.rst deleted file mode 100644 index aca0d5bec9c7590881351c18a29fa6b13f9f56ea..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/core/api.rst +++ /dev/null @@ -1,119 +0,0 @@ - -Core APIs -========= - -Base class for all NeMo models ------------------------------- - -.. autoclass:: nemo.core.ModelPT - :show-inheritance: - :members: - :member-order: bysource - :undoc-members: cfg, num_weights - :exclude-members: set_eff_save, use_eff_save, teardown - -Base Neural Module class ------------------------- - -.. autoclass:: nemo.core.NeuralModule - :show-inheritance: - :members: - :member-order: bysource - -Base Mixin classes ------------------- - -.. autoclass:: nemo.core.Typing - :show-inheritance: - :members: - :member-order: bysource - :private-members: - :exclude-members: _abc_impl - :noindex: - ------ - -.. autoclass:: nemo.core.Serialization - :show-inheritance: - :members: - :member-order: bysource - :noindex: - ------ - -.. autoclass:: nemo.core.FileIO - :show-inheritance: - :members: - :member-order: bysource - :noindex: - - -Base Connector classes ----------------------- - -.. autoclass:: nemo.core.connectors.save_restore_connector.SaveRestoreConnector - :show-inheritance: - :members: - :member-order: bysource - -Neural Type checking --------------------- - -.. autoclass:: nemo.core.classes.common.typecheck - :show-inheritance: - :members: - :member-order: bysource - - .. automethod:: __call__ - -Neural Type classes -------------------- - -.. autoclass:: nemo.core.neural_types.NeuralType - :show-inheritance: - :members: - :member-order: bysource - ------ - -.. autoclass:: nemo.core.neural_types.axes.AxisType - :show-inheritance: - :members: - :member-order: bysource - ------ - -.. autoclass:: nemo.core.neural_types.elements.ElementType - :show-inheritance: - :members: - :member-order: bysource - ------ - -.. autoclass:: nemo.core.neural_types.comparison.NeuralTypeComparisonResult - :show-inheritance: - :members: - :member-order: bysource - -Experiment manager ------------------- - -.. autoclass:: nemo.utils.exp_manager.exp_manager - :show-inheritance: - :members: - :member-order: bysource - -.. autoclass:: nemo.utils.exp_manager.ExpManagerConfig - :show-inheritance: - :members: - :member-order: bysource - - -Exportable ----------- - -.. autoclass:: nemo.core.classes.exportable.Exportable - :show-inheritance: - :members: - :member-order: bysource - diff --git a/SoundScribe/SpeakerID/docs/source/core/core.rst b/SoundScribe/SpeakerID/docs/source/core/core.rst deleted file mode 100644 index 7b2edfa0f5c419f02aa45bd416ef37bb8b4fab0f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/core/core.rst +++ /dev/null @@ -1,785 +0,0 @@ -NeMo Models -=========== - -Basics ------- - -NeMo models contain everything needed to train and reproduce Conversational AI models: - -- neural network architectures -- datasets/data loaders -- data preprocessing/postprocessing -- data augmentors -- optimizers and schedulers -- tokenizers -- language models - -NeMo uses `Hydra `_ for configuring both NeMo models and the PyTorch Lightning Trainer. - -.. note:: Every NeMo model has an example configuration file and training script that can be found `here `_. - -The end result of using NeMo, `Pytorch Lightning `_, and Hydra is that NeMo models all have the same look and feel and are also fully compatible with the PyTorch ecosystem. - -Pretrained ----------- - -NeMo comes with many pretrained models for each of our collections: ASR, NLP, and TTS. Every pretrained NeMo model can be downloaded -and used with the ``from_pretrained()`` method. - -As an example, we can instantiate QuartzNet with the following: - -.. code-block:: Python - - import nemo.collections.asr as nemo_asr - - model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="QuartzNet15x5Base-En") - -To see all available pretrained models for a specific NeMo model, use the ``list_available_models()`` method. - -.. code-block:: Python - - nemo_asr.models.EncDecCTCModel.list_available_models() - -For detailed information on the available pretrained models, refer to the collections documentation: - -- :ref:`Automatic Speech Recognition (ASR)` -- :doc:`Natural Language Processing (NLP) <../nlp/models>` -- :doc:`Text-to-Speech Synthesis (TTS) <../tts/intro>` - -Training --------- - -NeMo leverages `PyTorch Lightning `_ for model training. PyTorch Lightning lets NeMo decouple the -conversational AI code from the PyTorch training code. This means that NeMo users can focus on their domain (ASR, NLP, TTS) and -build complex AI applications without having to rewrite boiler plate code for PyTorch training. - -When using PyTorch Lightning, NeMo users can automatically train with: - -- multi-GPU/multi-node -- mixed precision -- model checkpointing -- logging -- early stopping -- and more - -The two main aspects of the Lightning API are the `LightningModule `_ -and the `Trainer `_. - -PyTorch Lightning ``LightningModule`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Every NeMo model is a ``LightningModule`` which is an ``nn.module``. This means that NeMo models are compatible with the PyTorch -ecosystem and can be plugged into existing PyTorch workflows. - -Creating a NeMo model is similar to any other PyTorch workflow. We start by initializing our model architecture, then define the forward pass: - -.. code-block:: python - - class TextClassificationModel(NLPModel, Exportable): - ... - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - """Initializes the BERTTextClassifier model.""" - ... - super().__init__(cfg=cfg, trainer=trainer) - - # instantiate a BERT based encoder - self.bert_model = get_lm_model( - config_file=cfg.language_model.config_file, - config_dict=cfg.language_model.config, - vocab_file=cfg.tokenizer.vocab_file, - trainer=trainer, - cfg=cfg, - ) - - # instantiate the FFN for classification - self.classifier = SequenceClassifier( - hidden_size=self.bert_model.config.hidden_size, - num_classes=cfg.dataset.num_classes, - num_layers=cfg.classifier_head.num_output_layers, - activation='relu', - log_softmax=False, - dropout=cfg.classifier_head.fc_dropout, - use_transformer_init=True, - idx_conditioned_on=0, - ) - -.. code-block:: python - - def forward(self, input_ids, token_type_ids, attention_mask): - """ - No special modification required for Lightning, define it as you normally would - in the `nn.Module` in vanilla PyTorch. - """ - hidden_states = self.bert_model( - input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask - ) - logits = self.classifier(hidden_states=hidden_states) - return logits - -The ``LightningModule`` organizes PyTorch code so that across all NeMo models we have a similar look and feel. -For example, the training logic can be found in ``training_step``: - -.. code-block:: python - - def training_step(self, batch, batch_idx): - """ - Lightning calls this inside the training loop with the data from the training dataloader - passed in as `batch`. - """ - # forward pass - input_ids, input_type_ids, input_mask, labels = batch - logits = self.forward(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) - - train_loss = self.loss(logits=logits, labels=labels) - - lr = self._optimizer.param_groups[0]['lr'] - - self.log('train_loss', train_loss) - self.log('lr', lr, prog_bar=True) - - return { - 'loss': train_loss, - 'lr': lr, - } - -While validation logic can be found in ``validation_step``: - -.. code-block:: python - - def validation_step(self, batch, batch_idx): - """ - Lightning calls this inside the validation loop with the data from the validation dataloader - passed in as `batch`. - """ - if self.testing: - prefix = 'test' - else: - prefix = 'val' - - input_ids, input_type_ids, input_mask, labels = batch - logits = self.forward(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) - - val_loss = self.loss(logits=logits, labels=labels) - - preds = torch.argmax(logits, axis=-1) - - tp, fn, fp, _ = self.classification_report(preds, labels) - - return {'val_loss': val_loss, 'tp': tp, 'fn': fn, 'fp': fp} - -PyTorch Lightning then handles all of the boiler plate code needed for training. Virtually any aspect of training can be customized -via PyTorch Lightning `hooks `_, -`Plugins `_, -`callbacks `_, or by overriding `methods `_. - -For more domain-specific information, see: - -- :ref:`Automatic Speech Recognition (ASR) <../asr/intro>` -- :ref:`Natural Language Processing (NLP) <../nlp/models>` -- :ref:`Text-to-Speech Synthesis (TTS) <../tts/intro>` - -PyTorch Lightning Trainer -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Since every NeMo model is a ``LightningModule``, we can automatically take advantage of the PyTorch Lightning ``Trainer``. Every NeMo -`example `_ training script uses the ``Trainer`` object to fit the model. - -First, instantiate the model and trainer, then call ``.fit``: - -.. code-block:: python - - # We first instantiate the trainer based on the model configuration. - # See the model configuration documentation for details. - trainer = pl.Trainer(**cfg.trainer) - - # Then pass the model configuration and trainer object into the NeMo model - model = TextClassificationModel(cfg.model, trainer=trainer) - - # Now we can train with by calling .fit - trainer.fit(model) - - # Or we can run the test loop on test data by calling - trainer.test(model=model) - -All `trainer flags `_ can be set from from the -NeMo configuration. - - -Configuration -------------- - -Hydra is an open-source Python framework that simplifies configuration for complex applications that must bring together many different -software libraries. Conversational AI model training is a great example of such an application. To train a conversational AI model, we -must be able to configure: - -- neural network architectures -- training and optimization algorithms -- data pre/post processing -- data augmentation -- experiment logging/visualization -- model checkpointing - -For an introduction to using Hydra, refer to the `Hydra Tutorials `_. - -With Hydra, we can configure everything needed for NeMo with three interfaces: - -- Command Line (CLI) -- Configuration Files (YAML) -- Dataclasses (Python) - -YAML -~~~~ - -NeMo provides YAML configuration files for all of our `example `_ training scripts. -YAML files make it easy to experiment with different model and training configurations. - -Every NeMo example YAML has the same underlying configuration structure: - -- trainer -- exp_manager -- model - -Model configuration always contain ``train_ds``, ``validation_ds``, ``test_ds``, and ``optim``. Model architectures vary across -domains, therefore, refer to the ASR, NLP, and TTS Collections documentation for more detailed information on Model architecture configuration. - -A NeMo configuration file should look similar to the following: - -.. code-block:: yaml - - # PyTorch Lightning Trainer configuration - # any argument of the Trainer object can be set here - trainer: - devices: 1 # number of gpus per node - accelerator: gpu - num_nodes: 1 # number of nodes - max_epochs: 10 # how many training epochs to run - val_check_interval: 1.0 # run validation after every epoch - - # Experiment logging configuration - exp_manager: - exp_dir: /path/to/my/nemo/experiments - name: name_of_my_experiment - create_tensorboard_logger: True - create_wandb_logger: True - - # Model configuration - # model network architecture, train/val/test datasets, data augmentation, and optimization - model: - train_ds: - manifest_filepath: /path/to/my/train/manifest.json - batch_size: 256 - shuffle: True - validation_ds: - manifest_filepath: /path/to/my/validation/manifest.json - batch_size: 32 - shuffle: False - test_ds: - manifest_filepath: /path/to/my/test/manifest.json - batch_size: 32 - shuffle: False - optim: - name: novograd - lr: .01 - betas: [0.8, 0.5] - weight_decay: 0.001 - # network architecture can vary greatly depending on the domain - encoder: - ... - decoder: - ... - -More specific details about configuration files for each collection can be found on the following pages: - -:ref:`NeMo ASR Configuration Files` - -CLI -~~~ - -With NeMo and Hydra, every aspect of model training can be modified from the command-line. This is extremely helpful for running lots -of experiments on compute clusters or for quickly testing parameters while developing. - -All NeMo `examples `_ come with instructions on how to -run the training/inference script from the command-line (see `here `_ -for an example). - -With Hydra, arguments are set using the ``=`` operator: - -.. code-block:: bash - - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/path/to/my/train/manifest.json \ - model.validation_ds.manifest_filepath=/path/to/my/validation/manifest.json \ - trainer.devices=2 \ - trainer.accelerator='gpu' \ - trainer.max_epochs=50 - -We can use the ``+`` operator to add arguments from the CLI: - -.. code-block:: bash - - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/path/to/my/train/manifest.json \ - model.validation_ds.manifest_filepath=/path/to/my/validation/manifest.json \ - trainer.devices=2 \ - trainer.accelerator='gpu' \ - trainer.max_epochs=50 \ - +trainer.fast_dev_run=true - -We can use the ``~`` operator to remove configurations: - -.. code-block:: bash - - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/path/to/my/train/manifest.json \ - model.validation_ds.manifest_filepath=/path/to/my/validation/manifest.json \ - ~model.test_ds \ - trainer.devices=2 \ - trainer.accelerator='gpu' \ - trainer.max_epochs=50 \ - +trainer.fast_dev_run=true - -We can specify configuration files using the ``--config-path`` and ``--config-name`` flags: - -.. code-block:: bash - - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - --config-path=conf/quartznet \ - --config-name=quartznet_15x5 \ - model.train_ds.manifest_filepath=/path/to/my/train/manifest.json \ - model.validation_ds.manifest_filepath=/path/to/my/validation/manifest.json \ - ~model.test_ds \ - trainer.devices=2 \ - trainer.accelerator='gpu' \ - trainer.max_epochs=50 \ - +trainer.fast_dev_run=true - -Dataclasses -~~~~~~~~~~~ - -Dataclasses allow NeMo to ship model configurations as part of the NeMo library and also enables pure Python configuration of NeMo models. -With Hydra, dataclasses can be used to create `structured configs `_ for the conversational AI application. - -As an example, refer to the code block below for an *Attenion is All You Need* machine translation model. The model configuration can -be instantiated and modified like any Python `Dataclass `_. - -.. code-block:: Python - - from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import AAYNBaseConfig - - cfg = AAYNBaseConfig() - - # modify the number of layers in the encoder - cfg.encoder.num_layers = 8 - - # modify the training batch size - cfg.train_ds.tokens_in_batch = 8192 - -.. note:: Configuration with Hydra always has the following precedence CLI > YAML > Dataclass - -.. _optimization-label: - -Optimization ------------- - -Optimizers and learning rate schedules are configurable across all NeMo models and have their own namespace. Here is a sample YAML -configuration for a Novograd optimizer with Cosine Annealing learning rate schedule. - -.. code-block:: yaml - - optim: - name: novograd - lr: 0.01 - - # optimizer arguments - betas: [0.8, 0.25] - weight_decay: 0.001 - - # scheduler setup - sched: - name: CosineAnnealing - - # Optional arguments - max_steps: -1 # computed at runtime or explicitly set here - monitor: val_loss - reduce_on_plateau: false - - # scheduler config override - warmup_steps: 1000 - warmup_ratio: null - min_lr: 1e-9: - -.. note:: `NeMo Examples `_ has optimizer and scheduler configurations for every NeMo model. - -Optimizers can be configured from the CLI as well: - -.. code-block:: bash - - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - --config-path=conf/quartznet \ - --config-name=quartznet_15x5 \ - ... - # train with the adam optimizer - model.optim=adam \ - # change the learning rate - model.optim.lr=.0004 \ - # modify betas - model.optim.betas=[.8, .5] - -.. _optimizers-label: - -Optimizers -~~~~~~~~~~ - -``name`` corresponds to the lowercase name of the optimizer. To view a list of available optimizers, run: - -.. code-block:: Python - - from nemo.core.optim.optimizers import AVAILABLE_OPTIMIZERS - - for name, opt in AVAILABLE_OPTIMIZERS.items(): - print(f'name: {name}, opt: {opt}') - -.. code-block:: bash - - name: sgd opt: - name: adam opt: - name: adamw opt: - name: adadelta opt: - name: adamax opt: - name: adagrad opt: - name: rmsprop opt: - name: rprop opt: - name: novograd opt: - -Optimizer Params -~~~~~~~~~~~~~~~~ - -Optimizer params can vary between optimizers but the ``lr`` param is required for all optimizers. To see the available params for an -optimizer, we can look at its corresponding dataclass. - -.. code-block:: python - - from nemo.core.config.optimizers import NovogradParams - - print(NovogradParams()) - -.. code-block:: bash - - NovogradParams(lr='???', betas=(0.95, 0.98), eps=1e-08, weight_decay=0, grad_averaging=False, amsgrad=False, luc=False, luc_trust=0.001, luc_eps=1e-08) - -``'???'`` indicates that the lr argument is required. - -Register Optimizer -~~~~~~~~~~~~~~~~~~ - -To register a new optimizer to be used with NeMo, run: - -.. autofunction:: nemo.core.optim.optimizers.register_optimizer - -.. _learning-rate-schedulers-label: - -Learning Rate Schedulers -~~~~~~~~~~~~~~~~~~~~~~~~ - -Learning rate schedulers can be optionally configured under the ``optim.sched`` namespace. - -``name`` corresponds to the name of the learning rate schedule. To view a list of available schedulers, run: - -.. code-block:: Python - - from nemo.core.optim.lr_scheduler import AVAILABLE_SCHEDULERS - - for name, opt in AVAILABLE_SCHEDULERS.items(): - print(f'name: {name}, schedule: {opt}') - -.. code-block:: bash - - name: WarmupPolicy, schedule: - name: WarmupHoldPolicy, schedule: - name: SquareAnnealing, schedule: - name: CosineAnnealing, schedule: - name: NoamAnnealing, schedule: - name: WarmupAnnealing, schedule: - name: InverseSquareRootAnnealing, schedule: - name: SquareRootAnnealing, schedule: - name: PolynomialDecayAnnealing, schedule: - name: PolynomialHoldDecayAnnealing, schedule: - name: StepLR, schedule: - name: ExponentialLR, schedule: - name: ReduceLROnPlateau, schedule: - name: CyclicLR, schedule: - -Scheduler Params -~~~~~~~~~~~~~~~~ - -To see the available params for a scheduler, we can look at its corresponding dataclass: - -.. code-block:: Python - - from nemo.core.config.schedulers import CosineAnnealingParams - - print(CosineAnnealingParams()) - -.. code-block:: bash - - CosineAnnealingParams(last_epoch=-1, warmup_steps=None, warmup_ratio=None, min_lr=0.0) - -Register scheduler -~~~~~~~~~~~~~~~~~~ - -To register a new scheduler to be used with NeMo, run: - -.. autofunction:: nemo.core.optim.lr_scheduler.register_scheduler - -Save and Restore ----------------- - -NeMo models all come with ``.save_to`` and ``.restore_from`` methods. - -Save -~~~~ - -To save a NeMo model, run: - -.. code-block:: Python - - model.save_to('/path/to/model.nemo') - -Everything needed to use the trained model is packaged and saved in the ``.nemo`` file. For example, in the NLP domain, ``.nemo`` files -include the necessary tokenizer models and/or vocabulary files, etc. - -.. note:: A ``.nemo`` file is simply an archive like any other ``.tar`` file. - -Restore -~~~~~~~ - -To restore a NeMo model, run: - -.. code-block:: Python - - # Here, you should usually use the class of the model, or simply use ModelPT.restore_from() for simplicity. - model.restore_from('/path/to/model.nemo') - -When using the PyTorch Lightning Trainer, a PyTorch Lightning checkpoint is created. These are mainly used within NeMo to auto-resume -training. Since NeMo models are ``LightningModules``, the PyTorch Lightning method ``load_from_checkpoint`` is available. Note that -``load_from_checkpoint`` won't necessarily work out-of-the-box for all models as some models require more artifacts than just the -checkpoint to be restored. For these models, the user will have to override ``load_from_checkpoint`` if they want to use it. - -It's highly recommended to use ``restore_from`` to load NeMo models. - -Restore with Modified Config -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Sometimes, there may be a need to modify the model (or it's sub-components) prior to restoring a model. A common case is when -the model's internal config must be updated due to various reasons (such as deprecation, newer versioning, support a new feature). -As long as the model has the same parameters as compared to the original config, the parameters can once again be restored safely. - -In NeMo, as part of the .nemo file, the model's internal config will be preserved. This config is used during restoration, and -as shown below we can update this config prior to restoring the model. - -.. code-block:: - - # When restoring a model, you should generally use the class of the model - # Obtain the config (as an OmegaConf object) - config = model_class.restore_from('/path/to/model.nemo', return_config=True) - # OR - config = model_class.from_pretrained('name_of_the_model', return_config=True) - - # Modify the config as needed - config.x.y = z - - # Restore the model from the updated config - model = model_class.restore_from('/path/to/model.nemo', override_config_path=config) - # OR - model = model_class.from_pretrained('name_of_the_model', override_config_path=config) - -Register Artifacts ------------------- - -Conversational AI models can be complicated to restore as more information is needed than just the checkpoint weights in order to use the model. -NeMo models can save additional artifacts in the .nemo file by calling ``.register_artifact``. -When restoring NeMo models using ``.restore_from`` or ``.from_pretrained``, any artifacts that were registered will be available automatically. - -As an example, consider an NLP model that requires a trained tokenizer model. -The tokenizer model file can be automatically added to the .nemo file with the following: - -.. code-block:: python - - self.encoder_tokenizer = get_nmt_tokenizer( - ... - tokenizer_model=self.register_artifact(config_path='encoder_tokenizer.tokenizer_model', - src='/path/to/tokenizer.model', - verify_src_exists=True), - ) - -By default, ``.register_artifact`` will always return a path. If the model is being restored from a .nemo file, -then that path will be to the artifact in the .nemo file. Otherwise, ``.register_artifact`` will return the local path specified by the user. - -``config_path`` is the artifact key. It usually corresponds to a model configuration but does not have to. -The model config that is packaged with the .nemo file will be updated according to the ``config_path`` key. -In the above example, the model config will have - -.. code-block:: YAML - - encoder_tokenizer: - ... - tokenizer_model: nemo:4978b28103264263a03439aaa6560e5e_tokenizer.model - -``src`` is the path to the artifact and the base-name of the path will be used when packaging the artifact in the .nemo file. -Each artifact will have a hash prepended to the basename of ``src`` in the .nemo file. This is to prevent collisions with basenames -base-names that are identical (say when there are two or more tokenizers, both called `tokenizer.model`). -The resulting .nemo file will then have the following file: - -.. code-block:: bash - - 4978b28103264263a03439aaa6560e5e_tokenizer.model - -If ``verify_src_exists`` is set to ``False``, then the artifact is optional. This means that ``.register_artifact`` will return ``None`` -if the ``src`` cannot be found. - -Nested NeMo Models ------------------- - -In some cases, it may be helpful to use NeMo models inside other NeMo models. For example, we can incorporate language models into ASR models to use in a decoding process to improve accuracy or use hybrid ASR-TTS models to generate audio from the text on the fly to train or finetune the ASR model. - -There are 3 ways to instantiate child models inside parent models: - -- use subconfig directly -- use the ``.nemo`` checkpoint path to load the child model -- use a pretrained NeMo model - -To register a child model, use the ``register_nemo_submodule`` method of the parent model. This method will add the child model to a provided model attribute and, in the serialization process, will handle child artifacts correctly and store the child model config in the parent model config in ``config_field``. - -.. code-block:: python - - from nemo.core.classes import ModelPT - - class ChildModel(ModelPT): - ... # implement necessary methods - - class ParentModel(ModelPT): - def __init__(self, cfg, trainer=None): - super().__init__(cfg=cfg, trainer=trainer) - - # optionally annotate type for IDE autocompletion and type checking - self.child_model: Optional[ChildModel] - if cfg.get("child_model") is not None: - # load directly from config - # either if config provided initially, or automatically - # after model restoration - self.register_nemo_submodule( - name="child_model", - config_field="child_model", - model=ChildModel(self.cfg.child_model, trainer=trainer), - ) - elif cfg.get('child_model_path') is not None: - # load from .nemo model checkpoint - # while saving, config will be automatically assigned/updated - # in cfg.child_model - self.register_nemo_submodule( - name="child_model", - config_field="child_model", - model=ChildModel.restore_from(self.cfg.child_model_path, trainer=trainer), - ) - elif cfg.get('child_model_name') is not None: - # load from pretrained model - # while saving, config will be automatically assigned/updated - # in cfg.child_model - self.register_nemo_submodule( - name="child_model", - config_field="child_model", - model=ChildModel.from_pretrained(self.cfg.child_model_name, trainer=trainer), - ) - else: - self.child_model = None - - -Neural Modules -============== - -NeMo is built around Neural Modules, conceptual blocks of neural networks that take typed inputs and produce typed outputs. Such -modules typically represent data layers, encoders, decoders, language models, loss functions, or methods of combining activations. -NeMo makes it easy to combine and re-use these building blocks while providing a level of semantic correctness checking via its neural -type system. - -.. note:: *All Neural Modules inherit from ``torch.nn.Module`` and are therefore compatible with the PyTorch ecosystem.* - -There are 3 types on Neural Modules: - - - Regular modules - - Dataset/IterableDataset - - Losses - -Every Neural Module in NeMo must inherit from `nemo.core.classes.module.NeuralModule` class. - -.. autoclass:: nemo.core.classes.module.NeuralModule - -Every Neural Modules inherits the ``nemo.core.classes.common.Typing`` interface and needs to define neural types for its inputs and outputs. -This is done by defining two properties: ``input_types`` and ``output_types``. Each property should return an ordered dictionary of -"port name"->"port neural type" pairs. Here is the example from :class:`~nemo.collections.asr.modules.ConvASREncoder` class: - -.. code-block:: python - - @property - def input_types(self): - return OrderedDict( - { - "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - } - ) - - @property - def output_types(self): - return OrderedDict( - { - "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - } - ) - - @typecheck() - def forward(self, audio_signal, length=None): - ... - -The code snippet above means that ``nemo.collections.asr.modules.conv_asr.ConvASREncoder`` expects two arguments: - * First one, named ``audio_signal`` of shape ``[batch, dimension, time]`` with elements representing spectrogram values. - * Second one, named ``length`` of shape ``[batch]`` with elements representing lengths of corresponding signals. - -It also means that ``.forward(...)`` and ``__call__(...)`` methods each produce two outputs: - * First one, of shape ``[batch, dimension, time]`` but with elements representing encoded representation (``AcousticEncodedRepresentation`` class). - * Second one, of shape ``[batch]``, corresponding to their lengths. - -.. tip:: It is a good practice to define types and add ``@typecheck()`` decorator to your ``.forward()`` method after your module is ready for use by others. - -.. note:: The outputs of ``.forward(...)`` method will always be of type ``torch.Tensor`` or container of tensors and will work with any other Pytorch code. The type information is attached to every output tensor. If tensors without types is passed to your module, it will not fail, however the types will not be checked. Thus, it is recommended to define input/output types for all your modules, starting with data layers and add ``@typecheck()`` decorator to them. - -.. note:: To temporarily disable typechecking, you can enclose your code in ```with typecheck.disable_checks():``` statement. - - -Dynamic Layer Freezing ----------------------- - -You can selectively freeze any modules inside a Nemo model by specifying a freezing schedule in the config yaml. Freezing stops any gradient updates -to that module, so that its weights are not changed for that step. This can be useful for combatting catastrophic forgetting, for example -when finetuning a large pretrained model on a small dataset. - -The default approach is to freeze a module for the first N training steps, but you can also enable freezing for a specific range of steps, -for example, from step 20 - 100, or even activate freezing from some N until the end of training. You can also freeze a module for the entire training run. -Dynamic freezing is specified in training steps, not epochs. - -To enable freezing, add the following to your config: - -.. code-block:: yaml - - model: - ... - freeze_updates: - enabled: true # set to false if you want to disable freezing - - modules: # list all of the modules you want to have freezing logic for - encoder: 200 # module will be frozen for the first 200 training steps - decoder: [50, -1] # module will be frozen at step 50 and will remain frozen until training ends - joint: [10, 100] # module will be frozen between step 10 and step 100 (step >= 10 and step <= 100) - transcoder: -1 # module will be frozen for the entire training run - diff --git a/SoundScribe/SpeakerID/docs/source/core/exp_manager.rst b/SoundScribe/SpeakerID/docs/source/core/exp_manager.rst deleted file mode 100644 index 6ef47b24b32f3e16c4854cce5c97e7995e2f7978..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/core/exp_manager.rst +++ /dev/null @@ -1,361 +0,0 @@ - -.. _exp-manager-label: - -Experiment Manager -================== - -NeMo's Experiment Manager leverages PyTorch Lightning for model checkpointing, TensorBoard Logging, Weights and Biases, DLLogger and MLFlow logging. The -Experiment Manager is included by default in all NeMo example scripts. - -To use the experiment manager simply call :class:`~nemo.utils.exp_manager.exp_manager` and pass in the PyTorch Lightning ``Trainer``. - -.. code-block:: python - - exp_dir = exp_manager(trainer, cfg.get("exp_manager", None)) - -And is configurable via YAML with Hydra. - -.. code-block:: bash - - exp_manager: - exp_dir: /path/to/my/experiments - name: my_experiment_name - create_tensorboard_logger: True - create_checkpoint_callback: True - -Optionally, launch TensorBoard to view the training results in ``./nemo_experiments`` (by default). - -.. code-block:: bash - - tensorboard --bind_all --logdir nemo_experiments - -.. - -If ``create_checkpoint_callback`` is set to ``True``, then NeMo automatically creates checkpoints during training -using PyTorch Lightning's `ModelCheckpoint `_. -We can configure the ``ModelCheckpoint`` via YAML or CLI. - -.. code-block:: yaml - - exp_manager: - ... - # configure the PyTorch Lightning ModelCheckpoint using checkpoint_call_back_params - # any ModelCheckpoint argument can be set here - - # save the best checkpoints based on this metric - checkpoint_callback_params.monitor=val_loss - - # choose how many total checkpoints to save - checkpoint_callback_params.save_top_k=5 - -Resume Training ---------------- - -We can auto-resume training as well by configuring the ``exp_manager``. Being able to auto-resume is important when doing long training -runs that are premptible or may be shut down before the training procedure has completed. To auto-resume training, set the following -via YAML or CLI: - -.. code-block:: yaml - - exp_manager: - ... - # resume training if checkpoints already exist - resume_if_exists: True - - # to start training with no existing checkpoints - resume_ignore_no_checkpoint: True - - # by default experiments will be versioned by datetime - # we can set our own version with - exp_manager.version: my_experiment_version - - -Experiment Loggers ------------------- - -Alongside Tensorboard, NeMo also supports Weights and Biases, MLFlow and DLLogger. To use these loggers, simply set the following -via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`. - - -Weights and Biases (WandB) -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. _exp_manager_weights_biases-label: - -.. code-block:: yaml - - exp_manager: - ... - create_checkpoint_callback: True - create_wandb_logger: True - wandb_logger_kwargs: - name: ${name} - project: ${project} - entity: ${entity} - - - -MLFlow -~~~~~~ - -.. _exp_manager_mlflow-label: - -.. code-block:: yaml - - exp_manager: - ... - create_checkpoint_callback: True - create_mlflow_logger: True - mlflow_logger_kwargs: - experiment_name: ${name} - tags: - - save_dir: './mlruns' - prefix: '' - artifact_location: None - # provide run_id if resuming a previously started run - run_id: Optional[str] = None - -DLLogger -~~~~~~~~ - -.. _exp_manager_dllogger-label: - -.. code-block:: yaml - - exp_manager: - ... - create_checkpoint_callback: True - create_dllogger_logger: True - dllogger_logger_kwargs: - verbose: False - stdout: False - json_file: "./dllogger.json" - -ClearML -~~~~~~~ - -.. _exp_manager_clearml-label: - -.. code-block:: yaml - - exp_manager: - ... - create_checkpoint_callback: True - create_clearml_logger: True - clearml_logger_kwargs: - project: None # name of the project - task: None # optional name of task - connect_pytorch: False - model_name: None # optional name of model - tags: None # Should be a list of str - log_model: False # log model to clearml server - log_cfg: False # log config to clearml server - log_metrics: False # log metrics to clearml server - -Exponential Moving Average --------------------------- - -.. _exp_manager_ema-label: - -NeMo supports using exponential moving average (EMA) for model parameters. This can be useful for improving model generalization -and stability. To use EMA, simply set the following via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`. - -.. code-block:: yaml - - exp_manager: - ... - # use exponential moving average for model parameters - ema: - enabled: True # False by default - decay: 0.999 # decay rate - cpu_offload: False # If EMA parameters should be offloaded to CPU to save GPU memory - every_n_steps: 1 # How often to update EMA weights - validate_original_weights: False # Whether to use original weights for validation calculation or EMA weights - -Support for Preemption ----------------------- - -.. _exp_manager_preemption_support-label: - -NeMo adds support for a callback upon preemption while running the models on clusters. The callback takes care of saving the current state of training via the ``.ckpt`` -file followed by a graceful exit from the run. The checkpoint saved upon preemption has the ``*last.ckpt`` suffix and replaces the previously saved last checkpoints. -This feature is useful to increase utilization on clusters. -The ``PreemptionCallback`` is enabled by default. To disable it simply add ``create_preemption_callback: False`` under exp_manager in the config YAML file. - - -.. _nemo_multirun-label: - - -Hydra Multi-Run with NeMo -------------------------- - -When training neural networks, it is common to perform hyper parameter search in order to improve the performance of a model -on some validation data. However, it can be tedious to manually prepare a grid of experiments and management of all checkpoints -and their metrics. In order to simplify such tasks, NeMo integrates with `Hydra Multi-Run support `_ in order to provide a unified way to run a set of experiments all -from the config. - -There are certain limitations to this framework, which we list below: - -* All experiments are assumed to be run on a single GPU, and multi GPU for single run (model parallel models are not supported as of now). -* NeMo Multi-Run supports only grid search over a set of hyper-parameters, but we will eventually add support for advanced hyper parameter search strategies. -* **NeMo Multi-Run only supports running on one or more GPUs** and will not work if no GPU devices are present. - -Config Setup -~~~~~~~~~~~~ - -In order to enable NeMo Multi-Run, we first update our YAML configs with some information to let Hydra know we expect to run multiple experiments from this one config - - -.. code-block:: yaml - - # Required for Hydra launch of hyperparameter search via multirun - defaults: - - override hydra/launcher: nemo_launcher - - # Hydra arguments necessary for hyperparameter optimization - hydra: - # Helper arguments to ensure all hyper parameter runs are from the directory that launches the script. - sweep: - dir: "." - subdir: "." - - # Define all the hyper parameters here - sweeper: - params: - # Place all the parameters you wish to search over here (corresponding to the rest of the config) - # NOTE: Make sure that there are no spaces between the commas that separate the config params ! - model.optim.lr: 0.001,0.0001 - model.encoder.dim: 32,64,96,128 - model.decoder.dropout: 0.0,0.1,0.2 - - # Arguments to the process launcher - launcher: - num_gpus: -1 # Number of gpus to use. Each run works on a single GPU. - jobs_per_gpu: 1 # If each GPU has large memory, you can run multiple jobs on the same GPU for faster results (until OOM). - - -Next, we will setup the config for ``Experiment Manager``. When we perform hyper parameter search, each run may take some time to complete. -We want to therefore avoid the case where a run ends (say due to OOM or timeout on the machine) and we need to redo all experiments. -We therefore setup the experiment manager config such that every experiment has a unique "key", whose value corresponds to a single -resumable experiment. - -Let us see how to setup such a unique "key" via the experiment name. Simply attach all the hyper parameter arguments to the experiment -name as shown below - - -.. code-block:: yaml - - exp_manager: - exp_dir: null # Can be set by the user. - - # Add a unique name for all hyper parameter arguments to allow continued training. - # NOTE: It is necessary to add all hyperparameter arguments to the name ! - # This ensures successful restoration of model runs in case HP search crashes. - name: ${name}-lr-${model.optim.lr}-adim-${model.adapter.dim}-sd-${model.adapter.adapter_strategy.stochastic_depth} - - ... - checkpoint_callback_params: - ... - save_top_k: 1 # Dont save too many .ckpt files during HP search - always_save_nemo: True # saves the checkpoints as nemo files for fast checking of results later - ... - - # We highly recommend use of any experiment tracking took to gather all the experiments in one location - create_wandb_logger: True - wandb_logger_kwargs: - project: "" - - # HP Search may crash due to various reasons, best to attempt continuation in order to - # resume from where the last failure case occurred. - resume_if_exists: true - resume_ignore_no_checkpoint: true - - -Running a Multi-Run config -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Once the config has been updated, we can now run it just like any normal Hydra script -- with one special flag (``-m``) ! - -.. code-block:: bash - - python script.py --config-path=ABC --config-name=XYZ -m \ - trainer.max_steps=5000 \ # Any additional arg after -m will be passed to all the runs generated from the config ! - ... - -Tips and Tricks -~~~~~~~~~~~~~~~ - -* Preserving disk space for large number of experiments - -Some models may have a large number of parameters, and it may be very expensive to save a large number of checkpoints on -physical storage drives. For example, if you use Adam optimizer, each PyTorch Lightning ".ckpt" file will actually be 3x the -size of just the model parameters - per ckpt file ! This can be exhorbitant if you have multiple runs. - -In the above config, we explicitly set ``save_top_k: 1`` and ``always_save_nemo: True`` - what this does is limit the number of -ckpt files to just 1, and also save a NeMo file (which will contain just the model parameters without optimizer state) and -can be restored immediately for further work. - -We can further reduce the storage space by utilizing some utility functions of NeMo to automatically delete either -ckpt or NeMo files after a training run has finished. This is sufficient in case you are collecting results in some experiment -tracking tool and can simply rerun the best config after the search is finished. - -.. code-block:: python - - # Import `clean_exp_ckpt` along with exp_manager - from nemo.utils.exp_manager import clean_exp_ckpt, exp_manager - - @hydra_runner(...) - def main(cfg): - ... - - # Keep track of the experiment directory - exp_log_dir = exp_manager(trainer, cfg.get("exp_manager", None)) - - ... add any training code here as needed ... - - # Add following line to end of the training script - # Remove PTL ckpt file, and potentially also remove .nemo file to conserve storage space. - clean_exp_ckpt(exp_log_dir, remove_ckpt=True, remove_nemo=False) - - -* Debugging Multi-Run Scripts - -When running hydra scripts, you may sometimes face config issues which crash the program. In NeMo Multi-Run, a crash in -any one run will **not** crash the entire program, we will simply take note of it and move onto the next job. Once all -jobs are completed, we then raise the error in the order that it occurred (it will crash the program with the first error's -stack trace). - -In order to debug Muti-Run, we suggest to comment out the full hyper parameter config set inside ``sweep.params`` -and instead run just a single experiment with the config - which would immediately raise the error. - - -* Experiment name cannot be parsed by Hydra - -Sometimes our hyper parameters include PyTorch Lightning ``trainer`` arguments - such as number of steps, number of epochs -whether to use gradient accumulation or not etc. When we attempt to add these as keys to the expriment manager's ``name``, -Hydra may complain that ``trainer.xyz`` cannot be resolved. - -A simple solution is to finalize the hydra config before you call ``exp_manager()`` as follows - - -.. code-block:: python - - @hydra_runner(...) - def main(cfg): - # Make any changes as necessary to the config - cfg.xyz.abc = uvw - - # Finalize the config - cfg = OmegaConf.resolve(cfg) - - # Carry on as normal by calling trainer and exp_manager - trainer = pl.Trainer(**cfg.trainer) - exp_log_dir = exp_manager(trainer, cfg.get("exp_manager", None)) - ... - - -ExpManagerConfig ----------------- - -.. autoclass:: nemo.utils.exp_manager.ExpManagerConfig - :show-inheritance: - :members: - :member-order: bysource diff --git a/SoundScribe/SpeakerID/docs/source/core/export.rst b/SoundScribe/SpeakerID/docs/source/core/export.rst deleted file mode 100644 index 202099b13d66121429d67454aa00bd5cada4b3c7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/core/export.rst +++ /dev/null @@ -1,223 +0,0 @@ -Exporting NeMo Models -===================== - -Exporting Models ----------------- - -Most of the NeMo models can be exported to ONNX or TorchScript to be deployed for inference in optimized execution environments, such as Riva or Triton Inference Server. -Export interface is provided by the :class:`~nemo.core.classes.exportable.Exportable` mix-in class. If a model extends :class:`~nemo.core.classes.exportable.Exportable`, it can be exported by: - -.. code-block:: Python - - from nemo.core.classes import ModelPT, Exportable - # deriving from Exportable - class MyExportableModel(ModelPT, Exportable): - ... - - mymodel = MyExportableModel.from_pretrained(model_name="MyModelName") - model.eval() - model.to('cuda') # or to('cpu') if you don't have GPU - - # exporting pre-trained model to ONNX file for deployment. - mymodel.export('mymodel.onnx', [options]) - - -How to Use Model Export ------------------------ -The following arguments are for :meth:`~nemo.core.classes.exportable.Exportable.export`. In most cases, you should only supply the name of the output file and use all defaults: - -.. code-block:: Python - - def export( - self, - output: str, - input_example=None, - verbose=False, - do_constant_folding=True, - onnx_opset_version=None, - check_trace: Union[bool, List[torch.Tensor]] = False, - dynamic_axes=None, - check_tolerance=0.01, - export_modules_as_functions=False, - keep_initializers_as_inputs=None, - ): - -The ``output``, ``input_example``, ``verbose``, ``do_constant_folding``, ``onnx_opset_version`` options have the same semantics as in Pytorch ``onnx.export()`` and ``jit.trace()`` functions and are passed through. For more information about Pytorch's``onnx.export()``, refer to the `torch.onnx functions documentation -`_. Note that if ``input_example`` is None, ``Exportable.input_example()`` is called. - -The file extension of the ``output`` parameter determines export format: - -* ``.onnx->ONNX`` -* ``.pt`` or ``.ts`` -> ``TorchScript``. - -**TorchScript-specific**: By default, the module will undergo ``jit.trace()``. You may require to explicitly pass some modules under ``jit.script()`` so that they are correctly traced.The ``check_trace`` arg is passed through to ``jit.trace()``. - -**ONNX-specific**: If ``use_dynamic_axes`` is True, ``onnx.export()`` is called with dynamic axes. If ``dynamic_axes`` is ``None``, they are inferred from the model's ``input_types`` definition (batch dimension is dynamic, and so is duration etc). - -If ``check_trace`` is ``True``, the resulting ONNX also runs on ``input_example`` and the results compared to the exported model's output, using the ``check_tolerance`` argument. Note the higher tolerance default. - - -How to Make Model Exportable ----------------------------- - -If you are simply using NeMo models, the previous example is all you need to know. -If you write your own models, this section highlights the things you need to be aware of after extending ``Exportable``. - -Exportable Hooks and Overrides -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You should not normally need to override ``Exportable`` default methods. However, ``Exportable.export()`` relies on the assumptions that certain methods are available in your class. - -.. code-block:: Python - - @property - def input_example(self) # => Tuple(input, [(input, ...], [Dict]) - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - -This function should return a tuple of (normally) Tensors - one per each of model inputs (args to ``forward()``). The last element may be a ``Dict`` to specify non-positional arguments by name, as per Torch ``export()`` convention. For more information, refer to the `Using dictionaries to handle Named Arguments as model inputs -`_. - -.. Note: ``Dict`` currently does not work with Torchscript ``trace()``. - -.. code-block:: Python - - @property - def input_types(self): - @property - def output_types(self): - -Those are needed for inferring in/out names and dynamic axes. If your model derives from ``ModulePT``, those are already there. Another common scenario is that your model contains one or more modules that processes input and generates output. Then, you should override ``Exportable`` methods ``input_module()`` and ``output_module()`` to point to them, like in this example: - -.. code-block:: Python - - @property - def input_module(self): - return self.fastpitch - - @property - def output_module(self): - return self.fastpitch - -Your model should also have an export-friendly ``forward()`` method - that can mean different things for ONNX ant TorchScript. For ONNX, you can't have forced named parameters without default, like ``forward(self, *, text)``. For TorchScript, you should avoid ``None`` and use ``Optional`` instead. The criteria are highly volatile and may change with every PyTorch version, so it's a trial-and-error process. There is also the general issue that in many cases, ``forward()`` for inference can be simplified and even use less inputs/outputs. To address this, ``Exportable`` looks for ``forward_for_export()`` method in your model and uses that instead of ``forward()`` to export: - -.. code-block:: Python - - # Uses forced named args, many default parameters. - def forward( - self, - *, - text, - durs=None, - pitch=None, - speaker=0, - pace=1.0, - spec=None, - attn_prior=None, - mel_lens=None, - input_lens=None, - ): - # Passes through all self.fastpitch outputs - return self.fastpitch( - text=text, - durs=durs, - pitch=pitch, - speaker=speaker, - pace=pace, - spec=spec, - attn_prior=attn_prior, - mel_lens=mel_lens, - input_lens=input_lens, - ) - - - # Uses less inputs, no '*', returns less outputs: - def forward_for_export(self, text): - ( - spect, - durs_predicted, - log_durs_predicted, - pitch_predicted, - attn_soft, - attn_logprob, - attn_hard, - attn_hard_dur, - pitch, - ) = self.fastpitch(text=text) - return spect, durs_predicted, log_durs_predicted, pitch_predicted - -To stay consistent with input_types()/output_types(), there are also those hooks in ``Exportable`` that let you exclude particular inputs/outputs from the export process: - -.. code-block:: Python - - @property - def disabled_deployment_input_names(self): - """Implement this method to return a set of input names disabled for export""" - return set(["durs", "pitch", "speaker", "pace", "spec", "attn_prior", "mel_lens", "input_lens"]) - - @property - def disabled_deployment_output_names(self): - - -Another common requirement for models that are being exported is to run certain net modifications for inference efficiency before exporting - like disabling masks in some convolutions or removing batch normalizations. A better style is to make those happen on ``ModelPT.eval()`` (and reversed on ``.train()``), but it's not always feasible so the following hook is provided in ``Exportable`` to run those: - -.. code-block:: Python - - def _prepare_for_export(self, **kwargs): - """ - Override this method to prepare module for export. This is in-place operation. - Base version does common necessary module replacements (Apex etc) - """ - # do graph modifications specific for this model - replace_1D_2D = kwargs.get('replace_1D_2D', False) - replace_for_export(self, replace_1D_2D) - # call base method for common set of modifications - Exportable._prepare_for_export(self, **kwargs) - -Some models that require control flow, need to be exported in multiple parts. Typical examples are RNNT nets. -To facilitate that, the hooks below are provided. To export, for example, 'encoder' and 'decoder' subnets of the model, overload list_export_subnets to return ['encoder', 'decoder']. - -.. code-block:: Python - - def get_export_subnet(self, subnet=None): - """ - Returns Exportable subnet model/module to export - """ - - - def list_export_subnets(self): - """ - Returns default set of subnet names exported for this model - First goes the one receiving input (input_example) - """ - -Some nertworks may be exported differently according to user-settable options (like ragged batch support for TTS or cache support for ASR). To facilitate that - `set_export_config()` method is provided by Exportable to set key/value pairs to predefined model.export_config dictionary, to be used during the export: - -.. code-block:: Python - def set_export_config(self, args): - """ - Sets/updates export_config dictionary - """ -Also, if an action hook on setting config is desired, this method may be overloaded by `Exportable` descendants to include one. -An example can be found in ``/nemo/collections/asr/models/rnnt_models.py``. - -Here is example on now `set_export_config()` call is being tied to command line arguments in ``/scripts/export.py`` : - -.. code-block:: Python - python scripts/export.py hybrid_conformer.nemo hybrid_conformer.onnx --export-config decoder_type=ctc - -Exportable Model Code -~~~~~~~~~~~~~~~~~~~~~ - -Most importantly, the actual Torch code in your model should be ONNX or TorchScript - compatible (ideally, both). -#. Ensure the code is written in Torch - avoid bare `Numpy or Python operands `_. -#. Create your model ``Exportable`` and add an export unit test, to catch any operation/construct not supported in ONNX/TorchScript, immediately. - -For more information, refer to the PyTorch documentation: - - `List of supported operators `_ - - `Tracing vs. scripting `_ - - `AlexNet example `_ - diff --git a/SoundScribe/SpeakerID/docs/source/core/neural_types.rst b/SoundScribe/SpeakerID/docs/source/core/neural_types.rst deleted file mode 100644 index 9003b9ca520328a0257bdade1660162eb1630bac..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/core/neural_types.rst +++ /dev/null @@ -1,178 +0,0 @@ - -Neural Types -============ - -Motivation ----------- - -Neural Types describe the semantics, axis order, and dimensions of a tensor. The purpose of this type system is to catch semantic and -dimensionality errors during model creation and facilitate module re-use. - -.. image:: whyntypes.gif - :width: 900 - :alt: Neural Types Motivation - -``NeuralType`` class --------------------- - -Neural Types perform semantic checks for modules and models inputs/outputs. They contain information about: - - - Semantics of what is stored in the tensors. For example, logits, logprobs, audiosignal, embeddings, etc. - - Axes layout, semantic and (optionally) dimensionality. For example: ``[Batch, Time, Channel]`` - -Types are implemented in ``nemo.core.neural_types.NeuralType`` class. When you instantiate an instance of this class, you -are expected to include both *axes* information and *element type* information. - -.. autoclass:: nemo.core.neural_types.NeuralType - -Type Comparison Results ------------------------ - -When comparing two neural types, the following comparison results are generated. - -.. autoclass:: nemo.core.neural_types.NeuralTypeComparisonResult - -Examples --------- - -Long vs short notation -~~~~~~~~~~~~~~~~~~~~~~ - -NeMo's ``NeuralType`` class allows you to express axis semantics information in long and short form. Consider these two equivalent types. Both encoder 3 dimensional tensors and both contain elements of type ``AcousticEncodedRepresentation`` (this type is a typical output of ASR encoders). - -.. code-block:: python - - long_version = NeuralType( - axes=(AxisType(AxisKind.Batch, None), AxisType(AxisKind.Dimension, None), AxisType(AxisKind.Time, None)), - elements_type=AcousticEncodedRepresentation(), - ) - short_version = NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()) - assert long_version.compare(short_version) == NeuralTypeComparisonResult.SAME - -Transpose same -~~~~~~~~~~~~~~ - -Often it is useful to know if a simple transposition will solve type incompatibility. This is the case if the comparison result of two types equals ``nemo.core.neural_types.NeuralTypeComparisonResult.TRANSPOSE_SAME``. - -.. code-block:: python - - type1 = NeuralType(axes=('B', 'T', 'C')) - type2 = NeuralType(axes=('T', 'B', 'C')) - assert type1.compare(type2) == NeuralTypeComparisonResult.TRANSPOSE_SAME - assert type2.compare(type1) == NeuralTypeComparisonResult.TRANSPOSE_SAME - -Note that in this example, we dropped ``elements_type`` argument of ``NeuralType`` constructor. If not supplied, the element type is ``VoidType``. - -``VoidType`` for elements -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Sometimes it is useful to express that elements' types don't matter but axes layout do. ``VoidType`` for elements can be used to express this. - -.. note:: ``VoidType`` is compatible with every other elements' type but not the other way around. See the following code snippet below for details. - -.. code-block:: python - - btc_spctr = NeuralType(('B', 'T', 'C'), SpectrogramType()) - btc_spct_bad = NeuralType(('B', 'T'), SpectrogramType()) - # Note the VoidType for elements here - btc_void = NeuralType(('B', 'T', 'C'), VoidType()) - - # This is true because VoidType is compatible with every other element type (SpectrogramType in this case) - # And axes layout between btc_void and btc_spctr is the same - assert btc_void.compare(btc_spctr) == NeuralTypeComparisonResult.SAME - # These two types are incompatible because even though VoidType is used for elements on one side, - # the axes layout is different - assert btc_void.compare(btc_spct_bad) == NeuralTypeComparisonResult.INCOMPATIBLE - # Note that even though VoidType is compatible with every other type, other types are not compatible with VoidType! - # It is one-way compatibility - assert btc_spctr.compare(btc_void) == NeuralTypeComparisonResult.INCOMPATIBLE - -Element type inheritance -~~~~~~~~~~~~~~~~~~~~~~~~ - -Neural types in NeMo support Python inheritance between element types. Consider an example where you want to develop a Neural Module which performs data augmentation for all kinds of spectrograms. -In ASR, two types of spectrograms are frequently used: mel and mfcc. To express this, we will create 3 classes to express -element's types: ``SpectrogramType``, ``MelSpectrogramType(SpectrogramType)``, ``MFCCSpectrogramType(SpectrogramType)``. - -.. code-block:: python - - input = NeuralType(('B', 'D', 'T'), SpectrogramType()) - out1 = NeuralType(('B', 'D', 'T'), MelSpectrogramType()) - out2 = NeuralType(('B', 'D', 'T'), MFCCSpectrogramType()) - - # MelSpectrogram and MFCCSpectrogram are not interchangeable. - assert out1.compare(out2) == NeuralTypeComparisonResult.INCOMPATIBLE - assert out2.compare(out1) == NeuralTypeComparisonResult.INCOMPATIBLE - # Type comparison detects that MFCC/MelSpectrogramType is a kind of SpectrogramType and can be accepted. - assert input.compare(out1) == NeuralTypeComparisonResult.GREATER - assert input.compare(out2) == NeuralTypeComparisonResult.GREATER - -Custom element types -~~~~~~~~~~~~~~~~~~~~ - -It is possible to create user-defined element types to express the semantics of elements in your tensors. To do so, the user will need to inherit and implement abstract methods of the ``nemo.core.neural_types.elements.ElementType`` class - -.. autoclass:: nemo.core.neural_types.elements.ElementType - -Note that element types can be parametrized. Consider this example where it distinguishes between audio sampled at 8Khz and 16Khz. - -.. code-block:: python - - audio16K = NeuralType(axes=('B', 'T'), elements_type=AudioSignal(16000)) - audio8K = NeuralType(axes=('B', 'T'), elements_type=AudioSignal(8000)) - - assert audio8K.compare(audio16K) == NeuralTypeComparisonResult.SAME_TYPE_INCOMPATIBLE_PARAMS - assert audio16K.compare(audio8K) == NeuralTypeComparisonResult.SAME_TYPE_INCOMPATIBLE_PARAMS - -Enforcing dimensions -~~~~~~~~~~~~~~~~~~~~ - -In addition to specifying tensor layout and elements' semantics, neural types also allow you to enforce tensor dimensions. -The user will have to use long notations to specify dimensions. Short notations only allows you to specify axes semantics and assumes -arbitrary dimensions. - -.. code-block:: python - - type1 = NeuralType( - (AxisType(AxisKind.Batch, 64), AxisType(AxisKind.Time, 10), AxisType(AxisKind.Dimension, 128)), - SpectrogramType(), - ) - type2 = NeuralType(('B', 'T', 'C'), SpectrogramType()) - - # type2 will accept elements of type1 because their axes semantics match and type2 does not care about dimensions - assert type2.compare(type1), NeuralTypeComparisonResult.SAME - # type1 will not accept elements of type2 because it need dimensions to match strictly. - assert type1.compare(type2), NeuralTypeComparisonResult.DIM_INCOMPATIBLE - -Generic Axis kind -~~~~~~~~~~~~~~~~~ - -Sometimes (especially in the case of loss modules) it is useful to be able to specify a "generic" axis kind which will make it -compatible with any other kind of axis. This is easy to express with Neural Types by using ``nemo.core.neural_types.axes.AxisKind.Any`` for axes. - -.. code-block:: python - - type1 = NeuralType(('B', 'Any', 'Any'), SpectrogramType()) - type2 = NeuralType(('B', 'T', 'C'), SpectrogramType()) - type3 = NeuralType(('B', 'C', 'T'), SpectrogramType()) - - # type1 will accept elements of type2 and type3 because it only cares about element kind (SpectrogramType) - # number of axes (3) and that first one corresponds to batch - assert type1.compare(type2) == NeuralTypeComparisonResult.SAME - assert type1.compare(type3) == NeuralTypeComparisonResult.INCOMPATIBLE - -Container types -~~~~~~~~~~~~~~~ - -The NeMo-type system understands Python containers (lists). If your module returns a nested list of typed tensors, the way to express it is by -using Python list notation and Neural Types together when defining your input/output types. - -The example below shows how to express that your module returns single output ("out") which is list of lists of two dimensional tensors of shape ``[batch, dimension]`` containing logits. - -.. code-block:: python - - @property - def output_types(self): - return { - "out": [[NeuralType(('B', 'D'), LogitsType())]], - } diff --git a/SoundScribe/SpeakerID/docs/source/core/whyntypes.gif b/SoundScribe/SpeakerID/docs/source/core/whyntypes.gif deleted file mode 100644 index 56b5ab154391c95d393e5b6d48f2170cba260f07..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/core/whyntypes.gif and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/favicon.ico b/SoundScribe/SpeakerID/docs/source/favicon.ico deleted file mode 100644 index 424df87200c706460f9ad1c7722ef0d35f286f2b..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/favicon.ico and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/index.rst b/SoundScribe/SpeakerID/docs/source/index.rst deleted file mode 100644 index 86ad55d1709b8d93f7db847b6463f5c23a6b8057..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/index.rst +++ /dev/null @@ -1,80 +0,0 @@ -NVIDIA NeMo User Guide -====================== - -.. toctree:: - :maxdepth: 2 - :caption: Getting Started - :name: starthere - - starthere/intro - starthere/tutorials - starthere/best-practices - starthere/migration-guide - -.. toctree:: - :maxdepth: 2 - :caption: NeMo Core - :name: core - - core/core - core/exp_manager - core/neural_types - core/export - core/adapters/intro - core/api - - -.. toctree:: - :maxdepth: 2 - :caption: Speech Processing - :name: Speech Processing - - asr/intro - asr/speech_classification/intro - asr/speaker_recognition/intro - asr/speaker_diarization/intro - asr/ssl/intro - asr/speech_intent_slot/intro - -.. toctree:: - :maxdepth: 3 - :caption: Natural Language Processing - :name: Natural Language Processing - - nlp/nemo_megatron/intro - nlp/machine_translation/machine_translation - nlp/text_normalization/intro - nlp/api - nlp/megatron_onnx_export - nlp/models - - -.. toctree:: - :maxdepth: 1 - :caption: Text To Speech (TTS) - :name: Text To Speech - - tts/intro - -.. toctree:: - :maxdepth: 2 - :caption: Common - :name: Common - - text_processing/intro - -.. toctree:: - :maxdepth: 2 - :caption: Text Processing - :name: Text Processing - - text_processing/g2p/g2p - common/intro - - -.. toctree:: - :maxdepth: 3 - :caption: Tools - :name: Tools - - tools/intro diff --git a/SoundScribe/SpeakerID/docs/source/nlp/api.rst b/SoundScribe/SpeakerID/docs/source/nlp/api.rst deleted file mode 100644 index 33709bd05a193ffd7c0c1a9596bcc4785ff80c2c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/api.rst +++ /dev/null @@ -1,162 +0,0 @@ -NeMo Megatron API -======================= - -Pretraining Model Classes -------------------------- - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_base_model.MegatronBaseModel - :show-inheritance: - :no-members: - :members: __init__, configure_optimizers - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel - :show-inheritance: - :no-members: - :members: generate, training_step, validation_step, build_train_valid_test_datasets, setup, on_save_checkpoint, on_load_checkpoint - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_bert_model.MegatronBertModel - :show-inheritance: - :no-members: - :members: training_step, validation_step, build_train_valid_test_datasets, build_LDDL_data, setup, on_save_checkpoint, on_load_checkpoint - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_bart_model.MegatronBARTModel - :show-inheritance: - :no-members: - :members: training_step, validation_step, build_train_valid_test_datasets, setup, on_save_checkpoint, on_load_checkpoint - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_retrieval_model.MegatronRetrievalModel - :show-inheritance: - :no-members: - :members: generate, training_step, validation_step, build_train_valid_test_datasets, setup - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_model.MegatronT5Model - :show-inheritance: - :no-members: - :members: complete, encode, decode, add_special_tokens_to_tokenizer, training_step, validation_step, build_train_valid_test_datasets, setup - -Customization Model Classes ---------------------------- - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model.MegatronGPTSFTModel - :show-inheritance: - :no-members: - :members: generate, training_step, validation_step, build_train_valid_test_datasets, setup - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model.MegatronGPTAdapterLearningModel - :show-inheritance: - :no-members: - :members: __init__, state_dict, generate, training_step, validation_step, build_train_valid_test_datasets, setup - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model.MegatronGPTInfusedAdapterModel - :show-inheritance: - :no-members: - :members: __init__, state_dict, generate, training_step, validation_step, build_train_valid_test_datasets, setup - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model.MegatronGPTPromptLearningModel - :show-inheritance: - :no-members: - :members: built_virtual_prompt_dataset, generate, training_step, validation_step, build_train_valid_test_datasets, setup - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5AdapterLearningModel - :show-inheritance: - :no-members: - :members: __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5AdapterLearningModel - :show-inheritance: - :no-members: - :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5InfusedAdapterModel - :show-inheritance: - :no-members: - :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup - -Modules -------- - -.. autoclass:: nemo.collections.nlp.modules.common.megatron.module.MegatronModule - :show-inheritance: - -.. autoclass:: nemo.collections.nlp.modules.common.megatron.module.Float16Module - :show-inheritance: - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron.gpt_model.GPTModel - :show-inheritance: - :no-members: - :members: forward - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron.bert_model.BertModel - :show-inheritance: - :no-members: - :members: forward - -.. autoclass:: nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder.MegatronTokenLevelEncoderDecoderModule - :show-inheritance: - :no-members: - :members: forward - -.. autoclass:: nemo.collections.nlp.modules.common.megatron.retrieval_token_level_encoder_decoder.MegatronRetrievalTokenLevelEncoderDecoderModule - :show-inheritance: - :no-members: - :members: forward - - -Datasets --------- - -.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset.BlendableDataset - :show-inheritance: - -.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset.GPTDataset - :show-inheritance: - -.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset.MockGPTDataset - :show-inheritance: - -.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.bert_dataset.BertDataset - :show-inheritance: - -.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.base_prompt_learning_dataset.BasePromptLearningDataset - :show-inheritance: - -.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset.GPTSFTDataset - :show-inheritance: - -.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset.GPTSFTChatDataset - :show-inheritance: - -.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.retro_dataset.RETRODataset - :show-inheritance: - -.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.t5_dataset.T5Dataset - :show-inheritance: - :exclude-members: MAX_SEQ_LENGTH_DELTA - -.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.t5_prompt_learning_dataset.T5PromptLearningDataset - :show-inheritance: - -.. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.ul2_dataset.UL2Dataset - :show-inheritance: - - -Adapter Mixin Class -------------------------- - -.. autoclass:: nemo.collections.nlp.parts.mixins.nlp_adapter_mixins.NLPAdapterModelMixin - :show-inheritance: - :members: add_adapter, load_adapters, merge_cfg_with, merge_inference_cfg - :exclude-members: first_stage_of_pipeline, tie_weights, get_peft_state_dict, state_dict, sharded_state_dict, load_state_dict, on_load_checkpoint - :member-order: bysource - - -Exportable Model Classes -------------------------- - -.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTExportableModel - :show-inheritance: - -.. toctree:: - :maxdepth: 1 - - megatron_onnx_export \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/nlp/bert_pretraining.rst b/SoundScribe/SpeakerID/docs/source/nlp/bert_pretraining.rst deleted file mode 100644 index 8c7fd376268f17780215d007bbec0162847b2fb0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/bert_pretraining.rst +++ /dev/null @@ -1,134 +0,0 @@ -.. _bert_pretraining: - -BERT -==== - -BERT is an autoencoding language model with a final loss composed of: - -- masked language model loss -- next sentence prediction - -The model architecture is published in `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding `__ :cite:`nlp-bert-devlin2018bert`. -The model is originally trained on English Wikipedia and BookCorpus. BERT is often used as a language model encoder for downstream tasks, for example, :ref:`token_classification`, :ref:`text_classification`, :ref:`question_answering`, etc. -Domain-specific BERT models can be advantageous for a wide range of applications. One notable application is the domain-specific BERT in a biomedical setting, -e.g. BioBERT :cite:`nlp-bert-lee2019biobert` or its improved derivative BioMegatron :cite:`nlp-bert-shin2020biomegatron`. For the latter, refer to :ref:`megatron_finetuning`. - -Quick Start Guide ------------------ - -.. code-block:: python - - from nemo.collections.nlp.models import BERTLMModel - - # to get the list of pre-trained models - BERTLMModel.list_available_models() - - # Download and load the pre-trained BERT-based model - model = BERTLMModel.from_pretrained("bertbaseuncased") - -Available Models -^^^^^^^^^^^^^^^^ - -.. list-table:: *Pretrained Models* - :widths: 5 10 - :header-rows: 1 - - * - Model - - Pretrained Checkpoint - * - BERT-base uncased - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:bertbaseuncased - * - BERT-large uncased - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:bertlargeuncased - -.. _dataset_bert_pretraining: - -Data Input for the BERT model ------------------------------ - -Data preprocessing can be either done on-the-fly during training or offline before training. The latter is optimized and recommended -for large text corpora. This was also used in the original paper to train the model on Wikipedia and BookCorpus. For on-the-fly data -processing, provide text files with sentences for training and validation, where words are separated by spaces, i.e.: ``[WORD] [SPACE] [WORD] [SPACE] [WORD]``. -To use this pipeline in training, use the dedicated configuration file `NeMo/examples/nlp/language_modeling/conf/bert_pretraining_from_preprocessed_config.yaml`. - -To process data offline in advance, refer to the `BERT Quick Start Guide `__. -To recreate the original Wikipedia and BookCorpus datasets, follow steps 1-5 in the Quick Start Guide and run the script ``./data/create_datasets_from_start.sh`` inside the Docker container. -The ``downloaded`` folder should include two sub folders ``lower_case_[0,1]_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5`` -and ``lower_case_[0,1]_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5``, containing sequences of length 128 with a maximum of 20 masked tokens -and sequences of length 512 with a maximum of 80 masked tokens respectively. To use this pipeline in training, use the dedicated configuration file ``NeMo/examples/nlp/language_modeling/conf/bert_pretraining_from_text_config.yaml`` -and specify the path to the created hd5f files. - - -Training the BERT model ------------------------ - -Example of model configuration for on-the-fly data preprocessing: `NeMo/examples/nlp/language_modeling/conf/bert_pretraining_from_text_config.yaml `__. -Example of model configuration for offline data preprocessing: `NeMo/examples/nlp/language_modeling/conf/bert_pretraining_from_preprocessed_config.yaml `__. - -The specification can be grouped into three categories: - -- Parameters that describe the training process: **trainer** -- Parameters that describe the datasets: **model.train_ds**, **model.validation_ds** -- Parameters that describe the model: **model**, **model.tokenizer**, **model.language_model** - -More details about parameters in the config file can be found below: - -+-------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **Parameter** | **Data Type** | **Description** | -+-------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **model.only_mlm_loss** | bool | Only uses masked language model without next sentence prediction. | -+-------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **train_ds.data_file** | string | Name of the text file or hdf5 data directory. | -+-------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **train_ds.num_samples** | integer | Number of samples to use from the training dataset, ``-1`` - to use all. | -+-------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ - -More details about parameters for offline data preprocessing can be found below: - -+-------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **Parameter** | **Data Type** | **Description** | -+-------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **train_ds.max_predictions_per_seq** | integer | Maximum number of masked tokens in a sequence in the preprocessed data. | -+-------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ - -More details about parameters for online data preprocessing can be found below: - -+-------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **Parameter** | **Data Type** | **Description** | -+-------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **model.max_seq_length** | integer | The maximum total input sequence length after tokenization. | -+-------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **model.mask_prob** | float | Probability of masking a token in the input text during data processing. | -+-------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **model.short_seq_prob** | float | Probability of having a sequence shorter than the maximum sequence length. | -+-------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ - -.. note:: - - For offline data preprocessing, **model.tokenizer** is null. For downstream task, use the same tokenizer that was used for - offline preprocessing. For online data preprocessing, **model.tokenizer** needs to be specified. See also :ref:`nlp_model` for - details. - -Example of the command for training the model: - -.. code:: - - python bert_pretraining.py \ - model.train_ds.data_file= \ - trainer.max_epochs= \ - trainer.devices=[] \ - trainer.accelerator='gpu' - - -Fine-tuning on Downstream Tasks -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To use a trained BERT model checkpoint on a NeMo NLP downstream task, e.g. :ref:`question_answering`, specify -:code:`model.language_model.lm_checkpoint=`. - -References ----------- - -.. bibliography:: nlp_all.bib - :style: plain - :labelprefix: NLP-BERT - :keyprefix: nlp-bert- diff --git a/SoundScribe/SpeakerID/docs/source/nlp/dialogue.rst b/SoundScribe/SpeakerID/docs/source/nlp/dialogue.rst deleted file mode 100644 index 157aaa714b16998a0a5008b7e0648b89d3d0ff11..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/dialogue.rst +++ /dev/null @@ -1,143 +0,0 @@ -.. _dialogue: - -Dialogue tasks -====================================== - -This module consists of various tasks that are related to dialogue. - -**Module Design** - -We decided to group dialogue tasks into a common module instead of having a module for each because they share many things in common, meaning that there can be more re-use of code. -This design can also support easier extension of this module, as developers can work on components of their interest while utilizing other components of dialogue pipeline. -In particular, we wanted to decouple the task-dependent, model-independent components of DataProcessor and InputExample from the model-dependent, task-independent components of Model and Dataset. - -.. image:: dialogue_UML.png - :alt: Dialogue-UML - :width: 800px - -**Supported Tasks** - -Supported tasks fall into broad categories of intent / domain classification with slot filling, intent classification as well as sequence generation. - -For each category of tasks, there exists several Data Processors to convert raw data from various sources into a common format as well as Dialogue Models that approachs the task in various ways. - -Currently, the supported task categories are: - -+----------------------------------------------------------+----------------------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------+ -| **Task Category** | **Tasks** | **Models** | **Supported Options for model.language_model.pretrained_model_name** | **Supported options for model.library** | -+----------------------------------------------------------+----------------------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------+ -| Domain / Intent Classification | Schema Guided Dialogue | Dialogue GPT Classification Model | gpt2, gpt2-{medium, large, xl}, microsoft/DialoGPT-{small, medium} | Huggingface, Megatron | -+ with slot filling +----------------------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------+ -| | Assistant | SGDQA (BERT-Based Schema Guided Dialogue Question Answering model) | bert-base-cased | Megatron | -+ +----------------------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------+ -| | | Intent Slot Classification Model | bert-base-uncased | Megatron | -+----------------------------------------------------------+----------------------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------+ -| Intent Classification | Zero Shot Food Ordering | Dialogue GPT Classification Model | gpt2, gpt2-{medium, large, xl}, microsoft/DialoGPT-{small, medium} | Huggingface, Megatron | -+ +----------------------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------+ -| | Omniverse Design | Dialogue Nearest Neighbour Model | sentence-transformers/* | Huggingface | -+ +----------------------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------+ -| | | Dialogue Zero Shot Intent Model (Based on MNLI pretraining) | bert-base-uncased | Huggingface, Megatron | -+----------------------------------------------------------+----------------------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------+ -| Sequence Generation | Schema Guided Dialogue Generation| Dialogue GPT Generation Model | gpt2, gpt2-{medium, large, xl}, microsoft/DialoGPT-{small, medium} | Huggingface, Megatron | -+ +----------------------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------+ -| | MS Marco NLGen | Dialogue S2S Generation Model | facebook/bart-{base, large}, t5-{small, base, large, 3b, 11b} | Huggingface, Megatron | -+----------------------------------------------------------+----------------------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------+ - -**Configuration** - -Example of model configuration file for training the model can be found at: `NeMo/examples/nlp/dialogue/conf/dialogue_config.yaml `__. - -Because the Dialogue module contains a wide variety of models and tasks, there are a large number of configuration parameters to adjust (some of which only applies to some models/some tasks) - -In the configuration file, define the parameters of the training and the model, although most of the default values will work well. -For various task-model combination, only a restricted set of config args will apply. Please read the configuration file for comments on which config args you would need for each model and task. - -The configuration can be roughly grouped into a few categories: - -- Parameters that describe the training process, such as how many gpus to use: **trainer** -- Parameters that describe the model: **model** -- Parameters that describe optimization: **model.optim** -- Parameters that describe the task: **model.dataset** -- Parameters that describe the dataloaders: **model.train_ds**, **model.validation_ds**, **model.test_ds**, -- Parameters that describe the training experiment manager that log training process: **exp_manager** - - -Arguments that very commonly need to be edited for all models and tasks - -- :code:`do_training`: perform training or only testing -- :code:`trainer.devices`: number of GPUs (int) or list of GPUs e.g. [0, 1, 3] -- :code:`model.dataset.task`: Task to work on [sgd, assistant, zero_shot, ms_marco, sgd_generation, design, mellon_qa] -- :code:`model.dataset.data_dir`: the dataset directory -- :code:`model.dataset.dialogues_example_dir`: the directory to store prediction files -- :code:`model.dataset.debug_mode`: whether to run in debug mode with a very small number of samples [True, False] -- :code:`model.language_model.pretrained_model_name`: language model to use, which causes different Dialogue Models to be loaded (see table above for options in each model class) -- :code:`model.library`: library to load language model from [huggingface or megatron] -- :code:`model.language_model.lm_checkpoint`: specifying a trained checkpoint (.bin / .ckpt / .nemo). The only exception is for DialogueZeroShotIntentModel, which can be configured at :code:`model.original_nemo_checkpoint`` instead For trained checkpoints, see :code:`list_available_models()`` for each model class and then downloading the file to a local directory - -**Obtaining data** - -Task: Schema Guided Dialogue (SGD) / SGD Generation - -:code: `git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue.git` - -Task: MS Marco - -Please download the files below and unzip them into a common folder (for model.dataset.data_dir) - -https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz -https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz -https://msmarco.blob.core.windows.net/msmarco/eval_v2.1_public.json.gz - -Then remove unused samples (optional, but otherwise, this would require significantly more CPU RAM ~25GB) - -:code: `python ../NeMo/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py --filename train_v2.1.json` -:code: `python ../NeMo/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py --filename dev_v2.1.json` - -Task: Assistant - -:code: `git clone https://github.com/xliuhw/NLU-Evaluation-Data` - -Then unzip it - -Finally, convert the dataset into the required format - -.. code:: - - python examples/nlp/intent_slot_classification/data/import_datasets.py - --source_data_dir=`source_data_dir` \ - --target_data_dir=`target_data_dir` \ - --dataset_name='assistant' - -- :code:`source_data_dir`: the directory location of the your dataset -- :code:`target_data_dir`: the directory location where the converted dataset should be saved - - -Unfortunately other datasets are currently not available publically - -**Training/Testing a model** - - -Please try the example Dialogue model in a Jupyter notebook (can run on `Google's Colab `__). - - -Connect to an instance with a GPU (**Runtime** -> **Change runtime type** -> select **GPU** for the hardware accelerator). - -An example script on how to train the model can be found here: `NeMo/examples/nlp/dialogue/dialogue.py `__. - -The following is an example of the command for training the model: - - -Code for training a model with three public datasets (from above) are available in the Jupyter/Colab notebook `Google's Colab `__) - - -.. code:: - - python examples/nlp/dialogue/dialogue.py \ - do_training=True \ - model.dataset.task=sgd \ - model.dataset.debug_mode=True \ - model.language_model.pretrained_model_name=gpt2 \ - model.data_dir= \ - model.dataset.dialogues_example_dir= \ - trainer.devices=[0] \ - trainer.accelerator='gpu' diff --git a/SoundScribe/SpeakerID/docs/source/nlp/dialogue_UML.png b/SoundScribe/SpeakerID/docs/source/nlp/dialogue_UML.png deleted file mode 100644 index 0a76aae7c5fa2136950c13bb3781b9b947f28cbc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/dialogue_UML.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a6f89e670db61728ca6c2a7c01da6017d299e18d5f8bc1334ca0694c0266ab5a -size 1682802 diff --git a/SoundScribe/SpeakerID/docs/source/nlp/entity_linking.rst b/SoundScribe/SpeakerID/docs/source/nlp/entity_linking.rst deleted file mode 100644 index 2c326fb8560338aa4dcb6f035c5d6fb211b857fe..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/entity_linking.rst +++ /dev/null @@ -1,36 +0,0 @@ -.. _entity_linking: - -Entity Linking -==================================== - -Entity linking is the process of matching concepts mentioned in natural language to their unique IDs and canonical forms stored -in a knowledge base. For example, an entity linking model might match the phrase ``blood thinners`` mentioned in conversation -to the knowledge base concept UID45623 anticoagulant. Entity linking applications range from helping automate ingestion of -large amounts of data to assisting in real time concept normalization. - -Within NeMo we use the entity linking approach described in Liu et. al's NAACL 2021 "`Self-alignment Pre-training for Biomedical Entity Representations `_" :cite:`nlp-entity_linking-liu2021selfalignment`. -The main idea behind this approach is to reshape an initial concept embedding space such that synonyms of the same concept are -pulled closer together and unrelated concepts are pushed further apart. The concept embeddings from this reshaped space can then -be used to build a knowledge base embedding index. - -.. image:: entity_linking_overview.jpg - :alt: Entity-Linking-Overview - :width: 800px - -Our BERT-base + Self Alignment Pretraining implementation allows you to train an entity linking encoder. We also provide example code -on building an index with `Medical UMLS `_ concepts `NeMo/examples/nlp/entity_linking/build_index.py `__. - -Please try the example Entity Linking model in a Jupyter notebook (can run on `Google's Colab `__). - -Connect to an instance with a GPU (**Runtime** -> **Change runtime type** -> select **GPU** for the hardware accelerator). - -An example script on how to train the model can be found here: `NeMo/examples/nlp/entity_linking `__. - - -References ----------- - -.. bibliography:: nlp_all.bib - :style: plain - :labelprefix: nlp-entity_linking - :keyprefix: nlp-entity_linking- diff --git a/SoundScribe/SpeakerID/docs/source/nlp/entity_linking_overview.jpg b/SoundScribe/SpeakerID/docs/source/nlp/entity_linking_overview.jpg deleted file mode 100644 index e28711a6e34588b3e9ec08844be3c53771bebec1..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/nlp/entity_linking_overview.jpg and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/nlp/glue_benchmark.rst b/SoundScribe/SpeakerID/docs/source/nlp/glue_benchmark.rst deleted file mode 100644 index 849a49ea1a123e853d2c92be636bb8dabbe7b0f1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/glue_benchmark.rst +++ /dev/null @@ -1,10 +0,0 @@ -.. _glue_benchmark: - -GLUE Benchmark -============== - -We recommend you try the GLUE Benchmark model in a Jupyter notebook (can run on `Google's Colab `_): `NeMo/tutorials/nlp/GLUE_Benchmark.ipynb `__. - -Connect to an instance with a GPU (**Runtime** -> **Change runtime type** -> select **GPU** for the hardware accelerator). - -An example script on how to train the model can be found here: `NeMo/examples/nlp/glue_benchmark/glue_benchmark.py `__. diff --git a/SoundScribe/SpeakerID/docs/source/nlp/information_retrieval.rst b/SoundScribe/SpeakerID/docs/source/nlp/information_retrieval.rst deleted file mode 100644 index 3c71ffcfcd129710ce770cc6c1a005bb3bc3cc2b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/information_retrieval.rst +++ /dev/null @@ -1,10 +0,0 @@ -.. _information_retrieval: - -Information Retrieval -===================== - -We recommend you try the Information Retrieval model in a Jupyter notebook (can run on `Google's Colab `_): `NeMo/tutorials/nlp/Information_Retrieval_MSMARCO.ipynb `__. - -Connect to an instance with a GPU (**Runtime** -> **Change runtime type** -> select **GPU** for hardware the accelerator), - -An example script on how to train the model can be found here: `NeMo/examples/nlp/information_retrieval `__. diff --git a/SoundScribe/SpeakerID/docs/source/nlp/joint_intent_slot.rst b/SoundScribe/SpeakerID/docs/source/nlp/joint_intent_slot.rst deleted file mode 100644 index 30f0962a241fef1f83ceb9fc5bd098582053ef94..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/joint_intent_slot.rst +++ /dev/null @@ -1,244 +0,0 @@ -.. _intent_slot: - -Joint Intent and Slot Classification -==================================== - -Joint Intent and Slot classification is a NLU task for classifying an intent and detecting all -relevant slots (Entities) for the intent in a query. For example, in the query ``What is the weather in Santa Clara tomorrow morning?``, -we would like to classify the query as a ``weather intent``, detect ``Santa Clara`` as a `location slot`, -and ``tomorrow morning`` as a ``date_time slot``. Intent and Slot names are usually task-specific and -defined as labels in the training data. This is a fundamental step that is executed in any -task-driven conversational assistant. - -Our BERT-based model implementation allows you to train and detect both of these tasks together. - -.. note:: - - We recommend you try the Joint Intent and Slot Classification model in a Jupyter notebook (can run on `Google's Colab `_.): `NeMo/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb `__. - - Connect to an instance with a GPU (**Runtime** -> **Change runtime type** -> select **GPU** for the hardware accelerator). - - An example script on how to train the model can be found here: `NeMo/examples/nlp/intent_slot_classification `__. - - -NeMo Data Format ----------------- - -When training the model, the dataset should be first converted to the required data format, which requires the following files: - -- :code:`dict.intents.csv` - A list of all intent names in the data. One line per an intent name. The index of the intent line - (starting from ``0``) is used to identify the appropriate intent in ``train.tsv`` and ``test.tsv`` files. - -.. code:: - - weather - alarm - meeting - ... - -- :code:`dict.slots.csv` - A list of all slot names in the data. One line per slot name. The index of the slot line - (starting from ``0``) is used to identify the appropriate slots in the queries in ``train_slot.tsv`` and ``test_slot.tsv`` files. - In the last line of this dictionary ``O`` slot name is used to identify all ``out of scope`` slots, which are usually the majority of the tokens - in the queries. - -.. code:: - - date - time - city - ... - O - -- :code:`train.tsv/test.tsv` - A list of original queries, one per line, with the intent number - separated by a tab (e.g. "what alarms do i have set right now 0"). Intent numbers are - set according to the intent line in the intent dictionary file (:code:`dict.intents.csv`), - starting from ``0``. The first line in these files should contain the header line ``sentence - label``. - -- :code:`train_slot.tvs/test_slot.tsv` - A list that contains one line per query, when each word from the original text queries - is replaced by a token number from the slots dictionary file (``dict.slots.csv``), counted starting from ``0``. All the words - which do not contain a relevant slot are replaced by ``out-of scope`` token number, which is also a part of the slot dictionary file, - usually as the last entry there. For example a line from these files should look similar to: "54 0 0 54 54 12 12" (the numbers are - separated by a space). These files do not contain a header line. - - -Dataset Conversion ------------------- - -To convert to the format of the model data, use the ``import_datasets`` utility, which implements -the conversion for the Assistant dataset. Download the dataset `here `_ or you can -write your own converter for the format that you are using for data annotation. - -For a dataset that follows your own annotation format, we recommend using one text file for all -samples of the same intent, with the name of the file as the name of the intent. Use one line per -query, with brackets to define slot names. This is very similar to the assistant format, and you can -adapt this converter utility or your own format with small changes: - -:: - - did i set an alarm to [alarm_type : wake up] in the [timeofday : morning] - -Run the ``dataset_converter`` command: - -.. code:: - - python examples/nlp/intent_slot_classification/data/import_datasets.py - --source_data_dir=`source_data_dir` \ - --target_data_dir=`target_data_dir` \ - --dataset_name=['assistant'|'snips'|'atis'] - -- :code:`source_data_dir`: the directory location of the your dataset -- :code:`target_data_dir`: the directory location where the converted dataset should be saved -- :code:`dataset_name`: one of the implemented dataset names - -After conversion, ``target_data_dir`` should contain the following files: - -.. code:: - - . - |--target_data_dir - |-- dict.intents.csv - |-- dict.slots.csv - |-- train.tsv - |-- train_slots.tsv - |-- test.tsv - |-- test_slots.tsv - -Model Training --------------- - -This is a pretrained BERT based model with 2 linear classifier heads on the top of it, one for classifying an intent of the query and -another for classifying slots for each token of the query. This model is trained with the combined loss function on the Intent and Slot -classification task on the given dataset. The model architecture is based on the paper `BERT for Joint Intent Classification and Slot Filling `__:cite:`nlp-jis-chen2019bert`. - -For each query, the model classifies it as one the intents from the intent dictionary and for each word of the query it will classify -it as one of the slots from the slot dictionary, including out of scope slot for all the remaining words in the query which does not -fall in another slot category. Out of scope slot (``O``) is a part of slot dictionary that the model is trained on. - -Example of model configuration file for training the model can be found at: `NeMo/examples/nlp/intent_slot_classification/conf/intent_slot_classification.yaml `__. -In the configuration file, define the parameters of the training and the model, although most of the default values will work well. - -The specification can be roughly grouped into three categories: - -- Parameters that describe the training process: **trainer** -- Parameters that describe the model: **model** -- Parameters that describe the datasets: **model.train_ds**, **model.validation_ds**, **model.test_ds**, - -More details about parameters in the spec file can be found below: - -+-------------------------------------------+-----------------+----------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ -| **Parameter** | **Data Type** | **Default** | **Description** | -+-------------------------------------------+-----------------+----------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ -| **model.data_dir** | string | -- | The path of the data converted to the specified format. | -+-------------------------------------------+-----------------+----------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ -| **model.class_balancing** | string | ``null`` | Choose from ``[null, weighted_loss]``. The ``weighted_loss`` enables weighted class balancing of the loss. | -+-------------------------------------------+-----------------+----------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ -| **model.intent_loss_weight** | float | ``0.6`` | The elation of intent-to-slot loss in the total loss. | -+-------------------------------------------+-----------------+----------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ -| **model.pad_label** | integer | ``-1`` | A value to pad the inputs. | -+-------------------------------------------+-----------------+----------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ -| **model.ignore_extra_tokens** | boolean | ``false`` | A flag that specifies whether to ignore extra tokens. | -+-------------------------------------------+-----------------+----------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ -| **model.ignore_start_end** | boolean | ``true`` | A flag that specifies whether to not use the first and last token for slot training. | -+-------------------------------------------+-----------------+----------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ -| **model.head.num_output_layers** | integer | ``2`` | The number of fully connected layers of the classifier on top of the BERT model. | -+-------------------------------------------+-----------------+----------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ -| **model.head.fc_dropout** | float | ``0.1`` | The dropout ratio of the fully connected layers. | -+-------------------------------------------+-----------------+----------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ -| **training_ds.prefix** | string | ``train`` | A prefix for the training file names. | -+-------------------------------------------+-----------------+----------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ -| **validation_ds.prefix** | string | ``dev`` | A prefix for the validation file names. | -+-------------------------------------------+-----------------+----------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ -| **test_ds.prefix** | string | ``test`` | A prefix for the test file names. | -+-------------------------------------------+-----------------+----------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ - -For additional config parameters common to all NLP models, refer to the `nlp_model doc `__. - -The following is an example of the command for training the model: - -.. code:: - - python examples/nlp/intent_slot_classification/intent_slot_classification.py - model.data_dir= \ - trainer.max_epochs= \ - trainer.devices=[] \ - trainer.accelerator='gpu' - - -Required Arguments for Training -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -- :code:`model.data_dir`: the dataset directory - - -Optional Arguments -^^^^^^^^^^^^^^^^^^ - -Most of the default parameters in the existing configuration file are already set appropriately, however, there are some parameters -you may want to experiment with. - -- ``trainer.max_epochs``: the number of training epochs (reasonable to be between 10 to 100) -- ``model.class_balancing``: value ``weighted_loss`` may help to train the model when there is unbalanced set of classes -- ``model.intent_loss_weight``: a number between 0 to 1 that defines a weight of the intent lost versus a slot loss during training. A default value 0.6 gives a slight preference for the intent lose optimization. - -Training Procedure -^^^^^^^^^^^^^^^^^^ - -At the start of evaluation, NeMo will print out a log of the experiment specification, a summary of the training dataset, and the -model architecture. - -As the model starts training, you should see a progress bar per epoch. During training, after each epoch, NeMo will display accuracy -metrics on the validation dataset for every intent and slot separately, as well as the total accuracy. You can expect these numbers -to grow up to 50-100 epochs, depending on the size of the trained data. Since this is a joint iIntent and slot training, usually -intent's accuracy will grow first for the initial 10-20 epochs, and after that, slot's accuracy will start improving as well. - -At the end of training, NeMo saves the best checkpoint on the validation dataset at the path specified by the experiment spec file -before finishing. - -.. code:: - - GPU available: True, used: True - TPU available: None, using: 0 TPU cores - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2] - [NeMo W 2021-01-28 14:52:19 exp_manager:299] There was no checkpoint folder at checkpoint_dir :results/checkpoints. Training from scratch. - [NeMo I 2021-01-28 14:52:19 exp_manager:186] Experiments will be logged at results - ... - label precision recall f1 support - weather.weather (label_id: 0) 0.00 0.00 0.00 128 - weather.temperature (label_id: 1) 0.00 0.00 0.00 0 - weather.temperature_yes_no (label_id: 2) 0.00 0.00 0.00 0 - weather.rainfall (label_id: 3) 0.00 0.00 0.00 0 - weather.rainfall_yes_no (label_id: 4) 0.00 0.00 0.00 0 - weather.snow (label_id: 5) 0.00 0.00 0.00 0 - weather.snow_yes_no (label_id: 6) 0.00 0.00 0.00 0 - weather.humidity (label_id: 7) 0.00 0.00 0.00 0 - weather.humidity_yes_no (label_id: 8) 0.00 0.00 0.00 0 - weather.windspeed (label_id: 9) 0.00 0.00 0.00 0 - weather.sunny (label_id: 10) 0.00 0.00 0.00 0 - weather.cloudy (label_id: 11) 0.00 0.00 0.00 0 - weather.alert (label_id: 12) 0.00 0.00 0.00 0 - context.weather (label_id: 13) 0.00 0.00 0.00 0 - context.continue (label_id: 14) 0.00 0.00 0.00 0 - context.navigation (label_id: 15) 0.00 0.00 0.00 0 - context.rating (label_id: 16) 0.00 0.00 0.00 0 - context.distance (label_id: 17) 0.00 0.00 0.00 0 - ------------------- - micro avg 0.00 0.00 0.00 128 - macro avg 0.00 0.00 0.00 128 - weighted avg 0.00 0.00 0.00 128 - -Model Evaluation and Inference ------------------------------- - -There is no separate script for the evaluation and inference of this model in NeMo, however, inside of the example file `examples/nlp/intent_slot_classification/intent_slot_classification.py` -after the training part is finished, you can see the code that evaluates the trained model on an evaluation test set and then an example of doing inference using a list of given queries. - -For the deployment in the production environment, refer to `NVIDIA Riva `__ and `NVIDIA TLT documentation `__. - -References ----------- - -.. bibliography:: nlp_all.bib - :style: plain - :labelprefix: NLP-JIS - :keyprefix: nlp-jis- diff --git a/SoundScribe/SpeakerID/docs/source/nlp/language_modeling.rst b/SoundScribe/SpeakerID/docs/source/nlp/language_modeling.rst deleted file mode 100644 index 854afe6ac6eac7369bd6493c5fe3213ed7c938cf..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/language_modeling.rst +++ /dev/null @@ -1,283 +0,0 @@ -.. _language_modeling: - -Language Modeling -================= - -A language model (LM) estimates the joint probability of a given text corpus :math:`(x_1,\dots,x_T)` by factorizing it with a chain rule :math:`P(x_1,\dots,x_T) = \prod_{t=1}^T P(x_t|x_1,\dots,x_{t-1})` and sequentially modeling each conditional term in the product. To simplify modeling, it is often assumed that the context size (a number of preceding words) necessary to predict each word :math:`x_t` in the corpus is limited to :math:`N:\;P(x_t|x_1,\dots,x_{t-1}) \approx P(x_t|x_{t-N},\dots,x_{t-1})`. This approximation is commonly referred to as N-gram LM. - -Currently, we mainly support sentence-level LMs which do not consider long-term dependencies and model all sentences independently of each other. Our models are based on the Transformer sequence-to-sequence architecture :cite:`nlp-language_modeling-vaswani2017attention`. - -| An example script on how to train the model can be found here: `NeMo/examples/nlp/language_modeling/transformer_lm.py `_. -| The default configuration file for the model can be found at: `NeMo/examples/nlp/language_modeling/conf/transformer_lm_config.yaml `_. - - -Data Format ------------ - -Unsupervised LMs require the corpus which comprises many examples of sentences from a particular domain (Wikipedia, news, Pubmed abstracts, etc). We assume that the data is formatted as a text file where each line corresponds to a separate sentence: - -.. list-table:: - :widths: 100 - :header-rows: 1 - - * - Sentence-level LM coprus - * - in a silver cake basket as the panins had at their party - * - let us pretermit that long comparison - * - poverty contempt and sickness treading on my heels i easily resolve not to be affrighted - -It is common practice to apply data cleaning, normalization, and tokenization to the data prior to training LM and -NeMo expects already cleaned, normalized, and tokenized data. The only data pre-processing NeMo does is subword tokenization with BPE :cite:`nlp-language_modeling-sennrich2015neural`. - -.. note:: - If LM is intended to be used in a conjunction with another model (e.g. :ref:`re-scoring of ASR `, shallow fusion with NMT), make sure that the training data is preprocessed accordingly (lower-case no punctuation for ASR, Moses tokenization/normalization for NMT). Otherwise, it might introduce inadequate LM scores. - - -Tokenizer Training ------------------- - -Our LMs support all tokenizers available in NeMo, but require special beginning-of-string ```` and end-of-string ```` tokens. - -Below is the example of training `YouTokenToMe `__ BPE tokenizer: - -.. code-block:: python - - import youtokentome as yttm - data = # string, path to file with training data - model = # string, path to where the trained model will be saved - vocab_size = # int, number of tokens in the final vocabulary - yttm.BPE.train(data, model, vocab_size) - - -Sentence Dataset Construction ------------------------------ - -Given BPE tokenizer and a cleaned sentence-level text corpus, the following steps are applied to create a `SentenceDataset `__ object. - -#. Text to IDs - Performs tokenization with the specified tokenizer model on an input sentence and maps it to a sequence of tokens. - -#. Bucketing - Sentences vary in length and when creating minibatches, we'd like sentences in them to have roughly the same length to minimize the number of ```` tokens and to maximize computational efficiency. This step groups sentences of roughly the same length into buckets. - -#. Batching and padding - Creates minibatches with a maximum number of tokens specified by ``model.{train_ds,validation_ds,test_ds}.tokens_in_batch`` from buckets and pads, so they can be packed into a tensor. - -To use ``SentenceDataset``, specify path to the training data in ``file_name`` in the experiment config file. Below is the list of all available configuration options: - -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **Parameter** | **Data Type** | **Default** | **Description** | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.file_name** | str | ``null`` | Path to the file with sentences. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.tokens_in_batch** | int | ``512`` | Maximum number of tokens per minibatch. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.max_seq_length** | int | ``512`` | Maximum sequence length, to be used with the ``clean`` argument below. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.clean** | bool | ``true`` | Whether to clean the dataset by discarding examples that are greater than ``max_seq_length``. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.shuffle** | bool | ``true`` | Whether to shuffle minibatches in the PyTorch DataLoader. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.num_samples** | int | ``-1`` | Number of samples to use. ``-1`` for the entire dataset. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.pin_memory** | bool | ``false`` | Whether to pin memory in the PyTorch DataLoader. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.num_workers** | int | ``8`` | Number of workers for the PyTorch DataLoader. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ - - -Model Configuration and Training --------------------------------- - -The overall model consists of an encoder and a classification head with the following configuration options: - -.. list-table:: *Transformer Encoder Network* - :widths: 30 5 5 60 - :header-rows: 1 - - * - Parameter - - Data Type - - Default - - Description - * - **model.encoder.max_sequence_length** - - int - - ``512`` - - Maximum allowed sequence length. - * - **model.encoder.learn_positional_encodings** - - bool - - ``false`` - - If ``true``, this is a regular learnable embedding layer. If ``false``, fixes position encodings to sinusoidal. - * - **model.encoder.hidden_size** - - int - - ``512`` - - Size of the transformer hidden states. - * - **model.encoder.num_layers** - - int - - ``6`` - - Number of transformer layers. - * - **model.encoder.inner_size** - - int - - ``2048`` - - Size of the hidden states within the feedforward layers. - * - **model.encoder.num_attention_heads** - - int - - ``8`` - - Number of attention heads. - * - **model.encoder.embedding_dropout** - - float - - ``0.1`` - - Dropout probability of the embedding layer. - * - **model.encoder.ffn_dropout** - - float - - ``0.1`` - - Dropout probability within the feedforward layers. - * - **model.encoder.attn_score_dropout** - - float - - ``0.1`` - - Dropout probability of the attention scores before softmax normalization. - * - **model.encoder.attn_layer_dropout** - - float - - ``0.1`` - - Dropout probability of the attention query, key, and value projection activations. - * - **model.encoder.hidden_act** - - str - - ``relu`` - - Activation function throughout the network. - * - **model.encoder.mask_future** - - bool - - ``true`` - - Whether to mask future timesteps for attention. Defaults to ``true`` for the standard left-to-right LM. - * - **model.encoder.pre_ln** - - bool - - ``false`` - - Whether to apply layer-normalization before (``true``) or after (``false``) a sub-layer. - -.. list-table:: *Head Network (multilayer perceptron)* - :widths: 30 5 5 60 - :header-rows: 1 - - * - Parameter - - Data Type - - Default - - Description - * - **model.head.num_layers** - - int - - ``1`` - - Number of layers in the head network. - * - **model.head.activation** - - str - - ``relu`` - - Activation function used after each layer. - * - **model.head.log_softmax** - - bool - - ``true`` - - Whether to apply ``log_softmax`` to the final layer output. - * - **model.head.dropout** - - float - - ``0.0`` - - Dropout probability after each layer. - - -Our pre-trained models are optimized with Adam, with a maximum learning of 0.001, beta of (0.9, 0.98), and inverse square root learning rate schedule from. The **model.optim** section sets the optimization parameters. - -The following script trains 6-layer Transformer LM: - -.. code :: - - python examples/nlp/language_modeling/transformer_lm.py \ - -cn transformer_lm_config \ - trainer.devices=2 \ - trainer.accelerator='gpu' \ - +exp_manager.exp_dir=/path/to/store/results \ - +exp_manager.create_checkpoint_callback=True \ - +exp_manager.checkpoint_callback_params.monitor=val_PPL \ - +exp_manager.checkpoint_callback_params.mode=min \ - +exp_manager.checkpoint_callback_params.save_top_k=5 \ - model.train_ds.file_name=/path/to/train.txt \ - model.validation_ds.file_name=/path/to/valid.txt \ - model.tokenizer.tokenizer_model=/path/to/yttm_tokenizer_model - -The trainer keeps track of the LM perplexity (PPL) on the provided validation set and saves the checkpoints that have the top 5 (by default) PPL. At the end of training, a ``.nemo`` file is written to the result directory which allows to run inference on a test set. - - -Tarred Datasets for Large Corpora ---------------------------------- - -When training with ``DistributedDataParallel``, each process has its own copy of the dataset. For large datasets, this may not always fit in CPU memory. `Webdatasets `__ circumvents this problem by efficiently iterating over tar files stored on disk. Each tar file can contain hundreds to thousands of pickle files, each containing a single minibatch. We recommend using this method when working with the datasets of more than 5 million sentences. - -To use an existing ``TarredSentenceDataset`` instead of a non-tarred ``SentenceDataset``, set ``is_tarred: true`` in -the experiment config file. Then, pass in the path to the metadata file in ``metadata_file`` and paths to all of the text tarballs in ``tar_files``, either as a list -of filepaths, e.g. ``['/data/shard1.tar', '/data/shard2.tar']``, or in a single brace-expandable string, e.g. -``'/data/shard_{1..64}.tar'`` or ``'/data/shard__OP_1..64_CL_'`` (recommended, see note below). - -.. note:: - For brace expansion, there may be cases where ``{x..y}`` syntax cannot be used due to shell interference. This occurs most commonly - inside SLURM scripts. Therefore, we provide a few equivalent replacements. Supported opening braces (equivalent to ``{``) are ``(``, - ``[``, ``<`` and the special tag ``_OP_``. Supported closing braces (equivalent to ``}``) are ``)``, ``]``, ``>`` and the special - tag ``_CL_``. For SLURM based tasks, we suggest the use of the special tags for ease of use. - -Tarred datasets for sentence-level LMs can be created with the following script: - -.. code:: - - python examples/nlp/machine_translation/create_tarred_monolingual_dataset.py \ - --pkl_file_prefix lm \ - --tokenizer_model /path/to/tokenizer_model \ - --fname /path/to/training_data \ - --out_dir /path/to/tarred_dataset \ - --tokens_in_batch 2048 \ - --num_batches_per_tarfile 250 - -For example, if your dataset contains 10000 batches, the script above will create 40 tarballs and the output directory will look similar to the following: - -.. code:: - - /path/to/tarred_dataset - ├── lm-batches.tokens.2048.1.tar - ├── lm-batches.tokens.2048.2.tar - ├── ... - ├── lm-batches.tokens.2048.40.tar - └── metadata.json - -To train the model on this dataset, the following parameters have to be specified in the **model.train_ds** section: - -.. code:: - - use_tarred_dataset: true - tar_files: /path/to/tarred_dataset/lm-batches.2048._OP_1..40_CL_ - metadata_fiel: /path/to/tarred_dataset/metadata.json - -Below is the full list of available configuration options for ``TarredSentenceDataset``: - -.. list-table:: - :widths: 30 5 5 60 - :header-rows: 1 - - * - Parameter - - Data Type - - Default - - Description - * - **model.{train_ds,validation_ds,test_ds}.use_tarred_dataset** - - bool - - ``false`` - - Whether to use tarred datasets. - * - **model.{train_ds,validation_ds,test_ds}.tar_files** - - str - - ``null`` - - Path to all tar files. Either a list or a single brace-expandable string. - * - **model.{train_ds,validation_ds,test_ds}.metadata_file** - - str - - ``null`` - - Path to JSON metadata file that contains only a single entry for the total number of batches in the dataset. - * - **model.{train_ds,validation_ds,test_ds}.tar_shuffle_n** - - int - - ``100`` - - How many samples to look ahead and load to be shuffled. - * - **model.{train_ds,validation_ds,test_ds}.shard_strategy** - - str - - ``scatter`` - - How the shards are distributed between multiple workers. Either ``scatter`` (each node gets a unique set of shards) or ``replicate`` (each node gets all of the set of shards available in the tarred dataset). - -References ----------- - -.. bibliography:: nlp_all.bib - :style: plain - :labelprefix: nlp-language_modeling - :keyprefix: nlp-language_modeling- diff --git a/SoundScribe/SpeakerID/docs/source/nlp/machine_translation/machine_translation.rst b/SoundScribe/SpeakerID/docs/source/nlp/machine_translation/machine_translation.rst deleted file mode 100644 index 190ac5b07da9a733b1abdd006fe0abed97278688..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/machine_translation/machine_translation.rst +++ /dev/null @@ -1,783 +0,0 @@ -.. _machine_translation: - -Machine Translation Models -========================== -Machine translation is the task of translating text from one language to another. For example, from English to Spanish. Models are -based on the Transformer sequence-to-sequence architecture :cite:`nlp-machine_translation-vaswani2017attention`. - -An example script on how to train the model can be found here: `NeMo/examples/nlp/machine_translation/enc_dec_nmt.py `__. -The default configuration file for the model can be found at: `NeMo/examples/nlp/machine_translation/conf/aayn_base.yaml `__. - -Quick Start Guide ------------------ - -.. code-block:: python - - from nemo.collections.nlp.models import MTEncDecModel - - # To get the list of pre-trained models - MTEncDecModel.list_available_models() - - # Download and load the a pre-trained to translate from English to Spanish - model = MTEncDecModel.from_pretrained("nmt_en_es_transformer24x6") - - # Translate a sentence or list of sentences - translations = model.translate(["Hello!"], source_lang="en", target_lang="es") - -Available Models -^^^^^^^^^^^^^^^^ - -.. list-table:: *Pretrained Models* - :widths: 5 10 - :header-rows: 1 - - * - Model - - Pretrained Checkpoint - * - *New Checkppoints* - - - * - English -> German - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_en_de_transformer24x6 - * - German -> English - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_de_en_transformer24x6 - * - English -> Spanish - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_en_es_transformer24x6 - * - Spanish -> English - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_es_en_transformer24x6 - * - English -> French - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_en_fr_transformer24x6 - * - French -> English - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_fr_en_transformer24x6 - * - English -> Russian - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_en_ru_transformer24x6 - * - Russian -> English - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_ru_en_transformer24x6 - * - English -> Chinese - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_en_zh_transformer24x6 - * - Chinese -> English - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_zh_en_transformer24x6 - * - *Old Checkppoints* - - - * - English -> German - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_en_de_transformer12x2 - * - German -> English - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_de_en_transformer12x2 - * - English -> Spanish - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_en_es_transformer12x2 - * - Spanish -> English - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_es_en_transformer12x2 - * - English -> French - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_en_fr_transformer12x2 - * - French -> English - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_fr_en_transformer12x2 - * - English -> Russian - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_en_ru_transformer6x6 - * - Russian -> English - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_ru_en_transformer6x6 - * - English -> Chinese - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_en_zh_transformer6x6 - * - Chinese -> English - - https://ngc.nvidia.com/catalog/models/nvidia:nemo:nmt_zh_en_transformer6x6 - -Data Format ------------ - -Supervised machine translation models require parallel corpora which comprises many examples of sentences in a source language and -their corresponding translation in a target language. We use parallel data formatted as separate text files for source and target -languages where sentences in corresponding files are aligned like in the table below. - -.. list-table:: *Parallel Coprus* - :widths: 10 10 - :header-rows: 1 - - * - train.english.txt - - train.spanish.txt - * - Hello . - - Hola . - * - Thank you . - - Gracias . - * - You can now translate from English to Spanish in NeMo . - - Ahora puedes traducir del inglés al español en NeMo . - -It is common practice to apply data cleaning, normalization, and tokenization to the data prior to training a translation model and -NeMo expects already cleaned, normalized, and tokenized data. The only data pre-processing NeMo does is subword tokenization with BPE -:cite:`nlp-machine_translation-sennrich2015neural`. - -Data Cleaning, Normalization & Tokenization -------------------------------------------- - -We recommend applying the following steps to clean, normalize, and tokenize your data. All pre-trained models released, apply these data pre-processing steps. - -#. Please take a look at a detailed notebook on best practices to pre-process and clean your datasets - NeMo/tutorials/nlp/Data_Preprocessing_and_Cleaning_for_NMT.ipynb - -#. Language ID filtering - This step filters out examples from your training dataset that aren't in the correct language. For example, - many datasets contain examples where source and target sentences are in the same language. You can use a pre-trained language ID - classifier from `fastText `__. Install fastText and then you can then run our script using the - ``lid.176.bin`` model downloaded from the fastText website. - - .. code :: - - python NeMo/scripts/neural_machine_translation/filter_langs_nmt.py \ - --input-src train.en \ - --input-tgt train.es \ - --output-src train_lang_filtered.en \ - --output-tgt train_lang_filtered.es \ - --source-lang en \ - --target-lang es \ - --removed-src train_noise.en \ - --removed-tgt train_noise.es \ - --fasttext-model lid.176.bin - -#. Length filtering - We filter out sentences from the data that are below a minimum length (1) or exceed a maximum length (250). We - also filter out sentences where the ratio between source and target lengths exceeds 1.3 except for English <-> Chinese models. - `Moses `__ is a statistical machine translation toolkit that contains many useful - pre-processing scripts. - - .. code :: - - perl mosesdecoder/scripts/training/clean-corpus-n.perl -ratio 1.3 train en es train.filter 1 250 - -#. Data cleaning - While language ID filtering can sometimes help with filtering out noisy sentences that contain too many punctuations, - it does not help in cases where the translations are potentially incorrect, disfluent, or incomplete. We use `bicleaner `__ - a tool to identify such sentences. It trains a classifier based on many features included pre-trained language model fluency, word - alignment scores from a word-alignment model like `Giza++ `__ etc. We use their available - pre-trained models wherever possible and train models ourselves using their framework for remaining languages. The following script - applies a pre-trained bicleaner model to the data and pick sentences that are clean with probability > 0.5. - - .. code :: - - awk '{print "-\t-"}' train.en \ - | paste -d "\t" - train.filter.en train.filter.es \ - | bicleaner-classify - -
> train.en-es.bicleaner.score - -#. Data deduplication - We use `bifixer `__ (which uses xxHash) to hash the source and target - sentences based on which we remove duplicate entries from the file. You may want to do something similar to remove training examples - that are in the test dataset. - - .. code :: - - cat train.en-es.bicleaner.score \ - | parallel -j 25 --pipe -k -l 30000 python bifixer.py --ignore-segmentation -q - - en es \ - > train.en-es.bifixer.score - - awk -F awk -F "\t" '!seen[$6]++' train.en-es.bifixer.score > train.en-es.bifixer.dedup.score - -#. Filter out data that bifixer assigns probability < 0.5 to. - - .. code :: - - awk -F "\t" '{ if ($5>0.5) {print $3}}' train.en-es.bifixer.dedup.score > train.cleaned.en - awk -F "\t" '{ if ($5>0.5) {print $4}}' train.en-es.bifixer.dedup.score > train.cleaned.es - -#. Punctuation Normalization - Punctuation, especially things like quotes can be written in different ways. - It's often useful to normalize the way they appear in text. We use the moses punctuation normalizer on all languages except Chinese. - - .. code :: - - perl mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l es < train.cleaned.es > train.normalized.es - perl mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l en < train.cleaned.en > train.normalized.en - - For example: - - .. code :: - - Before - Aquí se encuentran joyerías como Tiffany`s entre negocios tradicionales suizos como la confitería Sprüngli. - After - Aquí se encuentran joyerías como Tiffany's entre negocios tradicionales suizos como la confitería Sprüngli. - -#. Tokenization and word segmentation for Chinese - Naturally written text often contains punctuation markers like commas, full-stops - and apostrophes that are attached to words. Tokenization by just splitting a string on spaces will result in separate token IDs for - very similar items like ``NeMo`` and ``NeMo.``. Tokenization splits punctuation from the word to create two separate tokens. In the - previous example ``NeMo.`` becomes ``NeMo .`` which when split by space, results in two tokens and addresses the earlier problem. - - For example: - - .. code :: - - Before - Especialmente porque se enfrentará "a Mathieu (Debuchy), Yohan (Cabaye) y Adil (Rami) ", recuerda. - After - Especialmente porque se enfrentará " a Mathieu ( Debuchy ) , Yohan ( Cabaye ) y Adil ( Rami ) " , recuerda . - - We use the Moses tokenizer for all languages except Chinese. - - .. code :: - - perl mosesdecoder/scripts/tokenizer/tokenizer.perl -l es -no-escape < train.normalized.es > train.tokenized.es - perl mosesdecoder/scripts/tokenizer/tokenizer.perl -l en -no-escape < train.normalized.en > train.tokenized.en - - For languages like Chinese where there is no explicit marker like spaces that separate words, we use `Jieba `__ to segment a string into words that are space separated. - - For example: - - .. code :: - - Before - 同时,卫生局认为有必要接种的其他人员,包括公共部门,卫生局将主动联络有关机构取得名单后由卫生中心安排接种。 - After - 同时 , 卫生局 认为 有 必要 接种 的 其他 人员 , 包括 公共部门 , 卫生局 将 主动 联络 有关 机构 取得 名单 后 由 卫生 中心 安排 接种 。 - -Training a BPE Tokenization ---------------------------- - -Byte-pair encoding (BPE) :cite:`nlp-machine_translation-sennrich2015neural` is a sub-word tokenization algorithm that is commonly used -to reduce the large vocabulary size of datasets by splitting words into frequently occuring sub-words. Currently, Machine translation -only supports the `YouTokenToMe `__ BPE tokenizer. One can set the tokenization configuration -as follows: - -+-----------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------+ -| **Parameter** | **Data Type** | **Default** | **Description** | -+-----------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------+ -| **model.{encoder_tokenizer,decoder_tokenizer}.tokenizer_name** | str | ``yttm`` | BPE library name. Only supports ``yttm`` for now. | -+-----------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------+ -| **model.{encoder_tokenizer,decoder_tokenizer}.tokenizer_model** | str | ``null`` | Path to an existing YTTM BPE model. If ``null``, will train one from scratch on the provided data. | -+-----------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------+ -| **model.{encoder_tokenizer,decoder_tokenizer}.vocab_size** | int | ``null`` | Desired vocabulary size after BPE tokenization. | -+-----------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------+ -| **model.{encoder_tokenizer,decoder_tokenizer}.bpe_dropout** | float | ``null`` | BPE dropout probability. :cite:`nlp-machine_translation-provilkov2019bpe`. | -+-----------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------+ -| **model.{encoder_tokenizer,decoder_tokenizer}.vocab_file** | str | ``null`` | Path to pre-computed vocab file if exists. | -+-----------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------+ -| **model.shared_tokenizer** | bool | ``True`` | Whether to share the tokenizer between the encoder and decoder. | -+-----------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------+ - - -Applying BPE Tokenization, Batching, Bucketing and Padding ----------------------------------------------------------- - -Given BPE tokenizers, and a cleaned parallel corpus, the following steps are applied to create a `TranslationDataset `__ object. - -#. Text to IDs - This performs subword tokenization with the BPE model on an input string and maps it to a sequence of tokens for the - source and target text. - -#. Bucketing - Sentences vary in length and when creating minibatches, we'd like sentences in them to have roughly the same length to - minimize the number of ```` tokens and to maximize computational efficiency. This step groups sentences roughly the same length - into buckets. - -#. Batching and padding - Creates minibatches with a maximum number of tokens specified by ``model.{train_ds,validation_ds,test_ds}.tokens_in_batch`` - from buckets and pads, so they can be packed into a tensor. - -Datasets can be configured as follows: - -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **Parameter** | **Data Type** | **Default** | **Description** | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.src_file_name** | str | ``null`` | Path to the source language file. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.tgt_file_name** | str | ``null`` | Path to the target language file. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.tokens_in_batch** | int | ``512`` | Maximum number of tokens per minibatch. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.clean** | bool | ``true`` | Whether to clean the dataset by discarding examples that are greater than ``max_seq_length``. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.max_seq_length** | int | ``512`` | Maximum sequence to be used with the ``clean`` argument above. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.shuffle** | bool | ``true`` | Whether to shuffle minibatches in the PyTorch DataLoader. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.num_samples** | int | ``-1`` | Number of samples to use. ``-1`` for the entire dataset. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.drop_last** | bool | ``false`` | Drop last minibatch if it is not of equal size to the others. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.pin_memory** | bool | ``false`` | Whether to pin memory in the PyTorch DataLoader. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.num_workers** | int | ``8`` | Number of workers for the PyTorch DataLoader. | -+-------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------------+ - - -Tarred Datasets for Large Corpora ---------------------------------- - -When training with ``DistributedDataParallel``, each process has its own copy of the dataset. For large datasets, this may not always -fit in CPU memory. `Webdatasets `__ circumvents this problem by efficiently iterating over -tar files stored on disk. Each tar file can contain hundreds to thousands of pickle files, each containing a single minibatch. - -We recommend using this method when working with datasets with > 1 million sentence pairs. - -Tarred datasets can be configured as follows: - -+-----------------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------+ -| **Parameter** | **Data Type** | **Default** | **Description** | -+-----------------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.use_tarred_dataset** | bool | ``false`` | Whether to use tarred datasets. | -+-----------------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.tar_files** | str | ``null`` | String specifying path to all tar files. Example with 100 tarfiles ``/path/to/tarfiles._OP_1..100_CL_.tar``. | -+-----------------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.metadata_file** | str | ``null`` | Path to JSON metadata file that contains only a single entry for the total number of batches in the dataset. | -+-----------------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.lines_per_dataset_fragment** | int | ``1000000`` | Number of lines to consider for bucketing and padding. | -+-----------------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.num_batches_per_tarfile** | int | ``100`` | Number of batches (pickle files) within each tarfile. | -+-----------------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.tar_shuffle_n** | int | ``100`` | How many samples to look ahead and load to be shuffled. | -+-----------------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------+ -| **model.{train_ds,validation_ds,test_ds}.shard_strategy** | str | ``scatter`` | How the shards are distributed between multiple workers. | -+-----------------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------+ -| **model.preproc_out_dir** | str | ``null`` | Path to folder that contains processed tar files or directory where new tar files are written. | -+-----------------------------------------------------------------------+-----------------+----------------+----------------------------------------------------------------------------------------------------------------+ - -Tarred datasets can be created in two ways: - -#. Using the Hydra config and `training script `__. - - For example: - - .. code :: - - python examples/nlp/machine_translation/enc_dec_nmt.py \ - -cn aayn_base \ - do_training=false \ - model.preproc_out_dir=/path/to/preproc_dir \ - model.train_ds.use_tarred_dataset=true \ - model.train_ds.lines_per_dataset_fragment=1000000 \ - model.train_ds.num_batches_per_tarfile=200 \ - model.train_ds.src_file_name=train.tokenized.en \ - model.train_ds.tgt_file_name=train.tokenized.es \ - model.validation_ds.src_file_name=validation.tokenized.en \ - model.validation_ds.tgt_file_name=validation.tokenized.es \ - model.encoder_tokenizer.vocab_size=32000 \ - model.decoder_tokenizer.vocab_size=32000 \ - ~model.test_ds \ - trainer.devices=[0,1,2,3] \ - trainer.accelerator='gpu' \ - +trainer.fast_dev_run=true \ - exp_manager=null \ - - The above script processes the parallel tokenized text files into tarred datasets that are written to ``/path/to/preproc_dir``. Since - ``do_training`` is set to ``False``, the above script only creates tarred datasets and then exits. If ``do_training`` is set ``True``, - then one of two things happen: - - (a) If no tar files are present in ``model.preproc_out_dir``, the script first creates those files and then commences training. - (b) If tar files are already present in ``model.preproc_out_dir``, the script starts training from the provided tar files. - -#. Using a separate script without Hydra. - - Tarred datasets for parallel corpora can also be created with a script that doesn't require specifying a configs via Hydra and - just uses Python argparse. - - For example: - - .. code :: - - python examples/nlp/machine_translation/create_tarred_parallel_dataset.py \ - --shared_tokenizer \ - --clean \ - --bpe_dropout 0.1 \ - --src_fname train.tokenized.en \ - --tgt_fname train.tokenized.es \ - --out_dir /path/to/preproc_dir \ - --vocab_size 32000 \ - --max_seq_length 512 \ - --min_seq_length 1 \ - --tokens_in_batch 8192 \ - --lines_per_dataset_fragment 1000000 \ - --num_batches_per_tarfile 200 - - You can then set `model.preproc_out_dir=/path/to/preproc_dir` and `model.train_ds.use_tarred_dataset=true` to train with this data. - -Model Configuration and Training --------------------------------- - -The overall model consists of an encoder, decoder, and classification head. Encoders and decoders have the following configuration -options: - -+-------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------------------------------------------------------------------+ -| **Parameter** | **Data Type** | **Default** | **Description** | -+-------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------------------------------------------------------------------+ -| **model.{encoder,decoder}.max_sequence_length** | int | ``512`` | Maximum sequence length of positional encodings. | -+-------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------------------------------------------------------------------+ -| **model.{encoder,decoder}.embedding_dropout** | float | ``0.1`` | Path to JSON metadata file that contains only a single entry for the total number of batches in the dataset. | -+-------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------------------------------------------------------------------+ -| **model.{encoder,decoder}.learn_positional_encodings** | bool | ``false`` | If ``True``, this is a regular learnable embedding layer. If ``False``, fixes position encodings to sinusoidal. | -+-------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------------------------------------------------------------------+ -| **model.{encoder,decoder}.hidden_size** | int | ``512`` | Size of the transformer hidden states. | -+-------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------------------------------------------------------------------+ -| **model.{encoder,decoder}.num_layers** | int | ``6`` | Number of transformer layers. | -+-------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------------------------------------------------------------------+ -| **model.{encoder,decoder}.inner_size** | int | ``2048`` | Size of the hidden states within the feedforward layers. | -+-------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------------------------------------------------------------------+ -| **model.{encoder,decoder}.num_attention_heads** | int | ``8`` | Number of attention heads. | -+-------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------------------------------------------------------------------+ -| **model.{encoder,decoder}.ffn_dropout** | float | ``0.1`` | Dropout probability within the feedforward layers. | -+-------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------------------------------------------------------------------+ -| **model.{encoder,decoder}.attn_score_dropout** | float | ``0.1`` | Dropout probability of the attention scores before softmax normalization. | -+-------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------------------------------------------------------------------+ -| **model.{encoder,decoder}.attn_layer_dropout** | float | ``0.1`` | Dropout probability of the attention query, key, and value projection activations. | -+-------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------------------------------------------------------------------+ -| **model.{encoder,decoder}.hidden_act** | str | ``relu`` | Activation function throughout the network. | -+-------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------------------------------------------------------------------+ -| **model.{encoder,decoder}.mask_future** | bool | ``false``, ``true`` | Whether to mask future timesteps for attention. Defaults to ``True`` for decoder and ``False`` for encoder. | -+-------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------------------------------------------------------------------+ -| **model.{encoder,decoder}.pre_ln** | bool | ``false`` | Whether to apply layer-normalization before (``true``) or after (``false``) a sub-layer. | -+-------------------------------------------------------------------+-----------------+-----------------------+-----------------------------------------------------------------------------------------------------------------+ - -Our pre-trained models are optimized with Adam, with a maximum learning of 0.0004, beta of (0.9, 0.98), and inverse square root learning -rate schedule from :cite:`nlp-machine_translation-vaswani2017attention`. The **model.optim** section sets the optimization parameters. - -The following script creates tarred datasets based on the provided parallel corpus and trains a model based on the ``base`` configuration -from :cite:`nlp-machine_translation-vaswani2017attention`. - -.. code :: - - python examples/nlp/machine_translation/enc_dec_nmt.py \ - -cn aayn_base \ - do_training=true \ - trainer.devices=8 \ - trainer.accelerator='gpu' \ - ~trainer.max_epochs \ - +trainer.max_steps=100000 \ - +trainer.val_check_interval=1000 \ - +exp_manager.exp_dir=/path/to/store/results \ - +exp_manager.create_checkpoint_callback=True \ - +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \ - +exp_manager.checkpoint_callback_params.mode=max \ - +exp_manager.checkpoint_callback_params.save_top_k=5 \ - model.preproc_out_dir=/path/to/preproc_dir \ - model.train_ds.use_tarred_dataset=true \ - model.train_ds.lines_per_dataset_fragment=1000000 \ - model.train_ds.num_batches_per_tarfile=200 \ - model.train_ds.src_file_name=train.tokenized.en \ - model.train_ds.tgt_file_name=train.tokenized.es \ - model.validation_ds.src_file_name=validation.tokenized.en \ - model.validation_ds.tgt_file_name=validation.tokenized.es \ - model.encoder_tokenizer.vocab_size=32000 \ - model.decoder_tokenizer.vocab_size=32000 \ - ~model.test_ds \ - -The trainer keeps track of the sacreBLEU score :cite:`nlp-machine_translation-post2018call` on the provided validation set and saves -the checkpoints that have the top 5 (by default) sacreBLEU scores. - -At the end of training, a ``.nemo`` file is written to the result directory which allows to run inference on a test set. - -Multi-Validation ----------------- - -To run validation on multiple datasets, specify ``validation_ds.src_file_name`` and ``validation_ds.tgt_file_name`` with a list of file paths: - -.. code-block:: bash - - model.validation_ds.src_file_name=[/data/wmt13-en-de.src,/data/wmt14-en-de.src] \ - model.validation_ds.tgt_file_name=[/data/wmt13-en-de.ref,/data/wmt14-en-de.ref] \ - -When using ``val_loss`` or ``val_sacreBLEU`` for the ``exp_manager.checkpoint_callback_params.monitor`` -then the 0th indexed dataset will be used as the monitor. - -To use other indexes, append the index: - -.. code-block:: bash - - exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU_dl_index_1 - -Multiple test datasets work exactly the same way as validation datasets, simply replace ``validation_ds`` by ``test_ds`` in the above examples. - -Bottleneck Models and Latent Variable Models (VAE, MIM) -------------------------------------------------------- - -NMT with bottleneck encoder architecture is also supported (i.e., fixed size bottleneck), along with the training of Latent Variable Models (currently VAE, and MIM). - -1. Supported learning frameworks (**model.model_type**): - * NLL - Conditional cross entropy (the usual NMT loss) - * VAE - Variational Auto-Encoder (`paper `_) - * MIM - Mutual Information Machine (`paper `_) -2. Supported encoder architectures (**model.encoder.arch**): - * seq2seq - the usual transformer encoder without a bottleneck - * bridge - attention bridge bottleneck (`paper `_) - * perceiver - Perceiver bottleneck (`paper `_) - - -+----------------------------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------+ -| **Parameter** | **Data Type** | **Default** | **Description** | -+========================================+================+==============+=======================================================================================================+ -| **model.model_type** | str | ``nll`` | Learning (i.e., loss) type: nll (i.e., cross-entropy/auto-encoder), mim, vae (see description above) | -+----------------------------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------+ -| **model.min_logv** | float | ``-6`` | Minimal allowed log variance for mim | -+----------------------------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------+ -| **model.latent_size** | int | ``-1`` | Dimension of latent (projected from hidden) -1 will take value of hidden size | -+----------------------------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------+ -| **model. non_recon_warmup_batches** | bool | ``200000`` | Warm-up steps for mim, and vae losses (anneals non-reconstruction part) | -+----------------------------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------+ -| **model. recon_per_token** | bool | ``true`` | When false reconstruction is computed per sample, not per token | -+----------------------------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------+ -| **model.encoder.arch** | str | ``seq2seq`` | Supported architectures: ``seq2seq``, ``bridge``, ``perceiver`` (see description above). | -+----------------------------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------+ -| **model.encoder.hidden_steps** | int | ``32`` | Fixed number of hidden steps | -+----------------------------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------+ -| **model.encoder.hidden_blocks** | int | ``1`` | Number of repeat blocks (see classes for description) | -+----------------------------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------+ -| **model.encoder. hidden_init_method** | str | ``default`` | See classes for available values | -+----------------------------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------+ - - -Detailed description of config parameters: - -* **model.encoder.arch=seq2seq** - * *model.encoder.hidden_steps is ignored* - * *model.encoder.hidden_blocks is ignored* - * *model.encoder.hidden_init_method is ignored* -* **model.encoder.arch=bridge** - * *model.encoder.hidden_steps:* input is projected to the specified fixed steps - * *model.encoder.hidden_blocks:* number of encoder blocks to repeat after attention bridge projection - * *model.encoder.hidden_init_method:* - * enc_shared (default) - apply encoder to inputs, than attention bridge, followed by hidden_blocks number of the same encoder (pre and post encoders share parameters) - * identity - apply attention bridge to inputs, followed by hidden_blocks number of the same encoder - * enc - similar to enc_shared but the initial encoder has independent parameters -* **model.encoder.arch=perceiver** - * *model.encoder.hidden_steps:* input is projected to the specified fixed steps - * *model.encoder.hidden_blocks:* number of cross-attention + self-attention blocks to repeat after initialization block (all self-attention and cross-attention share parameters) - * *model.encoder.hidden_init_method:* - * params (default) - hidden state is initialized with learned parameters followed by cross-attention with independent parameters - * bridge - hidden state is initialized with an attention bridge - - -Training requires the use of the following script (instead of ``enc_dec_nmt.py``): - -.. code :: - - python -- examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py \ - --config-path=conf \ - --config-name=aayn_bottleneck \ - ... - model.model_type=nll \ - model.non_recon_warmup_batches=7500 \ - model.encoder.arch=perceiver \ - model.encoder.hidden_steps=32 \ - model.encoder.hidden_blocks=2 \ - model.encoder.hidden_init_method=params \ - ... - - -Model Inference ---------------- - -To generate translations on a test set and compute sacreBLEU scores, run the inference script: - -.. code :: - - python examples/nlp/machine_translation/nmt_transformer_infer.py \ - --model /path/to/model.nemo \ - --srctext test.en \ - --tgtout test.en-es.translations \ - --batch_size 128 \ - --source_lang en \ - --target_lang es - -The ``--srctext`` file must be provided before tokenization and normalization. The resulting ``--tgtout`` file is detokenized and -can be used to compute sacreBLEU scores. - -.. code :: - - cat test.en-es.translations | sacrebleu test.es - -Inference Improvements ----------------------- - -In practice, there are a few commonly used techniques at inference to improve translation quality. NeMo implements: - -1) Model Ensembling -2) Shallow Fusion decoding with transformer language models :cite:`nlp-machine_translation-gulcehre2015using` -3) Noisy-channel re-ranking :cite:`nlp-machine_translation-yee2019simple` - -(a) Model Ensembling - Given many models trained with the same encoder and decoder tokenizer, it is possible to ensemble their predictions (by averaging probabilities at each step) to generate better translations. - -.. math:: - - P(y_t|y_{`__ -or `Megatron-LM `__ -can be used to to train NeMo NMT models. - -The ``library`` flag takes values: ``huggingface``, ``megatron``, and ``nemo``. - -The ``model_name`` flag is used to indicate a *named* model architecture. -For example, we can use ``bert_base_cased`` from HuggingFace or ``megatron-bert-345m-cased`` from Megatron-LM. - -The ``pretrained`` flag indicates whether or not to download the pretrained weights (``pretrained=True``) or -instantiate the same model architecture with random weights (``pretrained=False``). - -To use a custom model architecture from a specific library, use ``model_name=null`` and then add the -custom configuration under the ``encoder`` configuration. - -HuggingFace -^^^^^^^^^^^ - -We have provided a `HuggingFace config file `__ -to use with HuggingFace encoders. - -To use the config file from CLI: - -.. code :: - - --config-path=conf \ - --config-name=huggingface \ - -As an example, we can configure the NeMo NMT encoder to use ``bert-base-cased`` from HuggingFace -by using the ``huggingface`` config file and setting - -.. code :: - - model.encoder.pretrained=true \ - model.encoder.model_name=bert-base-cased \ - -To use a custom architecture from HuggingFace we can use - -.. code :: - - +model.encoder._target_=transformers.BertConfig \ - +model.encoder.hidden_size=1536 \ - -Note the ``+`` symbol is needed if we're not adding the arguments to the YAML config file. - -Megatron -^^^^^^^^ - -We have provided a `Megatron config file `__ -to use with Megatron encoders. - -To use the config file from CLI: - -.. code :: - - --config-path=conf \ - --config-name=megatron \ - -The ``checkpoint_file`` should be the path to Megatron-LM checkpoint: - -.. code :: - - /path/to/your/megatron/checkpoint/model_optim_rng.pt - -In case your megatron model requires model parallelism, then ``checkpoint_file`` should point to the directory containing the -standard Megatron-LM checkpoint format: - -.. code :: - - 3.9b_bert_no_rng - ├── mp_rank_00 - │ └── model_optim_rng.pt - ├── mp_rank_01 - │ └── model_optim_rng.pt - ├── mp_rank_02 - │ └── model_optim_rng.pt - └── mp_rank_03 - └── model_optim_rng.pt - -As an example, to train a NeMo NMT model with a 3.9B Megatron BERT encoder, -we would use the following encoder configuration: - -.. code :: - - model.encoder.checkpoint_file=/path/to/megatron/checkpoint/3.9b_bert_no_rng \ - model.encoder.hidden_size=2560 \ - model.encoder.num_attention_heads=40 \ - model.encoder.num_layers=48 \ - model.encoder.max_position_embeddings=512 \ - -To train a Megatron 345M BERT, we would use - -.. code :: - - model.encoder.model_name=megatron-bert-cased \ - model.encoder.checkpoint_file=/path/to/your/megatron/checkpoint/model_optim_rng.pt \ - model.encoder.hidden_size=1024 \ - model.encoder.num_attention_heads=16 \ - model.encoder.num_layers=24 \ - model.encoder.max_position_embeddings=512 \ - -If the pretrained megatron model used a custom vocab file, then set: - -.. code:: - - model.encoder_tokenizer.vocab_file=/path/to/your/megatron/vocab_file.txt - model.encoder.vocab_file=/path/to/your/megatron/vocab_file.txt - - -Use ``encoder.model_name=megatron_bert_uncased`` for uncased models with custom vocabularies and -use ``encoder.model_name=megatron_bert_cased`` for cased models with custom vocabularies. - - -References ----------- - -.. bibliography:: ../nlp_all.bib - :style: plain - :labelprefix: nlp-machine_translation - :keyprefix: nlp-machine_translation- diff --git a/SoundScribe/SpeakerID/docs/source/nlp/megatron.rst b/SoundScribe/SpeakerID/docs/source/nlp/megatron.rst deleted file mode 100644 index 743aa2f84b536fae71c666d653c75ee949c1149d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/megatron.rst +++ /dev/null @@ -1,183 +0,0 @@ -.. _megatron_finetuning: - -NeMo Megatron -============= - -Megatron-LM :cite:`nlp-megatron-shoeybi2019megatron` is a large, powerful transformer developed by the Applied Deep Learning Research -team at NVIDIA. Currently NeMo Megatron supports 3 types of models: - -* GPT-style models (decoder only) -* T5/BART-style models (encoder-decoder) -* BERT-style models (encoder only) - -.. note:: - We recommend using `NeMo Megatron containers `_ for pre-training, tuning and running inference with large (1B and above) Megatrons. - - -Model Parallelism ------------------ - -`Megatron-LM `_ is a highly optimized and efficient library for training large language models. -With Megatron model parallelism, language models can be trained with billions of weights and then used in NeMo for downstream tasks. - -NeMo handles pretrained model parallel checkpoints from Megatron-LM automatically and model parallel models in NeMo have the all -the same features as other NeMo Models. - -.. note:: - - Currently, NeMo only supports tensor model parallelism. - -Training -^^^^^^^^ - -All of the necessary logic to train model parallel models in NeMo with PyTorch Lightning is contained in the ``NLPDDPStrategy``. -The ``NLPDDPStrategy`` subclasses the PyTorch Lightning strategy type ``DDPStrategy``. -See `strategies `_ for more information on PyTorch Lightning Strategies - -To enable model parallel training in NeMo: - -.. code-block:: python - - trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) - -Megatron-LM checkpoints have a specific format. One checkpoint is saved for each model parallel rank: - -.. code-block:: bash - - iter_0080000/ - ├── mp_rank_00 - │ └── model_optim_rng.pt - └── mp_rank_01 - └── model_optim_rng.pt - - -To start fine-tuning from a Megatron-LM checkpoint, simply pass the path to the Megatron-LM checkpoint -via the language model config: - -.. code-block:: bash - - model.language_model.lm_checkpoint=/raid/megatron/bert/iter_0080000 \ - -We also need to input the model configuration. This can be done via json: - -.. code-block:: json - - { - "hidden-size": 1024, - "num-attention-heads": 16, - "num-layers": 24, - "max-seq-length": 512 - } - -And input via command line: - -.. code-block:: bash - - model.language_model.config_file=/raid/data/megatron/bert/config.json \ - -Or the model configuration can be input via YAML: - -.. code-block:: YAML - - model: - language_model: - config: - hidden_size: 1024 - num_attention_heads: 16 - num_layers: 24 - max_position_embeddings: 512 - -Additionally, Megatron-LM requires a vocab file: - -.. code-block:: bash - - model.tokenizer.vocab_file=/path/to/vocab.txt - -If using the Megatron-LM default tokenizer for training BERT the vocab file can be omitted: - -.. code-block:: bash - - # uncased model - model.tokenizer.tokenizer_name=megatron-bert-uncased - -.. code-block:: bash - - # cased model - model.tokenizer.tokenizer_name=megatron-bert-uncased - -Auto-Resume -^^^^^^^^^^^ - -Resuming training with NeMo experiment manager and PyTorch Lightning works exactly the same as other NeMo models. -While training with PTL, model parallel checkpoint will be saved and loaded properly. - -.. code-block:: bash - - checkpoints/ - ├── mp_rank_00 - │ ├── mp_autoresume-last.ckpt - │ ├── mp_autoresume---val_loss=0.35-epoch=0.ckpt - │ ├── mp_autoresume---val_loss=0.38-epoch=1.ckpt - │ └── mp_autoresume---val_loss=0.39-epoch=2.ckpt - └── mp_rank_01 - ├── mp_autoresume-last.ckpt - ├── mp_autoresume---val_loss=0.35-epoch=0.ckpt - ├── mp_autoresume---val_loss=0.38-epoch=1.ckpt - └── mp_autoresume---val_loss=0.39-epoch=2.ckpt - -Save and Restore -^^^^^^^^^^^^^^^^ - -Model parallel .nemo files behave the same as all other .nemo files. Calling ``.save_to`` will save -a checkpoint for each model parallel rank inside the .nemo file: - -.. code-block:: bash - - text_class_350m - ├── megatron-bert-uncased_encoder_config.json - ├── megatron_checkpoint_version.json - ├── model_config.yaml - ├── mp_rank_00 - │ └── model_weights.ckpt - ├── mp_rank_01 - │ └── model_weights.ckpt - ├── tokenizer_vocab_dict.json - └── tokenizer.vocab_file - -When restoring a model parallel .nemo file, we must pass in the ``Trainer`` as model parallel requires DDP: - -.. code-block:: python - - model = TokenClassificationModel.restore_from(cfg.pretrained_model, trainer=trainer) - -Evaluation -^^^^^^^^^^ - -Since model parallel models always require more than one GPU, the ``Trainer`` is needed for evaluation: - -.. code-block:: python - - trainer = pl.Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) - - model = TextClassificationModel.restore_from(cfg.model.nemo_path, trainer=trainer) - model.setup_test_data(test_data_config=cfg.model.test_ds) - - trainer.test(model=model, ckpt_path=None) - -BioMegatron ------------ - -BioMegatron has the same network architecture as the Megatron-LM, but is pretrained on a different dataset - `PubMed `_, -a large biomedical text corpus, which achieves better performance in biomedical downstream tasks than the original Megatron-LM. - -Examples of using BioMegatron on biomedical downstream tasks can be found at (can be executed with `Google's Colab `_): -`NeMo/tutorials/nlp/Relation_Extraction-BioMegatron.ipynb `__ and `NeMo/tutorials/nlp/Token_Classification-BioMegatron.ipynb `__. - - -References ----------- - -.. bibliography:: nlp_all.bib - :style: plain - :labelprefix: NLP-MEGATRON - :keyprefix: nlp-megatron- diff --git a/SoundScribe/SpeakerID/docs/source/nlp/megatron_onnx_export.rst b/SoundScribe/SpeakerID/docs/source/nlp/megatron_onnx_export.rst deleted file mode 100644 index ee6138d1f912af0b45b66778f8702cc17807322e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/megatron_onnx_export.rst +++ /dev/null @@ -1,47 +0,0 @@ -.. _megatron_onnx_export: - -ONNX Export of Megatron Models -==================================== - -This guide demonstrates the usage of the ONNX export functionality for Megatron models. - -Requirements ------------------ -Set up the development environment by launching the latest `NeMo container `_ - -The minimum version requirements for NeMo and TransformerEngine are below - -.. code-block:: bash - - nemo > 1.19 - transformer_engine > 0.10 - -Export to ONNX ------------------ -The export script supports the ONNX export of models with .nemo and .ckpt file extensions. The script also supports the export of the following types of models: GPT, T5, BERT, BART, NMT, RETRO. -Commands for both file formats are discussed in the following sections. The model type used for the examples is GPT. - - -Export using .nemo file -^^^^^^^^^^^^^^^^^^^^^^^^ -A model with .nemo file extension can be exported using the command below - -.. code-block:: bash - - python3 examples/nlp/language_modeling/megatron_export.py \ - model_type=gpt \ - onnx_model_file=gpt_126m.onnx \ - gpt_model_file=gpt_126m.nemo - -Export using .ckpt file -^^^^^^^^^^^^^^^^^^^^^^^^ -A model with .ckpt file extension can be exported using the command below - -.. code-block:: bash - - python3 examples/nlp/language_modeling/megatron_export.py \ - model_type=gpt \ - onnx_model_file=gpt_126m.onnx \ - checkpoint_dir=./gpt_126m/ \ - checkpoint_name=model_weights.ckpt \ - hparams_file=./gpt_126m/hparams.yaml \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/nlp/models.rst b/SoundScribe/SpeakerID/docs/source/nlp/models.rst deleted file mode 100644 index ad50d976db9fd822323e84689819c42aed9b3c39..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/models.rst +++ /dev/null @@ -1,25 +0,0 @@ -.. _nlp_models: - -Tasks -===== - -NeMo's NLP collection supports provides the following task-specific models: - -.. toctree:: - :maxdepth: 1 - - punctuation_and_capitalization_models - spellchecking_asr_customization - token_classification - joint_intent_slot - text_classification - bert_pretraining - language_modeling - nemo_megatron/prompt_learning - question_answering - dialogue - glue_benchmark - information_retrieval - entity_linking - nlp_model - machine_translation/machine_translation diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/batching.rst b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/batching.rst deleted file mode 100644 index b7d6ea21306771de1eb90bb58f0578c8c8dbeb82..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/batching.rst +++ /dev/null @@ -1,21 +0,0 @@ -.. _batching: - -Batching --------- - -Batch size is one of the first parameters you should play with. For efficiency and convergence reasons we recommend you first try maximizing your batch size per GPU so that your GPU RAM usage is maximized. - -NeMo Megatron uses the following concepts. - -*Micro batch size* is the number of examples per data parallel rank. It is controlled by ``model.micro_batch_size`` parameter. - -*Global batch size* = micro_batch_size * data_parallel_size * gradient_accumulation_steps. For details on ``data_parallel_size`` see :ref:`parallelisms` section, but typically it is equal to the number of GPUs being used. -Global batch size is controlled by ``model.global_batch_size`` parameter. - - -*Gradient Accumulation* - - * Idea: Train with large batch sizes with fixed memory footprint at the cost of additional compute. - * Do k forward and backward passes through the network with different batches, do not perform parameter updates until after k passes. - * Update paramters - diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/flash_attention.rst b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/flash_attention.rst deleted file mode 100644 index b00b7a38d63ab203c862a53f7bfb27878eba53d1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/flash_attention.rst +++ /dev/null @@ -1,28 +0,0 @@ -Flash attention ---------------- -Flash Attention :cite:`nlp-megatron-dao2022flashattention` is a method designed to enhance the efficiency of Transformer models, which are widely utilized in applications such as natural language processing. Traditional Transformers are slow and consume a lot of memory, especially with long sequences, due to the quadratic time and memory complexity of self-attention. FlashAttention, an IO-aware exact attention algorithm that leverages tiling to minimize the number of memory reads/writes between the GPU's high bandwidth memory (HBM) and on-chip SRAM. This approach is designed to be more efficient in terms of IO complexity compared to standard attention mechanisms. - -GPT -^^^ -To enable Flash Attention while Megatron GPT model training or fine-tuning, modify the following configuration: - -.. code:: - - model.use_flash_attention=True - -T5 -^^ -To enable Flash Attention while Megatron T5 model training, modify the following configuration: - -.. code:: - - model.encoder.use_flash_attention=True - model.decoder.use_flash_attention=True - -References ----------- - -.. bibliography:: ../nlp_all.bib - :style: plain - :labelprefix: nlp-megatron - :keyprefix: nlp-megatron- diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst deleted file mode 100644 index 4c0a09b7f6ea3d97207bb205807fa33836a04523..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst +++ /dev/null @@ -1,232 +0,0 @@ -GPT model training ------------------- - -GPT is a decoder-only Transformer model. - - -Quick start -^^^^^^^^^^^ -Steps below demonstrate training of a GPT style model with NeMo - -Data download & pre-processing -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. note:: - Data download, pre-processing and tokenizer training in the example below will take ~3 hours. - -**Step 1: Download data** - -The step below will download Wikipedia data (around 20GB) and can take some several hours. - -.. code-block:: bash - - wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 - -**Step 2: Extract raw data** - -.. code-block:: bash - - pip install wikiextractor - python -m wikiextractor.WikiExtractor enwiki-latest-pages-articles.xml.bz2 --json - find text -name 'wiki_*' -exec cat {} \; > train_data.jsonl - -Now, ``train_data.jsonl`` will contain our training data in the json line format. We are interested in the data under "text" field. - - -**Step 3: Train tokenizer** - -Below we will condider 2 options for training data tokenizers: Using pre-built HuggingFace BPE and training and using your own Google Sentencepiece tokenizer. -Note that only second option allows you to experiment with vocabulary size. - -*Option 1:* Using HuggingFace GPT2 tokenizer files. - -With this option we will just download pre-built vocabulary and merge files for BPE tokenizer. - -.. code-block:: bash - - wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json - wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt - - -*Option 2:* Using `Google Sentencepiece `_ tokenizer library. - -It comes as dependency with NeMo, so if you have installed NeMo it should already be installed. -Note that training tokenizer model will also take some time. - -.. code-block:: bash - - sudo apt install jq - jq .text train_data.jsonl >> text_for_tokenizer.txt - spm_train --input=text_for_tokenizer.txt \ - --model_prefix=spm_32k_wiki \ - --vocab_size=32768 \ - --character_coverage=0.9999 \ - --model_type=bpe \ - --byte_fallback=true \ - --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 \ - --split_digits true - -After this is done (will take a while), you'll have two files: ```spm_32k_wiki.model and spm_32k_wiki.vocab`` which correspond to model and vocabulary. - -**Step 4: Convert training data into memory map format** - -This format makes training more efficient, especially with many nodes and GPUs. This step will also tokenize data using tokenizer model from Step 3. - -*Option 1:* Using HuggingFace GPT2 tokenizer files. - -.. code-block:: bash - - python /scripts/nlp_language_modeling/preprocess_data_for_megatron.py \ - --input=train_data.jsonl \ - --json-keys=text \ - --tokenizer-library=megatron \ - --vocab gpt2-vocab.json \ - --dataset-impl mmap \ - --tokenizer-type GPT2BPETokenizer \ - --merge-file gpt2-merges.txt \ - --output-prefix=hfbpe_gpt_training_data \ - --append-eod \ - --workers=32 - -*Option 2:* Using `Google Sentencepiece `_ tokenizer library. - -.. code-block:: bash - - python /scripts/nlp_language_modeling/preprocess_data_for_megatron.py \ - --input=train_data.jsonl \ - --json-keys=text \ - --tokenizer-library=sentencepiece \ - --tokenizer-model=spm_32k_wiki.model \ - --output-prefix=gpt_training_data \ - --append-eod \ - --workers=32 - - -Train GPT-style Model -~~~~~~~~~~~~~~~~~~~~~ - -Once you have prepared training data and tokenizer, you are ready to train the model. -The configuration we present below has about 124M parameters and it should fit on a single 16GB GPU if using float16. -Let's go!!! - -*Option 1:* Using HuggingFace GPT2 tokenizer files. - -.. code-block:: bash - - python /home/okuchaiev/repos/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - --config-path=/home/okuchaiev/repos/NeMo/examples/nlp/language_modeling/conf \ - --config-name=megatron_gpt_config \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.max_epochs=null \ - trainer.max_steps=300000 \ - trainer.val_check_interval=300 \ - trainer.log_every_n_steps=50 \ - trainer.limit_val_batches=50 \ - trainer.limit_test_batches=50 \ - trainer.accumulate_grad_batches=1 \ - trainer.precision=16 \ - model.micro_batch_size=6 \ - model.global_batch_size=192 \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.max_position_embeddings=1024 \ - model.encoder_seq_length=1024 \ - model.hidden_size=768 \ - model.ffn_hidden_size=3072 \ - model.num_layers=12 \ - model.num_attention_heads=12 \ - model.init_method_std=0.021 \ - model.hidden_dropout=0.1 \ - model.layernorm_epsilon=1e-5 \ - model.tokenizer.vocab_file=gpt2-vocab.json \ - model.tokenizer.merge_file=gpt2-merges.txt \ - model.data.data_prefix=[1.0,hfbpe_gpt_training_data_text_document] \ - model.data.num_workers=2 \ - model.data.seq_length=1024 \ - model.data.splits_string=\'980,10,10\' \ - model.optim.name=fused_adam \ - model.optim.lr=6e-4 \ - model.optim.betas=[0.9,0.95] \ - model.optim.weight_decay=0.1 \ - model.optim.sched.name=CosineAnnealing \ - model.optim.sched.warmup_steps=750 \ - model.optim.sched.constant_steps=80000 \ - model.optim.sched.min_lr=6e-5 \ - exp_manager.resume_if_exists=True \ - exp_manager.resume_ignore_no_checkpoint=True \ - exp_manager.create_checkpoint_callback=True \ - exp_manager.checkpoint_callback_params.monitor=val_loss \ - exp_manager.checkpoint_callback_params.save_top_k=3 \ - exp_manager.checkpoint_callback_params.mode=min \ - exp_manager.checkpoint_callback_params.always_save_nemo=False - - -*Option 2:* Using `Google Sentencepiece `_ tokenizer library. - -.. code-block:: bash - - python /home/okuchaiev/repos/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - --config-path=/home/okuchaiev/repos/NeMo/examples/nlp/language_modeling/conf \ - --config-name=megatron_gpt_config \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.max_epochs=null \ - trainer.max_steps=300000 \ - trainer.val_check_interval=300 \ - trainer.log_every_n_steps=50 \ - trainer.limit_val_batches=50 \ - trainer.limit_test_batches=50 \ - trainer.accumulate_grad_batches=1 \ - trainer.precision=16 \ - model.micro_batch_size=6 \ - model.global_batch_size=192 \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.max_position_embeddings=1024 \ - model.encoder_seq_length=1024 \ - model.hidden_size=768 \ - model.ffn_hidden_size=3072 \ - model.num_layers=12 \ - model.num_attention_heads=12 \ - model.init_method_std=0.021 \ - model.hidden_dropout=0.1 \ - model.layernorm_epsilon=1e-5 \ - model.tokenizer.library=sentencepiece \ - model.tokenizer.model=spm_32k_wiki.model \ - model.data.data_prefix=[1.0,gpt_training_data_text_document] \ - model.data.num_workers=2 \ - model.data.seq_length=1024 \ - model.data.splits_string=\'980,10,10\' \ - model.optim.name=fused_adam \ - model.optim.lr=6e-4 \ - model.optim.betas=[0.9,0.95] \ - model.optim.weight_decay=0.1 \ - model.optim.sched.name=CosineAnnealing \ - model.optim.sched.warmup_steps=750 \ - model.optim.sched.constant_steps=80000 \ - model.optim.sched.min_lr=6e-5 \ - exp_manager.resume_if_exists=True \ - exp_manager.resume_ignore_no_checkpoint=True \ - exp_manager.create_checkpoint_callback=True \ - exp_manager.checkpoint_callback_params.monitor=val_loss \ - exp_manager.checkpoint_callback_params.save_top_k=3 \ - exp_manager.checkpoint_callback_params.mode=min \ - exp_manager.checkpoint_callback_params.always_save_nemo=False - - -Next, simply launch Tensorboard to monitor training like so: - -.. code-block:: bash - - tensorboard --logdir nemo_experiments --bind_all - -Next steps -~~~~~~~~~~ - -Please refer to: - -* :ref:`batching` section for batch size adjustments -* :ref:`parallelisms` section for understanding various types of parallelisms -* :ref:`promptlearning` section for details on prompt-tuning and p-tuning - diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/hiddens/hiddens_module.rst b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/hiddens/hiddens_module.rst deleted file mode 100644 index 0811252393e8341455545dab5eba7665ad1200fd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/hiddens/hiddens_module.rst +++ /dev/null @@ -1,199 +0,0 @@ -Hiddens Module -============== - -The hiddens module allows to add **hidden transformations** and **hidden losses** to Megatron encoder-decoder models. -Hidden transformations are transformations that are applied to the output of the encoder. -Hidden losses are losses that are applied to the outputs of the hidden transformations. -A common use case for hidden transformations is to train a Mutual Information Machine (MIM) -or a Variational Auto-Encoder (VAE) models. - -Quick Start ------------ - -Below is an example command of training a MIM model with BART data augmentation (i.e., includes masking the input): - -.. code-block:: bash - - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=/results/megatron_mim \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.arch=perceiver \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.data.data_impl=text_mmap \ - model.data.data_prefix=[1.0,/data/wiki.txt] \ - model.data.splits_string=\'\"800,100,100\"\' \ - model.data.whole_word_masking=False \ - model.tokenizer.library=sentencepiece \ - model.tokenizer.model=/data/spm_64k_all_langs_plus_en.model \ - ++model.hiddens.enc_output_name=z \ - ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ - ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ - ++model.hiddens.loss.mim.cls_name=a_mim \ - ++model.hiddens.loss.mim.loss_weight=1.0 - -The last 5 lines in the above command enable sampling with reparameterization (`cond_gauss` hidden transformation) -and MIM loss where the hidden part of the loss is weighted by `1.0`. - -The above example will produce the following plots in Weights and Biases: - -.. image:: images/hiddens-wb-logging.png - :align: center - :width: 800px - :alt: MIM training W&B plots - -See below detailed description of usage and configuration format. - -High Level Description ----------------------- - -Megatron encoder-decoder models directly pass the output of the encoder to the decoder. -The hidden transformations provides a mechanism to add transformations and losses to the encoder outputs. -This is achieved my naming the output of the encoder (`hiddens`) and any provided hidden transformation. -Each hidden transformation is defined over expected existing outputs and produces a new set of outputs. -This allows us to define losses on any of the named outputs (i.e., the outputs of the encoder or any of the transformations). - - -Detailed Description --------------------- - -Features -^^^^^^^^ - -1. Hidden transformations and losses can be added to any Megatron encoder-decoder model. -2. Externally implemented transformations and losses can easily be registered and used. -3. Transformations (and losses) order is supported to allow one transformation to use the output of another. -4. All transformations' outputs are named, allowing for easy access in losses or other transformations (encoder raw output defaults to `hiddens`, and respective `hiddens_mask`). -5. All loss outputs are logged, allowing for easy monitoring of the training and validation process. -6. Transformations' outputs can be used in more than one loss. -7. The joint loss supports weighting the terms, and is computed as follows: `loss = hiddens.tokens_loss_weight * tokens_loss_weight + \sum_i hiddens.loss[i].loss_weight * hiddens.loss[i].loss`. -8. Detailed error messages are provided. Please check raised exceptions and log outputs. Errors will be raised if: - - * The same named output is used more than once. - * A loss is expected an undefined named output. - * A Mismatch in a transformation or loss constructor parameters. - - -Configuring Hidden Transformations and Losses -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A detailed example can be found in : `NeMo/examples/nlp/language_modeling/conf/megatron_hiddens_base_config.yaml `__. -Below is the content of the config file above: - -.. code-block:: yaml - - # this file main purpose is documentation, and it should not be used directly - enc_output_name: z # name of key in hidden transforms output to pass to decoder (default: hiddens). e.g., z for VAE/MIM. - tokens_loss_weight: 1.0 # weight of tokens loss (if not specified defaults to 1.0) - # the lists below are useful for adding multiple transforms and losses according to order - # if order is not important, you can use a single dictionary in the list with multiple keys - transform: # a list of dictionaries of transforms (or a joint dictionary) to apply to hiddens (list enforces order) - # - : # name of transform - # cls_name: # class name - # : # transform parameters - # ... - - q_z_given_x: # Gaussian posterior with reparameterization - cls_name: cond_gaussian # class name - hidden_size: 512 # hidden size of the encoder - min_logvar: -6.0 # minimum log variance - - logP_cls: # logP classifier logits - cls_name: guided_cls - input_name: hiddens - attr_name: logP - QED_cls: # QED classifier logits - cls_name: guided_cls - input_name: hiddens - attr_name: QED - loss: # a list of dictionaries of loss terms (or a joint dictionary) to add to reconstruction loss (list enforces order) - # - : # name of loss - # cls_name: # class name - # : # loss parameters - # ... - # below is example where order of losses does not matter so a single dictionary is enough - mim: # A-MIM example - cls_name: a_mim - loss_weight: 1.0 # weight of the MIM latent loss - vae: # VAE example - cls_name: vae - min_kl_value: null # minimum KL value if a float is provided - loss_weight: 1e-2 # weight of KL term in loss - logP_cls: # logP classifier loss (cross entropy) - cls_name: guided_cls_loss - input_name: logP - loss_weight: 0.1 - QED_cls: # QED classifier loss (cross entropy) - cls_name: guided_cls_loss - input_name: logP - loss_weight: 0.1 - - -Listing Registered Hidden Transformations and Losses -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The hidden transformations and losses are should be registered in the `hiddens` module. -To check available (i.e., registered) transformation and losses use the following python code: - -.. code-block:: python - - from nemo.collections.nlp.modules.common.hiddens import get_registered_hiddens - - # List all registered hidden transformations and losses - print(get_registered_hiddens()) - # { - # "loss": ["a_mim", "vae"], - # "transform": ["cond_gaussian"], - # } - - -Implementing and Registering a Custom Hidden Transformation or Loss -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Implementing a custom hidden transformation or loss is straightforward. - -* Example for a hidden transformation: `NeMo/nemo/collections/nlp/modules/common/megatron/hiddens/megatron_hidden_transform.py `__. -* Examples for hidden losses: `NeMo/nemo/collections/nlp/modules/common/megatron/hiddens/megatron_hidden_loss.py `__. - -Generally speaking, the custom hidden transformation or loss should inherit from `MegatronBaseHiddenTransform` or `MegatronBaseHiddenLoss` respectively. -Before using the classes, they should be registered in the `hiddens` module as described above. - -.. code-block:: python - - from nemo.collections.nlp.modules.common.hiddens import ( - register_hidden_loss, - register_hidden_transform, - MegatronBaseHiddenTransform, - MegatronBaseHiddenLoss, - ) - - class MyTransform(MegatronBaseHiddenTransform): - ... - - class MyLoss(MegatronBaseHiddenLoss): - ... - - # Registering a new hidden transformation MyTransform - # e.g., class_path = "nemo.collections.nlp.modules.common.hiddens.MyTransform" - class_path = MyTransform.__module__ + '.' + MyTransform.__qualname__ - # The command below will allow the use of `my_transform` as a config `cls_name` value for a transformation - register_hidden_transform(cls_name="my_transform", class_path=MyTransform) - - # Registering a new hidden loss MyLoss - # e.g., class_path = "nemo.collections.nlp.modules.common.hiddens.MyLoss" - class_path = MyLoss.__module__ + '.' + MyLoss.__qualname__ - # The command below will allow the use of `my_loss` as a config `cls_name` value for a loss - register_hidden_loss(cls_name="my_loss", class_path=MyLoss) diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/hiddens/images/hiddens-wb-logging.png b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/hiddens/images/hiddens-wb-logging.png deleted file mode 100644 index e9016a26d28fbc94a83f1cadf8affc57b8dc7a47..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/hiddens/images/hiddens-wb-logging.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/images/ddp.gif b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/images/ddp.gif deleted file mode 100644 index feae2f9445c7d43ef7d7da27fdd295e59e94c75e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/images/ddp.gif +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c1f2c903f2526b5a067c575ceea68c71037779203b8b00fb9cc21732d4c3b8e0 -size 6859866 diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/images/pnom.gif b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/images/pnom.gif deleted file mode 100644 index f2f5ed8cd6d7003a9446b2b0aac8bda6bf13c443..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/images/pnom.gif +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:86bd96ef60d79fb69cda823f651dd881d7306579ae7a1e1eaf3b553af15038c7 -size 4178913 diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/images/pp.gif b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/images/pp.gif deleted file mode 100644 index 6d780094beac75a8f73136c3fbf1075962e1d7eb..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/images/pp.gif +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ab31418983b038558c9d2c85f256205b961e2afd4fe9812fc1d541a939fe4f48 -size 5736616 diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/images/sp.gif b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/images/sp.gif deleted file mode 100644 index 665df2becf8894474b0c98c675757e8dd3d0631a..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/images/sp.gif and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/images/tp.gif b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/images/tp.gif deleted file mode 100644 index b39bf4b1a3aead65095d2d33cb308da48350fe61..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/images/tp.gif +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ea5d43e578159891071dcf93997d863b4908879861452e8d7c11b4d8db799248 -size 2478936 diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/intro.rst b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/intro.rst deleted file mode 100644 index 577291424c805d58d8024d4b063bbc4dbb414025..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/intro.rst +++ /dev/null @@ -1,39 +0,0 @@ -NeMo Megatron -============= - -Megatron :cite:`nlp-megatron-shoeybi2019megatron` is a large, powerful transformer developed by the Applied Deep Learning Research -team at NVIDIA. NeMo Megatron supports several types of models: - -* GPT-style models (decoder only) -* T5/BART/UL2-style models (encoder-decoder) -* BERT-style models (encoder only) -* RETRO model (decoder only) - - - -.. note:: - NeMo Megatron has an Enterprise edition which contains tools for data preprocessing, hyperparameter tuning, container, scripts for various clouds and more. With Enterprise edition you also get deployment tools. Apply for `early access here `_ . - - -.. toctree:: - :maxdepth: 1 - - mlm_migration - gpt/gpt_training - batching - parallelisms - prompt_learning - retro/retro_model - hiddens/hiddens_module - peft/landing_page - flash_attention - positional_embeddings - - -References ----------- - -.. bibliography:: ../nlp_all.bib - :style: plain - :labelprefix: nlp-megatron - :keyprefix: nlp-megatron- \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/mlm_migration.rst b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/mlm_migration.rst deleted file mode 100644 index ffe9764615b586db98106d058fbe8b9a86ab868c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/mlm_migration.rst +++ /dev/null @@ -1,24 +0,0 @@ -Migrating from Megatron-LM --------------------------- - -NeMo Megatron and Megatron-LM share many underlying technology. You should be able to convert your GPT model checkpoints trained with Megatron-LM into NeMo Megatron. -Example conversion script: - -.. code-block:: bash - - /examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \ - --checkpoint_folder \ - --checkpoint_name megatron_gpt--val_loss=99.99-step={steps}-consumed_samples={consumed}.0 \ - --nemo_file_path \ - --model_type \ - --tensor_model_parallel_size \ - --pipeline_model_parallel_size \ - --gpus_per_node - - - -To resume the training from converted MegatronLM checkpoint, make sure to set the -`trainer.max_steps=round(lr-warmup-fraction * lr-decay-iters + lr-decay-iters)` -where `lr-warmup-fraction` and `lr-decay-iters` are arguments from MegatronLM training -so the learning rate scheduler will follow the same curve. - diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/parallelisms.rst b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/parallelisms.rst deleted file mode 100644 index 172721a1d2ddd8042126c523dbb4fce853f85ea5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/parallelisms.rst +++ /dev/null @@ -1,49 +0,0 @@ -.. _parallelisms: - -Parallelisms ------------- - -NeMo Megatron supports 4 types of parallelisms (can be mixed together arbitraritly): - -Distributed Data parallelism -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. image:: images/ddp.gif - :align: center - :width: 800px - :alt: Distributed Data Parallel - - -Tensor Parallelism -^^^^^^^^^^^^^^^^^^ - -.. image:: images/tp.gif - :align: center - :width: 800px - :alt: Tensor Parallel - -Pipeline Parallelism -^^^^^^^^^^^^^^^^^^^^ - -.. image:: images/pp.gif - :align: center - :width: 800px - :alt: Pipeline Parallel - -Sequence Parallelism -^^^^^^^^^^^^^^^^^^^^ - -.. image:: images/sp.gif - :align: center - :width: 800px - :alt: Sqeuence Parallel - -Parallelism nomenclature -^^^^^^^^^^^^^^^^^^^^^^^^ - -When reading and modifying NeMo Megatron code you will encounter the following terms. - -.. image:: images/pnom.gif - :align: center - :width: 800px - :alt: Parallelism nomenclature diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/peft/landing_page.rst b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/peft/landing_page.rst deleted file mode 100644 index c90dcdfff1c5960a144d226ec81cd67c98db6bd7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/peft/landing_page.rst +++ /dev/null @@ -1,35 +0,0 @@ -Parameter-Efficient Fine-Tuning (PEFT) -====================================== - -PEFT is a popular technique used to efficiently finetune large language -models for use in various downstream tasks. When finetuning with PEFT, -the base model weights are frozen, and a few trainable adapter modules -are injected into the model, resulting in a very small number (<< 1%) of -trainble weights. With carefully chosen adapter modules and injection -points, PEFT achieves comparable performance to full finetuning at a -fraction of the computational and storage costs. - -NeMo supports four PEFT methods which can be used with various -transformer-based models. - -==================== ===== ===== ========= == -\ GPT 3 NvGPT LLaMa 1/2 T5 -==================== ===== ===== ========= == -Adapters (Canonical) ✅ ✅ ✅ ✅ -LoRA ✅ ✅ ✅ ✅ -IA3 ✅ ✅ ✅ ✅ -P-Tuning ✅ ✅ ✅ ✅ -==================== ===== ===== ========= == - -Learn more about PEFT in NeMo with the :ref:`peftquickstart` which provides an overview on how PEFT works -in NeMo. Read about the supported PEFT methods -`here `__. For a practical example, take a look at -the `Step-by-step Guide `__. - -The API guide can be found `here <../../api.html#adapter-mixin-class>`__ - -.. toctree:: - :maxdepth: 1 - - quick_start - supported_methods \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/peft/quick_start.rst b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/peft/quick_start.rst deleted file mode 100644 index 000e242b950873da900f3a95ddd49b575a568737..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/peft/quick_start.rst +++ /dev/null @@ -1,90 +0,0 @@ -.. _peftquickstart: - - -Quick Start Guide -================= - -The quick start guide provides an overview of a PEFT workflow in NeMo. - -Terminology: PEFT vs Adapter -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This tutorial uses "PEFT" to describe the overall parameter efficient -finetuning method, and "adapter" to describe the additional module -injected to a frozen base model. Each PEFT model can use one or more -types of adapters. - -One of the PEFT methods is sometimes referred to as "adapters", because -it was one of the first proposed usage of adapter modules for NLP. This -PEFT method will be called the "canonical" adapters to distinguish the -two usages. - -How PEFT work in NeMo models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Each PEFT method has one or more types of adapters that need to be -injected into the base model. In NeMo models, the adapter logic and -adapter weights are already built into the submodules, but they are -disabled by default for ordinary training and fine-tuning. - -When doing PEFT, the adapter logic path can be enabled when -``model.add_adapter(peft_cfg)`` is called. In this function, the model -scans through each adapter applicable to the current PEFT method with -each of its submodules in order to find adapter logic paths that can be -enabled. Then, the base models weights are frozen, while newly added -adapter weights are unfrozen and allowed to be updated during -fine-tuning, hence achieving efficiency in the number of parameters -finetuned. - -PEFT config classes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Each PEFT method is specified by a ``PEFTConfig`` class which stores the -types of adapters applicable to the PEFT method, as well as -hyperparameters required to initialize these adapter modules. These four -PEFT methods are currently supported: - -1. Adapters (canonical): ``CanonicalAdaptersPEFTConfig`` -2. LoRA: ``LoraPEFTConfig`` -3. IA3: ``IA3PEFTConfig`` -4. P-Tuning: ``PtuningPEFTConfig`` - -These config classes make experimenting with different adapters as easy -as changing the config class. - -Moreover, it is possible to use a combination of the PEFT methods in -NeMo since they are orthogonal to each other. This can be easily done by -passing in a list of ``PEFTConfig`` objects to ``add_adapter`` instead -of a single one. For example, a common workflow is to combine P-Tuning -and Adapter, and this can be achieved with -``model.add_adapter([PtuningPEFTConfig(model_cfg), CanonicalAdaptersPEFTConfig(model_cfg)])`` - -Base model classes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -PEFT in NeMo is built with a mix-in class that does not belong to any -model in particular. This means that the same interface is available to -different NeMo models. Currently, NeMo supports PEFT for GPT-style -models such as GPT 3, NvGPT, LLaMa 1/2 (``MegatronGPTSFTModel``), as -well as T5 (``MegatronT5SFTModel``). - -Full finetuning vs PEFT -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -You can switch between full fine-tuning and PEFT by removing calls to -``add_adapter`` and ``load_adapter``. - -The code snippet below illustrates the core API of full fine-tuning and -PEFT. - -.. code:: diff - - trainer = MegatronTrainerBuilder(config).create_trainer() - model_cfg = MegatronGPTSFTModel.merge_cfg_with(config.model.restore_from_path, config) - - model = MegatronGPTSFTModel.restore_from(restore_path, model_cfg, trainer) # restore from pretrained ckpt - + peft_cfg = LoRAPEFTConfig(model_cfg) - + model.add_adapter(peft_cfg) - trainer.fit(model) # saves adapter weights only - - # Restore from base then load adapter API - model = MegatronGPTSFTModel.restore_from(restore_path, trainer, model_cfg) - + model.load_adapters(adapter_save_path, peft_cfg) - model.freeze() - trainer.predict(model) diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/peft/supported_methods.rst b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/peft/supported_methods.rst deleted file mode 100644 index 4479565be6aa5ca3c60389be47c8a1e40e03ca20..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/peft/supported_methods.rst +++ /dev/null @@ -1,71 +0,0 @@ - - -Supported PEFT methods ----------------------- - -NeMo supports the following PFET tuning methods - -1. **Adapters (Canonical)**: `Parameter-Efficient Transfer Learning for - NLP `__ - - - Adapters (Houlsby setup) is one of the first PEFT methods applied - to NLP. Adapter tuning is more efficient than full fine-tuning - because the base model weights are frozen, while only a small - number of adapter module weights are updated. In this method, two - linear layers with a bottleneck and a non-linear activation are - inserted into each transformer layer via a residual connection. In - each case, the output linear layer is initialized to 0 to ensure - that an untrained adapter does not affect the normal forward pass - of the transformer layer. - -2. **LoRA**: `LoRA: Low-Rank Adaptation of Large Language - Models `__ - - - LoRA makes fine-tuning efficient by representing weight updates - with two low rank decomposition matrices. The original model - weights remain frozen, while the low rank decomposition matrices - are updated to adapt to the new data , so the number of trainable - parameters is kept low. In contrast with adapters, the original - model weights and adapted weights can be combined during - inference, avoiding any architectural change or additional latency - in the model at inference time. - - The matrix decomposition operation can be applied to any linear - layer, but in practice, it is only applied to the K, Q, V - projection matrices (sometimes just applied to the Q,V layers). - Since NeMo's attention implementation fuses KQV into a single - projection, our LoRA implementation learns a single Low-Rank - projection for KQV in a combined fashion. - -3. **IA3**: `Few-Shot Parameter-Efficient Fine-Tuning is Better and - Cheaper than In-Context Learning `__ - - - IA3 makes fine-tuning efficient by rescaling activations with - learned vectors. The rescaling layers are injected in the - attention (for key and value) and feedforward modules in the base - model. Similar to other PEFT methods, only the rescaling vectors - are updated during fine-tuning to adapt to the new data so the - number of updated parameters is low. However, since rescaling - vectors are much smaller than low rank matrices (LoRA) and - bottleneck layers (Adapters), IA3 cuts down the number of - trainable parameters further by an order of magnitude. The - learning rescaling vectors can also be merged with the base - weights, leading to no architectural change and no additional - latency at inference time. - -4. **P-Tuning**: `GPT Understands, - Too `__ - - - P-tuning is an example of the prompt learning family of methods, - in which trainable virtual tokens are inserted into the model - input prompt to induce it to perform a task. Virtual tokens (also - called "continuous" or "soft" tokens) are embeddings that have no - concrete mapping to strings or characters within the model’s - vocabulary. They are simply 1D vectors that match the - dimensionality of real tokens which make up the model's - vocabulary. - - In p-tuning, an intermediate LSTM or MLP model is used to generate - virtual token embeddings. We refer to this intermediate model as - our ``prompt_encoder``. The prompt encoder parameters are randomly - initialized at the start of p-tuning. All base model parameters - are frozen, and only the prompt encoder weights are updated at - each training step. diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/positional_embeddings.rst b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/positional_embeddings.rst deleted file mode 100644 index b8dea5280c28549122b1fa2ad0d7935a90f7586e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/positional_embeddings.rst +++ /dev/null @@ -1,111 +0,0 @@ -Positional embeddings ---------------------- - -Positional embeddings are used to give the model information about the position of each element in a sequence. Megatron LLM supports the following positional embedding types: - -GPT -^^^ - -.. list-table:: *Supported positional embeddings in GPT models* - :widths: 10 30 60 - :header-rows: 1 - - * - Parameter value - - How to use - - Description - - * - **learned_absolute** - - .. code:: - - model.position_embedding_type='learned_absolute' - - Absolute Position Encodings :cite:`nlp-megatron-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression. - - * - **rope** - - .. code:: - - model.position_embedding_type='rope' - model.rotary_percentage=1.0 - - Rotary Position Embedding (RoPE) :cite:`nlp-megatron-su2022roformer` incorporates positional information by utilizing a rotation matrix to encode the absolute positions of tokens while maintaining relative positional relationships in self-attention formulations by leveraging the geometric properties of vectors and complex numbers, applying a rotation based on a preset non-zero constant and the relative positions of the tokens to the word embeddings. - - * - **alibi** - - .. code:: - - model.position_embedding_type='alibi' - - Attention with Linear Biases (ALiBi) :cite:`nlp-megatron-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude. - - * - **kerple** - - .. code:: - - model.position_embedding_type='kerple' - - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`nlp-megatron-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using conditionally positive definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner. - - * - **xpos** - - .. code:: - - model.position_embedding_type='xpos' - - Extrapolatable Position Embedding (xPos) :cite:`nlp-megatron-sun2022lengthextrapolatable` - - * - **sandwich** - - .. code:: - - model.position_embedding_type='sandwich' - - Sandwich :cite:`nlp-megatron-chi2023dissecting` - -T5 -^^ - -.. list-table:: *Supported positional embeddings in T5 models* - :widths: 10 30 60 - :header-rows: 1 - - * - Parameter value - - How to use - - Description - - * - **learned_absolute** - - .. code:: - - model.encoder.position_embedding_type='learned_absolute' - model.decoder.position_embedding_type='learned_absolute' - - Absolute Position Encodings :cite:`nlp-megatron-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression. - - * - **relative** - - .. code:: - - model.encoder.position_embedding_type='relative' - model.decoder.position_embedding_type='relative' - - Relative Position Representations :cite:`nlp-megatron-shaw2018selfattention` - - * - **alibi** - - .. code:: - - model.encoder.position_embedding_type='alibi' - model.decoder.position_embedding_type='alibi' - - Attention with Linear Biases (ALiBi) :cite:`nlp-megatron-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude. - - * - **kerple** - - .. code:: - - model.encoder.position_embedding_type='kerple' - model.decoder.position_embedding_type='kerple' - - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`nlp-megatron-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using conditionally positive definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner. - -Positional interpolation ------------------------- -Position Interpolation (PI) :cite:`nlp-megatron-chen2023extending` is a method introduced to extend the context window sizes of Rotary Position Embedding (RoPE)-based pretrained large language models (LLMs). The central principle of PI is to reduce the position indices so that they align with the initial context window size through interpolation. - -Positional Interpolation is supported in Megatron GPT SFT models. Set RoPE Interpolation factor for sequence length :code:`seq_len_interpolation_factor` to enable it. - -.. code:: - - model.position_embedding_type='rope' - model.rotary_percentage=1.0 - model.seq_len_interpolation_factor: 2 - -References ----------- - -.. bibliography:: ../nlp_all.bib - :style: plain - :labelprefix: nlp-megatron - :keyprefix: nlp-megatron- \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/prompt_learning.rst b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/prompt_learning.rst deleted file mode 100644 index 8fe481019a6f127ca899eda39b50842eac844be5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/prompt_learning.rst +++ /dev/null @@ -1,390 +0,0 @@ -.. _promptlearning: - -Prompt Learning ---------------- - -Within NeMo we refer to **p-tuning** and **prompt tuning** methods collectively as prompt learning. Both methods are parameter efficient alternatives to fine-tuning pretrained language models. Our NeMo implementation makes it possible to use one pretrained GPT model on many downstream tasks without needing to tune the model's full set of parameters. It also allows for adding new tasks to your model without overwriting or disrupting previous tasks for which the model has already been p-tuned/prompt-tuned. Because the original model parameters are frozen and never altered by either method, p-tuning/prompt-tuning also avoids catastrophic forgetting issues often encountered when fine-tuning models. - -Instead of selecting discrete text prompts in a manual or automated fashion, prompt tuning and p-tuning utilize virtual prompt embeddings that can be optimized via gradient descent. The only difference between prompt tuning and p-tuning within NeMo-Megatron is the architecture used to tune the soft prompt tokens during training. - -- Our prompt tuning implementation is based off Lester et. al’s EMNLP 2021 paper "`The Power of Scale for Parameter-Efficient Prompt Tuning `_" -- Our p-tuning implementation is based off Liu et al's paper "`GPT Understands, Too `_" - -Our continuous learning capability for combined p-tuning and prompt tuning with GPT style models is a NeMo specific extension of the author's original work. - -Please also checkout our `prompt learning tutorial notebook. `_ - - -Terminology -^^^^^^^^^^^ -We will be using the terms ``continuous``, ``soft``, and ``virtual`` token interchangeably to refer to embeddings inserted into the model prompt that have no concrete mapping to strings or characters within the model’s vocabulary. These virtual token embeddings exist in contrast to the ``discrete``, ``hard``, or ``real`` tokens that do make up the model’s vocabulary. Virtual tokens are purely 1D vectors with dimensionality equal to that of each real token embedding, matching the ``hidden_size`` hyperparameter. In training and inference, continuous token embeddings are inserted among discrete token embeddings according to a template you provide in the model's config. We will demonstrate how to do this below. - -When referring to p-tuning and prompt tuning together, we will be using the phrase prompt learning for simplicity. - -Prompt Tuning -^^^^^^^^^^^^^ - -In prompt-tuning a pretrained GPT model, soft prompt embeddings are initialized as a 2D matrix of size ``total_virtual_tokens X hidden_size``. Each task the model is prompt-tuned to perform has its own 2D embedding matrix associated with it. Tasks do not share any parameters during training or inference. All GPT model parameters are frozen and only the embedding parameters for each task are updated during training. - -In prompt tuning you can specify how the embeddings are initialized for each task. You can either - -- Initialize embedding parameters according to some random distribution -- Initialize embedding parameters from existing vocabulary embeddings (recommended) - -If you choose to initialize virtual token embeddings from existing embedding weights, you can provide the string of words you want to use for initialization in the model's config. This string will be tokenized and tiled or truncated to match the specified number of virtual tokens you would like to use (``total_virtual_tokens``). Vocab embeddings are copied and used to initialize the soft prompt embedding matrix for each task. The vocab embeddings themselves are not updated or changed during prompt tuning. - -P-Tuning -^^^^^^^^ - -In p-tuning, an LSTM model is used to predict virtual token embeddings. We refer to this LSTM model as our ``prompt_encoder``. LSTM parameters are randomly initialized at the start of p-tuning. All GPT model parameters are frozen, and only the LSTM weights are updated at each training step. LSTM parameters are shared between all tasks that are p-tuned at the same time, but the LSTM model outputs unique virtual token embeddings for each task. The virtual tokens predicted by the LSTM are inserted among the discrete token input in the exact same manner as with prompt-tuning. You still specify the number of virtual tokens you want to use by setting ``total_virtual_tokens`` and each virtual token embedding is still a 1D vector of size ``hidden_size``. - -Using Both Prompt and P-Tuning -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A single pretrained GPT model can use both p-tuning and prompt-tuning. While you must decide to use either p-tuning or prompt-tuning for each task you want your model to perform, you can p-tune your model on a set of tasks *A*, then prompt tune your same model on a different set of tasks *B*, then finally run inference on tasks from both *A* and *B* at the same time. During prompt-tuning or p-tuning, tasks tuned at the same time must use the same number of virtual tokens. During inference, tasks using differing amounts of virtual tokens can be run at the same time. - -When p-tuning completes, prompt tuned virtual tokens from the p-tuning ``prompt_encoder`` are automatically moved to the ``prompt_table`` where all prompt tuned and p-tuned soft prompts are stored. The LSTM ``prompt_encoder`` is then removed from the model. This allows us to preserve previously p-tuned soft prompts while still maintaining the ability to add new p-tuned or prompt-tuned soft prompts in the future. The ``prompt_table`` uses the ``taskname`` as a key to look up the correct virtual tokens for a specified task. The ``prompt_table``'s hash table data structure also makes it possible for each task to flexibly use a different number of virtual tokens. - -P-tuning usually requires fewer virtual tokens per task to achieve good results but uses a higher number of parameters compared to prompt-tuning. For example, if you prompt tune a 125M parameter GPT model (with hidden size 768) on two tasks, using 100 virtual tokens per task, the total parameters tuned during prompt tuning would equal 153k (~.1% of the pre-trained model size). If you p-tune the same 125M GPT model on 2 tasks, using an LSTM with two layers and 10 tokens per task, you will be tuning 8.3M parameters (~6.6% of the pre-trained model size). The increased number of parameters used during p-tuning is mitigated by our ``prompt_table``. When p-tuned soft prompts are placed in the prompt table, only the parameters for the predicted virtual tokens are saved. This allows us to keep the benefit of tuning a larger number of parameters during training, while also preserving the parameter efficiency of prompt-tuning during inference and storing of the model. - -Because p-tuning shares parameters between tasks during training, p-tuning your model on multiple tasks that are similar might allow your model to share insight between tasks. In the same vein, p-tuning on many very different tasks at once might perform worse than prompt tuning, which tunes a distinct set of parameters per task. **Generally we recommend using p-tuning over prompt tuning.** - -Users can also optionally tune the model's full parameters in addition to the soft prompt parameters. See ``model.lm_finetune`` in the Prompt Learning Config section for details on how to configure this. - -Dataset Preprocessing -^^^^^^^^^^^^^^^^^^^^^ - -The prompt learning dataset accepts a list of json/dictionary objects or a list of json file names where each json file contains a collection of json objects. Each json object must include the field ``taskname`` which is a string identifier for the task the data example corresponds to. They should also include one or more fields corresponding to different sections of the discrete text prompt. The input data might look like: - -.. code:: - - [ - {"taskname": "squad", "context": [CONTEXT_PARAGRAPH_TEXT1], "question": [QUESTION_TEXT1], "answer": [ANSWER_TEXT1]}, - {"taskname": "squad", "context": [CONTEXT_PARAGRAPH_TEXT2], "question": [QUESTION_TEXT2], "answer": [ANSWER_TEXT2]}, - {"taskname": "intent_and_slot", "utterance": [UTTERANCE_TEXT1], "label": [INTENT_TEXT1][SLOT_TEXT1]}, - {"taskname": "intent_and_slot", "utterance": [UTTERANCE_TEXT2], "label": [INTENT_TEXT2][SLOT_TEXT2]}, - {"taskname": "sentiment", "sentence": [SENTENCE_TEXT1], "label": [SENTIMENT_LABEL1]}, - {"taskname": "sentiment", "sentence": [SENTENCE_TEXT2], "label": [SENTIMENT_LABEL2]}, - ] - -These additional fields can be unlimited in number and will be used to help map different parts of the discrete text input to a prompt template that you define. We show how this mapping works and how to construct your prompt template in the Prompt Formatting section. Data examples for each dataset can all be passed to the dataset class in one file, or in separate ``.jsonl`` files in a list. - -.. _data-example-label: - -Prompt Formatting -^^^^^^^^^^^^^^^^^ - -To customize different prompts for different tasks, we simply need to specify the prompt task template in the config file at ``model.task_templates``. The virtual token markers ``<|VIRTUAL_PROMPT_#|>`` signify where you want virtual tokens to be placed in the template string. ``<|VIRTUAL_PROMPT_0|>``, ``<|VIRTUAL_PROMPT_1|>``, and ``<|VIRTUAL_PROMPT_2|>`` indicate where a number of virtual tokens matching the values given at ``virtual_token_splits[0]``, ``virtual_token_splits[1]`` and ``virtual_token_splits[2]`` will be placed. The other variable fields ``{var}`` refer to the fields in the data json. - -For example, given: - -- the data json ``{"sentence1": "And he said, Mama, I'm home.", "sentence2": "He didn't say a word."}`` -- virtual token splits set to ``virtual_token_splits = [3, 3, 3]`` -- a prompt template set to ``prompt_template = "<|VIRTUAL_PROMPT_0|> Hypothesis: [sentence1], <|VIRTUAL_PROMPT_1|> Premise: [sentence2] <|VIRTUAL_PROMPT_2|> Answer:"`` - -the input will be translated into ``VVV Hypothesis: And he said, Mama, I'm home. VVV Premise: He didn't say a word. VVV Answer:``, where ``VVV`` are three virtual tokens. - -**We recommend you first try prompt learning by placing all virtual tokens at the very beginning of your prompt template** like we do with the ``sentiment`` task example below. We've found this gives strong performance. -.. code:: - - config.model.task_templates = [ - { - "taskname": "sentiment", - "prompt_template": "<|VIRTUAL_PROMPT_0|> {sentence} sentiment: {label}", - "total_virtual_tokens": 10, - "virtual_token_splits": [10], - "truncate_field": "sentence", - "answer_only_loss": False, - }, - { - "taskname": "intent_and_slot", - "prompt_template": "<|VIRTUAL_PROMPT_0|> Predict intent and slot <|VIRTUAL_PROMPT_1|> :\n{utterance}{label}", - "total_virtual_tokens": 10, - "virtual_token_splits": [7, 3], - "truncate_field": None, - "answer_only_loss": True, - "answer_field": "label" - } - ] - -.. _prompt-formatting-label: - -``model.task_templates`` Config Parameters -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. list-table:: - :widths: 15 15 25 - :header-rows: 1 - - * - **Parameter** - - **Data type** - - **Description** - * - **taskname** - - string - - Short string denoting the task, used to lookup task specific virtual tokens from the ``prompt_table``. Refers to the same ``taskname`` in the dataset json objects. - * - **prompt_template** - - string - - a string showing the model where to place virtual tokens and how to map dataset json fields to where they belong in the model prompt - * - **total_virtual_tokens** - - int - - specifies the total number of virtual tokens that will be inserted into the model prompt - * - **virtual_token_splits** - - list of ints - - specifies the number of virtual tokens that belong at each ``<|VIRTUAL_PROMPT_#|>`` marker. ``virtual_token_splits`` values should add up to ``total_virtual_tokens``. The number of ``virtual_token_splits`` should match the number of ``<|VIRTUAL_PROMPT_#|>`` markers. - * - **answer_only_loss** - - bool - - Whether to limit loss calculation to only the answer portion of the prompt during tuning. Strongly recommended for long prompts. - * - **answer_field** - - string - - The field in the data json corresponding to the answer. The loss will only be calculated on this portion of the prompt if ``answer_only_loss`` is ``True``. The answer field must be at the end of the prompt template. - * - **truncate_field** - - string - - specifies which field in the data json to truncate if the length of the input exceeds the maximum sequence length of the model. If ``truncate_field`` is set to ``None``, examples that are too long are simply dropped from the dataset. - -Prompt Learning Specific Config Values -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. list-table:: - :widths: 15 15 25 - :header-rows: 1 - - * - **Parameter** - - **Data type** - - **Description** - * - **model.nemo_path** - - string - - Path to where you want to save your model after prompt tuning/p-tuning, must end in `.nemo` - * - **model.virtual_prompt_style** - - string - - one of 'prompt-tuning', 'p-tuning', or 'inference' - * - **model.language_model_path** - - string - - Path to the GPT language model .nemo file you want to use for prompt learning, not needed if ``restore_path`` is set - * - **model.restore_path** - - string - - Path to a .nemo file of existing ``MegatronGPTPromptLearningModel`` that has already been prompt tuned or p-tuned on at least one task. P-tuned or prompt tuned in this training session will be added to this model's `prompt_table`. Should be set to ``null`` if none. - * - **model.new_tasks** - - list of strings - - List of new tasknames to be prompt or p-tuned, - * - **model.existing_tasks** - - list of strings - - List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given. Should be set to ``[]`` if None. - * - **model.task_templates** - - list - - See the ``model.task_templates`` Config Parameters Table above - * - **model.prompt_tuning.new_prompt_init_methods** - - list of strings - - List of 'text' or 'random', should correspond to the order of tasks listed in ``model.new_tasks``. Only needed if `virtual_prompt_style='prompt-tuning'` - * - **model.prompt_tuning.new_prompt_init_text** - - list of strings - - The text you want to use for soft prompt initalization if ``model.prompt_tuning.new_prompt_init_methods`` is set to 'text' for a task. Should correspond to the order of tasks listed in ``model.new_tasks``. The text is tokenized and clipped or tiled to match ``total_virtual_tokens`` in ``model.task_templates``. The vocab embeddings associated with each token are copied and use to initialize the soft prompts before tuning. - * - **model.p_tuning.dropout** - - float - - LSTM prompt encoder dropout prob - * - **model.p_tuning.num_layers** - - int - - Num layers in LSTM prompt encoder - * - **model.tensor_model_parallel_size** - - int - - intra-layer model parallelism, must match the ``tensor_model_parallel_size`` of the GPT model given at ``language_model_path`` - * - **model.batch_size** - - int - - global batch size - * - **model.data.train_ds** - - list of strings - - list of ``.json`` or ``.jsonl`` training dataset files with json ojects that have the dataset format described above - * - **model.data.validation_ds** - - list of strings - - list of ``.json`` or ``.jsonl`` validation dataset files with json ojects that have the dataset format described above - * - **model.data.add_eos** - - bool - - Whether to add an EOS token at the end of each training example (recommended). - -An example config file can be found at https://github.com/NVIDIA/NeMo/tree/stable/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml - -Setting New Tasks -^^^^^^^^^^^^^^^^^ - -After you p-tune or prompt-tune your model, you can always go back and p-tune or prompt-tune your model on more tasks without over writing the virtual prompts who've trained already. You can also use a different number of ``total_virtual_tokens`` between each training session as long as tasks ptuned or prompt tuned at the same time have the same number of ``total_virtual_tokens``. For this reason, when you ptune on a new task, you need to tell your model which of your tasks are new and which ones already exist (and thus you don't want to tune them). You do this by setting the ``new_tasks`` and ``existing_tasks`` values in the config file. - -Example Multi-Task Prompt Tuning Config and Command -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -First define a config called ``multitask-prompt-learning.yaml`` demonstrated below. **In the** ``exp_manager`` **portion of the config,** ``save_nemo_on_train_end`` **should be set to** ``False`` **to avoid unnecessarily saving the incorrect model weights.** This is already done in the example `megatron_gpt_prompt_learning_config.yaml config `_ that you should use as your starting point. The correct prompt learning model will be saved at the ``model.nemo_path`` you set. - -.. code:: - - name: multitask_prompt_tuning - trainer: ... - exp_manager: ... - model: - seed: 1234 - nemo_path: ${name}.nemo - virtual_prompt_style: "prompt-tuning" - encoder_seq_length: 2048 - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - global_batch_size: 16 - micro_batch_size: 4 - - restore_path: null - language_model_path: models/megatron_125M_gpt.nemo - existing_tasks: [] - new_tasks: ["sentiment", "intent_and_slot"] - - task_templates: - - taskname: "sentiment" - prompt_template: "<|VIRTUAL_PROMPT_0|> {sentence} sentiment: {label}" - total_virtual_tokens: 100 - virtual_token_splits: [100] - truncate_field: null - answer_only_loss: False - - - taskname: "intent_and_slot" - prompt_template: "<|VIRTUAL_PROMPT_0|> Predict intent and slot <|VIRTUAL_PROMPT_1|> :\n{utterance}{label}" - total_virtual_tokens: 100 - virtual_token_splits: [80, 20] - truncate_field: null - answer_only_loss: True - answer_field: "label" - - prompt_tuning: - new_prompt_init_methods: ["text", "text"] - new_prompt_init_text: ["financial sentiment analysis postive neutral negative", "intent and slot classification virtual assistant task bot please"] - - data: - train_ds: ["data/financial_phrase_bank_train.jsonl", "data/assistent_train.jsonl"] - validation_ds: ["data/financial_phrase_bank_val.jsonl", "data/assistent_val.jsonl"] - add_eos: True - shuffle: True - num_workers: 1 - pin_memory: True - - optim: ... - -(See https://github.com/NVIDIA/NeMo/tree/stable/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml for what should go in the ``trainer``, ``exp_manager``, and ``optim`` sections.) - -Then run the command - -.. code:: - - python megatron_gpt_prompt_learning.py --config-name=multitask-prompt-learning.yaml - - -Example Multi-Task P-Tuning Config and Command After Prompt-Tuning -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Update ``multitask-prompt-learning.yaml`` from the example above with p-tuning parameters for the new task. Be sure to update ``model.existing_tasks`` with the tasknames from previous prompt learning runs and to use the ``.nemo`` file saved at the end of your last prompt learning session. Values different from the config above have stars commented next to them. - -In this example, the SQuAD task includes the question context as part of the prompt. Because the context is long, we recommend setting ``answer_only_loss`` to ``True`` for this task, and any task where a significant portion of the prompt is not a part of the answer. ``answer_only_loss`` tells the model to only calculate the cross-entropy loss on the answer portion of the training example. Though we recommend placing all virtual tokens at the beginning of the prompt, we place them throughout the prompt in this example to demonstrate how to do so. - -.. code:: - - name: multitask_p_tuning # *** - trainer: ... - exp_manager: ... - model: - seed: 1234 - nemo_path: ${name}.nemo - virtual_prompt_style: "p-tuning" # *** - encoder_seq_length: 2048 - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - global_batch_size: 16 - micro_batch_size: 4 - - restore_path: multitask_prompt_tuning.nemo # *** - language_model_path: models/megatron_125M_gpt.nemo - existing_tasks: ["sentiment", "intent_and_slot"] # *** - new_tasks: ["squad"] - - task_templates: - - taskname: "sentiment" - prompt_template: "<|VIRTUAL_PROMPT_0|> {sentence} sentiment: {label}" - total_virtual_tokens: 100 - virtual_token_splits: [100] - truncate_field: null - answer_only_loss: False - - - taskname: "intent_and_slot" - prompt_template: "<|VIRTUAL_PROMPT_0|> Predict intent and slot <|VIRTUAL_PROMPT_1|> :\n{utterance}{label}" - total_virtual_tokens: 100 - virtual_token_splits: [80, 20] - truncate_field: null - answer_only_loss: True - answer_field: "label" - - - taskname: "squad" # *** - prompt_template: "<|VIRTUAL_PROMPT_0|> Answer the question from the context {question} {context} Answer: {answer}" # *** - total_virtual_tokens: 9 # *** - virtual_token_splits: [9] # *** - truncate_field: context # *** - answer_only_loss: True # *** - answer_field: "answer" # *** - - p_tuning: # *** - dropout: 0.0 # *** - num_layers: 2 # *** - - data: - train_ds: ["data/squad_train.jsonl"] # *** - validation_ds: ["data/squad_val.jsonl"] # *** - add_eos: True - shuffle: True - num_workers: 1 - pin_memory: True - - optim: ... - -Then run the command again: - -.. code:: - - python megatron_gpt_prompt_learning.py --config-name=multitask-prompt-learning.yaml - - -Example Multi-Task Inference -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The inference file can contain a mix of prompts from all the tasks the model has been prompt tuned on. - -.. code:: - - python megatron_gpt_prompt_learning_eval.py \ - virtual_prompt_model_file=PATH_TO_NEMO_PROMPT_LEARNING_MODEL_FILE \ - gpt_model_file=PATH_TO_FROZEN_GPT_MODEL_FILE \ - inference.greedy=True \ - inference.add_BOS=False \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - tensor_model_parallel_size=1 \ - pipeline_model_parallel_size=1 \ - prompts=[prompt1,prompt2] - -``virtual_prompt_model_file`` should be a path to a .nemo file saved after p-tuning/prompt tuning and ``model_file`` is still the path to the gpt model's .nemo file. - -prompts in this case should be a list of .json or .jsonl files containing json objects similar to the ones used during prompt learning. They should have keys that match the fields specified in the prompt template. Fields can be dropped from the prompt dict and their corresponding section of the prompt template will be automatically removed. - -For example, say the prompt template during p-tuning/prompt-tuning looked like: - -.. code:: - - '<|VIRTUAL_PROMPT_0|> Context: {context} Question: {question} Answer: {answer}' - -but you don't want to include the answer field during inference. Just don't include the answer field in the prompt dict like below: - -.. code:: - - {"taskname": "squad", "context": "some paragraph", "question": "question related to paragraph"} - {"taskname": "squad", "context": "another paragraph", "question": "a different question related to paragraph"} - - -And the dataset class will automatically format your input to have the form: - -.. code:: - - [ - '<|VIRTUAL_PROMPT_0|> Context: some paragraph Question: question related to paragraph Answer: ', - '<|VIRTUAL_PROMPT_0|> Context: another paragraph Question: a different question related to paragraph Answer: ' - ] - -Generally prompt learning inference is just like running inference with a GPT model. The only difference is you need to add ``virtual_prompt_model_file=PATH_TO_NEMO_PROMPT_LEARNING_MODEL_FILE`` to your command if you're using a p-tuned/prompt-tuned model. - -Example prompt learning script: `NeMo/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py.py `__. - -Example prompt tuned inference script: `NeMo/examples/nlp/language_modeling/megatron_gpt_eval.py `__. diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/retro/images/arch.png b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/retro/images/arch.png deleted file mode 100644 index eca506a12ceaa0a26c317824412987cc22a9f299..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/retro/images/arch.png and /dev/null differ diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/retro/retro_model.rst b/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/retro/retro_model.rst deleted file mode 100644 index ceff1baf857f37ae91f66ce7301b35096ea5694d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nemo_megatron/retro/retro_model.rst +++ /dev/null @@ -1,444 +0,0 @@ -NeMo RETRO Model -================ - -The Retrieval-Enhanced Transformer (RETRO) model is an autoregressive language model that takes into account document chunks retrieved from a large -corpus when making predictions. The RETRO model has a similar architecture to the GPT model, but it includes an encoder that encodes the retrieved -context and cross-attention layers that integrate the context to improve the model's output. Below is a simple diagram of the RETRO model architecture. - -.. image:: images/arch.png - :align: center - :width: 800px - :alt: RETRO model architecture - -For more detailed information on the model, please refer to the `RETRO paper `_ :cite:`nlp-retro-borgeaud2021improving` by Deepmind. -The NeMo RETRO Model is an open-source implementation of the paper, and it has the following differences/features compared to Deepmind's proposed implementation: - -1. The NeMo RETRO Model is built on top of NeMo Megatron code, allowing for efficient training of large language models in a cluster environment. -2. The NeMo RETRO Model uses `Faiss `_ :cite:`nlp-retro-jegou2022faiss` as the K$N search library, which can be accelerated by GPUs. -3. The NeMo RETRO uses `RoPe relative positional encoding `_ :cite:`nlp-retro-su2021roformer`. -4. The NeMo RETRO uses `SentenceTransformers `_ :cite:`nlp-retro-reimers2019sentence` as the retriever encoder. -5. The NeMo RETRO supports `mu-Transfer `_ :cite:`nlp-retro-yang2022tensor`, allowing for scalable training of the RETRO model via Zero-Shot Hyperparameter Transfer. - -Quick start -************ -Steps below demonstrate training and evaluating a NeMo RETRO model - -Data pre-processing -------------------- - -Step 1: Collect training data -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The RETRO model uses two types of data: training data, which typically consists of 64-token chunks, and retrieval data, which typically consists of 128-token chunks. -The training data is used to train the model, while the retrieval data is used to supplement the language model. -It's possible to use the same data for both training and retrieval, as long as duplicates are removed properly, as described below. -Both types of data are stored in a loose JSON format, with each line containing a single text sample. For example: - -.. code-block:: json - {"src": "www.nvidia.com", "text": "The quick brown fox", "type": "Eng", "id": "0", "title": "First Part"} - {"src": "The Internet", "text": "jumps over the lazy dog", "type": "Eng", "id": "42", "title": "Second Part"} -The name of the text field of the json can be changed by using the ``--json-key`` flag in ``preprocess_data_for_megatron.py``. The other metadata are optional and are not used in training. - -Step 2: Convert training data into memory map format -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The loose json is then processed into a binary format for training and retrieval. To convert the json into mmap, cached index file. -Set the ``--dataset-impl`` flag to `retmmap`, which is the memory map format dedicated for RETRO model. - -An example script to prepare data for RETRO training is: - -.. code-block:: bash - python scripts/nlp_language_modeling/preprocess_data_for_megatron.py \ - --input=/dataset/pubmed_train.jsonl \ - --json-keys=text \ - --tokenizer-library=megatron \ - --apply-ftfy \ - --dataset-impl=retmmap \ - --merge-file=/dataset/gpt2-merges.txt \ - --vocab-file=/dataset/gpt2-vocab.json \ - --tokenizer-type=GPT2BPETokenizer \ - --output-prefix=/result/pubmed_train \ - --need-pad-id \ - --append-eod \ - --retrieval-db \ - --chunk_size=64 \ - --workers=48 -The RETRO model processes chunked documents using 64 tokens as the default chunk size. The RETRO memory map dataset will add padding -tokens to the end of each document to make it a multiple of 64. The ``--need-pad-id`` argument adds a padding token to the tokenizer -if it doesn't already have one. The ``--append-eod`` argument controls whether to add ``end-of-document`` tokens to the preprocessed -data, and the ``--retrieval-db`` argument indicates whether to create a retrieval database for the preprocessed data. If ``--retrieval-db`` -is used, it will add an additional 64 padding tokens at the end of the document. The ``--chunk_size`` and ``--workers`` arguments -control the size of the data chunks to be processed and the number of worker processes to use, respectively. - -Following is the retro memory map index data format: - -.. list-table:: - :widths: 25 25 25 25 25 25 - - * - 'MMIDRET\x00\x00' (header 9 bytes) - - 1 (version 8 byte) - - dtype code :sup:`1` (1 byte) - - sentence count (8 byte) - - chunk size (8 byte) - - chunk count (8 byte) - * - retrieved db :sup:`2` (1 byte) - - number of tokens for each of sentences ( int32 array) - - start of sentence address in byte (int64 array) - - start of chunk id (int64 array) - - chunk id address in byte (int64 array) - - - -:sup:`1` 1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: np.float, 7: np.double, 8: np.uint16 - -:sup:`2` When building the indexed dataset, we pad each sentence to be a multiple of ``chunk_size`` with ``pad_id`` from the tokenizer. -The number of tokens for each sentence includes the padded token ids. For retrieval data, there is an extra ``chunk_size`` padding at -the end of each sentence, and the ``retrieved_db`` flag is set to True. However, the number of tokens for each sentence excludes this extra ``chunk_size`` padding. - -Following is the retro memory map binary data format: - -.. list-table:: - :widths: 65 - - * - token id array for sentence 0,1, 2 ... (dtype :sup:`3` array) - -:sup:`3` np.uint16 vocab_size < 65500 else np.int32 - -Step 3: Create Faiss index for retrieval data -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -After creating the memory map retrieval data binary file and index files, we can build a Faiss index that can quickly find the K-nearest neighbors of a given -chunk ID based on a query embedding vector. Because the retrieval data is typically very large, we break this process down into three steps. - -Step 3.1: Train the Faiss index structure -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In this step, it uses a subset of the retrieval data to train a empty Faiss index. An example script is: - -.. code-block:: bash - python scripts/nlp_language_modeling/build_retrieval_index.py \ - --input_file=/result/pubmed_train_text_document \ - --tokenizer-library=megatron \ - --tokenizer-type=GPT2BPETokenizer \ - --merge-file=/dataset/gpt2-merges.txt \ - --vocab-file=/dataset/gpt2-vocab.json \ - --percent=1.0 \ - --sentence_transformer_model=all-mpnet-base-v2 \ - --batch_size=1024 \ - --train_index_size=2000000 \ - --workers=2 \ - --devices=0,1,2,3,4,5,6,7 \ - --stage=0 \ - --output_file=/result/pubmed_faiss_learn.index -This command is used to build an empty Faiss index using the 2000000 training data in ``pubmed_train_text_document``. -The ``all-mpnet-base-v2`` sentence transformer model is used to encode the chunk tokens into an embedding vector. -The index will be saved in the result directory as ``pubmed_faiss_learn.index``. This command specifies using 8 GPUs to train the Faiss index. - -Step 3.2: Add retrieval data into sharding index -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This step adds all the retrieval data to the empty Faiss index created in the previous step. An example script is: - -.. code-block:: bash - python scripts/nlp_language_modeling/build_retrieval_index.py \ - --input_file=/result/pubmed_train_text_document \ - --tokenizer-library=megatron \ - --tokenizer-type=GPT2BPETokenizer \ - --merge-file=/dataset/gpt2-merges.txt \ - --vocab-file=/dataset/gpt2-vocab.json \ - --percent=1.0 \ - --sentence_transformer_model=all-mpnet-base-v2 \ - --batch_size=1024 \ - --shard_id=0 \ - --total_shards=10 \ - --workers=2 \ - --devices=0,1,2,3,4,5,6,7 \ - --stage=1 \ - --learned_index=/result/pubmed_faiss_learn.index \ - --output_file=/result/pubmed_faiss_shard0.save -This command breaks the retrieval data into ``total_shards`` shards and adds the data in the shard specified by ``shard_id``. -The result is saved to a file specified by ``output_file``. In the example above, 10 sharding indexes are created. - -Step 3.3: Merge the sharding indexes into final Faiss index -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This step merges all the sharding indexes created in the previous step into the final Faiss index. An example script is: - -.. code-block:: bash - python scripts/nlp_language_modeling/build_retrieval_index.py \ - --stage=2 \ - --devices=0,1,2,3,4,5,6,7 \ - --learned_index=/result/pubmed_faiss_learn.index \ - --shard_index_input=/result/pubmed_faiss_shard \ - --output_file=/result/pubmed_faiss_final.index -Step 4: Build KNN index -^^^^^^^^^^^^^^^^^^^^^^^ - -During training, it is inefficient to run a query to find the K-nearest neighbor chunk IDs for each training data point. -This can be pre-calculated by building a KNN index before training. The KNN index maps the training data chunk IDs to the K-nearest neighbor chunk IDs -in the retrieval data. As with building the Faiss index, this process is divided into two steps. - -Following is the KNN index data format: - -.. list-table:: - :widths: 25 25 25 25 45 - - * - 'KNNRETM\x00\x00' (header 9 bytes) - - 1 (version 8 byte) - - K number of neighbors (8 byte) - - Number chunks (8 byte) - - Map to K retrieval data chunk IDs, shape (number_chunks, K) ( int64 array) - -Step 4.1: Build KNN sharding index -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The KNN index is built using the memory-mapped training data created by the ``preprocess_data_for_megatron.py`` script and the Faiss index -file for the retrieval data built by the ``build_retrieval_index.py`` script. - -An example script is: - -.. code-block:: bash - python scripts/nlp_language_modeling/build_knn_map_index.py \ - --input_file=/result/pubmed_eval_text_document \ - --tokenizer-library=megatron \ - --tokenizer-type=GPT2BPETokenizer \ - --merge-file=/dataset/gpt2-merges.txt \ - --vocab-file=/dataset/gpt2-vocab.json \ - --process_chunk_size=10000 \ - --sentence_transformer_model=all-mpnet-base-v2 \ - --batch_size=1024 \ - --K_neighbors=50 \ - --workers=2 \ - --devices=0,1,2,3,4,5,6,7 \ - --remove_duplicate \ - --dedup_margin=70 \ - --nprobe=100 \ - --shard_id=0 \ - --total_shards=10 \ - --stage=1 \ - --output_file=/dataset/pubmed_knn_shard0.save \ - --faiss_index=/result/pubmed_faiss_final.index -In this example, the training data is broken into ``total_shards`` shards, and the KNN index is calculated for the shard specified by ``shard_id``. -The result is saved to a file specified by ``output_file``. In the example above, 10 KNN sharding indexes are created. - -Use the ``remove_duplicate`` flag if the training data and retrieval data are the same to remove neighbors from the same document. - -Step 4.2: Merge KNN sharding index into final KNN index -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -An example script is: - -.. code-block:: bash - python scripts/nlp_language_modeling/build_knn_map_index.py \ - --stage=2 \ - --output_file=pubmed_knn_final.save \ - --shard_index_input=pubmed_knn_shard -Train NeMo RETRO Model ------------------------ - -Once the training data, retrieval data, KNN index, and Faiss index are prepared, we are ready to train the RETRO model. In the NeMo implementation, -the RETRO model can be pre-trained with or without the `mu-Transfer `_ :cite:`nlp-retro-yang2022tensor` feature. We will introduce both ways. - - -The table below lists some of the common parameters that can be configured for model pre-training. - -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| **Parameter** | **Default** | **Description** | -+==================================+=============+========================================================================================+ -| model.micro_batch_size | 4 | the micro batch size used for training | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| model.tensor_model_parallel_size | 1 | tensor model parallel size | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| model.encoder_seq_length | 2048 | token sequence length | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| model.chunk_size | 64 | the chunk size used to retrieve | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| model.enc_num_layers | 4 | total number of encoder layers | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| model.dec_num_layers | 6 | total number of decoder layers | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| model.enc_cross_attention | [3] | layer numbers for cross attention in encoder | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| model.dec_cross_attention | [3,4,5] | layer numbers for chunked cross attention in decoder | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| model.add_position_embedding | FALSE | whether to add the absolute position encoding | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| model.hidden_size | 768 | model hidden size | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| model.ffn_hidden_size | 3072 | model FFN hidden size. Usually 4 * hidden_size | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| model.num_attention_heads | 12 | number of attention heads | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| model.init_method_std | 0.02 | standard deviation of the zero mean normal distribution used for weight initialization | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| model.hidden_dropout | 0.1 | dropout probability for hidden state transformer | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| model.attention_dropout | 0.1 | dropout probability in the attention layer | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ -| model.ffn_dropout | 0 | dropout probability in the feed-forward layer | -+----------------------------------+-------------+----------------------------------------------------------------------------------------+ - - -Option 1: Train the NeMo RETRO model *without* mu-Transfer -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -An example RETRO pre-training script is: - -.. code-block:: bash - python examples/nlp/language_modeling/megatron_retro_pretraining.py \ - trainer.devices=8 \ - trainer.num_nodes=2 \ - trainer.accelerator=gpu \ - trainer.max_steps=800000 \ - trainer.precision=16 \ - exp_manager.exp_dir=/result/retro_model \ - model.apply_query_key_layer_scaling=False \ - model.tensor_model_parallel_size=8 \ - model.optim.name=adamw \ - model.enc_num_layers=2 \ - model.dec_num_layers=32 \ - model.enc_cross_attention=[0] \ - model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \ - model.hidden_size=4096 \ - model.ffn_hidden_size=16384 \ - model.num_attention_heads=32 \ - model.tokenizer.merge_file=/dataset/gpt2-merges.txt \ - model.tokenizer.vocab_file=/dataset/gpt2-vocab.json \ - model.data.data_prefix=[/result/pubmed_eval_text_document] \ - model.data.knn_index=[dataset/pubmed_knn_final.save] \ - model.data.retrieval_prefix=/result/pubmed_eval_text_document \ - model.micro_batch_size=8 -During the training, launch Tensorboard to monitor training like so: - -.. code-block:: bash - tensorboard --logdir /result/retro_model --bind_all -.. note:: Weights and Biases (WandB) is supported too. Add ``exp_manager.create_wandb_logger=True`` to the model training arguments to enable it. - -After the training, the model nemo file can be found at the result checkpoint directory. - -Option 2: Train the NeMo RETRO model *with* mu-Transfer -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -`mu-Transfer `_ :cite:`nlp-retro-yang2022tensor` paper proposed a method to zero-shot transfer hyperparameter to train a larger model. -This can be done in 3 steps in NeMo RETRO implementation. - - -Step 1. find optimal hyper parameter for a small base model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Use the pre-training code in Option 1, either manually or automatically ind a set of optimal hyperparameter for a small base RETRO -model. This is can be done cheaply ans fast due to the small model size. - -Step 2. calculate the shape file that can be used to run mu-Transfer -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The shape file determines which hyperparameters will be scaled up, allowing the model to adjust the learning rate, weight scaling factor, etc. - -Here is an example shape file calculation script: - - -.. code-block:: bash - python examples/nlp/language_modeling/megatron_retro_cal_shape.py \ - trainer.devices=8 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - exp_manager.exp_dir=/result/retro_model \ - base_model.enc_num_layers=2 \ - delta_model.enc_num_layers=2 \ - base_model.dec_num_layers=32 \ - delta_model.dec_num_layers=32 \ - base_model.tensor_model_parallel_size=8 \ - delta_model.tensor_model_parallel_size=8 \ - base_model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \ - delta_model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \ - base_model.enc_cross_attention=[0] \ - delta_model.enc_cross_attention=[0] \ - base_model.hidden_size=768 \ - base_model.ffn_hidden_size=3072 \ - delta_model.hidden_size=96 \ - delta_model.ffn_hidden_size=384 \ - base_model.num_attention_heads=16 \ - delta_model.num_attention_heads=16 \ - model.shape_file=tp8_32depth_o1_rel_shape_info.yaml -In this example, the ``base_model`` refers to the small base model for which an optimal set of hyperparameters has been determined. -The ``delta_model`` refers to a model with certain hyperparameters that have been scaled up or down. In this case, -the ``hidden_size`` and ``ffn_hidden_size`` have been changed in the ``delta_model``, allowing these two parameters to be scaled freely later. - -Step 3. Pretrain mu-Transfer RETRO model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Once the shape file is created, we can start training a RETRO model. The model training can be scale up freely using the hyperparameters -specified by the delta model and the shape file. - -An example mu-Transfer pre-training script is: - -.. code-block:: bash - python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \ - trainer.devices=8 \ - trainer.num_nodes=2 \ - trainer.accelerator=gpu \ - trainer.max_steps=500000 \ - trainer.precision=16 \ - exp_manager.exp_dir=/result/retro_model \ - model.apply_query_key_layer_scaling=False \ - model.tensor_model_parallel_size=8 \ - model.optim.name=muadamw \ - model.enc_num_layers=2 \ - model.dec_num_layers=32 \ - model.enc_cross_attention=[0] \ - model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \ - model.hidden_size=4096 \ - model.ffn_hidden_size=16384 \ - model.num_attention_heads=32 \ - model.tokenizer.merge_file=/dataset/gpt2-merges.txt \ - model.tokenizer.vocab_file=/dataset/gpt2-vocab.json \ - model.data.data_prefix=[/result/pubmed_eval_text_document] \ - model.data.knn_index=[dataset/pubmed_knn_final.save] \ - model.data.retrieval_prefix=/result/pubmed_eval_text_document \ - model.micro_batch_size=8 \ - model.shape_file=tp8_32depth_o1_rel_shape_info.yaml -.. note:: We have chosen to use ``muadamw`` as the optimizer for use with the mu-transfer method. Currently, only ``muadam`` and ``muadamw`` are supported. - -Similarly to the pre-training in Option 1, the model nemo file can be found at the result checkpoint directory after training is complete. - -Run NeMo RETRO Model Inference -------------------------------- - -Once the NeMo RETRO model has been trained, we can put it into inference mode and experiment with it. -During inference, we are not limited to the static Faiss index that we built earlier for KNN queries. -We can feed any external data to the model as retrieval context. NeMo RETRO implementation supports dynamic retrieval service, -allowing users to add, reset, and query new documents on the fly. - -We have built a simple web client that makes it easy for users to play around with the model. Here is an example script to launch the server: - -.. code-block:: bash - python examples/nlp/language_modeling/megatron_retro_eval.py \ - trainer.devices=8 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.precision=16 \ - retro_model_file=megatron_retro.nemo \ - tensor_model_parallel_size=8 \ - pipeline_model_parallel_size=1 \ - retrieval_service.sentence_bert.devices=\'0,1,2,3,4,5,6,7\' \ - retrieval_service.services.0.faiss_devices=\'0,1,2,3,4,5,6,7\' \ - retrieval_service.services.1.faiss_devices=\'0,1,2,3,4,5,6,7\' \ - retrieval_service.services.0.faiss_index=/result/pubmed_faiss_final.index \ - retrieval_service.services.0.retrieval_index=/result/pubmed_eval_text_document \ - retrieval_service.neighbors=2 \ - retrieval_service.pad_tokens=True \ - retrieval_service.store_retrieved=True \ - server=True \ - web_server=True \ - share=True \ - username=test \ - password=test123 -Set the retro_model_file to use the nemo file generated in the pre-training step. After launching the server, copy-paste the URL from -the terminal into your browser. Use the specified username and password to log in and have fun experimenting with the RETRO model. - -References -************ - -.. bibliography:: ../../nlp_all.bib - :style: plain - :labelprefix: nlp-retro - :keyprefix: nlp-retro- diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nlp_all.bib b/SoundScribe/SpeakerID/docs/source/nlp/nlp_all.bib deleted file mode 100644 index 48f97e929fc232ac5324e690a2389e312de583a0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nlp_all.bib +++ /dev/null @@ -1,308 +0,0 @@ -@article{devlin2018bert, - title={Bert: Pre-training of deep bidirectional transformers for language understanding}, - author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, - journal={arXiv preprint arXiv:1810.04805}, - year={2018} -} - -@article{shoeybi2019megatron, - title={Megatron-lm: Training multi-billion parameter language models using model parallelism}, - author={Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared and Catanzaro, Bryan}, - journal={arXiv preprint arXiv:1909.08053}, - year={2019} -} - -@InProceedings{maas2011, - author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher}, - title = {Learning Word Vectors for Sentiment Analysis}, - booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}, - month = {June}, - year = {2011}, - address = {Portland, Oregon, USA}, - publisher = {Association for Computational Linguistics}, - pages = {142--150}, - url = {http://www.aclweb.org/anthology/P11-1015} -} - -@inproceedings{socher2013, - title = "Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank", - author = "Socher, Richard and Perelygin, Alex and Wu, Jean and Chuang, Jason and Manning, Christopher D. and Ng, Andrew and Potts, Christopher", - booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing", - month = oct, - year = "2013", - address = "Seattle, Washington, USA", - publisher = "Association for Computational Linguistics", - url = "https://www.aclweb.org/anthology/D13-1170", - pages = "1631--1642", -} - -@article{lim2018chemical, - title={Chemical--gene relation extraction using recursive neural network}, - author={Lim, Sangrak and Kang, Jaewoo}, - journal={Database}, - volume={2018}, - year={2018}, - publisher={Oxford Academic} -} - -@inproceedings{li2007scalable, - title={Scalable term selection for text categorization}, - author={Li, Jingyang and Sun, Maosong}, - booktitle={Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP-CoNLL)}, - pages={774--782}, - year={2007} -} - -@misc{lee2019biobert, - title={BioBERT: a pre-trained biomedical language representation model for biomedical text mining}, - author={Jinhyuk Lee and Wonjin Yoon and Sungdong Kim and Donghyeon Kim and Sunkyu Kim and Chan Ho So and Jaewoo Kang}, - year={2019}, - eprint={1901.08746}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} - -@misc{shin2020biomegatron, - title={BioMegatron: Larger Biomedical Domain Language Model}, - author={Hoo-Chang Shin and Yang Zhang and Evelina Bakhturina and Raul Puri and Mostofa Patwary and Mohammad Shoeybi and Raghav Mani}, - year={2020}, - eprint={2010.06060}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} - -@inproceedings{vaswani2017attention, - title={Attention is all you need}, - author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, - booktitle={Advances in Neural Information Processing Systems}, - pages={6000--6010}, - year={2017} -} - -@article{sennrich2015neural, - title={Neural machine translation of rare words with subword units}, - author={Sennrich, Rico and Haddow, Barry and Birch, Alexandra}, - journal={arXiv preprint arXiv:1508.07909}, - year={2015} -} - -@article{provilkov2019bpe, - title={Bpe-dropout: Simple and effective subword regularization}, - author={Provilkov, Ivan and Emelianenko, Dmitrii and Voita, Elena}, - journal={arXiv preprint arXiv:1910.13267}, - year={2019} -} - -@article{post2018call, - title={A call for clarity in reporting BLEU scores}, - author={Post, Matt}, - journal={arXiv preprint arXiv:1804.08771}, - year={2018} -} - -@misc{zhang2021sgdqa, - title={SGD-QA: Fast Schema-Guided Dialogue State Tracking for Unseen Services}, - author={Yang Zhang and Vahid Noroozi and Evelina Bakhturina and Boris Ginsburg}, - year={2021}, - eprint={2105.08049}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} - -@article{zhang2019neural, - title={Neural Models of Text Normalization for Speech Applications}, - author={Hao Zhang and R. Sproat and Axel H. Ng and Felix Stahlberg and Xiaochang Peng and Kyle Gorman and B. Roark}, - journal={Computational Linguistics}, - year={2019}, - pages={293-338} -} - -@misc{liu2021selfalignment, - title={Self-Alignment Pretraining for Biomedical Entity Representations}, - author={Fangyu Liu and Ehsan Shareghi and Zaiqiao Meng and Marco Basaldella and Nigel Collier}, - year={2021}, - eprint={2010.11784}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - } - -@article{gulcehre2015using, - title={On using monolingual corpora in neural machine translation}, - author={Gulcehre, Caglar and Firat, Orhan and Xu, Kelvin and Cho, Kyunghyun and Barrault, Loic and Lin, Huei-Chi and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua}, - journal={arXiv preprint arXiv:1503.03535}, - year={2015} -} - -@article{yee2019simple, - title={Simple and effective noisy channel modeling for neural machine translation}, - author={Yee, Kyra and Ng, Nathan and Dauphin, Yann N and Auli, Michael}, - journal={arXiv preprint arXiv:1908.05731}, - year={2019} -} - -@inproceedings{koehnetal2007moses, - title = "{M}oses: Open Source Toolkit for Statistical Machine Translation", - author = "Koehn, Philipp and - Hoang, Hieu and - Birch, Alexandra and - Callison-Burch, Chris and - Federico, Marcello and - Bertoldi, Nicola and - Cowan, Brooke and - Shen, Wade and - Moran, Christine and - Zens, Richard and - Dyer, Chris and - Bojar, Ond{\v{r}}ej and - Constantin, Alexandra and - Herbst, Evan", - booktitle = "Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics Companion Volume Proceedings of the Demo and Poster Sessions", - month = jun, - year = "2007", - address = "Prague, Czech Republic", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/P07-2045", - pages = "177--180", -} - -@inproceedings{sunkara20_interspeech, - author={Monica Sunkara and Srikanth Ronanki and Dhanush Bekal and Sravan Bodapati and Katrin Kirchhoff}, - title={{Multimodal Semi-Supervised Learning Framework for Punctuation Prediction in Conversational Speech}}, - year=2020, - booktitle={Proc. Interspeech 2020}, - pages={4911--4915}, - doi={10.21437/Interspeech.2020-3074} -} - -@article{chen2019bert, - title={Bert for joint intent classification and slot filling}, - author={Chen, Qian and Zhuo, Zhu and Wang, Wen}, - journal={arXiv preprint arXiv:1902.10909}, - year={2019} -} - -@article{borgeaud2021improving, - title={Improving language models by retrieving from trillions of tokens}, - author={Borgeaud, Sebastian and Mensch, Arthur and Hoffmann, Jordan and Cai, Trevor and Rutherford, Eliza and Millican, Katie and Driessche, George van den and Lespiau, Jean-Baptiste and Damoc, Bogdan and Clark, Aidan and others}, - journal={arXiv preprint arXiv:2112.04426}, - year={2021} -} - -@article{su2021roformer, - title={Roformer: Enhanced transformer with rotary position embedding}, - author={Su, Jianlin and Lu, Yu and Pan, Shengfeng and Wen, Bo and Liu, Yunfeng}, - journal={arXiv preprint arXiv:2104.09864}, - year={2021} -} - -@article{reimers2019sentence, - title={Sentence-bert: Sentence embeddings using siamese bert-networks}, - author={Reimers, Nils and Gurevych, Iryna}, - journal={arXiv preprint arXiv:1908.10084}, - year={2019} -} - -@article{yang2022tensor, - title={Tensor Programs V: Tuning Large Neural Networks via Zero-Shot Hyperparameter Transfer}, - author={Yang, Greg and Hu, Edward J and Babuschkin, Igor and Sidor, Szymon and Liu, Xiaodong and Farhi, David and Ryder, Nick and Pachocki, Jakub and Chen, Weizhu and Gao, Jianfeng}, - journal={arXiv preprint arXiv:2203.03466}, - year={2022} -} - -@article{jegou2022faiss, - title={Faiss: Similarity search and clustering of dense vectors library}, - author={J{\'e}gou, Herv{\'e} and Douze, Matthijs and Johnson, Jeff and Hosseini, Lucas and Deng, Chengqi}, - journal={Astrophysics Source Code Library}, - pages={ascl--2210}, - year={2022} -} - -@misc{antonova2023spellmapper, - title={SpellMapper: A non-autoregressive neural spellchecker for ASR customization with candidate retrieval based on n-gram mappings}, - author={Alexandra Antonova and Evelina Bakhturina and Boris Ginsburg}, - year={2023}, - eprint={2306.02317}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} - -@misc{dao2022flashattention, - title={FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness}, - author={Tri Dao and Daniel Y. Fu and Stefano Ermon and Atri Rudra and Christopher Ré}, - year={2022}, - eprint={2205.14135}, - archivePrefix={arXiv}, - primaryClass={cs.LG} -} - -@misc{vaswani2023attention, - title={Attention Is All You Need}, - author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin}, - year={2023}, - eprint={1706.03762}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} - -@misc{su2022roformer, - title={RoFormer: Enhanced Transformer with Rotary Position Embedding}, - author={Jianlin Su and Yu Lu and Shengfeng Pan and Ahmed Murtadha and Bo Wen and Yunfeng Liu}, - year={2022}, - eprint={2104.09864}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} - -@misc{press2022train, - title={Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation}, - author={Ofir Press and Noah A. Smith and Mike Lewis}, - year={2022}, - eprint={2108.12409}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} - -@misc{chi2022kerple, - title={KERPLE: Kernelized Relative Positional Embedding for Length Extrapolation}, - author={Ta-Chung Chi and Ting-Han Fan and Peter J. Ramadge and Alexander I. Rudnicky}, - year={2022}, - eprint={2205.09921}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} - -@misc{sun2022lengthextrapolatable, - title={A Length-Extrapolatable Transformer}, - author={Yutao Sun and Li Dong and Barun Patra and Shuming Ma and Shaohan Huang and Alon Benhaim and Vishrav Chaudhary and Xia Song and Furu Wei}, - year={2022}, - eprint={2212.10554}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} - -@misc{chi2023dissecting, - title={Dissecting Transformer Length Extrapolation via the Lens of Receptive Field Analysis}, - author={Ta-Chung Chi and Ting-Han Fan and Alexander I. Rudnicky and Peter J. Ramadge}, - year={2023}, - eprint={2212.10356}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} - -@misc{shaw2018selfattention, - title={Self-Attention with Relative Position Representations}, - author={Peter Shaw and Jakob Uszkoreit and Ashish Vaswani}, - year={2018}, - eprint={1803.02155}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} - -@misc{chen2023extending, - title={Extending Context Window of Large Language Models via Positional Interpolation}, - author={Shouyuan Chen and Sherman Wong and Liangjian Chen and Yuandong Tian}, - year={2023}, - eprint={2306.15595}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} \ No newline at end of file diff --git a/SoundScribe/SpeakerID/docs/source/nlp/nlp_model.rst b/SoundScribe/SpeakerID/docs/source/nlp/nlp_model.rst deleted file mode 100644 index 0c0b800fe44b5538d881ef3939931d0ea1ae0472..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/nlp_model.rst +++ /dev/null @@ -1,43 +0,0 @@ -.. _nlp_model: - -Model NLP -========= - -The config file for NLP models contain three main sections: - - - ``trainer``: contains the configs for PTL training. For more information, refer to :doc:`../core/core` and `PTL Trainer class API `. - - ``exp_manager``: the configs of the experiment manager. For more information, refer to :doc:`../core/core`. - - ``model``: contains the configs of the datasets, model architecture, tokenizer, optimizer, scheduler, etc. - -The following sub-sections of the model section are shared among most of the NLP models. - - - ``tokenizer``: specifies the tokenizer - - ``language_model``: specifies the underlying model to be used as the encoder - - ``optim``: the configs of the optimizer and scheduler :doc:`../core/core` - -The ``tokenizer`` and ``language_model`` sections have the following parameters: - -+------------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **Parameter** | **Data Type** | **Description** | -+------------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **model.tokenizer.tokenizer_name** | string | Tokenizer name will be filled automatically based on ``model.language_model.pretrained_model_name``. | -+------------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **model.tokenizer.vocab_file** | string | Path to tokenizer vocabulary. | -+------------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **model.tokenizer.tokenizer_model** | string | Path to tokenizer model (only for sentencepiece tokenizer). | -+------------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **model.language_model.pretrained_model_name** | string | Pre-trained language model name, for example: ``bert-base-cased`` or ``bert-base-uncased``. | -+------------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **model.language_model.lm_checkpoint** | string | Path to the pre-trained language model checkpoint. | -+------------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **model.language_model.config_file** | string | Path to the pre-trained language model config file. | -+------------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ -| **model.language_model.config** | dictionary | Config of the pre-trained language model. | -+------------------------------------------------+-----------------+--------------------------------------------------------------------------------------------------------------+ - -The parameter **model.language_model.pretrained_model_name** can be one of the following: - - - ``megatron-bert-345m-uncased``, ``megatron-bert-345m-cased``, ``biomegatron-bert-345m-uncased``, ``biomegatron-bert-345m-cased``, ``bert-base-uncased``, ``bert-large-uncased``, ``bert-base-cased``, ``bert-large-cased`` - - ``distilbert-base-uncased``, ``distilbert-base-cased`` - - ``roberta-base``, ``roberta-large``, ``distilroberta-base`` - - ``albert-base-v1``, ``albert-large-v1``, ``albert-xlarge-v1``, ``albert-xxlarge-v1``, ``albert-base-v2``, ``albert-large-v2``, ``albert-xlarge-v2``, ``albert-xxlarge-v2`` diff --git a/SoundScribe/SpeakerID/docs/source/nlp/punctuation_and_capitalization.rst b/SoundScribe/SpeakerID/docs/source/nlp/punctuation_and_capitalization.rst deleted file mode 100644 index 4be0d2151d8ee86425cb3a3185f1f14a2b1569e7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/docs/source/nlp/punctuation_and_capitalization.rst +++ /dev/null @@ -1,921 +0,0 @@ -.. _punctuation_and_capitalization: - -Punctuation and Capitalization Model -==================================== - -Quick Start Guide ------------------ - -.. code-block:: python - - from nemo.collections.nlp.models import PunctuationCapitalizationModel - - # to get the list of pre-trained models - PunctuationCapitalizationModel.list_available_models() - - # Download and load the pre-trained BERT-based model - model = PunctuationCapitalizationModel.from_pretrained("punctuation_en_bert") - - # try the model on a few examples - model.add_punctuation_capitalization(['how are you', 'great how about you']) - -Model Description ------------------ - -For each word in the input text, the Punctuation and Capitalization model: - -- predicts a punctuation mark that should follow the word (if any). By default, the model supports commas, periods, and question marks. -- predicts if the word should be capitalized or not - -In the Punctuation and Capitalization model, we are jointly training two token-level classifiers on top of a pre-trained -language model, such as `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding `__ :cite:`nlp-punct-devlin2018bert`. - -.. note:: - - We recommend you try this model in a Jupyter notebook (run on `Google's Colab `_.): `NeMo/tutorials/nlp/Punctuation_and_Capitalization.ipynb `__. - - Connect to an instance with a GPU (**Runtime** -> **Change runtime type** -> select **GPU** for the hardware accelerator). - - An example script on how to train and evaluate the model can be found at: `NeMo/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py `__. - - The default configuration file for the model can be found at: `NeMo/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml `__. - - The script for inference can be found at: `NeMo/examples/nlp/token_classification/punctuate_capitalize_infer.py `__. - -.. _raw_data_format_punct: - -Raw Data Format ---------------- - -The Punctuation and Capitalization model can work with any text dataset, although it is recommended to balance the -data, especially for the punctuation task. Before pre-processing the data to the format expected by the model, the -data should be split into ``train.txt`` and ``dev.txt`` (and optionally ``test.txt``). Each line in the -``train.txt/dev.txt/test.txt`` should represent one or more full and/or truncated sentences. - -Example of the ``train.txt``/``dev.txt`` file: - -.. code:: - - When is the next flight to New York? - The next flight is ... - .... - - -The ``source_data_dir`` structure should look similar to the following: - -.. code:: - - . - |--sourced_data_dir - |-- dev.txt - |-- train.txt - -.. _nemo-data-format-label: - -NeMo Data Format ----------------- - -The Punctuation and Capitalization model expects the data in the following format: - -The training and evaluation data is divided into 2 files: -- ``text.txt`` -- ``labels.txt`` - -Each line of the ``text.txt`` file contains text sequences, where words are separated with spaces. - -[WORD] [SPACE] [WORD] [SPACE] [WORD], for example: - - :: - - when is the next flight to new york - the next flight is ... - ... - -The ``labels.txt`` file contains corresponding labels for each word in ``text.txt``, the labels are separated with -spaces. Each label in ``labels.txt`` file consists of 2 symbols: - -- the first symbol of the label indicates what punctuation mark should follow the word (where ``O`` means no - punctuation needed) - -- the second symbol determines if a word needs to be capitalized or not (where ``U`` indicates that the word should be - upper cased, and ``O`` - no capitalization needed) - -By default, the following punctuation marks are considered: commas, periods, and question marks; the remaining punctuation marks were -removed from the data. This can be changed by introducing new labels in the ``labels.txt`` files. - -Each line of the ``labels.txt`` should follow the format: ``[LABEL] [SPACE] [LABEL] [SPACE] [LABEL]`` (for ``labels.txt``). For example, -labels for the above ``text.txt`` file should be: - - :: - - OU OO OO OO OO OO OU ?U - OU OO OO OO ... - ... - -The complete list of all possible labels used in this tutorial are: - -- ``OO`` -- ``.O`` -- ``?O`` -- ``OU`` -- -- ``.U`` -- ``?U`` - -Converting Raw Data to NeMo Format ----------------------------------- - -To pre-process the raw text data, stored under :code:`sourced_data_dir` (see the :ref:`raw_data_format_punct` -section), run the following command: - -.. code:: - - python examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py \ - -s \ - -o - - -Required Argument for Dataset Conversion -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -- :code:`-s` or :code:`--source_file`: path to the raw file -- :code:`-o` or :code:`--output_dir` - path to the directory to store the converted files - -After the conversion, the :code:`output_dir` should contain :code:`labels_*.txt` and :code:`text_*.txt` files. The -default names for the training and evaluation in the :code:`conf/punctuation_capitalization_config.yaml` are the -following: - -.. code:: - - . - |--output_dir - |-- labels_dev.txt - |-- labels_train.txt - |-- text_dev.txt - |-- text_train.txt - -Tarred dataset --------------- - -Tokenization and encoding of data is quite costly for punctuation and capitalization task. If your dataset contains a -lot of samples (~4M) you may use tarred dataset. A tarred dataset is a collection of `.tar` files which -contain batches ready for passing into a model. Tarred dataset is not loaded into memory entirely, but in small pieces, -which do not overflow memory. Tarred dataset relies on `webdataset `_. - -For creating of tarred dataset you will need data in NeMo format: - -.. code:: - - python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \ - --text \ - --labels \ - --output_dir \ - --num_batches_per_tarfile 100 - -All tar files contain similar amount of batches, so up to :code:`--num_batches_per_tarfile - 1` batches will be -discarded during tarred dataset creation. - -Beside `.tar` files with batches, the -`examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py -`_ -script will create metadata JSON file, and 2 `.csv` files with punctuation and -capitalization label vocabularies. To use tarred dataset you will need to pass path to a metadata file of your dataset -in a config parameter :code:`model.train_ds.tar_metadata_file` and set a config parameter -:code:`model.train_ds.use_tarred_dataset=true`. - -Training Punctuation and Capitalization Model ---------------------------------------------- - -The language model is initialized with the a pre-trained model from -`HuggingFace Transformers `__, unless the user provides a pre-trained -checkpoint for the language model. To train model from scratch, you will need to provide HuggingFace configuration in -one of parameters ``model.language_model.config_file``, ``model.language_model.config``. An example of a model -configuration file for training the model can be found at: -`NeMo/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml `__. - -A configuration file is a `.yaml` file which contains all parameters for model creation, training, testing, validation. -A structure of the configuration file for training and testing is described in the :ref:`Run config` -section. Some of parameters are required in a punctuation-and-capitalization `.yaml` config. Default values of -required parameters are ``???``. If you omit any of other parameters, they will be initialized according to default -values from following tables. - -.. _run-config-label: - -Run config -^^^^^^^^^^ - -An example of a config file is -`here `_. - -.. list-table:: Run config. The main config passed to a script `punctuation_capitalization_train_evaluate.py `_ - :widths: 5 5 10 25 - :header-rows: 1 - - * - **Parameter** - - **Data type** - - **Default value** - - **Description** - * - **pretrained_model** - - string - - ``null`` - - Can be an NVIDIA's NGC cloud model or a path to a ``.nemo`` checkpoint. You can get list of possible cloud options - by calling a method :py:meth:`~nemo.collections.nlp.models.PunctuationCapitalizationModel.list_available_models`. - * - **name** - - string - - ``'Punctuation_and_Capitalization'`` - - A name of the model. Used for naming output directories and ``.nemo`` checkpoints. - * - **do_training** - - bool - - ``true`` - - Whether to perform training of the model. - * - **do_testing** - - bool - - ``false`` - - Whether ot perform testing of the model after training. - * - **model** - - :ref:`model config` - - :ref:`model config` - - A configuration for the :class:`~nemo.collections.nlp.models.PunctuationCapitalizationModel`. - * - **trainer** - - trainer config - - - - Parameters of - `pytorch_lightning.Trainer `_. - * - **exp_manager** - - exp manager config - - - - A configuration with various NeMo training options such as output directories, resuming from checkpoint, - tensorboard and W&B logging, and so on. For possible options see :ref:`exp-manager-label` description and class - :class:`~nemo.utils.exp_manager.exp_manager`. - -.. _model-config-label: - -Model config -^^^^^^^^^^^^ - -.. list-table:: Location of model config in parent config - :widths: 5 5 - :header-rows: 1 - - * - **Parent config** - - **Key in parent config** - * - :ref:`Run config` - - ``model`` - -A configuration of -:class:`~nemo.collections.nlp.models.token_classification.punctuation_capitalization_model.PunctuationCapitalizationModel` -model. - -.. list-table:: Model config - :widths: 5 5 10 25 - :header-rows: 1 - - * - **Parameter** - - **Data type** - - **Default value** - - **Description** - * - **class_labels** - - :ref:`class labels config` - - :ref:`class labels config` - - Cannot be omitted in `.yaml` config. The ``class_labels`` parameter containing a dictionary with names of label - id files used in ``.nemo`` checkpoints. These file names can also be used for passing label vocabularies to the - model. If you wish to use ``class_labels`` for passing vocabularies, please provide path to vocabulary files in - ``model.common_dataset_parameters.label_vocab_dir`` parameter. - * - **common_dataset_parameters** - - :ref:`common dataset parameters config` - - :ref:`common dataset parameters config` - - Label ids and loss mask information. - * - **train_ds** - - :ref:`data config` with string in ``ds_item`` - - ``null`` - - A configuration for creating training dataset and data loader. Cannot be omitted in `.yaml` config if training - is performed. - * - **validation_ds** - - :ref:`data config` with string OR list of strings in ``ds_item`` - - ``null`` - - A configuration for creating validation datasets and data loaders. - * - **test_ds** - - :ref:`data config` with string OR list of strings in ``ds_item`` - - ``null`` - - A configuration for creating test datasets and data loaders. Cannot be omitted in `.yaml` config if testing is - performed. - * - **punct_head** - - :ref:`head config` - - :ref:`head config` - - A configuration for creating punctuation MLP head that is applied to a language model outputs. - * - **capit_head** - - :ref:`head config` - - :ref:`head config` - - A configuration for creating capitalization MLP head that is applied to a language model outputs. - * - **tokenizer** - - :ref:`tokenizer config` - - :ref:`tokenizer config` - - A configuration for creating source text tokenizer. - * - **language_model** - - :ref:`language model config` - - :ref:`language model config` - - A configuration of a BERT-like language model which serves as a model body. - * - **optim** - - optimization config - - ``null`` - - A configuration of optimizer, learning rate scheduler, and L2 regularization. Cannot be omitted in `.yaml` - config if training is performed. For more information see :ref:`Optimization ` and - `primer `_ tutorial. - -.. _class-labels-config-label: - -Class labels config -^^^^^^^^^^^^^^^^^^^ - -.. list-table:: Location of class labels config in parent configs - :widths: 5 5 - :header-rows: 1 - - * - **Parent config** - - **Key in parent config** - * - :ref:`Run config` - - ``model.class_labels`` - * - :ref:`Model config` - - ``class_labels`` - -.. list-table:: Class labels config - :widths: 5 5 5 35 - :header-rows: 1 - - * - **Parameter** - - **Data type** - - **Default value** - - **Description** - * - **punct_labels_file** - - string - - ??? - - A name of a punctuation labels file. This parameter cannot be omitted in `.yaml` config. This name - is used as a name of label ids file in ``.nemo`` checkpoint. It also can be used for passing label vocabulary to - the model. If ``punct_labels_file`` is used as a vocabulary file, then you should provide parameter - ``label_vocab_dir`` in :ref:`common dataset parameters` - (``model.common_dataset_parameters.label_vocab_dir`` in :ref:`run config`). Each line of - ``punct_labels_file`` file contains 1 label. The values are sorted, ``==Text Normalization Analysis.
') - f.write('Inverse Text Normalization Analysis.') - - # TN Section - f.write('

Text Normalization

\n') - for errorcase in tn_error_cases: - f.write(errorcase.get_html()) - - # ITN Section - f.write('

Inverse Text Normalization

\n') - for errorcase in itn_error_cases: - f.write(errorcase.get_html()) - - -if __name__ == '__main__': - # Parse argument - parser = ArgumentParser() - parser.add_argument('--errors_log_fp', help='Path to the error log file', required=True) - parser.add_argument('--visualization_fp', help='Path to the output visualization file', required=True) - args = parser.parse_args() - - analyze(args.errors_log_fp, args.visualization_fp) diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/conf/duplex_tn_config.yaml b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/conf/duplex_tn_config.yaml deleted file mode 100644 index 9cb7059d4c8f1885081995594c65b81f9f5a88c5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/conf/duplex_tn_config.yaml +++ /dev/null @@ -1,159 +0,0 @@ -name: &name DuplexTextNormalization -mode: joint # Three possible choices ['tn', 'itn', 'joint'] -lang: ??? # Supported languages are ['en', 'ru', 'de', 'multilingual'] - -# Pretrained Nemo Models -tagger_pretrained_model: null -decoder_pretrained_model: null - -# Tagger -tagger_trainer: - devices: 1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 5 # the number of training epochs (for ru or de or multilingual, try 10) - enable_checkpointing: False # provided by exp_manager - logger: false # provided by exp_manager - accumulate_grad_batches: 1 # accumulates grads every k batches - gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. - accelerator: gpu - strategy: ddp - -tagger_model: - do_training: true - transformer: albert-base-v2 # For ru, try cointegrated/rubert-tiny | For de, try bert-base-german-cased | For multilingual, try bert-base-multilingual-cased - tokenizer: ${tagger_model.transformer} - max_sequence_len: 128 - nemo_path: ${tagger_exp_manager.exp_dir}/tagger_model.nemo # exported .nemo path - lang: ${lang} - mode: ${mode} - - optim: - name: adamw - lr: 5e-5 - weight_decay: 0.01 - - sched: - name: WarmupAnnealing - - # pytorch lightning args - monitor: val_token_precision - reduce_on_plateau: false - - # scheduler config override - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - -tagger_exp_manager: - exp_dir: nemo_experiments # where to store logs and checkpoints - name: tagger_training # name of experiment - create_tensorboard_logger: True - create_checkpoint_callback: True - checkpoint_callback_params: - save_top_k: 3 - monitor: "val_token_precision" - mode: "max" - save_best_model: true - always_save_nemo: true - -# Decoder -decoder_trainer: - devices: 1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 3 # the number of training epochs - enable_checkpointing: False # provided by exp_manager - logger: false # provided by exp_manager - accumulate_grad_batches: 1 # accumulates grads every k batches - gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. - accelerator: gpu - strategy: ddp - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - -decoder_model: - do_training: true - transformer: t5-small # For ru, try cointegrated/rut5-base | For de or multilingual, try google/mt5-base - max_sequence_len: 80 - tokenizer: ${decoder_model.transformer} - nemo_path: ${decoder_exp_manager.exp_dir}/decoder_model.nemo # exported .nemo path - lang: ${lang} - mode: ${mode} - - # Options related to covering grammars for TN - use_cg: false # Use covering grammars to avoid catastrophic errors - neural_confidence_threshold: 0.99 # If the neural model is not confident, then use the covering grammars - n_tagged: 1 # number of tagged options to consider, -1 - to get all possible tagged options - - optim: - name: adamw - lr: 2e-4 - weight_decay: 0.01 - - sched: - name: WarmupAnnealing - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - - # scheduler config override - warmup_steps: null - warmup_ratio: 0.0 - last_epoch: -1 - -decoder_exp_manager: - exp_dir: nemo_experiments # where to store logs and checkpoints - name: decoder_training # name of experiment - create_tensorboard_logger: True - create_checkpoint_callback: True - checkpoint_callback_params: - save_top_k: 3 - monitor: "val_loss" - mode: "min" - save_best_model: True - -# Data -data: - train_ds: - data_path: train.tsv # provide the full path to the file. Ignored when using tarred dataset, tar_metadata_file is used instead. - batch_size: 64 # local training batch size for each worker. Ignored when using tarred dataset, the batch size of the tarred dataset is used instead. - shuffle: true - max_insts: -1 # Maximum number of instances (-1 means no limit) - # Refer to the text_normalization doc for more information about data augmentation - tagger_data_augmentation: false - decoder_data_augmentation: true - use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs) - num_workers: 3 - pin_memory: false - drop_last: false - use_tarred_dataset: False # if true tar_metadata_file will be used - tar_metadata_file: null # metadata for tarred dataset. A JSON file containing the list of tar_files in "text_tar_filepaths" field - tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled - - validation_ds: - data_path: dev.tsv # provide the full path to the file. Provide multiple paths to run evaluation on multiple datasets - batch_size: 64 - shuffle: false - max_insts: -1 # Maximum number of instances (-1 means no limit) - use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs) - num_workers: 3 - pin_memory: false - drop_last: false - - test_ds: - data_path: test.tsv # provide the full path to the file - batch_size: 64 - shuffle: false - use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs) - num_workers: 3 - pin_memory: false - drop_last: false - errors_log_fp: errors.txt # Path to the file for logging the errors - -# Inference -inference: - interactive: false # Set to true if you want to enable the interactive mode when running duplex_text_normalization_test.py - from_file: null # Path to the raw text, no labels required. Each sentence on a separate line - batch_size: 16 # batch size for inference.from_file diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/data/create_tarred_dataset.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/data/create_tarred_dataset.py deleted file mode 100644 index cdedf45e097f53df4e7bdb96b2a3d02807bb4e9e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/data/create_tarred_dataset.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json -import os -import pickle -import random -import tarfile -from glob import glob -from typing import List, Tuple - -from joblib import Parallel, delayed -from tqdm import tqdm -from transformers import AutoTokenizer - -import nemo.collections.nlp.data.text_normalization.constants as constants -from nemo.collections.nlp.data.text_normalization.decoder_dataset import TextNormalizationDecoderDataset -from nemo.utils import logging - - -""" -The script builds tar files for Tarred TextNormalizationDecoderDataset - -See `text_normalization doc ` -for more details on data format, and en/data_processing.py on how to pre-process the data before tarring. - -To run the script, use: - - python create_tarred_dataset.py \ - --input_files = "train_processed/output-00099-of-00100" \ - --input_files = "train_processed/output-00098-of-00100" \ - --lang = "en" \ - --out_dir="TARRED_DATA_OUTPUT_DIR" - -See the argparse help for more arguments. -""" - - -def _preprocess_file(input_file: str) -> List[Tuple[List[str]]]: - """ - Performs initial preprocessing, i.e., urls formatting, removal of "_trans" from Ru set - - Args: - input_file: path to a file in google TN format - - Returns: - Processed data. Each element is a Tuple(List[semiotic classes], List[written words], List[spoken words]) - """ - print(f"Reading and running initial pre-processing of {input_file}...") - cur_split = [] - with open(input_file, 'r', encoding='utf-8') as f: - # Loop through each line of the file - cur_classes, cur_tokens, cur_outputs = [], [], [] - for linectx, line in tqdm(enumerate(f)): - es = line.strip().split('\t') - if len(es) == 2 and es[0] == '': - cur_split.append((cur_classes, cur_tokens, cur_outputs)) - # Reset - cur_classes, cur_tokens, cur_outputs = [], [], [] - continue - assert len(es) == 3 - cur_classes.append(es[0]) - cur_tokens.append(es[1]) - cur_outputs.append(es[2]) - return cur_split - - -def _write_batches_to_tarfiles( - input_file: str, - tokenizer: AutoTokenizer, - tokenizer_name: str, - mode: str, - lang: str, - max_seq_len: int, - batch_size: int, - out_dir: str, - num_batches_per_tarfile: int, - decoder_data_augmentation: bool = False, -): - """ - Creates tar files for the input file, i.e.: - 1. Creates a TextNormalizationDecoderDataset from the input file - 2. Constructs batches of size `batch_size` - 3. Saves each created batch to a pickle file and then adds `num_batches_per_tarfile` - of the pickle files to a tarfile. - - Args: - input_file: path to cleaned data file. See en/data_processing.py for cleaning. - tokenizer: tokenizer - tokenizer_name: the name of the tokenizer, usually corresponds to the pre-trained LM - mode: model training mode - max_seq_len: maximum length of the sequence (examples that are longer will be discarded) - batch_size: batch size - out_dir: path to output directory - num_batches_per_tarfile: number of batches saved in each tar file - decoder_data_augmentation: Set to True to enable data augmentation for the decoder model - lang: data language - """ - - dataset = TextNormalizationDecoderDataset( - input_file=input_file, - raw_instances=_preprocess_file(input_file=input_file), - tokenizer=tokenizer, - tokenizer_name=tokenizer_name, - mode=mode, - max_len=max_seq_len, - decoder_data_augmentation=decoder_data_augmentation, - lang=lang, - use_cache=False, - max_insts=-1, - do_tokenize=False, - initial_shuffle=True, - ) - dataset.batchify(batch_size) - file_name = os.path.basename(input_file) - tar_file_ctr = 0 - tar_file_path = os.path.join( - out_dir, '%s-batches.%d.%d.%d.tar' % (file_name, batch_size, max_seq_len, tar_file_ctr) - ) - tar_file_ptr = tarfile.open(tar_file_path, 'w') - total_batch_ctr = 0 - batch_ctr = 0 - for batch in dataset.batches: - total_batch_ctr += 1 - batch_ctr += 1 - pickle_file = os.path.join(out_dir, '%s-batch-%d.pkl' % (file_name, total_batch_ctr)) - - pickle.dump(batch, open(pickle_file, 'wb')) - tar_file_ptr.add(pickle_file) - os.remove(pickle_file) - - if batch_ctr == num_batches_per_tarfile: - tar_file_ctr += 1 - tar_file_ptr.close() - tar_file_path = os.path.join( - out_dir, f'%s-batches.%d.%d.%d.tar' % (file_name, batch_size, max_seq_len, tar_file_ctr) - ) - tar_file_ptr = tarfile.open(tar_file_path, 'w',) - batch_ctr = 0 - - # return tar files paths that have batches remaining - remainder_tar_file_path = tar_file_ptr.name - tar_file_ptr.close() - - return total_batch_ctr, remainder_tar_file_path - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='(Inverse) Text Normalization tarred dataset creation') - parser.add_argument('--transformer_name', type=str, default="t5-small", help='Name of the pretrained LM.') - parser.add_argument('--mode', type=str, default='tn', choices=constants.MODES, help='(I)TN model training mode.') - parser.add_argument('--lang', type=str, default='en', choices=constants.SUPPORTED_LANGS, help='language.') - parser.add_argument( - '--decoder_data_augmentation', - action="store_true", - help='Set to True to use data augmentation for the decoder model.', - ) - parser.add_argument( - '-in', - '--input_files', - action='append', - required=True, - help="Example: -in train_processed/output-00099-of-00100 -in train_processed/output-00098-of-00100", - ) - parser.add_argument('--out_dir', type=str, required=True, help='Path to store dataloader and tokenizer models.') - parser.add_argument( - '--max_seq_length', type=int, default=80, help='Maximum sequence length, longer examples will be discarded.' - ) - parser.add_argument('--min_seq_length', type=int, default=1, help='Minimum sequence length.') - parser.add_argument( - '--num_batches_per_tarfile', - type=int, - default=2, - help='Number batches, i.e., pickle files, included in a single .tar file.', - ) - parser.add_argument('--n_jobs', type=int, default=-2, help='The maximum number of concurrently running jobs.') - parser.add_argument( - '--batch_size', - type=int, - default=16, - help='Batch size, i.e., number of examples in a single pickle file. This batch size will override the training size.', - ) - parser.add_argument( - '--factor', default=8, type=int, help='The final number of tar files will be divisible by the "factor" value' - ) - - args = parser.parse_args() - - # check if tar files exist - if os.path.exists(args.out_dir): - tar_files_in_out_dir = glob(f'{args.out_dir}/*.tar') - if tar_files_in_out_dir: - raise ValueError( - f'Tar files detected in {args.out_dir}. Delete the files to re-construct the dataset in the same directory.' - ) - else: - os.makedirs(args.out_dir) - - world_size = 1 - tokenizer = AutoTokenizer.from_pretrained(args.transformer_name) - - results_list = Parallel(n_jobs=args.n_jobs)( - delayed(_write_batches_to_tarfiles)( - input_file=input_file, - tokenizer=tokenizer, - tokenizer_name=args.transformer_name, - mode=args.mode, - lang=args.lang, - batch_size=args.batch_size, - max_seq_len=args.max_seq_length, - decoder_data_augmentation=args.decoder_data_augmentation, - out_dir=args.out_dir, - num_batches_per_tarfile=args.num_batches_per_tarfile, - ) - for input_file in args.input_files - ) - - total_batches = sum([batch_count for batch_count, _ in results_list]) - - # save batches from tar files containing the left over batches (if there's enough batches) - remainder_tar_file_ctr = 0 - remainder_tar_file_path = os.path.join( - args.out_dir, f'remainder-batches.tokens.{args.batch_size}.tar_file_{remainder_tar_file_ctr}.tar' - ) - remainder_tar_file_ptr = tarfile.open(remainder_tar_file_path, 'w') - batch_in_tar_ctr = 0 - for _, tar_file_path in results_list: - tar_file_ptr = tarfile.open(tar_file_path, 'r') - for member in tar_file_ptr.getmembers(): - remainder_tar_file_ptr.addfile(member, tar_file_ptr.extractfile(member.name)) - batch_in_tar_ctr += 1 - if batch_in_tar_ctr == args.num_batches_per_tarfile: - remainder_tar_file_ctr += 1 - remainder_tar_file_ptr.close() - remainder_tar_file_path = os.path.join( - args.out_dir, f'remainder-batches.tokens.{args.batch_size}.tar_file_{remainder_tar_file_ctr}.tar', - ) - remainder_tar_file_ptr = tarfile.open(remainder_tar_file_path, 'w',) - batch_in_tar_ctr = 0 - tar_file_ptr.close() - os.remove(tar_file_path) - - # log the number of batches remaining as they will be discarded - num_batches_discarded = len(remainder_tar_file_ptr.getmembers()) - remainder_tar_file_ptr.close() - os.remove(remainder_tar_file_path) - - tar_file_paths = glob(f'{args.out_dir}/*.tar') - if args.factor != 1: - num_tar_files = len(tar_file_paths) - num_tars_to_drop = num_tar_files % args.factor - num_batches_discarded += num_tars_to_drop * args.num_batches_per_tarfile - - random.shuffle(tar_file_paths) - for _ in range(num_tars_to_drop): - os.remove(tar_file_paths.pop(-1)) - - total_batches -= num_batches_discarded - logging.info(f'Number of batches discarded: {num_batches_discarded}, total batches kept: {total_batches}') - - # dump metadata to json - metadata = {} - metadata['num_batches'] = total_batches - - # rename tar files so they can be more easily used with CLI and YAML - file_name = f'{args.mode}.{args.batch_size}_bs.{args.num_batches_per_tarfile}_b_per_tar.{args.max_seq_length}_len' - for index, path in enumerate(tar_file_paths): - os.rename(path, os.path.join(args.out_dir, f'{file_name}.{index}.tar')) - - text_tar_filepaths = f'{file_name}._OP_0..{index}_CL_.tar' - logging.info(f'Files for brace expansion: "{text_tar_filepaths}"') - metadata['text_tar_filepaths'] = text_tar_filepaths - - # add tar files to metadata - tar_file_paths = glob(f'{args.out_dir}/*.tar') - metadata['tar_files'] = tar_file_paths - metadata_path = os.path.join(args.out_dir, 'metadata.json') - json.dump(metadata, open(metadata_path, 'w')) - - num_tar_files = len(tar_file_paths) - if num_tar_files < world_size: - raise ValueError( - ( - f'Number of tar files found: {num_tar_files} is less than world size: {world_size}. ' - f'There should be at least one tar file per GPU (ideally many tar files per GPU). ' - f'This may be due to dataset size. ' - f'Decrease num_batches_per_tarfile or num_tokens_per_batch to increase the number of tarfiles. ' - f'Also using shard_strategy=replicate will use all available tarfiles for every GPU. ' - ) - ) diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/data/data_split.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/data/data_split.py deleted file mode 100644 index b05cf6d437884ce5045fc8ac3829a1a3982db7c6..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/data/data_split.py +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script creates data splits of the Google Text Normalization dataset -of the format mentioned in the `text_normalization doc `. - -USAGE Example: -1. Download the Google TN dataset from https://www.kaggle.com/google-nlu/text-normalization -2. Unzip the English subset (e.g., by running `tar zxvf en_with_types.tgz`). Then there will a folder named `en_with_types`. -3. Run this script -# python data_split.py \ - --data_dir=en_with_types/ \ - --output_dir=data_split/ \ - --lang=en - -In this example, the split files will be stored in the `data_split` folder. -The folder should contain three subfolders `train`, 'dev', and `test` with `.tsv` files. -""" - -from argparse import ArgumentParser -from os import listdir, mkdir -from os.path import isdir, isfile, join - -from tqdm import tqdm - -from nemo.collections.nlp.data.text_normalization import constants - -# Local Constants -TEST_SIZE_EN = 100002 -TEST_SIZE_RUS = 100007 - - -def read_google_data(data_file: str, lang: str, split: str, add_test_full=False): - """ - The function can be used to read the raw data files of the Google Text Normalization - dataset (which can be downloaded from https://www.kaggle.com/google-nlu/text-normalization) - - Args: - data_file: Path to the data file. Should be of the form output-xxxxx-of-00100 - lang: Selected language. - split: data split - add_test_full: do not truncate test data i.e. take the whole test file not #num of lines - Return: - data: list of examples - """ - data = [] - cur_classes, cur_tokens, cur_outputs = [], [], [] - with open(data_file, 'r', encoding='utf-8') as f: - for linectx, line in tqdm(enumerate(f)): - es = line.strip().split('\t') - if split == "test" and not add_test_full: - # For the results reported in the paper "RNN Approaches to Text Normalization: A Challenge": - # + For English, the first 100,002 lines of output-00099-of-00100 are used for the test set - # + For Russian, the first 100,007 lines of output-00099-of-00100 are used for the test set - if lang == constants.ENGLISH and linectx == TEST_SIZE_EN: - break - if lang == constants.RUSSIAN and linectx == TEST_SIZE_RUS: - break - if len(es) == 2 and es[0] == '': - data.append((cur_classes, cur_tokens, cur_outputs)) - # Reset - cur_classes, cur_tokens, cur_outputs = [], [], [] - continue - - # Remove _trans (for Russian) - if lang == constants.RUSSIAN: - es[2] = es[2].replace('_trans', '') - # Update the current example - assert len(es) == 3 - cur_classes.append(es[0]) - cur_tokens.append(es[1]) - cur_outputs.append(es[2]) - return data - - -if __name__ == '__main__': - parser = ArgumentParser(description='Preprocess Google text normalization dataset') - parser.add_argument('--data_dir', type=str, required=True, help='Path to folder with data') - parser.add_argument('--output_dir', type=str, default='preprocessed', help='Path to folder with preprocessed data') - parser.add_argument( - '--lang', type=str, default=constants.ENGLISH, choices=constants.SUPPORTED_LANGS, help='Language' - ) - parser.add_argument( - '--add_test_full', - action='store_true', - help='If True, additional folder test_full will be created without truncation of files', - ) - args = parser.parse_args() - - # Create the output dir (if not exist) - if not isdir(args.output_dir): - mkdir(args.output_dir) - mkdir(args.output_dir + '/train') - mkdir(args.output_dir + '/dev') - mkdir(args.output_dir + '/test') - if args.add_test_full: - mkdir(args.output_dir + '/test_full') - - for fn in sorted(listdir(args.data_dir))[::-1]: - fp = join(args.data_dir, fn) - if not isfile(fp): - continue - if not fn.startswith('output'): - continue - - # Determine the current split - split_nb = int(fn.split('-')[1]) - if split_nb < 90: - cur_split = "train" - elif split_nb < 95: - cur_split = "dev" - elif split_nb == 99: - cur_split = "test" - data = read_google_data(data_file=fp, lang=args.lang, split=cur_split) - # write out - output_file = join(args.output_dir, f'{cur_split}', f'{fn}.tsv') - print(fp) - print(output_file) - output_f = open(output_file, 'w', encoding='utf-8') - for inst in data: - cur_classes, cur_tokens, cur_outputs = inst - for c, t, o in zip(cur_classes, cur_tokens, cur_outputs): - output_f.write(f'{c}\t{t}\t{o}\n') - output_f.write('\t\n') - - print(f'{cur_split}_sentences: {len(data)}') - - # additionally generate full test files if needed - if cur_split == "test" and args.add_test_full: - data = read_google_data(data_file=fp, lang=args.lang, split=cur_split, add_test_full=True) - # write out - output_file = join(args.output_dir, 'test_full', f'{fn}.tsv') - output_f = open(output_file, 'w', encoding='utf-8') - for inst in data: - cur_classes, cur_tokens, cur_outputs = inst - for c, t, o in zip(cur_classes, cur_tokens, cur_outputs): - output_f.write(f'{c}\t{t}\t{o}\n') - output_f.write('\t\n') - - print(f'{cur_split}_sentences: {len(data)}') diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py deleted file mode 100644 index 9523d0974db8a416de74fbedc8677d193b376c43..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py +++ /dev/null @@ -1,402 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script can be used to clean the splits of English Google Text Normalization dataset -for better training performance. Without these processing steps we noticed that the model would have a hard time to learn certain input cases, and instead starts to either make unrecoverable errors -or hallucinate. For example, the model struggles to learn numbers with five or more digits due to limited examples in the training data, so we simplified the task for the model by letting it verbalize those cases -digit by digit. This makes the model more rebust to errors. -The operations include: - - numbers that are longer than `max_integer_length` will be verbalized digit by digit, e.g. the mapping "10001" -> "ten thousand and one" in the data -will be changed to "10001" -> "one zero zero zero one" - - denominators of fractions that are longer than `max_denominator_length` will be verbalized digit by digit - - sentences with non-English characters will be removed - - some class formats converted to standardized format, e.g. for `Fraction` "½" become "1/2" - - urls that have a spoken form of "*_letter" e.g. "dot h_letter _letter t_letter _letter m_letter _letter l_letter" are converted to "dot h t m l" - - for class types "PLAIN", "LETTERS", "ELECTRONIC", "VERBATIM", "PUNCT" the spoken form is changed to "" which means this class should be left unchanged - - -USAGE Example: -1. Download the Google TN dataset from https://www.kaggle.com/google-nlu/text-normalization -2. Unzip the English subset (e.g., by running `tar zxvf en_with_types.tgz`). Then there will a folder named `en_with_types`. -3. Run the data_split.py scripts to obtain the data splits -4. Run this script on the different splits -# python data_preprocessing.py \ - --input_path=data_split/train \ - --output_dir=train_processed \ - --max_integer_length=4 \ - --max_denominator_length=3 - -In this example, the cleaned files will be saved in train_processed/. - -After this script, you can use upsample.py to create a more class balanced training dataset for better performance. -""" - - -import os -from argparse import ArgumentParser - -import inflect -import regex as re -from tqdm import tqdm - -from nemo.collections.common.tokenizers.moses_tokenizers import MosesProcessor -from nemo.collections.nlp.data.text_normalization.constants import EN_GREEK_TO_SPOKEN -from nemo.collections.nlp.data.text_normalization.utils import ( - add_space_around_dash, - convert_fraction, - convert_superscript, -) -from nemo.utils import logging - -engine = inflect.engine() - -# these are all words that can appear in a verbalized number, this list will be used later as a filter to detect numbers in verbalizations -number_verbalizations = list(range(0, 20)) + list(range(20, 100, 10)) -number_verbalizations = ( - [engine.number_to_words(x, zero="zero").replace("-", " ").replace(",", "") for x in number_verbalizations] - + ["hundred", "thousand", "million", "billion", "trillion"] - + ["point"] -) -digit = "0123456789" -processor = MosesProcessor(lang_id="en") - - -def process_url(o): - """ - The function is used to process the spoken form of every URL in an example. - E.g., "dot h_letter _letter t_letter _letter m_letter _letter l_letter" -> - "dot h t m l" - Args: - o: The expected outputs for the spoken form - Return: - o: The outputs for the spoken form with preprocessed URLs. - """ - - def flatten(l): - """ flatten a list of lists """ - return [item for sublist in l for item in sublist] - - if o != '' and '_letter' in o: - o_tokens = o.split(' ') - all_spans, cur_span = [], [] - for j in range(len(o_tokens)): - if len(o_tokens[j]) == 0: - continue - if o_tokens[j] == '_letter': - all_spans.append(cur_span) - all_spans.append([' ']) - cur_span = [] - else: - o_tokens[j] = o_tokens[j].replace('_letter', '') - cur_span.append(o_tokens[j]) - if len(cur_span) > 0: - all_spans.append(cur_span) - o_tokens = flatten(all_spans) - - o = '' - for o_token in o_tokens: - if len(o_token) > 1: - o += ' ' + o_token + ' ' - else: - o += o_token - o = o.strip() - o_tokens = processor.tokenize(o).split() - o = ' '.join(o_tokens) - - return o - - -def convert2digits(digits: str): - """ - Verbalizes integer digit by digit, e.g. "12,000.12" -> "one two zero zero zero point one two" - It can also take in a string that has an integer as prefix and outputs only the verbalized part of that, e.g. "12 kg" -> "one two" - and outputs a warning - - Args: - digits: integer in string format - Return: - res: number verbalization of the integer prefix of the input - """ - res = [] - for i, x in enumerate(digits): - if x in digit: - res.append(engine.number_to_words(str(x), zero="zero").replace("-", " ").replace(",", "")) - elif x == ".": - res.append("point") - elif x in [" ", ","]: - continue - else: - # logging.warning(f"remove {digits[:i]} from {digits[i:]}") - break - res = " ".join(res) - return res, i - - -def convert(example): - cls, written, spoken = example - - written = convert_fraction(written) - written = re.sub("é", "e", written) - written = convert_superscript(written) - - if cls == "TIME": - written = re.sub("([0-9]): ([0-9])", "\\1:\\2", written) - if cls == "MEASURE": - written = re.sub("([0-9])\s?''", '\\1"', written) - - spoken = process_url(spoken) - - if cls in ["TELEPHONE", "DIGIT", "MEASURE", "DECIMAL", "MONEY", "ADDRESS"]: - spoken = re.sub(" o ", " zero ", spoken) - spoken = re.sub(" o ", " zero ", spoken) - spoken = re.sub("^o ", "zero ", spoken) - spoken = re.sub(" o$", " zero", spoken) - spoken = re.sub("^sil ", "", spoken) - spoken = re.sub(" sil ", " ", spoken) - spoken = re.sub(" sil ", " ", spoken) - spoken = re.sub(" sil$", "", spoken) - - if cls != "ELECTRONIC": - written = add_space_around_dash(written) - - example[1] = written - example[2] = spoken - - l = args.max_integer_length - 2 - - # if written form does not fulfill this format return - if not re.search("[0-9]{%s}[,\s]?[0-9]{3}" % l, written): - if cls != "FRACTION": - return - idx = written.index("/") - denominator = written[idx + 1 :].strip() - if not re.search(r"[0-9]{%s}" % (args.max_denominator_length + 1), denominator): - return - - # convert spoken forms for different classes - if cls == "CARDINAL": - if written[0] == "-": - digits = "minus " + convert2digits(written[1:])[0] - else: - digits = convert2digits(written)[0] - spoken = digits - elif cls == "ADDRESS": - idx = re.search("[0-9]", written).start() - number = convert2digits(written[idx:].strip())[0] - s_words = spoken.split() - for i, x in enumerate(s_words): - if x in number_verbalizations: - break - spoken = " ".join(s_words[:i]) + " " + number - elif cls == "DECIMAL": - res = [] - for i, x in enumerate(written): - if i == 0 and x == "-": - res.append("minus") - elif x in digit: - res.append(engine.number_to_words(str(x), zero="zero").replace("-", " ").replace(",", "")) - elif x == ".": - res.append("point") - spoken = " ".join(res) - m = re.search("([a-z]+)", written) - if m: - spoken += " " + m.group(1) - elif cls == "FRACTION": - res = [] - if written[0] == "-": - res.append("minus") - written = written[1:] - idx = written.index("/") - numerator = written[:idx].strip() - denominator = written[idx + 1 :].strip() - if len(numerator) > args.max_integer_length: - numerator = convert2digits(numerator)[0] - else: - numerator = engine.number_to_words(str(numerator), zero="zero").replace("-", " ").replace(",", "") - if len(denominator) > args.max_denominator_length: - denominator = convert2digits(denominator)[0] - else: - denominator = engine.number_to_words(str(denominator), zero="zero").replace("-", " ").replace(",", "") - spoken = numerator + " slash " + denominator - if res: - spoken = "minus " + spoken - elif cls == "MEASURE": - res = [] - if written[0] == "-": - res.append("minus") - written = written[1:] - idx = re.search("(?s:.*)([0-9]\s?[a-zA-Zµμ\/%Ω'])", written).end() - number, unit_idx = convert2digits(written[:idx].strip()) - s_words = spoken.split() - for i, x in enumerate(s_words): - if x not in number_verbalizations: - break - - spoken = number + " " + " ".join(s_words[i:]) - if res: - spoken = "minus " + spoken - elif cls == "MONEY": - res = [] - if written[0] == "-": - res.append("minus") - written = written[1:] - idx = re.search("[0-9]", written).start() - m = re.search("\.", written[idx:]) - idx_end = len(written) - if m: - idx_end = m.start() + idx - number, unit_idx = convert2digits(written[idx:idx_end].strip()) - s_words = spoken.split() - for i, x in enumerate(s_words): - if x not in number_verbalizations: - break - spoken = number + " " + " ".join(s_words[i:]) - if res: - spoken = "minus " + spoken - elif cls == "ORDINAL": - res = [] - if written[0] == "-": - res.append("minus") - written = written[1:] - if "th" in written.lower(): - idx = written.lower().index("th") - elif "rd" in written.lower(): - idx = written.lower().index("rd") - elif "nd" in written.lower(): - idx = written.lower().index("nd") - elif "st" in written.lower(): - idx = written.lower().index("st") - if re.search(r"[¿¡ºª]", written) is None: - spoken = convert2digits(written[:idx].strip())[0] + " " + written[idx:].lower() - if res: - spoken = "minus " + spoken - example[2] = spoken - - -def ignore(example): - """ - This function makes sure specific class types like 'PLAIN', 'ELECTRONIC' etc. are left unchanged. - - Args: - example: data example - """ - cls, _, _ = example - if cls in ["PLAIN", "LETTERS", "ELECTRONIC", "VERBATIM", "PUNCT"]: - example[2] = "" - if example[1] == 'I' and re.search("(first|one)", example[2]): - example[2] = "" - - -def process_file(fp): - """ Reading the raw data from a file of NeMo format and preprocesses it. Write is out to the output directory. - For more info about the data format, refer to the - `text_normalization doc `. - - Args: - fp: file path - """ - file_name = fp.split("/")[-1] - output_path = f"{args.output_dir}/{file_name}" - logging.info(f"-----input_file--------\n{fp}") - logging.info(f"-----output_file--------\n{output_path}") - - insts, w_words, s_words, classes = [], [], [], [] - delete_sentence = False - with open(fp, 'r', encoding='utf-8') as f: - for line in tqdm(f): - es = [e.strip() for e in line.strip().split('\t')] - if es[0] == '': - if not delete_sentence: - inst = (classes, w_words, s_words) - insts.append(inst) - # Reset - w_words, s_words, classes = [], [], [] - delete_sentence = False - else: - # convert data sample - convert(es) - # decide if this data sample's spoken form should be same as written form - ignore(es) - - characters_ignore = "¿¡ºª" + "".join(EN_GREEK_TO_SPOKEN.keys()) - # delete sentence with greek symbols, etc. - if re.search(rf"[{characters_ignore}]", es[1]) is not None: - delete_sentence = True - # delete characters from chinese, japanese, korean - if re.search(r'[\u4e00-\u9fff]+', es[1]) is not None: - delete_sentence = True - - if es[0] == 'MONEY' and re.search("\s?DM$", es[1]): - delete_sentence = True - - if es[0] == 'MEASURE' and re.search("\s?Da$", es[1]): - delete_sentence = True - - classes.append(es[0]) - w_words.append(es[1]) - s_words.append(es[2]) - - inst = (classes, w_words, s_words) - insts.append(inst) - - output_f = open(output_path, 'w+', encoding='utf-8') - for _, inst in enumerate(insts): - cur_classes, cur_tokens, cur_outputs = inst - for c, t, o in zip(cur_classes, cur_tokens, cur_outputs): - output_f.write(f'{c}\t{t}\t{o}\n') - - output_f.write(f'\t\n') - - -def main(): - if not os.path.exists(args.input_path): - raise ValueError(f"Input path {args.input_path} does not exist") - if os.path.exists(args.output_dir): - logging.info( - f"Output directory {args.output_dir} exists already. Existing files could be potentially overwritten." - ) - else: - logging.info(f"Creating output directory {args.output_dir}.") - os.makedirs(args.output_dir, exist_ok=True) - - if os.path.isdir(args.input_path): - input_paths = sorted([os.path.join(args.input_path, f) for f in os.listdir(args.input_path)]) - else: - input_paths = [args.input_path] - - for input_file in input_paths: - process_file(input_file) - - -if __name__ == "__main__": - - parser = ArgumentParser(description="Text Normalization Data Preprocessing for English") - parser.add_argument("--output_dir", required=True, type=str, help='Path to output directory.') - parser.add_argument("--input_path", required=True, type=str, help='Path to input file or input directory.') - parser.add_argument( - "--max_integer_length", - default=4, - type=int, - help='Maximum number of digits for integers that are allowed. Beyond this, the integers are verbalized digit by digit.', - ) - parser.add_argument( - "--max_denominator_length", - default=3, - type=int, - help='Maximum number of digits for denominators that are allowed. Beyond this, the denominator is verbalized digit by digit.', - ) - args = parser.parse_args() - - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/data/en/upsample.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/data/en/upsample.py deleted file mode 100644 index e6331be0f5437c0caa5c21721cbb07c8a3775f18..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/data/en/upsample.py +++ /dev/null @@ -1,333 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script can be used to create a more class balanced file from a set of the data files of the English Google Text Normalization dataset -for better training performance. Currently this script upsamples the class types "MONEY", "MEASURE", "TIME", "FRACTION" since these are underrepresented in the Google Text Normalization dataset, but still diverse in its representations. -Of all the input files in `input_dir` this script takes the first file and computes the class patterns that occurs in it. -For those that are underrepresented, quantitatively defined as lower than `min_number`, the other files are scanned for sentences that have the missing patterns. -Those sentences are appended to the first file and outputted. - -USAGE Example: -1. Download the Google TN dataset from https://www.kaggle.com/google-nlu/text-normalization -2. Unzip the English subset (e.g., by running `tar zxvf en_with_types.tgz`). Then there will a folder named `en_with_types`. -3. Run the data_split.py, data_preprocessing.py scripts to obtain cleaned data files -4. Run this script on the training data portion -# python upsample.py \ - --input_dir=train_processed/ \ - --output_file=train_upsampled.tsv/ \ - --min_number=2000 - -In this example, the final file will be train_upsampled.tsv. -""" - - -import glob -from argparse import ArgumentParser -from collections import defaultdict -from typing import List - -import numpy as np -import regex as re - -parser = ArgumentParser(description="English Text Normalization upsampling") -parser.add_argument("--input_dir", required=True, type=str, help='Path to input directory with preprocessed data') -parser.add_argument("--output_file", required=True, type=str, help='Path to output file') -parser.add_argument("--min_number", default=2000, type=int, help='minimum number per pattern') -parser.add_argument("--pretty", action="store_true", help='Pretty print') -args = parser.parse_args() - -# global pattern tables -MONEY_PATTERNS = defaultdict(int) -MEASURE_PATTERNS = defaultdict(int) -TIME_PATTERNS = defaultdict(int) -FRACTION_PATTERNS = defaultdict(int) - -# global templates/stencils for creating patterns -money_templates = ["([0-9]|\.|,)+"] -measure_templates = ["^-?([0-9]|\.|,|/|\s)+"] -time_templates = [ - "^[0-9]+:[0-9][0-9]$", - "^[0-9]+:[0-9][0-9]\s?[a-zA-Z]+$", - "^[0-9]+\s(p|P|A|a)\.?(m|M)\.?", - "^[0-9]+(p|P|A|a)\.?(m|M)\.?", - "^[0-9]:[0-9][0-9]\s(p|P|A|a)\.?(m|M)\.?", - "^[0-9][0-9]:[0-9][0-9]\s(p|P|A|a)\.?(m|M)\.?", - "^[0-9]:[0-9][0-9](p|P|A|a)\.?(m|M)\.?", - "^[0-9][0-9]:[0-9][0-9](p|P|A|a)\.?(m|M)\.?", - "^[0-9]+.[0-9][0-9]\s?(p|P|A|a)\.?(m|M)\.?", - "^[0-9]+:[0-9]+:[0-9]+", - "^[0-9]+:[0-9]+.[0-9]+", - "^[0-9]+.[0-9]+$", - "^[0-9]+.[0-9]+\s?[a-zA-Z]+$", -] -fraction_templates = [ - "^-?[0-9]+\s?\/\s?[0-9]{3}$", - "^-?[0-9]{3}\s?\/\s?[0-9]+$", - "^[0-9]+\s[0-9]+\/[0-9]+$", - "^[0-9]+\s[0-9]+\/[0-9]+$", - "^[0-9]+\s[0-9]+\s\/\s[0-9]+$", - "^-?[0-9]+\s\/\s[0-9]+$", - "^-?[0-9]+\/[0-9]+$", -] - -# classes that still need to be upsampled, and required number of instances needed -classes_to_upsample = defaultdict(int) - - -def include_sentence(sentence_patterns) -> bool: - """ - Determines whether to use a sentence for upsampling whose patterns are provided as input. This will check the global pattern tables - if this sentence includes any patterns that are still needed. - - Args: - sentence_patterns: dictionary of patterns for a sentence grouped by class - Returns: - include: whether or not to use the sentence or for upsampling - """ - include = False - for k, v in sentence_patterns["MONEY"].items(): - if v > 0 and k in MONEY_PATTERNS and MONEY_PATTERNS[k] < args.min_number: - include = True - for k, v in sentence_patterns["MEASURE"].items(): - if v > 0 and k in MEASURE_PATTERNS and MEASURE_PATTERNS[k] < args.min_number: - include = True - for k, v in sentence_patterns["TIME"].items(): - if v > 0 and k in TIME_PATTERNS and TIME_PATTERNS[k] < args.min_number: - include = True - for k, v in sentence_patterns["FRACTION"].items(): - if v > 0 and k in FRACTION_PATTERNS and FRACTION_PATTERNS[k] < args.min_number: - include = True - - if include: - for k, v in sentence_patterns["MONEY"].items(): - if v > 0 and k in MONEY_PATTERNS: - MONEY_PATTERNS[k] += v - if MONEY_PATTERNS[k] - v < args.min_number and MONEY_PATTERNS[k] >= args.min_number: - classes_to_upsample["MONEY"] -= 1 - if classes_to_upsample["MONEY"] <= 0: - classes_to_upsample.pop("MONEY") - for k, v in sentence_patterns["MEASURE"].items(): - if v > 0 and k in MEASURE_PATTERNS: - MEASURE_PATTERNS[k] += v - if MEASURE_PATTERNS[k] - v < args.min_number and MEASURE_PATTERNS[k] >= args.min_number: - classes_to_upsample["MEASURE"] -= 1 - if classes_to_upsample["MEASURE"] <= 0: - classes_to_upsample.pop("MEASURE") - for k, v in sentence_patterns["TIME"].items(): - if v > 0 and k in TIME_PATTERNS: - TIME_PATTERNS[k] += v - if TIME_PATTERNS[k] - v < args.min_number and TIME_PATTERNS[k] >= args.min_number: - classes_to_upsample["TIME"] -= 1 - if classes_to_upsample["TIME"] <= 0: - classes_to_upsample.pop("TIME") - for k, v in sentence_patterns["FRACTION"].items(): - if v > 0 and k in FRACTION_PATTERNS: - FRACTION_PATTERNS[k] += v - if FRACTION_PATTERNS[k] - v < args.min_number and FRACTION_PATTERNS[k] >= args.min_number: - classes_to_upsample["FRACTION"] -= 1 - if classes_to_upsample["FRACTION"] <= 0: - classes_to_upsample.pop("FRACTION") - return include - - -def read_data_file(fp: str, upsample_file: bool = False): - """ Reading the raw data from a file of NeMo format - For more info about the data format, refer to the - `text_normalization doc `. - - Args: - fp: file paths - upsample_file: whether or not this input file should be used in full or only for upsampling, i.e. only as a subset - Returns: - insts: List of sentences parsed as list of words - """ - - insts, w_words, s_words, classes = [], [], [], [] - with open(fp, 'r', encoding='utf-8') as f: - sentence_patterns = { - "FRACTION": defaultdict(int), - "MEASURE": defaultdict(int), - "TIME": defaultdict(int), - "MONEY": defaultdict(int), - } - for line in f: - es = [e.strip() for e in line.strip().split('\t')] - if es[0] == '': - if not upsample_file: - inst = (classes, w_words, s_words) - insts.append(inst) - else: - ok = include_sentence(sentence_patterns) - if ok: - inst = (classes, w_words, s_words) - insts.append(inst) - # Reset - w_words, s_words, classes = [], [], [] - sentence_patterns = { - "FRACTION": defaultdict(int), - "MEASURE": defaultdict(int), - "TIME": defaultdict(int), - "MONEY": defaultdict(int), - } - - else: - classes.append(es[0]) - w_words.append(es[1]) - s_words.append(es[2]) - if not upsample_file: - register_patterns(cls=es[0], input_str=es[1], pretty=args.pretty) - else: - if es[0] in classes_to_upsample: - patterns = lookup_patterns(cls=es[0], input_str=es[1]) - update_patterns(sentence_patterns[es[0]], patterns) - if not upsample_file: - inst = (classes, w_words, s_words) - insts.append(inst) - return insts - - -def update_patterns(patterns: dict, new_patterns: dict): - """ - updates a given pattern table with counts from another table by adding them to the given table. - - Args: - patterns: main table - new_patterns: new table to update the main table with - """ - for k, v in new_patterns.items(): - patterns[k] += v - - -def register_patterns(cls: str, input_str: str, pretty: bool = False): - """ - Saves all patterns created from input string from global templates/stencils to global pattern table - - Args: - cls: class type of input_str - input_str: input string - pretty: used to pretty print patterns - """ - if cls == "MONEY": - new_dict = create_pattern(money_templates, input_str, pretty=pretty) - update_patterns(MONEY_PATTERNS, new_dict) - if cls == "MEASURE": - new_dict = create_pattern(measure_templates, input_str, pretty=pretty) - update_patterns(MEASURE_PATTERNS, new_dict) - if cls == "TIME": - new_dict = create_pattern(time_templates, input_str, pretty=pretty) - update_patterns(TIME_PATTERNS, new_dict) - if cls == "FRACTION": - new_dict = create_pattern(fraction_templates, input_str, pretty=pretty) - update_patterns(FRACTION_PATTERNS, new_dict) - - -def lookup_patterns(cls: str, input_str: str) -> dict: - """ - Look up all patterns that match an input string from global pattern table - - Args: - cls: class type of input_str - input_str: input string - """ - if cls == "MONEY": - new_dict = create_pattern(MONEY_PATTERNS.keys(), input_str) - if cls == "MEASURE": - new_dict = create_pattern(MEASURE_PATTERNS.keys(), input_str) - if cls == "TIME": - new_dict = create_pattern(TIME_PATTERNS.keys(), input_str) - if cls == "FRACTION": - new_dict = create_pattern(FRACTION_PATTERNS.keys(), input_str) - return new_dict - - -def create_pattern(templates: List[str], input_str: str, pretty: bool = False): - """ - create all patterns based on list of input templates using the input string. - - Args: - templates: list of templates/stencils - input_str: string to apply templates on to create patterns - pretty: used to pretty print patterns - """ - res = defaultdict(int) - for template in templates: - if re.search(template, input_str) is None: - continue - if not pretty: - res[re.sub(template, template, input_str)] += 1 - else: - res[re.sub(template, "@", input_str)] += 1 - return res - - -def print_stats(): - """ - print statistics on class patterns to be upsampled - """ - print("MONEY") - for k, v in MONEY_PATTERNS.items(): - print(f"\t{k}\t{v}") - print("no. patterns to upsample", classes_to_upsample["MONEY"]) - print("MEASURE") - for k, v in MEASURE_PATTERNS.items(): - print(f"\t{k}\t{v}") - print("no. patterns to upsample", classes_to_upsample["MEASURE"]) - print("TIME") - for k, v in TIME_PATTERNS.items(): - print(f"\t{k}\t{v}") - print("no. patterns to upsample", classes_to_upsample["TIME"]) - print("FRACTION") - for k, v in FRACTION_PATTERNS.items(): - print(f"\t{k}\t{v}") - print("no. patterns to upsample", classes_to_upsample["FRACTION"]) - - -def main(): - input_files = sorted(glob.glob(f"{args.input_dir}/output-*")) - print("Taking in full: ", input_files[0]) - inst_first_file = read_data_file(input_files[0]) - - measure_keys = list(MEASURE_PATTERNS.keys()) - for k in measure_keys: - if re.search("\s?st$", k) is not None or re.search("\s?Da$", k) is not None: - MEASURE_PATTERNS.pop(k) - - money_keys = list(MONEY_PATTERNS.keys()) - for k in money_keys: - if re.search("(DM|SHP|BMD|SCR|SHP|ARS|BWP|SBD)$", k) is not None: - MONEY_PATTERNS.pop(k) - - classes_to_upsample["FRACTION"] = sum(np.asarray(list(FRACTION_PATTERNS.values())) < args.min_number) - classes_to_upsample["MEASURE"] = sum(np.asarray(list(MEASURE_PATTERNS.values())) < args.min_number) - classes_to_upsample["TIME"] = sum(np.asarray(list(TIME_PATTERNS.values())) < args.min_number) - classes_to_upsample["MONEY"] = sum(np.asarray(list(MONEY_PATTERNS.values())) < args.min_number) - - print_stats() - for fp in input_files[1:]: - print("Upsamling: ", fp) - instances = read_data_file(fp, upsample_file=True) - inst_first_file.extend(instances) - print_stats() - - output_f = open(args.output_file, 'w+', encoding='utf-8') - for ix, inst in enumerate(inst_first_file): - cur_classes, cur_tokens, cur_outputs = inst - for c, t, o in zip(cur_classes, cur_tokens, cur_outputs): - output_f.write(f'{c}\t{t}\t{o}\n') - output_f.write(f'\t\n') - - -if __name__ == "__main__": - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/duplex_text_normalization_infer.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/duplex_text_normalization_infer.py deleted file mode 100644 index 6bcc69de7db95dc8cfb218366949a63c4431f392..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/duplex_text_normalization_infer.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script contains an example on how to run inference with the DuplexTextNormalizationModel. -DuplexTextNormalizationModel is essentially a wrapper class around DuplexTaggerModel and DuplexDecoderModel. -Therefore, two trained NeMo models should be specified to run the joint evaluation -(one is a trained DuplexTaggerModel and the other is a trained DuplexDecoderModel). - -This script can perform inference for 2 settings: -1. inference from a raw file (no labels required). Each line of the file represents a single example for inference. - Specify in inference.from_file and inference.batch_size parameters. - - python duplex_text_normalization_infer.py \ - tagger_pretrained_model=PATH_TO_TRAINED_TAGGER \ - decoder_pretrained_model=PATH_TO_TRAINED_DECODER \ - mode={tn,itn,joint} \ - lang={en,ru,de} \ - inference.from_file=PATH_TO_RAW_TEXT_FILE - - The predictions will be saved at "_norm" and "_denorm" files. - -2. Interactive inference (one query at a time), set inference.interactive to True to enter the interactive mode - python duplex_text_normalization_infer.py \ - tagger_pretrained_model=PATH_TO_TRAINED_TAGGER \ - decoder_pretrained_model=PATH_TO_TRAINED_DECODER \ - mode={tn,itn,joint} \ - lang={en,ru,de} \ - inference.interactive=true - -This script uses the `/examples/nlp/duplex_text_normalization/conf/duplex_tn_config.yaml` -config file by default. The other option is to set another config file via command -line arguments by `--config-name=CONFIG_FILE_PATH'. -""" - - -import os -from typing import List - -from helpers import DECODER_MODEL, TAGGER_MODEL, instantiate_model_and_trainer -from nn_wfst.en.electronic.normalize import ElectronicNormalizer -from nn_wfst.en.whitelist.normalize import WhitelistNormalizer -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.data.text_normalization import constants -from nemo.collections.nlp.models import DuplexTextNormalizationModel -from nemo.core.config import hydra_runner -from nemo.utils import logging - -try: - from nemo_text_processing.text_normalization.data_loader_utils import post_process_punct -except (ImportError, ModuleNotFoundError): - raise ModuleNotFoundError( - "The package `nemo_text_processing` was not installed in this environment. Please refer to" - " https://github.com/NVIDIA/NeMo-text-processing and install this package before using " - "this script" - ) - - -@hydra_runner(config_path="conf", config_name="duplex_tn_config") -def main(cfg: DictConfig) -> None: - logging.debug(f'Config Params: {OmegaConf.to_yaml(cfg)}') - lang = cfg.lang - - if cfg.decoder_pretrained_model is None or cfg.tagger_pretrained_model is None: - raise ValueError("Both pre-trained models (DuplexTaggerModel and DuplexDecoderModel) should be provided.") - tagger_trainer, tagger_model = instantiate_model_and_trainer(cfg, TAGGER_MODEL, False) - decoder_trainer, decoder_model = instantiate_model_and_trainer(cfg, DECODER_MODEL, False) - decoder_model.max_sequence_len = 512 - tagger_model.max_sequence_len = 512 - tn_model = DuplexTextNormalizationModel(tagger_model, decoder_model, lang) - - if lang == constants.ENGLISH: - normalizer_electronic = ElectronicNormalizer(input_case="cased", lang=lang, deterministic=True) - normalizer_whitelist = WhitelistNormalizer(input_case="cased", lang=lang, deterministic=True) - - if cfg.inference.get("from_file", False): - text_file = cfg.inference.from_file - logging.info(f'Running inference on {text_file}...') - if not os.path.exists(text_file): - raise ValueError(f'{text_file} not found.') - - with open(text_file, 'r') as f: - lines = f.readlines() - - if lang == constants.ENGLISH: - new_lines = normalizer_electronic.normalize_list(lines) - lines = [ - post_process_punct(input=input_, normalized_text=norm_) for input_, norm_ in zip(lines, new_lines) - ] - new_lines = normalizer_whitelist.normalize_list(lines) - lines = [ - post_process_punct(input=input_, normalized_text=norm_) for input_, norm_ in zip(lines, new_lines) - ] - - def _get_predictions(lines: List[str], mode: str, batch_size: int, text_file: str): - """ Runs inference on a batch data without labels and saved predictions to a file. """ - assert mode in ['tn', 'itn'] - file_name, extension = os.path.splitext(text_file) - batch, all_preds = [], [] - for i, line in enumerate(lines): - batch.append(line.strip()) - if len(batch) == batch_size or i == len(lines) - 1: - outputs = tn_model._infer(batch, [constants.DIRECTIONS_TO_MODE[mode]] * len(batch),) - all_preds.extend([x for x in outputs[-1]]) - batch = [] - assert len(all_preds) == len(lines) - out_file = f'{file_name}_{mode}{extension}' - with open(f'{out_file}', 'w') as f_out: - f_out.write("\n".join(all_preds)) - logging.info(f'Predictions for {mode} save to {out_file}.') - - batch_size = cfg.inference.get("batch_size", 8) - if cfg.mode in ['tn', 'joint']: - # TN mode - _get_predictions(lines, 'tn', batch_size, text_file) - if cfg.mode in ['itn', 'joint']: - # ITN mode - _get_predictions(lines, 'itn', batch_size, text_file) - - else: - print('Entering interactive mode.') - done = False - while not done: - print('Type "STOP" to exit.') - test_input = input('Input a test input:') - if test_input == "STOP": - done = True - if not done: - if lang == constants.ENGLISH: - new_input = normalizer_electronic.normalize(test_input, verbose=False) - test_input = post_process_punct(input=test_input, normalized_text=new_input) - new_input = normalizer_whitelist.normalize(test_input, verbose=False) - test_input = post_process_punct(input=test_input, normalized_text=new_input) - directions = [] - inputs = [] - if cfg.mode in ['itn', 'joint']: - directions.append(constants.DIRECTIONS_TO_MODE[constants.ITN_MODE]) - inputs.append(test_input) - if cfg.mode in ['tn', 'joint']: - directions.append(constants.DIRECTIONS_TO_MODE[constants.TN_MODE]) - inputs.append(test_input) - outputs = tn_model._infer(inputs, directions)[-1] - if cfg.mode in ['joint', 'itn']: - print(f'Prediction (ITN): {outputs[0]}') - if cfg.mode in ['joint', 'tn']: - print(f'Prediction (TN): {outputs[-1]}') - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/duplex_text_normalization_test.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/duplex_text_normalization_test.py deleted file mode 100644 index d80c88ec97079c8dee9c1aa1e677c85ca6c8fc0a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/duplex_text_normalization_test.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script runs evaluation on the test data. For more details on the data format refer to the -`text_normalization doc ` - -1. To evaluate the tagger model: - python duplex_text_normalization_test.py \ - tagger_pretrained_model=PATH_TO_TRAINED_TAGGER \ - mode={tn,itn,joint} \ - lang={en,ru,de} - -2. To evaluate the decoder model: - python duplex_text_normalization_test.py \ - decoder_pretrained_model=PATH_TO_TRAINED_DECODER \ - mode={tn,itn,joint} \ - lang={en,ru,de} - -3. To jointly evaluate "tagger -> decoder" pipeline the DuplexTextNormalizationModel will be used. - DuplexTextNormalizationModel is essentially a wrapper class around DuplexTaggerModel and DuplexDecoderModel. - Therefore, two trained NeMo models should be specified to run the joint evaluation - (one is a trained DuplexTaggerModel and the other is a trained DuplexDecoderModel). - Additionally, an error log will be saved in a file specified with data.test_ds.errors_log_fp (this file can be - later used with analyze_errors.py) - - python duplex_text_normalization_test.py \ - tagger_pretrained_model=PATH_TO_TRAINED_TAGGER \ - decoder_pretrained_model=PATH_TO_TRAINED_DECODER \ - mode={tn,itn,joint} \ - lang={en,ru,de} \ - data.test_ds.errors_log_fp=PATH_TO_FILE_TO_SAVE_ERROR_LOG \ - data.test_ds.use_cache=true \ - data.test_ds.batch_size=256 -""" - -from helpers import DECODER_MODEL, TAGGER_MODEL, instantiate_model_and_trainer -from omegaconf import DictConfig - -from nemo.collections.nlp.data.text_normalization import TextNormalizationTestDataset -from nemo.collections.nlp.models import DuplexTextNormalizationModel -from nemo.core.config import hydra_runner -from nemo.utils import logging - - -@hydra_runner(config_path="conf", config_name="duplex_tn_config") -def main(cfg: DictConfig) -> None: - lang = cfg.lang - - if cfg.tagger_pretrained_model: - tagger_trainer, tagger_model = instantiate_model_and_trainer(cfg, TAGGER_MODEL, False) - tagger_model.max_sequence_len = 512 - tagger_model.setup_test_data(cfg.data.test_ds) - logging.info('Evaluating the tagger...') - tagger_trainer.test(model=tagger_model, verbose=False) - else: - logging.info('Tagger checkpoint is not provided, skipping tagger evaluation') - - if cfg.decoder_pretrained_model: - decoder_trainer, decoder_model = instantiate_model_and_trainer(cfg, DECODER_MODEL, False) - decoder_model.max_sequence_len = 512 - decoder_model.setup_multiple_test_data(cfg.data.test_ds) - logging.info('Evaluating the decoder...') - decoder_trainer.test(decoder_model) - else: - logging.info('Decoder checkpoint is not provided, skipping decoder evaluation') - - if cfg.tagger_pretrained_model and cfg.decoder_pretrained_model: - logging.info('Running evaluation of the duplex model (tagger + decoder) on the test set.') - tn_model = DuplexTextNormalizationModel(tagger_model, decoder_model, lang) - test_dataset = TextNormalizationTestDataset(cfg.data.test_ds.data_path, cfg.mode, lang) - results = tn_model.evaluate(test_dataset, cfg.data.test_ds.batch_size, cfg.data.test_ds.errors_log_fp) - print(f'\nTest results: {results}') - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/duplex_text_normalization_train.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/duplex_text_normalization_train.py deleted file mode 100644 index f2daff7a8deb19869a4196b4828a3b9e215f9926..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/duplex_text_normalization_train.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script contains an example on how to train a DuplexTextNormalizationModel. -Note that DuplexTextNormalizationModel is essentially a wrapper class around -two other classes: - -(1) DuplexTaggerModel is a model for identifying spans in the input that need to -be normalized. Usually, such spans belong to semiotic classes (e.g., DATE, NUMBERS, ...). - -(2) DuplexDecoderModel is a model for normalizing the spans identified by the tagger. -For example, in the text normalization (TN) problem, each span will be converted to its -spoken form. In the inverse text normalization (ITN) problem, each span will be converted -to its written form. - -Therefore, this script consists of two parts, one is for training the tagger model -and the other is for training the decoder. - -This script uses the `/examples/nlp/duplex_text_normalization/conf/duplex_tn_config.yaml` -config file by default. The other option is to set another config file via command -line arguments by `--config-name=CONFIG_FILE_PATH'. Probably it is worth looking -at the example config file to see the list of parameters used for training. - -USAGE Example: -1. Obtain a processed dataset (refer to the `text_normalization doc `) -2. Run: -# python duplex_text_normalization_train.py \ - data.validation_ds.data_path=PATH_TO_VALIDATION_FILE \ - data.train_ds.data_path=PATH_TO_TRAIN_FILE \ - mode={tn,itn,joint} \ - lang={en,ru,de} - -There are 3 different modes. `tn` mode is for training a system for TN only. -`itn` mode is for training a system for ITN. `joint` is for training a system -that can do both TN and ITN at the same time. Note that the above command will -first train a tagger and then train a decoder sequentially. - -You can also train only a tagger (without training a decoder) by running the -following command: -# python duplex_text_normalization_train.py - data.validation_ds.data_path=PATH_TO_VALIDATION_FILE \ - data.train_ds.data_path=PATH_TO_TRAIN_FILE \ - data.test_ds.data_path=PATH_TO_TEST_FILE \ - mode={tn,itn,joint} - lang={en,ru,de} - decoder_model.do_training=false - -Or you can also train only a decoder (without training a tagger): -# python duplex_text_normalization_train.py \ - data.validation_ds.data_path=PATH_TO_VALIDATION_FILE \ - data.train_ds.data_path=PATH_TO_TRAIN_FILE \ - data.test_ds.data_path=PATH_TO_TEST_FILE \ - mode={tn,itn,joint} \ - lang={en,ru,de} \ - tagger_model.do_training=false - -To use tarred dataset for decoder training set: - - data.train_ds.use_tarred_dataset=True \ - data.train_ds.tar_metadata_file=\PATH_TO\\metadata.json - -Information on the arguments: - -Most arguments in the example config file are quite self-explanatory (e.g., -`decoder_model.optim.lr` refers to the learning rate for training the decoder). -Some arguments we want to mention are: - -+ lang: The language of the dataset. - -+ tagger_model.nemo_path: This is the path where the final trained tagger model -will be saved to. - -+ decoder_model.nemo_path: This is the path where the final trained decoder model -will be saved to. -""" - - -from helpers import DECODER_MODEL, TAGGER_MODEL, instantiate_model_and_trainer -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.data.text_normalization import TextNormalizationTestDataset -from nemo.collections.nlp.models import DuplexTextNormalizationModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="duplex_tn_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config Params: {OmegaConf.to_yaml(cfg)}') - - # Train the tagger - if cfg.tagger_model.do_training: - logging.info( - "================================================================================================" - ) - logging.info('Starting training tagger...') - tagger_trainer, tagger_model = instantiate_model_and_trainer(cfg, TAGGER_MODEL, True) - tagger_exp_manager = cfg.get('tagger_exp_manager', None) - exp_manager(tagger_trainer, tagger_exp_manager) - tagger_trainer.fit(tagger_model) - logging.info('Training finished!') - - # Train the decoder - if cfg.decoder_model.do_training: - logging.info( - "================================================================================================" - ) - logging.info('Starting training decoder...') - decoder_trainer, decoder_model = instantiate_model_and_trainer(cfg, DECODER_MODEL, True) - decoder_exp_manager = cfg.get('decoder_exp_manager', None) - exp_manager(decoder_trainer, decoder_exp_manager) - decoder_trainer.fit(decoder_model) - logging.info('Training finished!') - - # Evaluation after training - if ( - hasattr(cfg.data, 'test_ds') - and cfg.data.test_ds.data_path is not None - and cfg.tagger_model.do_training - and cfg.decoder_model.do_training - ): - tn_model = DuplexTextNormalizationModel(tagger_model, decoder_model, cfg.lang) - test_dataset = TextNormalizationTestDataset(cfg.data.test_ds.data_path, cfg.mode, cfg.lang) - results = tn_model.evaluate(test_dataset, cfg.data.test_ds.batch_size, cfg.data.test_ds.errors_log_fp) - print(f'\nTest results: {results}') - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/helpers.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/helpers.py deleted file mode 100644 index 6c1cfe37b90d535c854ea9f832d00a3fa698ccd4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/helpers.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pytorch_lightning as pl -from omegaconf import DictConfig - -from nemo.collections.nlp.data.text_normalization import constants -from nemo.collections.nlp.models import DuplexDecoderModel, DuplexTaggerModel -from nemo.utils import logging - -__all__ = ['TAGGER_MODEL', 'DECODER_MODEL', 'MODEL_NAMES', 'instantiate_model_and_trainer'] - -TAGGER_MODEL = 'tagger' -DECODER_MODEL = 'decoder' -MODEL_NAMES = [TAGGER_MODEL, DECODER_MODEL] - - -def instantiate_model_and_trainer(cfg: DictConfig, model_name: str, do_training: bool): - """ Function for instantiating a model and a trainer - Args: - cfg: The config used to instantiate the model and the trainer. - model_name: A str indicates whether the model to be instantiated is a tagger or a decoder (i.e., model_name should be either TAGGER_MODEL or DECODER_MODEL). - do_training: A boolean flag indicates whether the model will be trained or evaluated. - - Returns: - trainer: A PyTorch Lightning trainer - model: A NLPModel that can either be a DuplexTaggerModel or a DuplexDecoderModel - """ - assert model_name in MODEL_NAMES - - # Get configs for the corresponding models - trainer_cfg = cfg.get(f'{model_name}_trainer') - model_cfg = cfg.get(f'{model_name}_model') - pretrained_cfg = cfg.get(f'{model_name}_pretrained_model', None) - - trainer = pl.Trainer(**trainer_cfg) - if not pretrained_cfg: - logging.info(f'Initializing {model_name} model') - if model_name == TAGGER_MODEL: - model = DuplexTaggerModel(model_cfg, trainer=trainer) - if model_name == DECODER_MODEL: - model = DuplexDecoderModel(model_cfg, trainer=trainer) - elif os.path.exists(pretrained_cfg): - logging.info(f'Restoring pretrained {model_name} model from {pretrained_cfg}') - if model_name == TAGGER_MODEL: - model = DuplexTaggerModel.restore_from(pretrained_cfg) - if model_name == DECODER_MODEL: - model = DuplexDecoderModel.restore_from(pretrained_cfg) - else: - logging.info(f'Loading pretrained model {pretrained_cfg}') - if model_name == TAGGER_MODEL: - if pretrained_cfg not in DuplexTaggerModel.get_available_model_names(): - raise ( - ValueError( - f'{pretrained_cfg} not in the list of available Tagger models. Select from {DuplexTaggerModel.list_available_models()}' - ) - ) - model = DuplexTaggerModel.from_pretrained(pretrained_cfg) - if model_name == DECODER_MODEL: - if pretrained_cfg not in DuplexDecoderModel.get_available_model_names(): - raise ( - ValueError( - f'{pretrained_cfg} not in the list of available Decoder models. Select from {DuplexDecoderModel.list_available_models()}' - ) - ) - model = DuplexDecoderModel.from_pretrained(pretrained_cfg) - - # Set model.lang (if it is still None) - if model.lang is None: - model.lang = cfg.lang - assert model.lang in constants.SUPPORTED_LANGS - # Setup covering grammars (if enabled) - # We only support integrating with English TN covering grammars at the moment - if model_name == DECODER_MODEL and model_cfg.use_cg and cfg.lang == constants.ENGLISH: - if model.cg_normalizer is None: - model.setup_cgs(model_cfg) - - # Setup train and validation data - if do_training: - model.setup_training_data(train_data_config=cfg.data.train_ds) - if model_name == DECODER_MODEL: - model.setup_multiple_validation_data(val_data_config=cfg.data.validation_ds) - else: - model.setup_validation_data(val_data_config=cfg.data.validation_ds) - - logging.info(f'Model {model_name} -- Device {model.device}') - return trainer, model diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/__init__.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/__init__.py deleted file mode 100644 index 7d200df57afefb446fde9b8fcd896f7e110f367f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/__init__.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/__init__.py deleted file mode 100644 index 7d200df57afefb446fde9b8fcd896f7e110f367f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/__init__.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/__init__.py deleted file mode 100644 index 7d200df57afefb446fde9b8fcd896f7e110f367f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/normalize.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/normalize.py deleted file mode 100644 index a1f8caa7d9592f1626dde77621ba340c6a7d6f30..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/normalize.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -try: - from nemo_text_processing.text_normalization.normalize import Normalizer - from nemo_text_processing.text_normalization.token_parser import TokenParser -except (ImportError, ModuleNotFoundError): - raise ModuleNotFoundError( - "The package `nemo_text_processing` was not installed in this environment. Please refer to" - " https://github.com/NVIDIA/NeMo-text-processing and install this package before using " - "this script" - ) - -from nemo.collections.common.tokenizers.moses_tokenizers import MosesProcessor - - -class ElectronicNormalizer(Normalizer): - """ - Normalizer for ELECTRONIC. - - Args: - input_case: accepting either "lower_cased" or "cased" input. - lang: language - deterministic: if True will provide a single transduction option, - for False multiple options (used for audio-based normalization) - cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. - overwrite_cache: set to True to overwrite .far files - """ - - def __init__( - self, - input_case: str = 'cased', - lang: str = 'en', - deterministic: bool = True, - cache_dir: str = None, - overwrite_cache: bool = False, - ): - - from nn_wfst.en.electronic.tokenize_and_classify import ClassifyFst - from nn_wfst.en.electronic.verbalize_final import VerbalizeFinalFst - - self.tagger = ClassifyFst( - input_case=input_case, deterministic=deterministic, cache_dir=cache_dir, overwrite_cache=overwrite_cache - ) - self.verbalizer = VerbalizeFinalFst(deterministic=deterministic) - self.post_processor = None - self.parser = TokenParser() - self.lang = lang - self.processor = MosesProcessor(lang_id=lang) diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/tokenize_and_classify.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/tokenize_and_classify.py deleted file mode 100644 index 9e0c284d84b0d806e0c97ded64bdb88dd1292475..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/tokenize_and_classify.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os - -try: - import pynini - from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_WHITE_SPACE, - GraphFst, - delete_extra_space, - delete_space, - generator_main, - ) - from nemo_text_processing.text_normalization.en.taggers.electronic import ElectronicFst - from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst - from nemo_text_processing.text_normalization.en.taggers.word import WordFst - from pynini.lib import pynutil -except (ImportError, ModuleNotFoundError): - raise ModuleNotFoundError( - "The package `nemo_text_processing` was not installed in this environment. Please refer to" - " https://github.com/NVIDIA/NeMo-text-processing and install this package before using " - "this script" - ) - -from nemo.utils import logging - - -class ClassifyFst(GraphFst): - """ - Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. - More details to deployment at NeMo/tools/text_processing_deployment. - - Args: - input_case: accepting either "lower_cased" or "cased" input. - deterministic: if True will provide a single transduction option, - for False multiple options (used for audio-based normalization) - cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. - overwrite_cache: set to True to overwrite .far files - """ - - def __init__( - self, input_case: str, cache_dir: str = None, overwrite_cache: bool = False, deterministic: bool = True - ): - super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) - - far_file = None - if cache_dir is not None and cache_dir != "None": - os.makedirs(cache_dir, exist_ok=True) - far_file = os.path.join(cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic.far") - if not overwrite_cache and far_file and os.path.exists(far_file): - self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] - logging.info(f'ClassifyFst.fst was restored from {far_file}.') - else: - logging.info(f"Creating ClassifyFst grammars.") - - punctuation = PunctuationFst(deterministic=deterministic) - punct_graph = punctuation.fst - word_graph = WordFst(deterministic=deterministic, punctuation=punctuation).fst - electonic_graph = ElectronicFst(deterministic=deterministic).fst - - classify = pynutil.add_weight(electonic_graph, 1.1) | pynutil.add_weight(word_graph, 100) - - punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") - punct = pynini.closure( - pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) - | (pynutil.insert(" ") + punct), - 1, - ) - token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") - token_plus_punct = ( - pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) - ) - - graph = ( - token_plus_punct - + pynini.closure( - ( - pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) - | (pynutil.insert(" ") + punct + pynutil.insert(" ")) - ) - + token_plus_punct - ).optimize() - ) - - graph = delete_space + graph + delete_space - graph |= punct - - self.fst = graph.optimize() - - if far_file: - generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/verbalize.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/verbalize.py deleted file mode 100644 index 7236be7a19944ad889b60c871d6899f50c5c299a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/verbalize.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -try: - from nemo_text_processing.text_normalization.en.graph_utils import GraphFst - from nemo_text_processing.text_normalization.en.verbalizers.electronic import ElectronicFst -except (ImportError, ModuleNotFoundError): - raise ModuleNotFoundError( - "The package `nemo_text_processing` was not installed in this environment. Please refer to" - " https://github.com/NVIDIA/NeMo-text-processing and install this package before using " - "this script" - ) - - -class VerbalizeFst(GraphFst): - """ - Composes other verbalizer grammars. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. - More details to deployment at NeMo/tools/text_processing_deployment. - - Args: - deterministic: if True will provide a single transduction option, - for False multiple options (used for audio-based normalization) - """ - - def __init__(self, deterministic: bool = True): - super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic) - electronic_graph = ElectronicFst(deterministic=deterministic).fst - - self.fst = electronic_graph diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/verbalize_final.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/verbalize_final.py deleted file mode 100644 index b2cc69ca9e0979560af6304d5340b8f8ecb0c432..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/verbalize_final.py +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -try: - import pynini - from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space - from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst - from nn_wfst.en.electronic.verbalize import VerbalizeFst - from pynini.lib import pynutil -except (ImportError, ModuleNotFoundError): - raise ModuleNotFoundError( - "The package `nemo_text_processing` was not installed in this environment. Please refer to" - " https://github.com/NVIDIA/NeMo-text-processing and install this package before using " - "this script" - ) - - -class VerbalizeFinalFst(GraphFst): - """ - Finite state transducer that verbalizes an entire sentence. - - Args: - deterministic: if True will provide a single transduction option, - for False multiple options (used for audio-based normalization) - """ - - def __init__(self, deterministic: bool = True): - super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) - verbalize = VerbalizeFst(deterministic=deterministic).fst - word = WordFst(deterministic=deterministic).fst - types = verbalize | word - - if deterministic: - graph = ( - pynutil.delete("tokens") - + delete_space - + pynutil.delete("{") - + delete_space - + types - + delete_space - + pynutil.delete("}") - ) - else: - graph = delete_space + types + delete_space - graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space - self.fst = graph diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/__init__.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/__init__.py deleted file mode 100644 index 7d200df57afefb446fde9b8fcd896f7e110f367f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/normalize.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/normalize.py deleted file mode 100644 index cfb4bef5d1c32cc8b96d6430ee0654c32b86093c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/normalize.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -try: - from nemo_text_processing.text_normalization.normalize import Normalizer - from nemo_text_processing.text_normalization.token_parser import TokenParser -except (ImportError, ModuleNotFoundError): - raise ModuleNotFoundError( - "The package `nemo_text_processing` was not installed in this environment. Please refer to" - " https://github.com/NVIDIA/NeMo-text-processing and install this package before using " - "this script" - ) - -from nemo.collections.common.tokenizers.moses_tokenizers import MosesProcessor - - -class WhitelistNormalizer(Normalizer): - """ - Normalizer for WHITELIST. - - Args: - input_case: accepting either "lower_cased" or "cased" input. - lang: language - deterministic: if True will provide a single transduction option, - for False multiple options (used for audio-based normalization) - cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. - overwrite_cache: set to True to overwrite .far files - whitelist: path to a file with whitelist replacements - """ - - def __init__( - self, - input_case: str, - lang: str = 'en', - deterministic: bool = True, - cache_dir: str = None, - overwrite_cache: bool = False, - whitelist: str = None, - ): - - from nn_wfst.en.whitelist.tokenize_and_classify import ClassifyFst - from nn_wfst.en.whitelist.verbalize_final import VerbalizeFinalFst - - self.tagger = ClassifyFst( - input_case=input_case, - deterministic=deterministic, - cache_dir=cache_dir, - overwrite_cache=overwrite_cache, - whitelist=whitelist, - ) - self.verbalizer = VerbalizeFinalFst(deterministic=deterministic) - self.post_processor = None - self.parser = TokenParser() - self.lang = lang - self.processor = MosesProcessor(lang_id=lang) diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/tokenize_and_classify.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/tokenize_and_classify.py deleted file mode 100644 index c2d69e765bb4c4986e026007f6fd05c1fcaf0dfe..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/tokenize_and_classify.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os - -try: - import pynini - from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_WHITE_SPACE, - GraphFst, - delete_extra_space, - delete_space, - generator_main, - ) - from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst - from nemo_text_processing.text_normalization.en.taggers.whitelist import WhiteListFst - from nemo_text_processing.text_normalization.en.taggers.word import WordFst - from pynini.lib import pynutil -except (ImportError, ModuleNotFoundError): - raise ModuleNotFoundError( - "The package `nemo_text_processing` was not installed in this environment. Please refer to" - " https://github.com/NVIDIA/NeMo-text-processing and install this package before using " - "this script" - ) - -from nemo.utils import logging - - -class ClassifyFst(GraphFst): - """ - Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. - More details to deployment at NeMo/tools/text_processing_deployment. - - Args: - input_case: accepting either "lower_cased" or "cased" input. - deterministic: if True will provide a single transduction option, - for False multiple options (used for audio-based normalization) - cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. - overwrite_cache: set to True to overwrite .far files - whitelist: path to a file with whitelist replacements - """ - - def __init__( - self, - input_case: str, - cache_dir: str = None, - overwrite_cache: bool = False, - deterministic: bool = True, - whitelist: str = None, - ): - super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) - - far_file = None - if cache_dir is not None and cache_dir != "None": - os.makedirs(cache_dir, exist_ok=True) - whitelist_file = os.path.basename(whitelist) if whitelist else "" - far_file = os.path.join( - cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far" - ) - if not overwrite_cache and far_file and os.path.exists(far_file): - self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] - logging.info(f'ClassifyFst.fst was restored from {far_file}.') - else: - logging.info(f"Creating ClassifyFst grammars.") - - punctuation = PunctuationFst(deterministic=deterministic) - punct_graph = punctuation.fst - word_graph = WordFst(deterministic=deterministic, punctuation=punctuation).fst - whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst - - classify = pynutil.add_weight(whitelist_graph, 1) | pynutil.add_weight(word_graph, 100) - - punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") - punct = pynini.closure( - pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) - | (pynutil.insert(" ") + punct), - 1, - ) - token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") - token_plus_punct = ( - pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) - ) - - graph = ( - token_plus_punct - + pynini.closure( - ( - pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) - | (pynutil.insert(" ") + punct + pynutil.insert(" ")) - ) - + token_plus_punct - ).optimize() - ) - - graph = delete_space + graph + delete_space - graph |= punct - - self.fst = graph.optimize() - - if far_file: - generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/verbalize.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/verbalize.py deleted file mode 100644 index c647a142ef8c55679c8197790d797a49fc3d7294..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/verbalize.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -try: - from nemo_text_processing.text_normalization.en.graph_utils import GraphFst - from nemo_text_processing.text_normalization.en.verbalizers.whitelist import WhiteListFst -except (ImportError, ModuleNotFoundError): - raise ModuleNotFoundError( - "The package `nemo_text_processing` was not installed in this environment. Please refer to" - " https://github.com/NVIDIA/NeMo-text-processing and install this package before using " - "this script" - ) - - -class VerbalizeFst(GraphFst): - """ - Composes other verbalizer grammars. - For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. - More details to deployment at NeMo/tools/text_processing_deployment. - - Args: - deterministic: if True will provide a single transduction option, - for False multiple options (used for audio-based normalization) - """ - - def __init__(self, deterministic: bool = True): - super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic) - whitelist_graph = WhiteListFst(deterministic=deterministic).fst - - self.fst = whitelist_graph diff --git a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/verbalize_final.py b/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/verbalize_final.py deleted file mode 100644 index 550a8a85d797088421046a49e20efd78a9f5ab07..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/duplex_text_normalization/nn_wfst/en/whitelist/verbalize_final.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -try: - import pynini - from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space - from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst - from nn_wfst.en.electronic.verbalize import VerbalizeFst - from pynini.lib import pynutil -except (ImportError, ModuleNotFoundError): - raise ModuleNotFoundError( - "The package `nemo_text_processing` was not installed in this environment. Please refer to" - " https://github.com/NVIDIA/NeMo-text-processing and install this package before using " - "this script" - ) - - -class VerbalizeFinalFst(GraphFst): - """ - Finite state transducer that verbalizes an entire sentence. - - Args: - deterministic: if True will provide a single transduction option, - for False multiple options (used for audio-based normalization) - """ - - def __init__(self, deterministic: bool = True): - super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) - verbalize = VerbalizeFst(deterministic=deterministic).fst - word = WordFst(deterministic=deterministic).fst - types = verbalize | word - - if deterministic: - graph = ( - pynutil.delete("tokens") - + delete_space - + pynutil.delete("{") - + delete_space - + types - + delete_space - + pynutil.delete("}") - ) - else: - graph = delete_space + types + delete_space - graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space - self.fst = graph diff --git a/SoundScribe/SpeakerID/examples/nlp/entity_linking/build_index.py b/SoundScribe/SpeakerID/examples/nlp/entity_linking/build_index.py deleted file mode 100644 index eeba5c83130ec5e0e7d152a0b4b25f50945ed82e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/entity_linking/build_index.py +++ /dev/null @@ -1,201 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import pickle as pkl -import random -from argparse import ArgumentParser - -import h5py -import numpy as np -import torch -from omegaconf import DictConfig, OmegaConf -from sklearn.decomposition import PCA -from tqdm import tqdm - -from nemo.collections.nlp.models import EntityLinkingModel -from nemo.utils import logging - -try: - import faiss -except ModuleNotFoundError: - logging.warning("Faiss is required for building the index. Please install faiss-gpu") - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - - -def build_index(cfg: DictConfig, model: object): - """ - Builds faiss index from index dataset specified in the config. - - Args: - cfg (DictConfig): Config file specifying index parameters - model (object): Encoder model - """ - - # Get index dataset embeddings - # PCA model exists and index embeddings have already been PCAed, no need to re-extract/PCA them - if cfg.apply_pca and os.path.isfile(cfg.pca.pca_save_name) and os.path.isfile(cfg.pca_embeddings_save_name): - logging.info("Loading reduced dimensionality embeddings") - embeddings = h5py.File(cfg.pca_embeddings_save_name, "r") - embeddings = embeddings[cfg.index_ds.name][:] - - elif os.path.isfile(cfg.embedding_save_name): - logging.info("Loading previously extracted index dataset embeddings") - embeddings = h5py.File(cfg.embedding_save_name, "r") - embeddings = embeddings[cfg.index_ds.name][:] - - else: - logging.info("Encoding index dataset, this may take a while") - index_dataloader = model.setup_dataloader(cfg.index_ds, is_index_data=True) - embeddings, concept_ids = get_index_embeddings(cfg, index_dataloader, model) - - # Create pca model to reduce dimensionality of index dataset and decrease memory footprint - if cfg.apply_pca: - - # Need to train PCA model and apply PCA transformation with newly trained model - if not os.path.isfile(cfg.pca.pca_save_name): - logging.info("Fitting PCA model for embedding dimensionality reduction") - pca_train_set = random.sample(list(embeddings), k=int(len(embeddings) * cfg.pca.sample_fraction)) - pca = PCA(n_components=cfg.pca.output_dim) - pca.fit(pca_train_set) - pkl.dump(pca, open(cfg.pca.pca_save_name, "wb")) - embeddings = reduce_embedding_dim(pca, embeddings, cfg) - - # PCA model already trained, just need to reduce dimensionality of all embeddings - elif not os.path.isfile(cfg.pca_embeddings_save_name): - pca = pkl.load(open(cfg.pca.pca_save_name, "rb")) - embeddings = reduce_embedding_dim(pca, embeddings, cfg) - - # Build faiss index from embeddings - logging.info(f"Training index with embedding dim size {cfg.dims} using {faiss.get_num_gpus()} gpus") - quantizer = faiss.IndexFlatL2(cfg.dims) - index = faiss.IndexIVFFlat(quantizer, cfg.dims, cfg.nlist) - index = faiss.index_cpu_to_all_gpus(index) - index.train(embeddings) - - logging.info("Adding dataset embeddings to index") - for i in tqdm(range(0, embeddings.shape[0], cfg.index_batch_size)): - index.add(embeddings[i : i + cfg.index_batch_size]) - - logging.info("Saving index") - faiss.write_index(faiss.index_gpu_to_cpu(index), cfg.index_save_name) - logging.info("Index built and saved") - - -def reduce_embedding_dim(pca, embeddings, cfg): - """Apply PCA transformation to index dataset embeddings""" - - logging.info("Applying PCA transformation to entire index dataset") - embeddings = np.array(pca.transform(embeddings), dtype=np.float32) - emb_file = h5py.File(cfg.pca_embeddings_save_name, "w") - emb_file.create_dataset(cfg.index_ds.name, data=embeddings) - emb_file.close() - - return embeddings - - -def get_index_embeddings(cfg: DictConfig, dataloader: object, model: object): - """Use entity linking encoder to get embeddings for full index dataset""" - embeddings = [] - concept_ids = [] - - with torch.no_grad(): - for batch in tqdm(dataloader): - input_ids, token_type_ids, input_mask, batch_concept_ids = batch - input_ids = input_ids.to(device) - token_type_ids = token_type_ids.to(device) - input_mask = input_mask.to(device) - batch_embeddings = model.forward( - input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=input_mask - ) - - embeddings.extend(batch_embeddings.detach().cpu().numpy()) - concept_ids.extend(batch_concept_ids.numpy()) - - emb_file = h5py.File(cfg.embedding_save_name, "w") - emb_file.create_dataset(cfg.index_ds.name, data=embeddings) - emb_file.close() - - pkl.dump(concept_ids, open(cfg.concept_id_save_name, "wb")) - - return embeddings, concept_ids - - -def load_model(cfg: DictConfig, restore: bool): - """ - Loads encoder model. - - Args: - cfg: Config file specifying model parameters - restore: Whether to restore model weights trained - by the user. Otherwise will load weights - used before self alignment pretraining. - """ - - if restore: - model = EntityLinkingModel.restore_from(cfg.nemo_path) - else: - cfg.train_ds = None - cfg.validation_ds = None - cfg.test_ds = None - model = EntityLinkingModel(cfg) - - model = model.to(device) - - return model - - -def main(cfg: DictConfig, restore: bool): - """ - Builds new index if one hasn't been built yet. - - Args: - cfg: Config file specifying index parameters - restore: Whether to restore model weights trained - by the user. Otherwise will load weights - used before self alignment pretraining. - """ - - logging.info("Loading entity linking encoder model") - model = load_model(cfg.model, restore) - - if not os.path.isfile(cfg.index.index_save_name) or ( - cfg.apply_pca and not os.path.isfile(cfg.index.pca.pca_save_name) - ): - logging.info("Building index") - build_index(cfg.index, model) - else: - logging.info("Index and pca model (if required) already exists. Skipping build index step.") - - if not os.path.isfile(cfg.index.idx_to_id): - logging.info("Mapping entity index postions to ids") - map_idx_to_ids(cfg.index) - else: - logging.info("Map from concept index to id already exists. Skipping mapping step.") - - -if __name__ == '__main__': - parser = ArgumentParser() - parser.add_argument( - "--restore", action="store_true", help="Whether to restore encoder model weights from nemo path" - ) - parser.add_argument("--project_dir", required=False, type=str, default=".") - parser.add_argument("--cfg", required=False, type=str, default="./conf/umls_medical_entity_linking_config.yaml") - args = parser.parse_args() - - cfg = OmegaConf.load(args.cfg) - cfg.project_dir = args.project_dir - - main(cfg, args.restore) diff --git a/SoundScribe/SpeakerID/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml b/SoundScribe/SpeakerID/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml deleted file mode 100644 index b7f538ccd68f6f89576bf5317306a5667c3882a2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -project_dir: null -name: SelfAlignmentPretrainingForMedicalEntityLinking -trainer: - devices: 1 - num_nodes: 1 - max_epochs: 2 - max_steps: -1 - accumulate_grad_batches: 1 - precision: 16 - accelerator: gpu - strategy: ddp - gradient_clip_val: 0.0 - log_every_n_steps: 1 - val_check_interval: 2 - enable_checkpointing: False - logger: false -model: - nemo_path: ??? - max_seq_length: 128 - language_model: - pretrained_model_name: bert-base-uncased - config_file: null - config: null - lm_checkpoint: null - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} - vocab_file: null - tokenizer_model: null - do_lower_case: true - loss_params: null - train_ds: - data_file: ??? - max_seq_length: ${model.max_seq_length} - batch_size: 8 - shuffle: true - num_workers: 2 - pin_memory: false - drop_last: false - validation_ds: - data_file: ??? - max_seq_length: ${model.max_seq_length} - batch_size: 8 - shuffle: false - num_workers: 2 - pin_memory: false - drop_last: false - optim: - name: adam - lr: 3.0e-05 - weight_decay: 0.0 - sched: - name: CosineAnnealing - warmup_steps: null - warmup_ratio: 0.1 - min_lr: 0.0 - last_epoch: -1 -index: - dims: 768 - nlist: 2 - top_n: 3 - query_num_factor: 20 - index_save_name: ??? - index_batch_size: 10 - index_ds: - name: tiny_example - data_file: ??? - max_seq_length: ${model.max_seq_length} - batch_size: 100 - shuffle: false - num_workers: 2 - pin_memory: false - drop_last: false - idx_to_id: ${project_dir}/idx_to_id.pkl - id_to_string: ${project_dir}/id_to_string.pkl - concept_id_save_name: ${project_dir}/tiny_example_concept_ids.pkl - embedding_save_name: ${project_dir}/tiny_example_concept_embeddings.hdf5 - pca_embeddings_save_name: null - apply_pca: false - pca: null -exp_manager: - exp_dir: . - name: ${project_dir}/SelfAlignmentPretrainingTinyExample - create_tensorboard_logger: true - create_checkpoint_callback: true -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/SoundScribe/SpeakerID/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml b/SoundScribe/SpeakerID/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml deleted file mode 100644 index ad636ef23e18c9a94cc022c6a2157819dde17f3f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml +++ /dev/null @@ -1,95 +0,0 @@ -project_dir: ??? -name: SelfAlignmentPretrainingForMedicalEntityLinking -trainer: - devices: 1 - num_nodes: 1 - max_epochs: 2 - max_steps: -1 - accumulate_grad_batches: 1 - precision: 16 - accelerator: gpu - strategy: ddp - gradient_clip_val: 0.0 - log_every_n_steps: 1 - val_check_interval: 1000 - enable_checkpointing: False - logger: false -model: - nemo_path: ${project_dir}/sap_bert_umls.nemo - raw_data: ${project_dir}/data/MRCONSO.RRF - max_seq_length: 128 - language_model: - pretrained_model_name: bert-base-uncased - config_file: null - config: null - lm_checkpoint: null - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} - vocab_file: null - tokenizer_model: null - do_lower_case: true - train_ds: - data_file: ${project_dir}/data/umls_train_pairs.tsv - max_seq_length: ${model.max_seq_length} - batch_size: 128 - shuffle: true - num_workers: 2 - pin_memory: false - drop_last: false - validation_ds: - data_file: ${project_dir}/data/umls_validation_pairs.tsv - max_seq_length: ${model.max_seq_length} - batch_size: 128 - shuffle: false - num_workers: 2 - pin_memory: false - drop_last: false - optim: - name: adam - lr: 3.0e-05 - weight_decay: 0.0 - sched: - name: CosineAnnealing - warmup_steps: null - warmup_ratio: 0.1 - min_lr: 0.0 - last_epoch: -1 -index: - dims: 256 - nlist: 300 - top_n: 5 - query_num_factor: 20 - index_save_name: ${project_dir}/medical_entity_linking_index - index_batch_size: 1000 - raw_data: ${model.raw_data} - index_ds: - name: umls - data_file: ${project_dir}/data/umls_index_concepts.tsv - max_seq_length: ${model.max_seq_length} - batch_size: 128 - shuffle: false - num_workers: 2 - pin_memory: false - drop_last: false - idx_to_id: ${project_dir}/data/idx_to_id.pkl - id_to_string: ${project_dir}/data/id_to_string.pkl - concept_id_save_name: ${project_dir}/data/concept_ids.pkl - embedding_save_name: ${project_dir}/data/medical_concept_embeddings.hdf5 - pca_embeddings_save_name: ${project_dir}/data/medical_concept_reduced_${index.dims}dim_embeddings.hdf5 - apply_pca: true - pca: - input_dim: 756 - output_dim: ${index.dims} - sample_fraction: 0.5 - pca_save_name: ${project_dir}/${index.pca.input_dim}_to_${index.pca.output_dim}_pca_model.pkl -exp_manager: - exp_dir: ${project_dir}/medical_entity_linking_experiments - name: sap_bert_umls - create_tensorboard_logger: true - create_checkpoint_callback: true -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/SoundScribe/SpeakerID/examples/nlp/entity_linking/data/umls_dataset_processing.py b/SoundScribe/SpeakerID/examples/nlp/entity_linking/data/umls_dataset_processing.py deleted file mode 100644 index 03a17da3c0bc33572baa44973548410e8d6accc1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/entity_linking/data/umls_dataset_processing.py +++ /dev/null @@ -1,189 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -import pickle as pkl -import random -from argparse import ArgumentParser - -import pandas as pd -from omegaconf import OmegaConf -from tqdm import tqdm - -# Info on these headers can be found here on the UMLS website https://www.ncbi.nlm.nih.gov/books/NBK9685/ -# section 3.3.4 Table 1 -HEADERS = [ - 'CUI', - 'LAT', - 'TS', - 'LUI', - 'STT', - 'SUI', - 'ISPREF', - 'AUI', - 'SAUI', - 'SCUI', - 'SDUI', - 'SAB', - 'TTY', - 'CODE', - 'STR', - 'SRL', - 'SUPPRESS', - 'CVF', -] - - -def process_umls_training_dataset(data_path, train_save_name, val_save_name, max_pairs, train_split, headers): - """ - Generates and saves UMLS self alignment pretraining train and validation data. Takes the raw .RRF UMLS - data file and creates different pair combinations for entities with the same CUI. Each row in the output - will be formatted as 'CUI EntitySynonym1 EntitySynonym2' with each item in a row separated by tabs. - Saves two .tsv output files, one for the train split and one for the validation split. - Only data marked as English is added to the train and val splits. - - Arguments: - data_path (str): path to MRCONSO.RRF UMLS data file - train_save_name (str): path to where training data will be saved - val_save_name (str): path to where validation data will be saved - max_pairs (int): max number of pairs for any one CUI added to the train - or validation splits - train_split (float): precentage of raw data to be added to train set split - headers (list): column lables within MRCONSO.RRF - """ - - print("Loading training data file...") - df = pd.read_table(data_path, names=headers, index_col=False, delimiter='|') - train_file = open(train_save_name, 'w') - val_file = open(val_save_name, 'w') - - cui = df["CUI"].iloc[0] - names = [] - random.seed(2021) - - for idx in tqdm(range(len(df))): - # Address incorrectly formatted data - if type(df["STR"].iloc[idx]) != str or "|" in df["STR"].iloc[idx]: - continue - - # Collect all english concept strings matching the current CUI - if df["CUI"].iloc[idx] == cui and df["LAT"].iloc[idx] == "ENG": - concept_string = df["STR"].iloc[idx] - names.append(concept_string) - - else: - # Pair off concept synonyms to make training and val sets - pairs = list(itertools.combinations(names, 2)) - - if len(pairs) == 0: - # Not enough concepts gathered to make a pair - cui = df["CUI"].iloc[idx] - names = [df["STR"].iloc[idx]] - continue - - # Removing leading C to convert label string to int - cui = int(cui[1:]) - random.shuffle(pairs) - - # Keep up to max pairs number pairs for any one concept - for pair in pairs[:max_pairs]: - - # Want concepts in train and val splits to be randomly selected and mutually exclusive - add_to_train = random.random() - - if add_to_train <= train_split: - train_file.write(f'{cui}\t{pair[0]}\t{pair[1]}\n') - else: - val_file.write(f'{cui}\t{pair[0]}\t{pair[1]}\n') - - # Switch to next concept - cui = df["CUI"].iloc[idx] - names = [df["STR"].iloc[idx]] - - train_file.close() - val_file.close() - print("Finished making training and validation data") - - -def process_umls_index_dataset(data_path, data_savename, id2string_savename, headers): - """ - Generates data file needed to build a UMLS index and a hash table mapping each - CUI to one canonical concept string. Takes the raw .RRF data file and saves - a .tsv indec concept file as well as the a .pkl file of cui to concept string - mappings. Only data marked as English is added to the index data file. - - Arguments: - data_path (str): path to MRCONSO.RRF UMLS data file - data_savename (str): path to where .tsv index data will be saved - id2string_savename (str): path to where .pkl cui to string mapping will - be saved - headers (list): column lables within MRCONSO.RRF - """ - - print("Loading index data file...") - df = pd.read_table(data_path, names=headers, index_col=False, delimiter='|') - id2string = {} - - with open(data_savename, "w") as outfile: - for idx, row in tqdm(df.iterrows(), total=df.shape[0]): - # Address incorrectly formatted data - if type(row["STR"]) != str or "|" in row["STR"]: - continue - - cui = row["CUI"] - sent = row["STR"] - - # Removing leading C to convert label string to int - cui = int(cui[1:]) - - # Only keeping english concepts - if row["LAT"] == "ENG": - outfile.write(f'{cui}\t{sent}\n') - - # Matching each cui to one canonical string represention - if cui not in id2string and ":" not in sent: - id2string[cui] = sent - - outfile.close() - pkl.dump(id2string, open(id2string_savename, "wb")) - print("Finished saving index data and id to concept mapping") - - -if __name__ == '__main__': - parser = ArgumentParser() - parser.add_argument("--index", action="store_true", help="Whether to process data for building an index") - parser.add_argument("--project_dir", required=False, type=str, default=".") - parser.add_argument("--cfg", required=False, type=str, default="conf/umls_medical_entity_linking_config.yaml") - parser.add_argument( - "--max_pairs", required=False, type=int, default=50, help="Max number of train pairs for a single concepts" - ) - parser.add_argument( - "--train_split", required=False, type=float, default=0.99, help="Precentage of data to add to train set" - ) - - args = parser.parse_args() - cfg = OmegaConf.load(args.cfg) - cfg.project_dir = args.project_dir - - if args.index: - process_umls_index_dataset(cfg.index.raw_data, cfg.index.index_ds.data_file, cfg.index.id_to_string, HEADERS) - else: - process_umls_training_dataset( - cfg.model.raw_data, - cfg.model.train_ds.data_file, - cfg.model.validation_ds.data_file, - args.max_pairs, - args.train_split, - HEADERS, - ) diff --git a/SoundScribe/SpeakerID/examples/nlp/entity_linking/query_index.py b/SoundScribe/SpeakerID/examples/nlp/entity_linking/query_index.py deleted file mode 100644 index 6cb51a7de1609a5a0779573c2a06a1856d668cd7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/entity_linking/query_index.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import pickle as pkl -from argparse import ArgumentParser -from collections import OrderedDict -from typing import Dict - -import numpy as np -import torch -from build_index import load_model -from omegaconf import DictConfig, OmegaConf - -from nemo.utils import logging - -try: - import faiss -except ModuleNotFoundError: - logging.warning("Faiss is required for building the index. Please install faiss-gpu") - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - - -def get_query_embedding(query, model): - """Use entity linking encoder to get embedding for index query""" - model_input = model.tokenizer( - query, - add_special_tokens=True, - padding=True, - truncation=True, - max_length=512, - return_token_type_ids=True, - return_attention_mask=True, - ) - - query_emb = model.forward( - input_ids=torch.LongTensor([model_input["input_ids"]]).to(device), - token_type_ids=torch.LongTensor([model_input["token_type_ids"]]).to(device), - attention_mask=torch.LongTensor([model_input["attention_mask"]]).to(device), - ) - - return query_emb - - -def query_index( - query: str, cfg: DictConfig, model: object, index: object, pca: object, idx2id: dict, id2string: dict, -) -> Dict: - - """ - Query the nearest neighbor index of entities to find the - concepts in the index dataset that are most similar to the - query. - - Args: - query (str): entity to look up in the index - cfg (DictConfig): config object to specifiy query parameters - model (EntityLinkingModel): entity linking encoder model - index (object): faiss index - pca (object): sklearn pca transformation to be applied to queries - idx2id (dict): dictionary mapping unique concept dataset index to - its CUI - id2string (dict): dictionary mapping each unqiue CUI to a - representative english description of - the concept - Returns: - A dictionary with the concept ids of the index's most similar - entities as the keys and a tuple containing the string - representation of that concept and its cosine similarity to - the query as the values. - """ - query_emb = get_query_embedding(query, model).detach().cpu().numpy() - - if cfg.apply_pca: - query_emb = pca.transform(query_emb) - - dist, neighbors = index.search(query_emb.astype(np.float32), cfg.query_num_factor * cfg.top_n) - dist, neighbors = dist[0], neighbors[0] - unique_ids = OrderedDict() - neighbor_idx = 0 - - # Many of nearest neighbors could map to the same concept id, their idx is their unique identifier - while len(unique_ids) < cfg.top_n and neighbor_idx < len(neighbors): - concept_id_idx = neighbors[neighbor_idx] - concept_id = idx2id[concept_id_idx] - - # Only want one instance of each unique concept - if concept_id not in unique_ids: - concept = id2string[concept_id] - unique_ids[concept_id] = (concept, 1 - dist[neighbor_idx]) - - neighbor_idx += 1 - - unique_ids = dict(unique_ids) - - return unique_ids - - -def main(cfg: DictConfig, restore: bool): - """ - Loads faiss index and allows commandline queries - to the index. Builds new index if one hasn't been built yet. - - Args: - cfg: Config file specifying index parameters - restore: Whether to restore model weights trained - by the user. Otherwise will load weights - used before self alignment pretraining. - """ - - if not os.path.isfile(cfg.index.index_save_name) or ( - cfg.apply_pca and not os.path.isfile(cfg.index.pca.pca_save_name) or not os.path.isfile(cfg.index.idx_to_id) - ): - logging.warning("Either no index and/or no mapping from entity idx to ids exists. Please run `build_index.py`") - return - - logging.info("Loading entity linking encoder model") - model = load_model(cfg.model, restore) - - logging.info("Loading index and associated files") - index = faiss.read_index(cfg.index.index_save_name) - idx2id = pkl.load(open(cfg.index.idx_to_id, "rb")) - id2string = pkl.load(open(cfg.index.id_to_string, "rb")) # Should be created during dataset prep - - if cfg.index.apply_pca: - pca = pkl.load(open(cfg.index.pca.pca_save_name, "rb")) - - while True: - query = input("enter index query: ") - output = query_index(query, cfg.top_n, cfg.index, model, index, pca, idx2id, id2string) - - if query == "exit": - break - - for concept_id in output: - concept_details = output[concept_id] - concept_id = "C" + str(concept_id).zfill(7) - print(concept_id, concept_details) - - print("----------------\n") - - -if __name__ == '__main__': - parser = ArgumentParser() - parser.add_argument( - "--restore", action="store_true", help="Whether to restore encoder model weights from nemo path" - ) - parser.add_argument("--project_dir", required=False, type=str, default=".") - parser.add_argument("--cfg", required=False, type=str, default="./conf/umls_medical_entity_linking_config.yaml") - args = parser.parse_args() - - cfg = OmegaConf.load(args.cfg) - cfg.project_dir = args.project_dir - - main(cfg, args.restore) diff --git a/SoundScribe/SpeakerID/examples/nlp/entity_linking/self_alignment_pretraining.py b/SoundScribe/SpeakerID/examples/nlp/entity_linking/self_alignment_pretraining.py deleted file mode 100644 index a1ac1ac327cbdc47f4ce1af154f2def1c468f0a4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/entity_linking/self_alignment_pretraining.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# Please see tutorial at Nemo/tutorials/nlp/Entity_Linking_Medical.ipynb for -# more information on entity linking and self alignment pretraining. - -from omegaconf import DictConfig, OmegaConf -from pytorch_lightning import Trainer - -from nemo.collections.nlp.models import EntityLinkingModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="umls_medical_entity_linking_config.yaml") -def main(cfg: DictConfig) -> None: - # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True - # when there are unused parameters here - if cfg.trainer.strategy == 'ddp': - cfg.trainer.strategy = "ddp_find_unused_parameters_true" - logging.info(f"\nConfig Params:\n{OmegaConf.to_yaml(cfg)}") - trainer = Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - - logging.info(f"Loading weights from pretrained model {cfg.model.language_model.pretrained_model_name}") - model = EntityLinkingModel(cfg=cfg.model, trainer=trainer) - logging.info("===========================================================================================") - logging.info('Starting training...') - trainer.fit(model) - logging.info('Training finished!') - logging.info("===========================================================================================") - - if cfg.model.nemo_path: - # '.nemo' file contains the last checkpoint and the params to initialize the model - model.save_to(cfg.model.nemo_path) - logging.info(f'Model is saved into `.nemo` file: {cfg.model.nemo_path}') - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/glue_benchmark/glue_benchmark.py b/SoundScribe/SpeakerID/examples/nlp/glue_benchmark/glue_benchmark.py deleted file mode 100644 index 3cb5f8e4af3ef9a6cec0f77f6d7d6d21e0b684fc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/glue_benchmark/glue_benchmark.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -## Tasks -This script works with all GLUE Benchmark tasks, more details about the GLUE Benchmark could be found at -https://gluebenchmark.com/ - -More details on how to use this script could be found in tutorials/nlp/GLUE_Benchmark.ipynb - -## Model Training - -To train GLUEModel with the default config file, run: - python glue_benchmark.py \ - model.dataset.data_dir= \ - model.task_name=TASK_NAME \ - trainer.max_epochs= \ - trainer.devices="[] - -Supported task names: -["cola", "sst-2", "mrpc", "sts-b", "qqp", "mnli", "qnli", "rte", "wnli"] -Note, MNLI task includes both matched and mismatched dev sets -""" - -import os - -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import GLUEModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_name="glue_benchmark_config") -def main(cfg: DictConfig) -> None: - # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True - # when there are unused parameters like here - if cfg.trainer.strategy == 'ddp': - cfg.trainer.strategy = "ddp_find_unused_parameters_true" - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - exp_manager_cfg = cfg.get("exp_manager", None) - - if exp_manager_cfg: - exp_manager_cfg.name = cfg.model.task_name - logging.info(f'Setting task_name to {exp_manager_cfg.name} in exp_manager') - exp_manager(trainer, exp_manager_cfg) - - if cfg.model.nemo_path and os.path.exists(cfg.model.nemo_path): - model = GLUEModel.restore_from(cfg.model.nemo_path) - logging.info(f'Restoring model from {cfg.model.nemo_path}') - model.update_data_dir(data_dir=cfg.model.dataset.data_dir) - model.setup_training_data() - model.setup_multiple_validation_data() - trainer.fit(model) - else: - model = GLUEModel(cfg.model, trainer=trainer) - trainer.fit(model) - if cfg.model.nemo_path: - model.save_to(cfg.model.nemo_path) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/glue_benchmark/glue_benchmark_config.yaml b/SoundScribe/SpeakerID/examples/nlp/glue_benchmark/glue_benchmark_config.yaml deleted file mode 100644 index 21cdc04db22fd05ee6266e7ef67aea3913e3748e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/glue_benchmark/glue_benchmark_config.yaml +++ /dev/null @@ -1,82 +0,0 @@ -# GLUE Benchmark with pre-trained BERT models -supported_tasks: &supported_tasks ['cola', 'sst-2', 'mrpc', 'sts-b', 'qqp', 'mnli', 'qnli', 'rte', 'wnli'] - -trainer: - devices: 1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 3 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - precision: 16 - accelerator: gpu - strategy: ddp - enable_checkpointing: False # Provided by exp_manager - logger: False # Provided by exp_manager - -model: - task_name: &task_name mrpc # choose from: ["cola", "sst-2", "mrpc", "sts-b", "qqp", "mnli", "qnli", "rte", "wnli"] GLUE task name, MNLI includes both matched and mismatched dev sets - supported_tasks: *supported_tasks - output_dir: null # dir to write write predictions - nemo_path: null # filename to save the model and associated artifacts to .nemo file - dataset: - data_dir: ??? # /path/to/data - max_seq_length: 128 - use_cache: true - - # shared across dataloaders: - num_workers: 2 - pin_memory: false - drop_last: false - - train_ds: - ds_item: 'train.tsv' - shuffle: true - num_samples: -1 - batch_size: 32 - - validation_ds: - ds_item: 'dev.tsv' # for MNLI 'dev_matched.tsv' and 'dev_mismatched.tsv' will de used - shuffle: false - num_samples: -1 - batch_size: 32 - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null # only necessary for adding transformer/bert-specific special tokens to tokenizer if the tokenizer does not already have these inherently. - - language_model: - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - optim: - name: adam - lr: 5e-5 - weight_decay: 0.00 - - sched: - name: WarmupAnnealing - # Scheduler params - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./NeMo_experiments" - name: *task_name # The name of your model - create_tensorboard_logger: True # Whether you want exp_manger to create a tb logger - create_checkpoint_callback: True # Whether you want exp_manager to create a modelcheckpoint callback - -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/SoundScribe/SpeakerID/examples/nlp/information_retrieval/bert_dpr.py b/SoundScribe/SpeakerID/examples/nlp/information_retrieval/bert_dpr.py deleted file mode 100644 index 2d9cd962ff340731b824aa9a24183a20b734337d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/information_retrieval/bert_dpr.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import BertDPRModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="bert_ir_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - bert_dpr_model = BertDPRModel(cfg.model, trainer=trainer) - trainer.fit(bert_dpr_model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/information_retrieval/bert_joint_ir.py b/SoundScribe/SpeakerID/examples/nlp/information_retrieval/bert_joint_ir.py deleted file mode 100644 index 1bb164e580d1e26f677425862768d58ec36229dd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/information_retrieval/bert_joint_ir.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import BertJointIRModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="bert_ir_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - bert_joint_ir_model = BertJointIRModel(cfg.model, trainer=trainer) - trainer.fit(bert_joint_ir_model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/information_retrieval/conf/bert_ir_config.yaml b/SoundScribe/SpeakerID/examples/nlp/information_retrieval/conf/bert_ir_config.yaml deleted file mode 100644 index 56e573e0bcf655da55cb8d619cb60502242ee617..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/information_retrieval/conf/bert_ir_config.yaml +++ /dev/null @@ -1,99 +0,0 @@ -# Fine-tuning BERT model for information retrieval -name: &name BertIR -trainer: - devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices - num_nodes: 1 - max_epochs: 2 # the number of training epochs - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - precision: 16 # 16 to use AMP - accelerator: gpu - strategy: ddp - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 0.05 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - enable_checkpointing: False # provided by exp_manager - logger: false # provided by exp_manager - -model: - nemo_path: null # exported .nemo path - - language_model: - pretrained_model_name: bert-base-uncased - sim_score_dropout: 0.1 - lm_checkpoint: null - config: - attention_probs_dropout_prob: 0.1 - hidden_act: gelu - hidden_dropout_prob: 0.1 - hidden_size: 768 - initializer_range: 0.02 - intermediate_size: 3072 - max_position_embeddings: 512 - num_attention_heads: 12 - num_hidden_layers: 12 - type_vocab_size: 2 - vocab_size: 30522 - config_file: null # json file, precedence over config - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # tokenizer that inherits from TokenizerSpec - vocab_file: null # path to vocab file - tokenizer_model: null # tokenizer model for sentencepiece - special_tokens: null - - train_ds: - passages: null # path to file with passages and their indices - queries: null # path to file with training queries and their indices - query_to_passages: null - # path to file with training examples which have the form of - # (query_id, relevant_passage_id, irrelevant_passage_1_id, ..., irrelevant_passage_n_id) - num_negatives: 10 - batch_size: 6 - psg_cache_format: npz - shuffle: true - num_samples: -1 # number of samples to be considered, -1 means all the dataset - num_workers: 1 - drop_last: false - pin_memory: false - - validation_ds: - passages: null # path to file with passages and their indices - queries: null # path to file with validation queries and their indices - query_to_passages: null # path to file with passages to re-rank for each validation query - num_negatives: 10 - batch_size: 6 - psg_cache_format: pkl - shuffle: false - num_samples: -1 # number of samples to be considered, -1 means all the dataset - num_workers: 1 - drop_last: false - pin_memory: false - - optim: - name: adam - lr: 1e-5 - betas: [0.9, 0.999] - weight_decay: 0 - - sched: - name: WarmupAnnealing - warmup_steps: null - warmup_ratio: 0.05 - last_epoch: -1 - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - -exp_manager: - exp_dir: null # where to store logs and checkpoints - name: *name # name of experiment - create_tensorboard_logger: True - create_checkpoint_callback: True - -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/SoundScribe/SpeakerID/examples/nlp/information_retrieval/construct_random_negatives.py b/SoundScribe/SpeakerID/examples/nlp/information_retrieval/construct_random_negatives.py deleted file mode 100644 index 36da96f503c600abab29ce1b54cb961d247d71aa..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/information_retrieval/construct_random_negatives.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -import numpy as np -import pandas as pd - - -def construct_negatives(input_file, output_file, num_passages, num_negatives): - qrels = pd.read_csv(input_file, delimiter="\t", header=None) - with open(output_file, "w") as f: - for i in range(len(qrels)): - query_id, rel_passage_id = qrels[0][i], qrels[2][i] - negatives = np.random.randint(num_passages, size=num_negatives) - output_ids = [query_id, rel_passage_id] + negatives.tolist() - output_str = [str(id_) for id_ in output_ids] - print("\t".join(output_str), file=f) - - -def main(): - parser = argparse.ArgumentParser(description="Negative passages construction") - parser.add_argument("--data", type=str, default="msmarco_dataset", help="path to folder with data") - parser.add_argument("--num_passages", type=int, default=8841823, help="total number of passages") - parser.add_argument("--num_negatives", type=int, default=10, help="number of negatives per positive") - args = parser.parse_args() - - for mode in ["train", "dev"]: - construct_negatives( - input_file=f"{args.data}/qrels.{mode}.tsv", - output_file=f"{args.data}/query2passages.{mode}.tsv", - num_passages=args.num_passages, - num_negatives=args.num_negatives, - ) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/information_retrieval/get_msmarco.sh b/SoundScribe/SpeakerID/examples/nlp/information_retrieval/get_msmarco.sh deleted file mode 100644 index 0bf286c3f719e461b168b7062035dcc1afba253c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/information_retrieval/get_msmarco.sh +++ /dev/null @@ -1,22 +0,0 @@ -echo "=== Acquiring MSMARCO dataset ===" -echo "---" - -mkdir -p msmarco_dataset -cd msmarco_dataset - -echo "- Downloading passages" -wget --quiet --continue https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz -tar -xzvf collection.tar.gz -rm collection.tar.gz - -echo "- Downloading queries" -wget --quiet --continue https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz -tar -xzvf queries.tar.gz -rm queries.tar.gz -rm queries.eval.tsv - -echo "- Downloading relevance labels" -wget --quiet --continue https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv -wget --quiet --continue https://msmarco.blob.core.windows.net/msmarcoranking/qrels.dev.tsv - -echo "---" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml b/SoundScribe/SpeakerID/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml deleted file mode 100644 index df66111375cb3209895e1d0a9eee6c35b6c46738..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml +++ /dev/null @@ -1,110 +0,0 @@ -# Intent and Slot classification with pretrained BERT models - -trainer: - devices: 1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 50 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - precision: 32 # Should be set to 16 for O1 and O2 amp_level to enable the AMP. - accelerator: gpu - strategy: ddp - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - - enable_checkpointing: False - logger: false # Provided by exp_manager - -model: - nemo_path: null # filename to save the model and associated artifacts to .nemo file - data_dir: ??? # /path/to/data - class_labels: - intent_labels_file: intent_labels.csv - slot_labels_file: slot_labels.csv - class_balancing: null # or weighted_loss - intent_loss_weight: 0.6 # relation of intent to slot loss in total loss (between 0 to 1) - pad_label: -1 # if -1 not slot token will be used - ignore_extra_tokens: false - ignore_start_end: true # do not use first and last token for slot training - - train_ds: - prefix: train - batch_size: 32 - shuffle: true - num_samples: -1 - num_workers: 2 - drop_last: false - pin_memory: false - - validation_ds: - prefix: test - batch_size: 32 - shuffle: false - num_samples: -1 - num_workers: 2 - drop_last: false - pin_memory: false - - test_ds: - prefix: test - batch_size: 32 - shuffle: false - num_samples: -1 - num_workers: 2 - drop_last: false - pin_memory: false - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - - language_model: - max_seq_length: 50 - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - head: - num_output_layers: 2 - fc_dropout: 0.1 - - optim: - name: adam - lr: 2e-5 - args: - name: auto - params: - weight_decay: 0.01 - - sched: - name: WarmupAnnealing - iters_per_batch: null # computed at runtime - max_steps: -1 # computed at runtime or explicitly set here - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - - # scheduler config override - args: - name: auto - params: - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./nemo_experiments" - name: "IntentSlot" # The name of your model - create_tensorboard_logger: true # Whether you want exp_manger to create a tb logger - create_checkpoint_callback: true # Whether you want exp_manager to create a modelcheckpoint callback - -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/SoundScribe/SpeakerID/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml b/SoundScribe/SpeakerID/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml deleted file mode 100644 index c15c71e67c0720a9f7f4fac563eb8df38794d2b1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml +++ /dev/null @@ -1,110 +0,0 @@ -# Intent and Slot classification with pretrained BERT models - -trainer: - devices: -1 # number of GPUs, -1 would use all available GPUs - num_nodes: 1 - max_epochs: 5 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - precision: 32 # Should be set to 16 for O1 and O2 amp_level to enable the AMP. - accelerator: auto - strategy: ddp - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - -model: - nemo_path: null # filename to save the model and associated artifacts to .nemo file - data_dir: ??? # /path/to/data - class_labels: - intent_labels_file: intent_labels.csv - slot_labels_file: slot_labels.csv - class_balancing: null # or weighted_loss - intent_loss_weight: 0.6 # relation of intent to slot loss in total loss (between 0 to 1) - pad_label: -1 # if -1 not slot token will be used - ignore_extra_tokens: false - ignore_start_end: true # do not use first and last token for slot training - - train_ds: - prefix: train - batch_size: 32 - shuffle: true - num_samples: -1 - num_workers: 8 - drop_last: false - pin_memory: false - - validation_ds: - prefix: dev - batch_size: 32 - shuffle: false - num_samples: -1 - num_workers: 8 - drop_last: false - pin_memory: false - - test_ds: - prefix: dev - batch_size: 32 - shuffle: false - num_samples: -1 - num_workers: 8 - drop_last: false - pin_memory: false - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - - language_model: - max_seq_length: 50 - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - head: - num_output_layers: 2 - fc_dropout: 0.1 - - optim: - name: adam - lr: 2e-5 - args: - name: auto - params: - weight_decay: 0.01 - - sched: - name: WarmupAnnealing - iters_per_batch: null # computed at runtime - max_steps: -1 # computed at runtime or explicitly set here - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - - # scheduler config override - args: - name: auto - params: - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - -language_model: - max_seq_length: 50 - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./nemo_experiments" - name: "MultiLabelIntentSlot" # The name of your model - create_tensorboard_logger: False # Whether you want exp_manger to create a tb logger - create_checkpoint_callback: False # Whether you want exp_manager to create a modelcheckpoint callback diff --git a/SoundScribe/SpeakerID/examples/nlp/intent_slot_classification/intent_slot_classification.py b/SoundScribe/SpeakerID/examples/nlp/intent_slot_classification/intent_slot_classification.py deleted file mode 100644 index a112ea7785f5b07c73c2bdb0d4b09724b4ec6199..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/intent_slot_classification/intent_slot_classification.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import IntentSlotClassificationModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="intent_slot_classification_config") -def main(cfg: DictConfig) -> None: - # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True - # when there are unused parameters like here - if cfg.trainer.strategy == 'ddp': - cfg.trainer.strategy = "ddp_find_unused_parameters_true" - logging.info(f'Config Params:\n {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - - # initialize the model using the config file - model = IntentSlotClassificationModel(cfg.model, trainer=trainer) - - # training - logging.info("================================================================================================") - logging.info('Starting training...') - trainer.fit(model) - logging.info('Training finished!') - - # Stop further testing as fast_dev_run does not save checkpoints - if trainer.fast_dev_run: - return - - # after model training is done, you can load the model from the saved checkpoint - # and evaluate it on a data file or on given queries. - logging.info("================================================================================================") - logging.info("Starting the testing of the trained model on test set...") - logging.info("We will load the latest model saved checkpoint from the training...") - - # for evaluation and inference you can load the previously trained model saved in .nemo file - # like this in your code, but we will just reuse the trained model here - # eval_model = IntentSlotClassificationModel.restore_from(restore_path=checkpoint_path) - eval_model = model - - # we will setup testing data reusing the same config (test section) - eval_model.update_data_dir_for_testing(data_dir=cfg.model.data_dir) - eval_model.setup_test_data(test_data_config=cfg.model.test_ds) - - trainer.test(model=eval_model, ckpt_path=None, verbose=False) - logging.info("Testing finished!") - - # run an inference on a few examples - logging.info("======================================================================================") - logging.info("Evaluate the model on the given queries...") - - # this will work well if you train the model on Assistant dataset - # for your own dataset change the examples appropriately - queries = [ - 'set alarm for seven thirty am', - 'lower volume by fifty percent', - 'what is my schedule for tomorrow', - ] - - pred_intents, pred_slots = eval_model.predict_from_examples(queries, cfg.model.test_ds) - - logging.info('The prediction results of some sample queries with the trained model:') - for query, intent, slots in zip(queries, pred_intents, pred_slots): - logging.info(f'Query : {query}') - logging.info(f'Predicted Intent: {intent}') - logging.info(f'Predicted Slots: {slots}') - - logging.info("Inference finished!") - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/intent_slot_classification/multi_label_intent_slot_classification.py b/SoundScribe/SpeakerID/examples/nlp/intent_slot_classification/multi_label_intent_slot_classification.py deleted file mode 100644 index 2441885e2ed280f5e55732ce1ed597dd3f9ec260..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/intent_slot_classification/multi_label_intent_slot_classification.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Sample command to run the script: - -python multi_label_intent_slot_classification.py \ - model.data_dir=/home/user/multiatis \ - model.validation_ds.prefix=dev \ - model.test_ds.prefix=dev \ - trainer.devices=[0] \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=checkpoints - -fast_dev_run=false will save checkpoints for the model -""" - - -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import MultiLabelIntentSlotClassificationModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="multi_label_intent_slot_classification_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config Params:\n {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - - # initialize the model using the config file - model = MultiLabelIntentSlotClassificationModel(cfg.model, trainer=trainer) - - # training - logging.info("================================================================================================") - logging.info('Starting training...') - trainer.fit(model) - logging.info('Training finished!') - - # Stop further testing as fast_dev_run does not save checkpoints - if trainer.fast_dev_run: - return - - # after model training is done, you can load the model from the saved checkpoint - # and evaluate it on a data file or on given queries. - logging.info("================================================================================================") - logging.info("Starting the testing of the trained model on test set...") - logging.info("We will load the latest model saved checkpoint from the training...") - - # for evaluation and inference you can load the previously trained model saved in .nemo file - # like this in your code, but we will just reuse the trained model here - # eval_model = MultiLabelIntentSlotClassificationModel.restore_from(restore_path=checkpoint_path) - eval_model = model - - # we will setup testing data reusing the same config (test section) - eval_model.update_data_dir_for_testing(data_dir=cfg.model.data_dir) - eval_model.setup_test_data(test_data_config=cfg.model.test_ds) - - trainer.test(model=eval_model, ckpt_path=None, verbose=False) - logging.info("Testing finished!") - - # Optimize Threshold - eval_model.optimize_threshold(cfg.model.test_ds, 'dev') - - # run an inference on a few examples - logging.info("======================================================================================") - logging.info("Evaluate the model on the given queries...") - - # this will work well if you train the model on ATIS dataset - # for your own dataset change the examples appropriately - queries = [ - 'i would like to find a flight from charlotte to las vegas that makes a stop in st. louis', - 'on april first i need a ticket from tacoma to san jose departing before 7 am', - 'how much is the limousine service in boston', - ] - - # We use the optimized threshold for predictions - pred_intents, pred_slots, pred_list = eval_model.predict_from_examples(queries, cfg.model.test_ds) - logging.info('The prediction results of some sample queries with the trained model:') - - for query, intent, slots in zip(queries, pred_intents, pred_slots): - logging.info(f'Query : {query}') - logging.info(f'Predicted Intents: {intent}') - logging.info(f'Predicted Slots: {slots}') - - logging.info("Inference finished!") - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/bert_pretraining.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/bert_pretraining.py deleted file mode 100644 index 75d0a1072e697c4b232ea287d9822cfe25ca525f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/bert_pretraining.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf -from pytorch_lightning.strategies import DDPStrategy - -from nemo.collections.nlp.models.language_modeling import BERTLMModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="bert_pretraining_from_text_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config:\n {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(strategy=DDPStrategy(find_unused_parameters=True), **cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - bert_model = BERTLMModel(cfg.model, trainer=trainer) - trainer.fit(bert_model) - if cfg.model.nemo_path: - bert_model.save_to(cfg.model.nemo_path) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/bert_pretraining_from_preprocessed_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/bert_pretraining_from_preprocessed_config.yaml deleted file mode 100644 index 2c44c6c28aaddf937bd7c16931c2073a7e4d61a1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/bert_pretraining_from_preprocessed_config.yaml +++ /dev/null @@ -1,79 +0,0 @@ -# BERT Pretraining from Preprocessed (tokenized) data -name: &name PretrainingBERTFromPreprocessed -trainer: - devices: 8 # the number of gpus, 0 for CPU, or list with gpu indices - num_nodes: 1 - max_steps: 2285714 # precedence over max_epochs - num_sanity_val_steps: 0 # needed for bert pretraining from preproc - use_distributed_sampler: false # needed for bert pretraining from preproc - accumulate_grad_batches: 1 # accumulates grads every k batches - precision: 16 # 16 to use AMP - accelerator: gpu - gradient_clip_val: 1.0 - log_every_n_steps: 1 - val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch - enable_checkpointing: False # provided by exp_manager - logger: false # provided by exp_manager - -model: - nemo_path: null # exported .nemo path - only_mlm_loss: true # only use masked language model without next sentence prediction - num_tok_classification_layers: 1 # number of token classification head output layers - num_seq_classification_layers: 2 # number of sequence classification head output layers - - - language_model: - pretrained_model_name: bert-base-uncased # huggingface model name - lm_checkpoint: null - config: - attention_probs_dropout_prob: 0.1 - hidden_act: gelu - hidden_dropout_prob: 0.1 - hidden_size: 768 - initializer_range: 0.02 - intermediate_size: 3072 - max_position_embeddings: 512 - num_attention_heads: 12 - num_hidden_layers: 12 - type_vocab_size: 2 - vocab_size: 30522 - config_file: null # json file, precedence over config - - tokenizer: null - - train_ds: - data_file: ??? # path to hdf5 file (or directory) - max_predictions_per_seq: 80 - batch_size: 16 - shuffle: true - num_samples: -1 - num_workers: 2 - drop_last: false - pin_memory: false - - optim: - name: adamw - lr: 0.4375e-4 - weight_decay: 0.01 - - sched: - name: SquareRootAnnealing - warmup_steps: null - warmup_ratio: 0.01 - min_lr: 0.0 - last_epoch: -1 - - -exp_manager: - exp_dir: null # where to store logs and checkpoints - name: *name # name of experiment - create_tensorboard_logger: True - create_checkpoint_callback: True - - -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/bert_pretraining_from_text_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/bert_pretraining_from_text_config.yaml deleted file mode 100644 index c29fcb3e912df65c420c96741de89e9926ab1163..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/bert_pretraining_from_text_config.yaml +++ /dev/null @@ -1,108 +0,0 @@ -# BERT Pretraining from Text -name: &name PretrainingBERTFromText -trainer: - devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices - num_nodes: 1 - max_epochs: 2 # the number of training epochs - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - precision: 16 # 16 to use AMP - accelerator: gpu - gradient_clip_val: 0.0 - log_every_n_steps: 1 - val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch - enable_checkpointing: False # provided by exp_manager - logger: false # provided by exp_manager - -model: - nemo_path: null # exported .nemo path - only_mlm_loss: false # only use masked language model without next sentence prediction - num_tok_classification_layers: 1 # number of token classification head output layers - num_seq_classification_layers: 2 # number of sequence classification head output layers - max_seq_length: 128 - # The maximum total input sequence length after tokenization. Sequences longer than this - # will be truncated, and sequences shorter than this will be padded. - mask_prob: 0.15 - # Probability of masking a token in the input text during data processing. - short_seq_prob: 0.1 - # Probability of having a sequence shorter than the maximum sequence length `max_seq_length` in data processing.", - - language_model: - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config: - attention_probs_dropout_prob: 0.1 - hidden_act: gelu - hidden_dropout_prob: 0.1 - hidden_size: 768 - initializer_range: 0.02 - intermediate_size: 3072 - max_position_embeddings: 512 - num_attention_heads: 12 - num_hidden_layers: 12 - type_vocab_size: 2 - vocab_size: 30522 - config_file: null # json file, precedence over config - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # tokenizer that inherits from TokenizerSpec - vocab_file: null # path to vocab file - tokenizer_model: null # tokenizer model for sentencepiece - special_tokens: # only necessary for adding transformer/bert-specific special tokens to tokenizer if the tokenizer does not already have these inherently. - unk_token: '[UNK]' - sep_token: '[SEP]' - pad_token: '[PAD]' - bos_token: '[CLS]' - mask_token: '[MASK]' - eos_token: '[SEP]' - cls_token: '[CLS]' - - train_ds: - data_file: ??? # path to data file - max_seq_length: ${model.max_seq_length} - mask_prob: ${model.mask_prob} - short_seq_prob: ${model.short_seq_prob} - batch_size: 16 # per GPU - shuffle: true - num_samples: -1 - num_workers: 2 - drop_last: false - pin_memory: false - - validation_ds: - data_file: ??? # path to data file - max_seq_length: ${model.max_seq_length} - mask_prob: ${model.mask_prob} - short_seq_prob: ${model.short_seq_prob} - batch_size: 16 # per GPU - shuffle: false - num_samples: -1 - num_workers: 2 - drop_last: false - pin_memory: false - - optim: - name: adamw - lr: 3e-5 - weight_decay: 0.0 - - sched: - name: CosineAnnealing - warmup_steps: null - warmup_ratio: 0.1 - min_lr: 0.0 - last_epoch: -1 - - -exp_manager: - exp_dir: null # where to store logs and checkpoints - name: *name # name of experiment - create_tensorboard_logger: True - create_checkpoint_callback: True - -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_bart_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_bart_config.yaml deleted file mode 100644 index 02a41a9a7bc044d3b2c8f604c232d3e6c9667f6c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_bart_config.yaml +++ /dev/null @@ -1,151 +0,0 @@ -defaults: - - .@model.encoder: megatron_model_base_config - - .@model.decoder: megatron_model_base_config - -name: megatron_bart -restore_from_path: null # used when starting from a .nemo file - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 # PTL default. In practice, max_steps will be reached first. - max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 100 - limit_val_batches: 50 - limit_test_batches: 500 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - benchmark: False - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 10 - mode: min - always_save_nemo: False # saves nemo file during validation, not implemented for model parallel - filename: '${name}--{val_loss:.2f}-{step}-{consumed_samples}' - model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} - -model: - # model parallelism - micro_batch_size: 4 - global_batch_size: 8 # will use more micro batches to reach global batch size - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - resume_from_checkpoint: null # manually set the checkpoint file to load from - pipeline_model_parallel_split_rank: 0 # rank at which decoder starts. - - # model architecture - make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. - - megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting. - grad_allreduce_chunk_size_mb: 125 - grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - - seq_length: 512 - max_position_embeddings: ${.seq_length} - - tokenizer: - library: 'megatron' - type: 'BertWordPieceCase' - model: null - vocab_file: null - merge_file: null - num_sentinel_tokens: 0 # expected to be 0 for BART - sentencepiece_legacy: True # Legacy=True allows you to add special tokens to sentencepiece tokenizers. - - # weight init - embedding_init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') - - # embedding dropout - embedding_dropout: 0.1 - - # embedding sharing - share_token_embeddings: True # If True share encoder/decoder embeddings - share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits - - # token head - tokens_head_bias: True - - # precision - native_amp_init_scale: 4294967296 # 2 ** 32 - native_amp_growth_interval: 1000 - fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 - - # miscellaneous - seed: 1234 - use_cpu_initialization: False # Init weights on the CPU (slow for large models) - apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this - - data: - # Path to data must be specified by the user. - # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-bart_00_text_document,.5,/raid/data/pile/my-bart_01_text_document]", - # Or see example below: - # data_prefix: - # - .5 - # - /raid/data/pile/my-bart_00_text_document - # - .5 - # - /raid/data/pile/my-bart_01_text_document - data_prefix: ??? - index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix - data_impl: mmap - # data_impl_kwargs: # currently used only for text_mmap, csv_mmap (should be data_impl dependant) - # # defaults for text_memmap - # newline_int: 10 # byte-value of newline (Use ord('\n') to get value) - # header_lines: 0 # skip first N header lines - # workers: null # number of workers when creating missing index files (null defaults to cpu_num // 2) - # sort_dataset_paths: False # if True datasets will be sorted by name - # # defaults for csv_memmap - # newline_int: 10 # byte-value of newline - # header_lines: 1 # skip first N header lines - # workers: null # number of workers when creating missing index files (null defaults to cpu_num // 2) - # sort_dataset_paths: False # if True datasets will be sorted by name - # data_col: 1 # column to use for data - # data_sep: ',' # string to split text into columns - splits_string: 949,45,5 - seq_length: ${model.seq_length} - skip_warmup: True - num_workers: 0 - dataloader_type: single # cyclic - masked_lm_prob: 0.15 - dataset_type: 'bart' - short_seq_prob: 0.0 - max_ngram_size: 10 - mean_ngram_size: null - geometric_dist: True - permutation: False - whole_word_masking: True - favor_longer_ngrams: False - delete_mask_prob: 0.3 - respect_document_boundaries: True # If true, a single training exampl cannot cross document boundaries, increasing the fraction of tokens within a batch. - - optim: - name: fused_adam - lr: 0.0001 - betas: - - 0.9 - - 0.999 - eps: 1e-8 - weight_decay: 0.01 - sched: - name: WarmupAnnealing - min_lr: 0.00001 - last_epoch: -1 - warmup_ratio: 0.01 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_bert_config.yaml deleted file mode 100644 index 5267b34429fd8cb933886da8ca6fc135931ba195..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_bert_config.yaml +++ /dev/null @@ -1,155 +0,0 @@ -name: megatron_bert -restore_from_path: null # used when starting from a .nemo file - -trainer: - devices: 2 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. - max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 100 - limit_val_batches: 50 - limit_test_batches: 500 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - benchmark: False - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: megatron_bert - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 10 - mode: min - always_save_nemo: False # saves nemo file during validation, not implemented for model parallel - filename: 'megatron_bert--{val_loss:.2f}-{step}-{consumed_samples}' - model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} - - -model: - # model parallelism - micro_batch_size: 4 - global_batch_size: 8 - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - virtual_pipeline_model_parallel_size: null - - # model architecture - encoder_seq_length: 512 - max_position_embeddings: ${.encoder_seq_length} - position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental. - num_layers: 12 - hidden_size: 768 - ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size. - num_attention_heads: 12 - init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') - hidden_dropout: 0.1 # Dropout probability for hidden state transformer. - kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null - apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. - layernorm_epsilon: 1e-5 - make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. - pre_process: True # add embedding - post_process: True # add pooler - bert_binary_head: True # BERT binary head - - tokenizer: - library: 'megatron' - type: 'BertWordPieceLowerCase' - model: null - vocab_file: null - merge_file: null - - # precision - native_amp_init_scale: 4294967296 # 2 ** 32 - native_amp_growth_interval: 1000 - fp32_residual_connection: False # Move residual connections to fp32 - fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 - - # Megatron O2-style half-precision - megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters - grad_allreduce_chunk_size_mb: 125 - grad_div_ar_fusion: False - - # miscellaneous - seed: 1234 - use_cpu_initialization: False # Init weights on the CPU (slow for large models) - onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. - gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - - ## Activation Checkpointing - # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. - # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+). - # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - # 'full' will checkpoint the entire transformer layer. - activations_checkpoint_granularity: null # 'selective' or 'full' - activations_checkpoint_method: null # 'uniform', 'block' - # 'uniform' divides the total number of transformer layers and checkpoints the input activation - # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model. - # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity - activations_checkpoint_num_layers: null - # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory. - # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage. - num_micro_batches_with_partial_activation_checkpoints: null - # This feature is valid only when used with pipeline-model-parallelism. - # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed - # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is - # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint - # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'. - # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage. - activations_checkpoint_layers_per_pipeline: null - # This feature is valid only when used with pipeline-model-parallelism. - # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later - # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than - # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage - # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints', - # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path. - sequence_parallel: False - - data: - # Path to data must be specified by the user. - # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]", - # Or see example below: - # data_prefix: - # - .5 - # - /raid/data/pile/my-gpt3_00_text_document - # - .5 - # - /raid/data/pile/my-gpt3_01_text_document - data_prefix: ??? - index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix - data_impl: mmap - splits_string: 900,50,50 - seq_length: ${model.encoder_seq_length} - skip_warmup: True - num_workers: 0 - dataloader_type: single # cyclic, LDDL - reset_position_ids: False # Reset position ids after end-of-document token - reset_attention_mask: False # Reset attention mask after end-of-document token - eod_mask_loss: False # Mask loss for the end of document tokens - masked_lm_prob: 0.15 # Probability of replacing a token with mask. - short_seq_prob: 0.1 # Probability of producing a short sequence. - - optim: - name: fused_adam - lr: 2e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 500 - constant_steps: 50000 - min_lr: 2e-5 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml deleted file mode 100644 index fdad93d14adfa385e0cf6de5347e1f53bc2dea94..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ /dev/null @@ -1,243 +0,0 @@ -defaults: - - _self_ - - optional tp_overlap@model.ub_tp_comm_overlap_cfg: - -name: megatron_gpt -restore_from_path: null # used when starting from a .nemo file - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 # PTL default. In practice, max_steps will be reached first. - max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 100 - limit_val_batches: 50 - limit_test_batches: 500 - accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models - gradient_clip_val: 1.0 - benchmark: False - enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: megatron_gpt - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - resume_from_checkpoint: ${model.resume_from_checkpoint} - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 10 - mode: min - always_save_nemo: False # saves nemo file during validation, not implemented for model parallel - save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits - filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}' - model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} - -model: - # use GPTModel from megatron.core - mcore_gpt: False - - # specify micro_batch_size, global_batch_size, and model parallelism - # gradient accumulation will be done automatically based on data_parallel_size - micro_batch_size: 4 # limited by GPU memory - global_batch_size: 8 # will use more micro batches to reach global batch size - rampup_batch_size: null # Should be a list of 3 values: [, , ] - tensor_model_parallel_size: 1 # intra-layer model parallelism - pipeline_model_parallel_size: 1 # inter-layer model parallelism - virtual_pipeline_model_parallel_size: null # interleaved pipeline - - # model architecture - encoder_seq_length: 512 - max_position_embeddings: ${.encoder_seq_length} - num_layers: 12 - hidden_size: 768 - ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size. - num_attention_heads: 12 - init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') - use_scaled_init_method: True # use scaled residuals initialization - hidden_dropout: 0.1 # Dropout probability for hidden state transformer. - attention_dropout: 0.1 # Dropout probability for attention - ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. - kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null - apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number. - normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm' - layernorm_epsilon: 1e-5 - do_layer_norm_weight_decay: False # True means weight decay on all params - make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. - pre_process: True # add embedding - post_process: True # add pooler - persist_layer_norm: True # Use of persistent fused layer norm kernel. - bias: True # Whether to use bias terms in all weight matrices. - activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu'] - headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head. - transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer'] - openai_gelu: False # Use OpenAI's GELU instead of the default GeLU - normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. - position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental. - rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this. - attention_type: 'multihead' # Attention type. Options ['multihead'] - share_embeddings_and_output_weights: True # Share embedding and output layer weights. - overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 - batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 - seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595. - num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. - - tokenizer: - library: 'megatron' - type: 'GPT2BPETokenizer' - model: null - vocab_file: null - merge_file: null - delimiter: null # only used for tabular tokenizer - sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers. - - # Mixed precision - native_amp_init_scale: 4294967296 # 2 ** 32 - native_amp_growth_interval: 1000 - hysteresis: 2 # Gradient scale hysteresis - fp32_residual_connection: False # Move residual connections to fp32 - fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 - - # Megatron O2-style half-precision - megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters - grad_allreduce_chunk_size_mb: 125 - - # Fusion - grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism.. - gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2. - bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. - bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition. - masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. - get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages. - - - # Miscellaneous - seed: 1234 - resume_from_checkpoint: null # manually set the checkpoint file to load from - use_cpu_initialization: False # Init weights on the CPU (slow for large models) - onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. - apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this - gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages - - ## Activation Checkpointing - # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. - # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+). - # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - # 'full' will checkpoint the entire transformer layer. - activations_checkpoint_granularity: null # 'selective' or 'full' - activations_checkpoint_method: null # 'uniform', 'block' - # 'uniform' divides the total number of transformer layers and checkpoints the input activation - # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model. - # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity - activations_checkpoint_num_layers: null - # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory. - # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage. - num_micro_batches_with_partial_activation_checkpoints: null - # This feature is valid only when used with pipeline-model-parallelism. - # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed - # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is - # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint - # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'. - # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage. - activations_checkpoint_layers_per_pipeline: null - # This feature is valid only when used with pipeline-model-parallelism. - # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later - # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than - # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage - # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints', - # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path. - - ## Sequence Parallelism - # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially - # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - sequence_parallel: False - - ## Transformer Engine - transformer_engine: False - fp8: False # enables fp8 in TransformerLayer forward - fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 - fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID - fp8_margin: 0 # scaling margin - fp8_interval: 1 # scaling update interval - fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor - fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history - reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration - use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. - ub_tp_comm_overlap: False - # Use userbuffer backend to overlap tensor-parallel communications with computes. - # This feature is only available with Transformer Engine and squence parallelism enabled and, currently, supports only GPT models. - ub_tp_comm_overlap_cfg: null - # A yaml file with userbuffer communicator configurations. This file should provide `method`, `dtype`, `num_sm`, `num_splits`, - # `cga_size`, `num_splits`, `set_sm_margin`, and `aggregate` for the communicators to use custom settings. - # If the configuration file is not provided a default setting is used for all communicators. - - ## Flash Attention - use_flash_attention: False # Use flash attention in self-attention module, this config does nothing when transformer_engine=True - - data: - # Path to data must be specified by the user. - # Supports List, String and Dictionary - # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]", - # Or see example below: - # data_prefix: - # - .5 - # - /raid/data/pile/my-gpt3_00_text_document - # - .5 - # - /raid/data/pile/my-gpt3_01_text_document - # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test} - # Or see example below: - # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}" - data_prefix: ??? - index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix - data_impl: mmap - splits_string: 900,50,50 - seq_length: ${model.encoder_seq_length} - skip_warmup: True - num_workers: 2 - dataloader_type: single # cyclic - reset_position_ids: False # Reset position ids after end-of-document token - reset_attention_mask: False # Reset attention mask after end-of-document token - eod_mask_loss: False # Mask loss for the end of document tokens - validation_drop_last: True # Set to false if the last partial validation samples is to be consumed - no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token - pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size - shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled - exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem - - # Nsys profiling options - nsys_profile: - enabled: False - start_step: 10 # Global batch to start profiling - end_step: 10 # Global batch to end profiling - ranks: [0] # Global rank IDs to profile - gen_shape: False # Generate model and kernel details including input shapes - - optim: - name: fused_adam - lr: 2e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 500 - constant_steps: 50000 - min_lr: 2e-5 - - gc_interval: 0 - # Interval of the host memory garbage collection. When it is zero, collectiion relies on the automatic garbage collector. - # If an interger value larger than zero is set, collection is done manually by the batch step interval of `gc_interval`. diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml deleted file mode 100644 index 24d0c1548e69b973c4846a7479def920e8638a11..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml +++ /dev/null @@ -1,25 +0,0 @@ -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - logger: False # logger provided by exp_manager - precision: bf16 # 16, 32, or bf16 - -model_type: gpt -tensor_model_parallel_size: 1 -pipeline_model_parallel_size: 1 -pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) -gpt_model_file: null # GPT nemo file path -onnx_model_file: null # ONNX file path -checkpoint_dir: null # Checkpoint directory -checkpoint_name: null # Checkpoint name -hparams_file: null # hparams filepath - -export_options: - runtime_check: False - verbose: False - onnx_opset: 17 - do_constant_folding: True - cache_support: False - device: 'cuda' - check_tolerance: 0.01 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml deleted file mode 100644 index 2570251bcdee2df395000541d1e8f2adca235591..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml +++ /dev/null @@ -1,95 +0,0 @@ -inference: - greedy: False # Whether or not to use sampling ; use greedy decoding otherwise - top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - temperature: 1.0 # sampling temperature - add_BOS: True # add the bos token at the begining of the prompt - tokens_to_generate: 30 # The minimum length of the sequence to be generated. - all_probs: False # whether return the log prob for all the tokens in vocab - repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. - min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. - compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - end_strings: ["<|endoftext|>"] # generation will stop when one of these tokens is generated - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - logger: False # logger provided by exp_manager - precision: 16 # 16, 32, or bf16 - use_distributed_sampler: False - - -tensor_model_parallel_size: -1 -pipeline_model_parallel_size: -1 -pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) -megatron_amp_O2: False # Enable O2-level automatic mixed precision to save memory -gpt_model_file: null # GPT nemo file path -checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training -checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading -hparams_file: null # model configuration file, only used for PTL checkpoint loading -prompts: # prompts for GPT inference - - "Q: How are you?" - - "Q: How big is the universe?" -server: False # whether launch the API server -port: 5555 # the port number for the inference server -web_server: False # whether launch the web inference server -share: False # whether create a public URL -username: test # user name for web client -password: test2 # password for web client -web_port: 9889 # the port number of the web server -chat: False # use the chat interface -chatbot_config: - value: False # whether to inject the value attributes - attributes: - - name: Quality - min: 0 - max: 4 - key: quality - type: int - default: 4 - - name: Toxicity - min: 0 - max: 4 - key: toxcity - type: int - default: 0 - - name: Humor - min: 0 - max: 4 - key: humor - type: int - default: 0 - - name: Creativity - min: 0 - max: 4 - key: creativity - type: int - default: 0 - - name: Violence - min: 0 - max: 4 - key: violence - type: int - default: 0 - - name: Helpfulness - min: 0 - max: 4 - key: helpfulness - type: int - default: 4 - - name: Not_Appropriate - min: 0 - max: 4 - key: not_appropriate - type: int - default: 0 - - name: Language - choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh'] - key: lang - type: list - default: en - - user: User - assistant: Assistant - system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n" diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml deleted file mode 100644 index a1c5f774cd112705aa273611bc2a2e4c6fa3dbcf..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml +++ /dev/null @@ -1,173 +0,0 @@ -name: megatron_virtual_prompt_gpt - -trainer: - devices: 1 - accelerator: gpu - num_nodes: 1 - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: 3 # min 25 recommended - max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 # frequency with which training steps are logged - val_check_interval: 1.0 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch - gradient_clip_val: 1.0 - benchmark: False - - - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 2 - mode: min - save_nemo_on_train_end: True - filename: 'megatron_gpt_prompt_tune--{val_loss:.3f}-{step}' - model_parallel_size: ${model.tensor_model_parallel_size} - save_best_model: True - create_early_stopping_callback: True - early_stopping_callback_params: - monitor: "val_loss" - mode: "min" - min_delta: 0.001 - patience: 10 - verbose: True - strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. - - -model: - seed: 1234 - nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved - virtual_prompt_style: 'p-tuning' # one of 'prompt-tuning', 'p-tuning', or 'inference' - tensor_model_parallel_size: 1 # intra-layer model parallelism - pipeline_model_parallel_size: 1 # inter-layer model parallelism - global_batch_size: 8 - micro_batch_size: 4 - validation_global_batch_size: ${model.global_batch_size} - validation_micro_batch_size: ${model.micro_batch_size} - validation_drop_last: False - report_validation_metric: False - validation_metric: 'accuracy' - - restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with - language_model_path: ??? # Path to the GPT language model .nemo file, always required - save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. - existing_tasks: ['boolq', 'intent_and_slot'] # List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given - new_tasks: ['rte'] # List of new tasknames to be prompt-tuned - - - - ## Sequence Parallelism - # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially - # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - sequence_parallel: False - - ## Activation Checkpoint - activations_checkpoint_granularity: null # 'selective' or 'full' - activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' - # 'uniform' divides the total number of transformer layers and checkpoints the input activation - # of each chunk at the specified granularity - # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity - activations_checkpoint_num_layers: null # not used with 'selective' - - task_templates: # Add more/replace tasks as needed, these are just examples - - taskname: "boolq" # The task name - prompt_template: "<|VIRTUAL_PROMPT_0|> Passage: {passage} <|VIRTUAL_PROMPT_1|> \nQuestion: {question} \nAnswer: {answer}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|> - total_virtual_tokens: 30 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time. - virtual_token_splits: [20, 10] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens - truncate_field: "passage" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped. - answer_only_loss: True - answer_field: "answer" - - - taskname: "intent_and_slot" - prompt_template: "<|VIRTUAL_PROMPT_0|> intent options: {intent_options} <|VIRTUAL_PROMPT_1|> slot options: {slot_options} <|VIRTUAL_PROMPT_2|> {utterance} \nintent: {intent} \nslot: {slot}" - total_virtual_tokens: 30 - answer_only_loss: False - virtual_token_splits: [15, 10, 5] - truncate_field: null - - - taskname: "rte" - prompt_template: "<|VIRTUAL_PROMPT_0|>{premise}\n{hypothesis}\nAnswer: {answer}" - total_virtual_tokens: 9 - virtual_token_splits: [9] - truncate_field: null - answer_only_loss: True - answer_field: "answer" - - - taskname: "squad" - prompt_template: "<|VIRTUAL_PROMPT_0|> context: {context} question: {question} answer: {answer}" - total_virtual_tokens: 10 - virtual_token_splits: [10] - truncate_field: null - answer_only_loss: True - answer_field: "answer" - - - taskname: "taskname" - prompt_template: "<|VIRTUAL_PROMPT_0|> {prompt} {completion}" - total_virtual_tokens: 100 - virtual_token_splits: [100] - truncate_field: null - answer_only_loss: True - answer_field: "completion" - - prompt_tuning: # Prompt tunin specific params - new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks - new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random - - p_tuning: # P-tuning specific params - encoder_type: "tpmlp" # ['tpmlp', 'lstm', 'biglstm', 'mlp'] - dropout: 0.0 - num_layers: 2 # number of layers for MLP or LSTM layers. Note, it has no effect for tpmlp currently as it always assumes it is two layers. - encoder_hidden: 2048 # encoder hidden for biglstm and tpmlp - init_std: 0.023 # init std for tpmlp layers - - data: - train_ds: [data/rte_train.jsonl,] - validation_ds: [data/rte_val.jsonl,] - add_eos: True - shuffle: True - num_workers: 8 - pin_memory: True - train_cache_data_path: null # the path to the train cache data - validation_cache_data_path: null # the path to the validation cache data - test_cache_data_path: null # the path to the test cache data - load_cache: False # whether to load from the cache data - max_seq_length: 1024 # filter out training and validation examples longer than 1024 tokens. Set to None will default to model's encoder length. - min_seq_length: 1 # filter out training and validation examples less than 1 token long. - - - optim: - name: fused_adam - lr: 1e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 50 - min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1 - constant_steps: 0 # Constant steps should also be 0 when min_lr=0 - monitor: val_loss - reduce_on_plateau: false - - # required for reporting validation metrics - inference: - greedy: False - top_k: 0 - top_p: 0.9 - temperature: 1.0 - tokens_to_generate: 30 - repetition_penalty: 1.2 - min_tokens_to_generate: 0 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_inference.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_inference.yaml deleted file mode 100644 index 33ca3f06ddfe42d014a318c59464dfd3e3e5f1a7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_inference.yaml +++ /dev/null @@ -1,30 +0,0 @@ -inference: - greedy: False # Whether or not to use sampling ; use greedy decoding otherwise - top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - temperature: 1.0 # sampling temperature - add_BOS: True # add the bos token at the begining of the prompt - tokens_to_generate: 30 # The minimum length of the sequence to be generated. - all_probs: False # whether return the log prob for all the tokens in vocab - repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. - min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. - compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - batch_size: 1 - - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - logger: False # logger provided by exp_manager - precision: 16 # 16, 32, or bf16 - -tensor_model_parallel_size: -1 -pipeline_model_parallel_size: -1 -gpt_model_file: null # GPT nemo file path -virtual_prompt_model_file: ??? # path to a MegatronGPTPromptLearningModel model if you want to use soft prompts -pred_file_path: ??? # Path will model predictions will be written -max_seq_length: 8192 # this will filter out inputs whose length is longer than the set value form the generation process. -data_paths: # paths to .jsonl files you want to perform inference on -num_workers: 8 - \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_validate_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_validate_config.yaml deleted file mode 100644 index 39b0c7ed2176cd87f238bf0fbedee2ae36b428f0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_gpt_validate_config.yaml +++ /dev/null @@ -1,22 +0,0 @@ -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - logger: False # logger provided by exp_manager - precision: 16 # 16, 32, or bf16 - log_every_n_steps: 1 - limit_val_batches: 10 - limit_test_batches: 50 - max_steps: 100 # needed to setup dataloaders - max_epochs: null - replace_sampler_ddp: False - -tensor_model_parallel_size: ??? # should be set the same as the pretrained model that is being restored from -pipeline_model_parallel_size: ??? # should be set the same as the pretrained model that is being restored from -micro_batch_size: null # limited by GPU memory, defaults to pretrained model config -global_batch_size: null # will use more micro batches to reach global batch size, defaults to pretrained model config -virtual_pipeline_model_parallel_size: null -gpt_model_file: null # GPT nemo file path -checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training -checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading -hparams_file: null # model configuration file, only used for PTL checkpoint loading diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_hiddens_base_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_hiddens_base_config.yaml deleted file mode 100644 index 560b966a9c087943461ab95b59bf00395f4c5e17..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_hiddens_base_config.yaml +++ /dev/null @@ -1,43 +0,0 @@ -# this file main purpose is documentation, and it should not be used directly -enc_output_name: z # name of key in hidden transforms output to pass to decoder (default: hiddens). e.g., z for VAE/MIM. -tokens_loss_weight: 1.0 # weight of tokens loss (if not specified defaults to 1.0) -# the lists below are useful for adding multiple transforms and losses according to order -# if order is not important, you can use a single dictionary in the list with multiple keys -transform: # a list of dictionaries of transforms (or a joint dictionary) to apply to hiddens (list enforces order) - # - : # name of transform - # cls_name: # class name - # : # transform parameters - # ... - - q_z_given_x: # Gaussian posterior with reparameterization - cls_name: cond_gaussian # class name - hidden_size: 512 # hidden size of the encoder - min_logvar: -6.0 # minimum log variance - - logP_cls: # logP classifier logits - cls_name: guided_cls - input_name: hiddens - attr_name: logP - QED_cls: # QED classifier logits - cls_name: guided_cls - input_name: hiddens - attr_name: QED -loss: # a list of dictionaries of loss terms (or a joint dictionary) to add to reconstruction loss (list enforces order) - # - : # name of loss - # cls_name: # class name - # : # loss parameters - # ... - # below is example where order of losses does not matter so a single dictionary is enough - mim: # A-MIM example - cls_name: a_mim - loss_weight: 1.0 # weight of the MIM latent loss - vae: # VAE example - cls_name: vae - min_kl_value: null # minimum KL value if a float is provided - loss_weight: 1e-2 # weight of KL term in loss - logP_cls: # logP classifier loss (cross entropy) - cls_name: guided_cls_loss - input_name: logP - loss_weight: 0.1 - QED_cls: # QED classifier loss (cross entropy) - cls_name: guided_cls_loss - input_name: logP - loss_weight: 0.1 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_llama_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_llama_config.yaml deleted file mode 100644 index 62589b12a9999b30d0b8b28d31045807a86627aa..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_llama_config.yaml +++ /dev/null @@ -1,219 +0,0 @@ -name: megatron_llama -restore_from_path: null # used when starting from a .nemo file - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - precision: 32 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 # PTL default. In practice, max_steps will be reached first. - max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 100 - limit_val_batches: 50 - limit_test_batches: 500 - accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models - gradient_clip_val: 1.0 - benchmark: False - enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: megatron_llama - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 10 - mode: min - always_save_nemo: False # saves nemo file during validation, not implemented for model parallel - save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits - filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}' - model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} - -model: - mcore_gpt: True - # specify micro_batch_size, global_batch_size, and model parallelism - # gradient accumulation will be done automatically based on data_parallel_size - micro_batch_size: 4 # limited by GPU memory - global_batch_size: 8 # will use more micro batches to reach global batch size - tensor_model_parallel_size: 1 # intra-layer model parallelism - pipeline_model_parallel_size: 1 # inter-layer model parallelism - virtual_pipeline_model_parallel_size: null # interleaved pipeline - - # model architecture - encoder_seq_length: 4096 - max_position_embeddings: ${.encoder_seq_length} - num_layers: 32 # 7b: 32 | 13b: 40 | 70b: 80 - hidden_size: 4096 # 7b: 4096 | 13b: 5120 | 70b: 8192 - ffn_hidden_size: 11008 # Transformer FFN hidden size. Usually 4 * hidden_size. | 7b: 11008 | 13b: 13824 | 70b: 28672 - num_attention_heads: 32 # 7b: 32 | 13b: 40 | 70b: 64 - init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') - use_scaled_init_method: True # use scaled residuals initialization - hidden_dropout: 0.0 # Dropout probability for hidden state transformer. - attention_dropout: 0.0 # Dropout probability for attention - ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. - kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null - apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. - normalization: 'rmsnorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm' - layernorm_epsilon: 1e-5 - do_layer_norm_weight_decay: False # True means weight decay on all params - make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. - pre_process: True # add embedding - post_process: True # add pooler - persist_layer_norm: True # Use of persistent fused layer norm kernel. - bias: False # Whether to use bias terms in all weight matrices. - activation: 'fast-swiglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu'] - headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head. - transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer'] - openai_gelu: False # Use OpenAI's GELU instead of the default GeLU - normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. - position_embedding_type: 'rope' # Position embedding type. Options ['learned_absolute', 'rope'] - rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this. - attention_type: 'multihead' # Attention type. Options ['multihead'] - share_embeddings_and_output_weights: False # Share embedding and output layer weights. - overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 - batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 - num_query_groups: 32 # Number of query groups for group query attention. If None, normal attention is used. | 7b: 32 | 13b: 40 | 70b: 8 - - tokenizer: - library: 'sentencepiece' - type: null - model: ??? # /path/to/tokenizer.model - vocab_file: null - merge_file: null - delimiter: null # only used for tabular tokenizer - sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers. - - # Mixed precision - native_amp_init_scale: 4294967296 # 2 ** 32 - native_amp_growth_interval: 1000 - hysteresis: 2 # Gradient scale hysteresis - fp32_residual_connection: False # Move residual connections to fp32 - fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 - - # Megatron O2-style half-precision - megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters - grad_allreduce_chunk_size_mb: 125 - - # Fusion - grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism.. - gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2. - bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. - bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition. - masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. - get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages. - - - # Miscellaneous - seed: 1234 - resume_from_checkpoint: null # manually set the checkpoint file to load from - use_cpu_initialization: False # Init weights on the CPU (slow for large models) - onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. - apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this - gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages - - ## Activation Checkpointing - # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. - # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+). - # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - # 'full' will checkpoint the entire transformer layer. - activations_checkpoint_granularity: null # 'selective' or 'full' - activations_checkpoint_method: null # 'uniform', 'block' - # 'uniform' divides the total number of transformer layers and checkpoints the input activation - # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model. - # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity - activations_checkpoint_num_layers: null - # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory. - # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage. - num_micro_batches_with_partial_activation_checkpoints: null - # This feature is valid only when used with pipeline-model-parallelism. - # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed - # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is - # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint - # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'. - # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage. - activations_checkpoint_layers_per_pipeline: null - # This feature is valid only when used with pipeline-model-parallelism. - # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later - # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than - # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage - # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints', - # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path. - - ## Sequence Parallelism - # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially - # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - sequence_parallel: False - - ## Transformer Engine - transformer_engine: True - fp8: False # enables fp8 in TransformerLayer forward - fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 - fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID - fp8_margin: 0 # scaling margin - fp8_interval: 1 # scaling update interval - fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor - fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history - reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration - use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. - - data: - # Path to data must be specified by the user. - # Supports List, String and Dictionary - # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]", - # Or see example below: - # data_prefix: - # - .5 - # - /raid/data/pile/my-gpt3_00_text_document - # - .5 - # - /raid/data/pile/my-gpt3_01_text_document - # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test} - # Or see example below: - # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}" - # data_prefix: ??? - index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix - data_impl: mmap - splits_string: 900,50,50 - seq_length: ${model.encoder_seq_length} - skip_warmup: True - num_workers: 2 - dataloader_type: single # cyclic - reset_position_ids: False # Reset position ids after end-of-document token - reset_attention_mask: False # Reset attention mask after end-of-document token - eod_mask_loss: False # Mask loss for the end of document tokens - validation_drop_last: True # Set to false if the last partial validation samples is to be consumed - no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token - pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size - shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled - - # Nsys profiling options - nsys_profile: - enabled: False - start_step: 10 # Global batch to start profiling - end_step: 10 # Global batch to end profiling - ranks: [0] # Global rank IDs to profile - gen_shape: False # Generate model and kernel details including input shapes - - optim: - name: fused_adam - lr: 2e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 500 - constant_steps: 50000 - min_lr: 2e-5 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_llama_inference.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_llama_inference.yaml deleted file mode 100644 index e508b01858f52268beea956ec38a10f3d4142860..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_llama_inference.yaml +++ /dev/null @@ -1,39 +0,0 @@ -inference: - greedy: False # Whether or not to use sampling ; use greedy decoding otherwise - top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - temperature: 1.0 # sampling temperature - add_BOS: True # add the bos token at the begining of the prompt - tokens_to_generate: 30 # The minimum length of the sequence to be generated. - all_probs: False # whether return the log prob for all the tokens in vocab - repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. - min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. - compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - end_strings: [""] # generation will stop when one of these tokens is generated - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - logger: False # logger provided by exp_manager - precision: 32 # 16, 32, or bf16 - use_distributed_sampler: False - -tensor_model_parallel_size: -1 -pipeline_model_parallel_size: -1 -pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) -megatron_amp_O2: False # Enable O2-level automatic mixed precision to save memory -gpt_model_file: null # GPT nemo file path -checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training -checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading -hparams_file: null # model configuration file, only used for PTL checkpoint loading -prompts: # prompts for GPT inference - - "Q: How are you?" - - "Q: How big is the universe?" -server: False # whether launch the API server -port: 5555 # the port number for the inference server -web_server: False # whether launch the web inference server -share: False # whether create a public URL -username: test # user name for web client -password: test2 # password for web client -web_port: 9889 # the port number of the web server diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml deleted file mode 100644 index 4da8177685a1f2dc8a63736fc7a361f0a552f6b6..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml +++ /dev/null @@ -1,40 +0,0 @@ -num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers. -hidden_size: 768 -ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size. -num_attention_heads: 12 -init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') -hidden_dropout: 0.1 # Dropout probability for hidden state transformer. -attention_dropout: 0.1 # Dropout probability in the attention layer. -ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. -position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'relative', 'alibi', 'kerple'] -relative_attention_num_buckets: 32 # Relative position number of buckets for computing the bias -relative_attention_max_distance: 128 # max_distance to keep relative distance in the attention_num_buckets. -relative_position_bias_self_attention_only: True # whether to only use relative position bias for self attention only. -kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null -apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number. -layernorm_epsilon: 1e-5 -persist_layer_norm: True # Use of persistent fused layer norm kernel. -bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. -grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce -masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. -bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition. -bias: True # Whether to use bias terms in all weight matrices. -normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm' -arch: 'transformer' # Options: ['transformer', 'perceiver'] -activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu'] -headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head. -transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer'] -hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders -num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer. -openai_gelu: False # Use OpenAI's GELU instead of the default GeLU -onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. -fp32_residual_connection: False # Use FP32 for residual connections. -activations_checkpoint_method: null # 'uniform', 'block' -activations_checkpoint_num_layers: 1 -activations_checkpoint_granularity: null -megatron_legacy: False # Whether to use the legacy Megatron model. This affects the way q,k,v is partitioned from the mixed q,k,v layer in ParallelAttention. This needs to be True for models converted from HF. -normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. -num_moe_experts: 1 # When >1, FFNs are changed to MoE layers -moe_frequency: 1 # every Nth ffn layer will be made MoE -moe_dropout: 0.0 # Dropout value for MoE layers -use_flash_attention: false # Use flash attention in self-attention module \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_retro_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_retro_config.yaml deleted file mode 100644 index dafdcf542f11cb22171fd4e1e7b426627ef11db9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_retro_config.yaml +++ /dev/null @@ -1,127 +0,0 @@ -defaults: - - .@model: megatron_model_base_config - -name: test_retro -restore_from_path: null # used when starting from a .nemo file - -trainer: - devices: 2 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. - max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 100 - limit_val_batches: null - limit_test_batches: null - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: megatron_retro - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 10 - mode: min - always_save_nemo: False # saves nemo file during validation, not implemented for model parallel - filename: 'megatron_retro--{val_loss:.2f}-{step}-{consumed_samples}' - model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} - - -model: - version: 1 # indicate the retro model version - - # model parallelism - micro_batch_size: 4 - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 # has to be one. not supporting pipeline parallel yet - - # model architecture - encoder_seq_length: 2048 - max_position_embeddings: ${.encoder_seq_length} - - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - - dump_debug_info: False # dump out the debug information - dump_debug_info_to_file: False # dump out the debug information to files - - # retro architecture - chunk_size: 64 # the chunk size used to retrive - enc_num_layers: 4 # total number of encoder layers - dec_num_layers: 6 # total number of decoder layers - enc_cross_attention: [3] # layer numbers for cross attention in encoder - dec_cross_attention: [3, 5] # layer numbers for chunked cross attention in decoder - add_position_embedding: False # whether use the absolute position encoding - - make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. - pre_process: True # add embedding - post_process: True # add pooler - bert_binary_head: True # BERT binary head - - megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting. - grad_allreduce_chunk_size_mb: 125 - - megatron_lm_compatible: False # a flag to indicate whether the model is compatible with Megatron LM - - tokenizer: - library: 'megatron' - type: 'GPT2BPETokenizer' - model: null - vocab_file: null - merge_file: null - delimiter: null # only used for tabular tokenizer - - # precision - native_amp_init_scale: 4294967296 # 2 ** 32 - native_amp_growth_interval: 1000 - fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 - - # miscellaneous - seed: 1234 - - data: - # Path to data must be specified by the user. - # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]", - # Or see example below: - # data_prefix: - # - .5 - # - /raid/data/pile/my-gpt3_00_text_document - # - .5 - # - /raid/data/pile/my-gpt3_01_text_document - data_prefix: ??? # list of training datasets - knn_index: ??? # list of KNN map index files - retrieval_prefix: ??? # a singe path to retrieval data - index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix - data_impl: retmmap # for retro model, this is the only allowed type - splits_string: 900,50,50 - seq_length: ${model.encoder_seq_length} # must be multiple of the chunk_size in your dataset - skip_warmup: True - num_workers: 0 - dataloader_type: single # cyclic - neighbors: 2 # number of retrieved neighbors - - optim: - name: fused_adam - lr: 1e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 500 - constant_steps: 50000 - min_lr: 1e-5 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_retro_finetune_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_retro_finetune_config.yaml deleted file mode 100644 index 7fa5c350614be08358ff4748ae0130de17efd2a1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_retro_finetune_config.yaml +++ /dev/null @@ -1,105 +0,0 @@ -name: fine_tune_retro - -trainer: - devices: 2 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. - max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 100 - limit_val_batches: null - limit_test_batches: null - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: megatron_retro - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 10 - mode: min - always_save_nemo: False # saves nemo file during validation, not implemented for model parallel - filename: 'megatron_retro--{val_loss:.2f}-{step}-{consumed_samples}' - model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} - - -model: - # model parallelism - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 # has to be one. not supporting pipeline parallel yet - - micro_batch_size: 4 - megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting. - - tokenizer: - library: 'megatron' - type: 'GPT2BPETokenizer' - model: null - vocab_file: null - merge_file: null - delimiter: null # only used for tabular tokenizer - - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - # precision - native_amp_init_scale: 4294967296 # 2 ** 32 - native_amp_growth_interval: 1000 - fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 - - # miscellaneous - seed: 1234 - - restore_path: null # the retro model restore path - - data: - train_ds: - file_name: ??? # train data file path - answer_only_loss: True # whether use answer only loss - seq_length: 128 # must be multiple of the chunk_size in your dataset - add_bos: True # whether to add bos at the beginning - add_eos: True # whether to add eos at the end - seed: 1234 - neighbors: 20 # number of retrieved neighbors - val_ds: - file_name: ??? # train data file path - answer_only_loss: True # whether use answer only loss - seq_length: 128 # must be multiple of the chunk_size in your dataset - add_bos: True # whether to add bos at the beginning - add_eos: True # whether to add eos at the end - seed: 1234 - neighbors: 20 # number of retrieved neighbors - test_ds: - file_name: ??? # train data file path - answer_only_loss: True # whether use answer only loss - seq_length: 128 # must be multiple of the chunk_size in your dataset - add_bos: True # whether to add bos at the beginning - add_eos: True # whether to add eos at the end - seed: 1234 - neighbors: 20 # number of retrieved neighbors - - - optim: - name: fused_adam - lr: 1e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 500 - constant_steps: 50000 - min_lr: 1e-5 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_retro_inference.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_retro_inference.yaml deleted file mode 100644 index 1b99a65f46ad9b26a4540498e13af70972d5c380..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_retro_inference.yaml +++ /dev/null @@ -1,44 +0,0 @@ -inference: - greedy: False # Whether or not to use sampling ; use greedy decoding otherwise - top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - temperature: 1.0 # sampling temperature - add_BOS: True # add the bos token at the begining of the prompt - tokens_to_generate: 30 # The minimum length of the sequence to be generated. - all_probs: False # whether return the log prob for all the tokens in vocab - repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. - min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. - compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - logger: False # logger provided by exp_manager - precision: 16 # 16, 32, or bf16 - -inference_batch_size: 2 -tensor_model_parallel_size: -1 -pipeline_model_parallel_size: -1 -pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) -retro_model_file: null # RETRO nemo file path - -use_predict_method: False # whether to use the predict method - -prompts: # prompts for RETRO model inference - - "hello," - - "good morning," - - "good afternoon," - - "good evening," - -########### Faiss service parameters ######## -retrieval_service: - strategy: RetroModelTextGenerationStrategy # choose customized inference strategy - neighbors: 4 - frequent_query: False # for the current token generation, frequently update the retrieval context. If false, update it every 64 tokens - pad_tokens: True # pad the tokens at the beginning to make it minimum of 64 tokens for retrieving at least once - store_retrieved: False # whether store the retrieved documents, so it can be checked - combo_service: - service_ip: '0.0.0.0' - service_port: 17181 \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_retro_mutransfer.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_retro_mutransfer.yaml deleted file mode 100644 index 80103891f22c14e2adf60b60c1e2e471ff963124..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_retro_mutransfer.yaml +++ /dev/null @@ -1,224 +0,0 @@ -defaults: - - .@base_model: megatron_model_base_config - - .@delta_model: megatron_model_base_config - - .@model: megatron_model_base_config - -name: mu_transfer_retro -restore_from_path: null # used when starting from a .nemo file - -trainer: - devices: 2 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. - max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 100 - limit_val_batches: null - limit_test_batches: null - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: megatron_retro - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 10 - mode: min - always_save_nemo: False # saves nemo file during validation, not implemented for model parallel - filename: 'megatron_retro--{val_loss:.2f}-{step}-{consumed_samples}' - model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} - -base_model: - version: 1 # indicate the retro model version - # model parallelism - micro_batch_size: 4 - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 # has to be one. not supporting pipeline parallel yet - - # model architecture - encoder_seq_length: 2048 - max_position_embeddings: ${.encoder_seq_length} - - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - - # retro architecture - chunk_size: 64 # the chunk size used to retrive - enc_num_layers: 2 # total number of encoder layers - dec_num_layers: 12 # total number of decoder layers - enc_cross_attention: [0] # layer numbers for cross attention in encoder - dec_cross_attention: [5, 8, 11] # layer numbers for chunked cross attention in decoder - add_position_embedding: False # whether use the absolute position encoding - - make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. - pre_process: True # add embedding - post_process: True # add pooler - bert_binary_head: True # BERT binary head - - megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting. - grad_allreduce_chunk_size_mb: 125 - - tokenizer: - library: 'megatron' - type: 'GPT2BPETokenizer' - model: null - vocab_file: null - merge_file: null - delimiter: null # only used for tabular tokenizer - - # precision - native_amp_init_scale: 4294967296 # 2 ** 32 - native_amp_growth_interval: 1000 - fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 - - # miscellaneous - seed: 1234 - -delta_model: - version: 1 # indicate the retro model version - # model parallelism - micro_batch_size: 4 - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 # has to be one. not supporting pipeline parallel yet - - # model architecture - encoder_seq_length: 2048 - max_position_embeddings: ${.encoder_seq_length} - - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - - # retro architecture - chunk_size: 64 # the chunk size used to retrive - enc_num_layers: 2 # total number of encoder layers - dec_num_layers: 12 # total number of decoder layers - enc_cross_attention: [0] # layer numbers for cross attention in encoder - dec_cross_attention: [5, 8, 11] # layer numbers for chunked cross attention in decoder - add_position_embedding: False # whether use the absolute position encoding - - make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. - pre_process: True # add embedding - post_process: True # add pooler - bert_binary_head: True # BERT binary head - - megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting. - grad_allreduce_chunk_size_mb: 125 - - tokenizer: - library: 'megatron' - type: 'GPT2BPETokenizer' - model: null - vocab_file: null - merge_file: null - delimiter: null # only used for tabular tokenizer - - # precision - native_amp_init_scale: 4294967296 # 2 ** 32 - native_amp_growth_interval: 1000 - fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 - # miscellaneous - seed: 1234 - -model: - version: 1 # indicate the retro model version - shape_file: null # the path to the shape file - # model parallelism - micro_batch_size: 4 - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 # has to be one. not supporting pipeline parallel yet - - # model architecture - encoder_seq_length: 2048 - max_position_embeddings: ${.encoder_seq_length} - - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - - dump_debug_info: False # dump out the debug information - dump_debug_info_to_file: False # dump out the debug information to files - - # retro architecture - chunk_size: 64 # the chunk size used to retrive - enc_num_layers: 2 # total number of encoder layers - dec_num_layers: 12 # total number of decoder layers - enc_cross_attention: [0] # layer numbers for cross attention in encoder - dec_cross_attention: [5, 8, 11] # layer numbers for chunked cross attention in decoder - add_position_embedding: False # whether use the absolute position encoding - - make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. - pre_process: True # add embedding - post_process: True # add pooler - bert_binary_head: True # BERT binary head - - megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting. - grad_allreduce_chunk_size_mb: 125 - - activations_checkpoint_granularity: null # 'selective' or 'full' - activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' - # 'uniform' divides the total number of transformer layers and checkpoints the input activation - # of each chunk at the specified granularity - # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity - activations_checkpoint_num_layers: null # not used with 'selective' - # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory. - # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage. - - tokenizer: - library: 'megatron' - type: 'GPT2BPETokenizer' - model: null - vocab_file: null - merge_file: null - delimiter: null # only used for tabular tokenizer - - # precision - native_amp_init_scale: 4294967296 # 2 ** 32 - native_amp_growth_interval: 1000 - fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 - - # miscellaneous - seed: 1234 - - data: - # Path to data must be specified by the user. - # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]", - # Or see example below: - # data_prefix: - # - .5 - # - /raid/data/pile/my-gpt3_00_text_document - # - .5 - # - /raid/data/pile/my-gpt3_01_text_document - data_prefix: ??? # list of training datasets - knn_index: ??? # list of KNN map index files - retrieval_prefix: ??? # a singe path to retrieval data - index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix - data_impl: retmmap # for retro model, this is the only allowed type - splits_string: 900,50,50 - seq_length: ${model.encoder_seq_length} # must be multiple of the chunk_size in your dataset - skip_warmup: True - num_workers: 0 - dataloader_type: single # cyclic - neighbors: 2 # number of retrieved neighbors - - optim: - name: muadamw - lr: 1e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 500 - constant_steps: 50000 - min_lr: 1e-5 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t0_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t0_config.yaml deleted file mode 100644 index 0c76cd7175f4d354cedae0695c7679ab8f7dd0cf..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t0_config.yaml +++ /dev/null @@ -1,95 +0,0 @@ -name: megatron_t0 - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 # PTL default. In practice, max_steps will be reached first. - max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 300 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: megatron_t0 - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: validation_${model.data.validation_ds.metric.name} - save_top_k: 10 - mode: max - always_save_nemo: False # TODO: add support - filename: 'megatron_t0--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' - model_parallel_size: ${model.tensor_model_parallel_size} - save_best_model: True - -model: - restore_from_path: null # Path to a trained T5 .nemo file - pretrained_checkpoint: - checkpoint_dir: null # Path to a folder that contains a .ckpt file - checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir. - hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint. - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - pipeline_model_parallel_split_rank: 0 - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - megatron_amp_O2: False # Enable O2 optimization for megatron amp - resume_from_checkpoint: null - hidden_dropout: 0.1 # Override dropout prob from pretraining - attention_dropout: 0.1 # Override attention dropout prob from pretraining - - data: - train_ds: - file_names: ??? # Path to a list of JSONL files corresponding to the source data. - global_batch_size: 128 - micro_batch_size: 16 - shuffle: True - num_workers: 8 - pin_memory: True - max_src_seq_length: 512 - max_tgt_seq_length: 512 - drop_last: True - concat_sampling_probabilities: ??? # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' - replace_bos_with_pad: False # Replaces bos with pad for both the encoder and decoder. This is necessary when using Google's T5 checkpoints. - add_bos_to_input: False # Adds bos to the input sequence. - add_eos_to_input: False # Adds eos to the input sequence. - seed: 1234 - - validation_ds: - file_names: ??? # Path to a list of JSONL files corresponding to the source data. - names: null # Names of the corresponding datasets used to log metrics. - global_batch_size: 16 - micro_batch_size: 16 - shuffle: False - num_workers: 0 - pin_memory: True - max_src_seq_length: 512 - max_tgt_seq_length: 512 - drop_last: False # TODO: Figure out if there is a way to avoid dropping last. - write_predictions_to_file: False - output_file_path_prefix: null # Prefix of the file to write predictions to. - metric: - name: "exact_string_match" # Name of the evaluation metric to use. - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - replace_bos_with_pad: ${data.train_ds.replace_bos_with_pad} - add_bos_to_input: ${data.train_ds.add_bos_to_input} - add_eos_to_input: ${data.train_ds.add_eos_to_input} - seed: 1234 - - optim: - name: fused_adam - lr: 5e-6 - weight_decay: 0.0 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_config.yaml deleted file mode 100644 index e51cfff420a37f53df641ab77a82dc4c17006f98..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_config.yaml +++ /dev/null @@ -1,155 +0,0 @@ -defaults: - - .@model.encoder: megatron_model_base_config - - .@model.decoder: megatron_model_base_config - -name: megatron_t5 -restore_from_path: null # used when starting from a .nemo file - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 # PTL default. In practice, max_steps will be reached first. - max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 100 - limit_val_batches: 50 - limit_test_batches: 500 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - benchmark: False - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 10 - mode: min - always_save_nemo: False # saves nemo file during validation, not implemented for model parallel - filename: '${name}--{val_loss:.2f}-{step}-{consumed_samples}' - model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} - -model: - # model parallelism - micro_batch_size: 4 - global_batch_size: 8 # will use more micro batches to reach global batch size - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - resume_from_checkpoint: null # manually set the checkpoint file to load from - pipeline_model_parallel_split_rank: 0 # rank at which decoder starts. - - # model architecture - make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. - - megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting. - grad_allreduce_chunk_size_mb: 125 - grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - - seq_length: 512 - max_position_embeddings: ${.seq_length} - - tokenizer: - library: 'megatron' - type: 'BertWordPieceCase' - model: null - vocab_file: null - merge_file: null - num_sentinel_tokens: 100 - sentencepiece_legacy: True # Legacy=True allows you to add special tokens to sentencepiece tokenizers. - - # weight init - embedding_init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') - - # embedding dropout - embedding_dropout: 0.1 - - # embedding sharing - share_token_embeddings: True # If True share encoder/decoder embeddings - share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits - - # token head - tokens_head_bias: True - - # precision - native_amp_init_scale: 4294967296 # 2 ** 32 - native_amp_growth_interval: 1000 - fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 - - # miscellaneous - seed: 1234 - use_cpu_initialization: False # Init weights on the CPU (slow for large models) - apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this - - data: - # Path to data must be specified by the user. - # Supports List, String and Dictionary - # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-t5_00_text_document,.5,/raid/data/pile/my-t5_01_text_document]", - # Or see example below: - # data_prefix: - # - .5 - # - /raid/data/pile/my-t5_00_text_document - # - .5 - # - /raid/data/pile/my-t5_01_text_document - # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test} - # Or see example below: - # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}" - data_prefix: ??? - index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix - data_impl: mmap # mmap, retmmap, text_mmap, csv_mmap - # data_impl_kwargs: # currently used only for text_mmap, csv_mmap (should be data_impl dependant) - # # defaults for text_memmap - # newline_int: 10 # byte-value of newline (Use ord('\n') to get value) - # header_lines: 0 # skip first N header lines - # workers: null # number of workers when creating missing index files (null defaults to cpu_num // 2) - # sort_dataset_paths: False # if True datasets will be sorted by name - # # defaults for csv_memmap - # newline_int: 10 # byte-value of newline - # header_lines: 1 # skip first N header lines - # workers: null # number of workers when creating missing index files (null defaults to cpu_num // 2) - # sort_dataset_paths: False # if True datasets will be sorted by name - # data_col: 1 # column to use for data - # data_sep: ',' # string to split text into columns - splits_string: 949,45,5 - seq_length: ${model.seq_length} - seq_length_dec: 128 - skip_warmup: True - num_workers: 0 - dataloader_type: single # cyclic - masked_lm_prob: 0.15 - dataset_type: 't5' - short_seq_prob: 0.0 - max_ngram_size: 10 - mean_ngram_size: null - geometric_dist: True - permutation: False - whole_word_masking: True - favor_longer_ngrams: False - respect_document_boundaries: True # If true, a single training exampl cannot cross document boundaries, increasing the fraction of tokens within a batch. - - optim: - name: fused_adam - lr: 0.0001 - betas: - - 0.9 - - 0.999 - eps: 1e-8 - weight_decay: 0.01 - sched: - name: WarmupAnnealing - min_lr: 0.00001 - last_epoch: -1 - warmup_ratio: 0.01 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml deleted file mode 100644 index 40a061bc2822e57e1e0386a17a2460036a52f0be..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml +++ /dev/null @@ -1,52 +0,0 @@ -name: megatron_t5_finetune_eval - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - benchmark: False - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: megatron_t5_finetune_eval - create_checkpoint_callback: False - -model: - restore_from_path: null # Path to a trained T5 .nemo file - pretrained_checkpoint: - checkpoint_dir: null # Path to a folder that contains a .ckpt file - checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir. - hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint. - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - megatron_amp_O2: False # Enable O2 optimization for megatron amp - - data: - validation_ds: - src_file_name: null # Path to the txt file corresponding to the source data. - tgt_file_name: null # Path to the txt file corresponding to the target data. - names: null # If src/tgt file names are ListConfigs, the corresponding label is used to log metrics. - global_batch_size: 64 - micro_batch_size: 64 - shuffle: False - num_workers: 0 - pin_memory: True - max_src_seq_length: 512 - max_tgt_seq_length: 128 - drop_last: False # TODO: Figure out if there is a way to avoid dropping last. - write_predictions_to_file: False - output_file_path_prefix: null # Prefix of the file to write predictions to. - replace_bos_with_pad: False # Replaces bos with pad for both the encoder and decoder. This is necessary when using Google's T5 checkpoints. - add_bos_to_input: False # Adds bos to the input sequence. - add_eos_to_input: False # Adds eos to the input sequence. - - metric: - name: "exact_string_match" # Name of the evaluation metric to use. - average: micro # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null # Number of classes for the metric. Works only for 'F1', 'accuracy' and 'average_precision' etc. Refer to torchmetrics for metrics where this is supported. - class_labels: null # If the targets in your dataset are strings and not integers/float, you need to provide a list of class labels (size = num_classes) so we can convert from strings to integer categories to compute the metric. - labels_are_strings: True # NOTE: This is only required to properly handle metrics like f1, accuracy, average_precision etc. This does not affect extract_string_match. diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml deleted file mode 100644 index 2c8994c063261fedfa1d5ed1756548371f9905e5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml +++ /dev/null @@ -1,50 +0,0 @@ -name: megatron_t5_glue_eval - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - benchmark: False - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: megatron_t5_glue_eval - create_checkpoint_callback: False - -model: - restore_from_path: null # Path to a trained T5 .nemo file - pretrained_checkpoint: - checkpoint_dir: null # Path to a folder that contains a .ckpt file - checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir. - hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint. - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - megatron_amp_O2: False # Enable O2 optimization for megatron amp - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - pipeline_model_parallel_split_rank: 0 - - data: - validation_ds: - task_name: 'mnli' - file_path: ??? # Path to the TSV file for MNLI train ex: '/raid/Data/GLUE/MNLI/dev_matched.tsv' - global_batch_size: 1 - micro_batch_size: 1 - shuffle: False - num_workers: 0 - pin_memory: True - max_seq_length: 512 - drop_last: False - write_predictions_to_file: False - output_file_path_prefix: null # Prefix of the file to write predictions to. - replace_bos_with_pad: False # Replaces bos with pad for both the encoder and decoder. This is necessary when using Google's T5 checkpoints. - add_bos_to_input: False # Adds bos to the input sequence. - add_eos_to_input: False # Adds eos to the input sequence. - metric: - name: "exact_string_match" # Name of the evaluation metric to use. - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml deleted file mode 100644 index 459ca317b1bc0ec33bb4ab6aeec63847ad9b3e32..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml +++ /dev/null @@ -1,93 +0,0 @@ -name: megatron_t5_glue - -trainer: - devices: 2 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: 3 - max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 300 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - benchmark: False - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: megatron_t5_glue - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: validation_${model.data.validation_ds.metric.name} - save_top_k: 10 - mode: max - always_save_nemo: False # TODO: add support - filename: 'megatron_t5--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}' - model_parallel_size: ${model.tensor_model_parallel_size} - save_best_model: True - -model: - restore_from_path: null # Path to a trained T5 .nemo file - pretrained_checkpoint: - checkpoint_dir: null # Path to a folder that contains a .ckpt file - checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir. - hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint. - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - pipeline_model_parallel_split_rank: 0 - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - megatron_amp_O2: False # Enable O2 optimization for megatron amp - resume_from_checkpoint: null - hidden_dropout: 0.1 # Override dropout prob from pretraining - attention_dropout: 0.1 # Override attention dropout prob from pretraining - - data: - train_ds: - task_name: 'mnli' - file_path: ??? # Path to the TSV file for MNLI train ex: '/raid/Data/GLUE/MNLI/train.tsv' - global_batch_size: 128 - micro_batch_size: 64 - shuffle: True - num_workers: 0 - pin_memory: True - max_seq_length: 512 - drop_last: True - replace_bos_with_pad: False # Replaces bos with pad for both the encoder and decoder. This is necessary when using Google's T5 checkpoints. - add_bos_to_input: False # Adds bos to the input sequence. - add_eos_to_input: False # Adds eos to the input sequence. - - - validation_ds: - task_name: 'mnli' - file_path: ??? # Path to the TSV file for MNLI train ex: '/raid/Data/GLUE/MNLI/dev_matched.tsv' - global_batch_size: 128 - micro_batch_size: 64 - shuffle: False - num_workers: 0 - pin_memory: True - max_seq_length: 512 - drop_last: False # TODO: Figure out if there is a way to avoid dropping last. - write_predictions_to_file: False - output_file_path_prefix: null # Prefix of the file to write predictions to. - replace_bos_with_pad: ${data.train_ds.replace_bos_with_pad} - add_bos_to_input: ${data.train_ds.add_bos_to_input} - add_eos_to_input: ${data.train_ds.add_eos_to_input} - metric: - name: "exact_string_match" # Name of the evaluation metric to use. - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - - optim: - name: fused_adam - lr: 5e-6 - weight_decay: 0.0 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml deleted file mode 100644 index 44bf8ac467b082ddb50e5ae5e558b4917b9f2e67..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml +++ /dev/null @@ -1,116 +0,0 @@ -name: megatron_t5_glue_xnli - -trainer: - devices: 2 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: 3 - max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 300 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - benchmark: False - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: megatron_t5_glue_xnli - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: validation_${model.data.validation_ds.metric.name} - save_top_k: 10 - mode: max - always_save_nemo: False # TODO: add support - filename: 'megatron_t5--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}' - model_parallel_size: ${model.tensor_model_parallel_size} - save_best_model: True - -model: - restore_from_path: null # Path to a trained T5 .nemo file - pretrained_checkpoint: - checkpoint_dir: null # Path to a folder that contains a .ckpt file - checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir. - hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint. - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - pipeline_model_parallel_split_rank: 1 - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - resume_from_checkpoint: null - megatron_amp_O2: False # Enable O2 optimization for megatron amp - hidden_dropout: 0.1 # Override dropout prob from pretraining - attention_dropout: 0.1 # Override attention dropout prob from pretraining - eval_languages: ['fr', 'de', 'en', 'es'] # List of languages to evaluate zero-shot XNLI performance. - - data: - train_ds: - task_name: 'mnli' - file_path: ??? # Path to the TSV file for MNLI train ex: '/raid/Data/GLUE/MNLI/train.tsv' - global_batch_size: 128 - micro_batch_size: 64 - shuffle: True - num_workers: 0 - pin_memory: True - max_seq_length: 512 - drop_last: True - replace_bos_with_pad: False # Replaces bos with pad for both the encoder and decoder. This is necessary when using Google's T5 checkpoints. - add_bos_to_input: True # Adds bos to the input sequence. - add_eos_to_input: True # Adds eos to the input sequence. - - - validation_ds: - task_name: 'xnli' - file_path: ??? # Path to the TSV file for XNLI dev ex: '/raid/Data/GLUE/MNLI/dev_matched.tsv' - global_batch_size: 128 - micro_batch_size: 64 - shuffle: False - num_workers: 0 - pin_memory: True - max_seq_length: 512 - drop_last: False - write_predictions_to_file: False - prediction_file_path_prefix: null # Prefix of the file to write predictions to. - replace_bos_with_pad: ${data.train_ds.replace_bos_with_pad} - add_bos_to_input: ${data.train_ds.add_bos_to_input} - add_eos_to_input: ${data.train_ds.add_eos_to_input} - - metric: - name: "exact_string_match" # Name of the evaluation metric to use. - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - - test_ds: - task_name: 'xnli' - file_path: ??? # Path to the TSV file for XNLI dev ex: '/raid/Data/GLUE/MNLI/dev_matched.tsv' - global_batch_size: 128 - micro_batch_size: 64 - shuffle: False - num_workers: 0 - pin_memory: True - max_seq_length: 512 - drop_last: False - write_predictions_to_file: False - output_file_path_prefix: null # Prefix of the file to write predictions to. - replace_bos_with_pad: ${data.train_ds.replace_bos_with_pad} - add_bos_to_input: ${data.train_ds.add_bos_to_input} - add_eos_to_input: ${data.train_ds.add_eos_to_input} - - metric: - name: "exact_string_match" # Name of the evaluation metric to use. - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - - optim: - name: fused_adam - lr: 5e-6 - weight_decay: 0.0 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml deleted file mode 100644 index 2ba68cbc5979cf7cf6f26b4b47457206cf71388a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml +++ /dev/null @@ -1,99 +0,0 @@ -name: megatron_t5_finetuning - -trainer: - devices: 2 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: 10 - max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 300 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: megatron_t5_finetune - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: validation_${model.data.validation_ds.metric.name} - save_top_k: 10 - mode: max - always_save_nemo: False # TODO: add support - filename: 'megatron_t5--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}' - model_parallel_size: ${model.tensor_model_parallel_size} - save_best_model: True - -model: - restore_from_path: null # Path to a trained T5 .nemo file - pretrained_checkpoint: - checkpoint_dir: null # Path to a folder that contains a .ckpt file - checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir. - hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint. - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - pipeline_model_parallel_split_rank: 0 - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - megatron_amp_O2: False # Enable O2 optimization for megatron amp - resume_from_checkpoint: null - hidden_dropout: 0.1 # Override dropout prob from pretraining - attention_dropout: 0.1 # Override attention dropout prob from pretraining - - data: - train_ds: - src_file_name: ??? # Path to the txt file corresponding to the source data. - tgt_file_name: ??? # Path to the txt file corresponding to the target data. - global_batch_size: 128 - micro_batch_size: 64 - shuffle: True - num_workers: 0 - pin_memory: True - max_src_seq_length: 512 - max_tgt_seq_length: 128 - drop_last: True - concat_sampling_technique: temperature # When providing a list of datasets, this arg defines the sampling strategy. Options: ['temperature', 'random'] - concat_sampling_temperature: 5 # When providing a list of datasets, this arg defines the sampling temperature when strategy='temperature' - concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' - replace_bos_with_pad: False # Replaces bos with pad for both the encoder and decoder. This is necessary when using Google's T5 checkpoints. - add_bos_to_input: False # Adds bos to the input sequence. - add_eos_to_input: False # Adds eos to the input sequence. - - validation_ds: - src_file_name: ??? # Path to the txt file corresponding to the source data. - tgt_file_name: ??? # Path to the txt file corresponding to the target data. - names: null # If src/tgt file names are ListConfigs, the corresponding label is used to log metrics. - global_batch_size: 128 - micro_batch_size: 64 - shuffle: False - num_workers: 0 - pin_memory: True - max_src_seq_length: 512 - max_tgt_seq_length: 128 - drop_last: False # TODO: Figure out if there is a way to avoid dropping last. - write_predictions_to_file: False - output_file_path_prefix: null # Prefix of the file to write predictions to. - replace_bos_with_pad: ${data.train_ds.replace_bos_with_pad} - add_bos_to_input: ${data.train_ds.add_bos_to_input} - add_eos_to_input: ${data.train_ds.add_eos_to_input} - metric: - name: "exact_string_match" # Name of the evaluation metric to use. Supported metrics: [`exact_string_match`, `rouge`, `pearson_corr_coef`, `spearman_corr_coef`, `f1`, `accuracy`, `average_precision`] - average: micro # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null # Number of classes for the metric. Works only for 'F1', 'accuracy' and 'average_precision' etc. Refer to torchmetrics for metrics where this is supported. - class_labels: null # If the targets in your dataset are strings and not integers/float, you need to provide a list of class labels (size = num_classes) so we can convert from strings to integer categories to compute the metric. - labels_are_strings: True # NOTE: This is only required to properly handle metrics like f1, accuracy, average_precision etc. This does not affect extract_string_match. - - optim: - name: fused_adam - lr: 5e-6 - weight_decay: 0.0 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_lm_adaptation_finetune.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_lm_adaptation_finetune.yaml deleted file mode 100644 index 8a00408390ee60aceb0c17761c4c5cd268ec6190..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_lm_adaptation_finetune.yaml +++ /dev/null @@ -1,101 +0,0 @@ -name: megatron_t5_lm_adaptation_finetune -restore_from_path: null # used when starting from a .nemo file - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 # PTL default. In practice, max_steps will be reached first. - max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 100 - limit_val_batches: 50 - limit_test_batches: 500 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: megatron_t5_lm_adaptation_finetune - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 10 - mode: min - always_save_nemo: False # saves nemo file during validation, not implemented for model parallel - filename: 'megatron_t5--{val_loss:.2f}-{step}-{consumed_samples}' - model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} - -model: - # pretrained model path - pretrained_model_path: ??? - - # model parallelism - micro_batch_size: 4 - global_batch_size: 8 # will use more micro batches to reach global batch size - tensor_model_parallel_size: 2 - pipeline_model_parallel_size: 1 - resume_from_checkpoint: null # manually set the checkpoint file to load from - pipeline_model_parallel_split_rank: 1 - - # O2 mixed precision - megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting. - - # JIT fusion params. - bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. - masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. - bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition. - - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - - # Dropout - hidden_dropout: null - attention_dropout: null - - data: - # Path to data must be specified by the user. - # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-t5_00_text_document,.5,/raid/data/pile/my-t5_01_text_document]", - # Or see example below: - # data_prefix: - # - .5 - # - /raid/data/pile/my-t5_00_text_document - # - .5 - # - /raid/data/pile/my-t5_01_text_document - data_prefix: ??? - index_mapping_dir: null - data_impl: mmap - splits_string: 949,45,5 - seq_length: ${model.seq_length} - seq_length_dec: 128 - skip_warmup: True - num_workers: 0 - dataloader_type: single # cyclic - masked_lm_prob: 0.15 - dataset_type: 't5_prefix_lm' - short_seq_prob: 0.0 - max_ngram_size: 10 - mean_ngram_size: null - geometric_dist: True - permutation: False - whole_word_masking: True - favor_longer_ngrams: False - - optim: - name: fused_adam - lr: 5e-6 - betas: - - 0.9 - - 0.999 - eps: 1e-8 - weight_decay: 0.01 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml deleted file mode 100644 index b966ad0eb6311eadd24a8c23a0a7190cafcaa6d7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml +++ /dev/null @@ -1,108 +0,0 @@ -name: p_tuning_squad_t5 - -trainer: - devices: 1 - accelerator: gpu - num_nodes: 1 - precision: 16 - logger: False - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: 10 - max_steps: -1 - log_every_n_steps: 10 - val_check_interval: 1.0 - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: PromptLearning-T5 - name: ${name} - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 2 - mode: min - save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below - filename: "megatron_t5_prompt_tune--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}" - model_parallel_size: ${model.tensor_model_parallel_size} - save_best_model: True - create_early_stopping_callback: True - early_stopping_callback_params: - monitor: "val_loss" - mode: "min" - min_delta: 0.001 - patience: 10 - verbose: True - -model: - seed: 1234 - nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved - virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference' - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - global_batch_size: 8 - micro_batch_size: 8 # micro batch size should equal global batch size when pipeline parallel = 1 - validation_global_batch_size: ${model.global_batch_size} - validation_micro_batch_size: ${model.micro_batch_size} - validation_drop_last: False - report_validation_metric: False - validation_metric: accuracy - - restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with - language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required - save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. - existing_tasks: [] - new_tasks: ["squad"] - - - task_templates: - - taskname: "squad" - prompt_template: "<|VIRTUAL_PROMPT_0|> {context} {question} {answer}" - total_virtual_tokens: 100 - virtual_token_splits: [100] - truncate_field: context - answer_field: answer - - p_tuning: # P-tuning specific params - encoder_type: "mlp" # Either "mlp" or "lstm", mlp is default - num_layers: 2 # 2 recommended for MLP, 1 recommended for LSTM, must be at least 2 for mlp - dropout: 0.0 - - prompt_tuning: # Prompt tunin specific params - new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks - new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random - - data: - train_ds: ["data/squad_train.jsonl"] - validation_ds: ["data/squad_val.jsonl"] - add_eos: true - add_bos: false - decoder_starts_with_pad: False - add_eos_to_decoder_output: True - add_sentinel_to_input: True - ul2_prompt_token: null # , , - shuffle: true - num_workers: 4 - pin_memory: true - - optim: - name: fused_adam - lr: 1e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 50 - constant_steps: 0 - min_lr: 0.0 - monitor: val_loss - reduce_on_plateau: false \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning_inference.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning_inference.yaml deleted file mode 100644 index 0b5929a36a62ea46d9277c65436e6e1b62081124..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning_inference.yaml +++ /dev/null @@ -1,21 +0,0 @@ -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - logger: False # logger provided by exp_manager - precision: 16 # 16, 32, or bf16 - -data: - test_ds: ??? - num_workers: 1 - global_batch_size: 8 - micro_batch_size: 8 - -tensor_model_parallel_size: -1 -pipeline_model_parallel_size: -1 -pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) -language_model_path: ??? # path to a pretrained T5 nemo file -virtual_prompt_model_file: ??? # path to a MegatronT5PromptLearningModel nemo file -pred_file_path: ??? # Path were all model predicitons will be written to a text file - - diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml deleted file mode 100644 index 86d02b0ce310a061d86ab44096f3c10a5adcc803..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml +++ /dev/null @@ -1,156 +0,0 @@ -defaults: - - .@model.encoder: megatron_model_base_config - - .@model.decoder: megatron_model_base_config - -name: megatron_ul2 -restore_from_path: null # used when starting from a .nemo file - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 # PTL default. In practice, max_steps will be reached first. - max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 100 - limit_val_batches: 50 - limit_test_batches: 500 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 10 - mode: min - always_save_nemo: False # saves nemo file during validation, not implemented for model parallel - filename: '${name}--{val_loss:.2f}-{step}-{consumed_samples}' - model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} - -model: - # model parallelism - micro_batch_size: 4 - global_batch_size: 8 # will use more micro batches to reach global batch size - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - resume_from_checkpoint: null # manually set the checkpoint file to load from - pipeline_model_parallel_split_rank: 0 # rank at which decoder starts. - - # model architecture - make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. - - megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting. - grad_allreduce_chunk_size_mb: 125 - grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - - seq_length: 512 - max_position_embeddings: ${.seq_length} - - tokenizer: - library: 'megatron' - type: 'BertWordPieceCase' - model: null - vocab_file: null - merge_file: null - num_sentinel_tokens: 100 - sentencepiece_legacy: True # Legacy=True allows you to add special tokens to sentencepiece tokenizers. - - # weight init - embedding_init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') - - # embedding dropout - embedding_dropout: 0.1 - - # embedding sharing - share_token_embeddings: True # If True share encoder/decoder embeddings - share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits - - # token head - tokens_head_bias: True - - # precision - native_amp_init_scale: 4294967296 # 2 ** 32 - native_amp_growth_interval: 1000 - fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 - - # miscellaneous - seed: 1234 - use_cpu_initialization: False # Init weights on the CPU (slow for large models) - apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this - - data: - # Path to data must be specified by the user. - # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-t5_00_text_document,.5,/raid/data/pile/my-t5_01_text_document]", - # Or see example below: - # data_prefix: - # - .5 - # - /raid/data/pile/my-t5_00_text_document - # - .5 - # - /raid/data/pile/my-t5_01_text_document - data_prefix: ??? - index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix - data_impl: mmap - # data_impl_kwargs: # currently used only for text_mmap, csv_mmap (should be data_impl dependant) - # # defaults for text_memmap - # newline_int: 10 # byte-value of newline (Use ord('\n') to get value) - # header_lines: 0 # skip first N header lines - # workers: null # number of workers when creating missing index files (null defaults to cpu_num // 2) - # sort_dataset_paths: False # if True datasets will be sorted by name - # # defaults for csv_memmap - # newline_int: 10 # byte-value of newline - # header_lines: 1 # skip first N header lines - # workers: null # number of workers when creating missing index files (null defaults to cpu_num // 2) - # sort_dataset_paths: False # if True datasets will be sorted by name - # data_col: 1 # column to use for data - # data_sep: ',' # string to split text into columns - splits_string: 949,45,5 - seq_length: ${model.seq_length} - seq_length_dec: ${model.seq_length} - skip_warmup: True - num_workers: 0 - dataloader_type: single # cyclic - masked_lm_prob: 0.15 - extreme_masked_lm_prob: 0.5 - dataset_type: 'ul2' - short_seq_prob: 0.0 - max_ngram_size: 10 - extreme_max_ngram_size: 128 - extreme_min_ngram_size: 32 - extreme_mean_ngram_size: 64 - ngram_span_length_distribution: 'geometric' - extreme_ngram_span_length_distribution: 'truncated_normal' - prefix_lm_pivot_mean: 0.25 - mean_ngram_size: 3 - permutation: False - whole_word_masking: True - favor_longer_ngrams: False - respect_document_boundaries: True # If true, a single training exampl cannot cross document boundaries, increasing the fraction of tokens within a batch. - - optim: - name: fused_adam - lr: 0.0001 - betas: - - 0.9 - - 0.999 - eps: 1e-8 - weight_decay: 0.01 - sched: - name: WarmupAnnealing - min_lr: 0.00001 - last_epoch: -1 - warmup_ratio: 0.01 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml deleted file mode 100644 index c6e25c087ffc0cfb094f7242311ea0945a09d815..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# UB communicator configurations -# Model configs: A100/175B/TP4/MBS1/SeqLen2K/BF16 - -# Bulk overlap with AllGather -qkv_dgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -qkv_wgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -fc1_dgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -fc1_wgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -## Ring-exchange overlap with AllGather -qkv_fprop: - method: ring_exchange - aggregate: 0 - -proj_dgrad: - method: ring_exchange - aggregate: 0 - -fc1_fprop: - method: ring_exchange - aggregate: 0 - -fc2_dgrad: - method: ring_exchange - aggregate: 0 - -# Chunked-collective overlap with ReduceScatter -proj_fprop: - method: pipeline - num_sm: 4 - num_splits: 4 - set_sm_margin: 0 - -fc2_fprop: - method: pipeline - num_sm: 4 - num_splits: 4 - set_sm_margin: 0 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml deleted file mode 100644 index 434e0a29f42cd40082187612b0713e5da7ce0b0b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# UB communicator configurations -# Model configs: A100/175B/TP4/MBS2/SeqLen2K/BF16 - -# Bulk overlap with AllGather -qkv_dgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -qkv_wgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -fc1_dgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -fc1_wgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -## Ring-exchange overlap with AllGather -qkv_fprop: - method: ring_exchange - aggregate: 0 - -proj_dgrad: - method: ring_exchange - aggregate: 0 - -fc1_fprop: - method: ring_exchange - aggregate: 0 - -fc2_dgrad: - method: ring_exchange - aggregate: 0 - -# Chunked-collective overlap with ReduceScatter -proj_fprop: - method: pipeline - num_sm: 8 - num_splits: 4 - set_sm_margin: 0 - -fc2_fprop: - method: pipeline - num_sm: 4 - num_splits: 4 - set_sm_margin: 0 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h6144_tp4_mbs4_seqlen2048.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h6144_tp4_mbs4_seqlen2048.yaml deleted file mode 100644 index 2c11fa89af4b1820b36f4481ad648c8f8ac2d8a2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h6144_tp4_mbs4_seqlen2048.yaml +++ /dev/null @@ -1 +0,0 @@ -# dummy file to build hydra configs diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h6144_tp8_mbs4_seqlen2048.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h6144_tp8_mbs4_seqlen2048.yaml deleted file mode 100644 index 2c11fa89af4b1820b36f4481ad648c8f8ac2d8a2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h6144_tp8_mbs4_seqlen2048.yaml +++ /dev/null @@ -1 +0,0 @@ -# dummy file to build hydra configs diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h8192_tp8_mbs4_seqlen2048.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h8192_tp8_mbs4_seqlen2048.yaml deleted file mode 100644 index 2c11fa89af4b1820b36f4481ad648c8f8ac2d8a2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h8192_tp8_mbs4_seqlen2048.yaml +++ /dev/null @@ -1 +0,0 @@ -# dummy file to build hydra configs diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml deleted file mode 100644 index 21d02f3dd22c82c8dfe38e04d69b6c4232526b20..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml +++ /dev/null @@ -1,59 +0,0 @@ -# UB communicator configurations -# Model configs: H100/175B/TP4/MBS1/SeqLen2K/FP8 - -# Bulk overlap with AllGather / ReduceScatter -qkv_dgrad: - method: bulk - num_sm: 4 - cga_size: 2 - set_sm_margin: 0 - -qkv_wgrad: - method: bulk - num_sm: 8 - cga_size: 2 - set_sm_margin: 0 - -fc1_dgrad: - method: bulk - num_sm: 2 - cga_size: 2 - set_sm_margin: 0 - -fc1_wgrad: - method: bulk - num_sm: 4 - cga_size: 2 - set_sm_margin: 0 - -## Ring-exchange overlap with AllGather -qkv_fprop: - method: ring_exchange - aggregate: 0 - -proj_dgrad: - method: ring_exchange - aggregate: 0 - -fc1_fprop: - method: ring_exchange - aggregate: 0 - -fc2_dgrad: - method: ring_exchange - aggregate: 1 - -# Chunked-collective overlap with ReduceScatter -proj_fprop: - method: pipeline - num_sm: 24 - cga_size: 2 - num_splits: 4 - set_sm_margin: 1 - -fc2_fprop: - method: pipeline - num_sm: 20 - cga_size: 2 - num_splits: 4 - set_sm_margin: 1 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml deleted file mode 100644 index 444c8245e02c4fdbd6d770c456f76e9281f0e27c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml +++ /dev/null @@ -1,59 +0,0 @@ -# UB communicator configurations -# Model configs: H100/175B/TP8/MBS2/SeqLen2K/FP8 - -# Bulk overlap with AllGather -qkv_dgrad: - method: bulk - num_sm: 8 - cga_size: 2 - set_sm_margin: 0 - -qkv_wgrad: - method: bulk - num_sm: 16 - cga_size: 2 - set_sm_margin: 0 - -fc1_dgrad: - method: bulk - num_sm: 4 - cga_size: 2 - set_sm_margin: 0 - -fc1_wgrad: - method: bulk - num_sm: 16 - cga_size: 2 - set_sm_margin: 0 - -## Ring-exchange overlap with AllGather -qkv_fprop: - method: ring_exchange - aggregate: 0 - -proj_dgrad: - method: ring_exchange - aggregate: 1 - -fc1_fprop: - method: ring_exchange - aggregate: 0 - -fc2_dgrad: - method: ring_exchange - aggregate: 0 - -# Chunked-collective overlap with ReduceScatter -proj_fprop: - method: pipeline - num_sm: 16 - cga_size: 2 - num_splits: 4 - set_sm_margin: 1 - -fc2_fprop: - method: pipeline - num_sm: 24 - cga_size: 2 - num_splits: 4 - set_sm_margin: 1 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h6144_tp4_mbs4_seqlen2048.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h6144_tp4_mbs4_seqlen2048.yaml deleted file mode 100644 index 2c11fa89af4b1820b36f4481ad648c8f8ac2d8a2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h6144_tp4_mbs4_seqlen2048.yaml +++ /dev/null @@ -1 +0,0 @@ -# dummy file to build hydra configs diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h6144_tp8_mbs4_seqlen2048.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h6144_tp8_mbs4_seqlen2048.yaml deleted file mode 100644 index 2c11fa89af4b1820b36f4481ad648c8f8ac2d8a2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h6144_tp8_mbs4_seqlen2048.yaml +++ /dev/null @@ -1 +0,0 @@ -# dummy file to build hydra configs diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h8192_tp8_mbs4_seqlen2048.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h8192_tp8_mbs4_seqlen2048.yaml deleted file mode 100644 index 2c11fa89af4b1820b36f4481ad648c8f8ac2d8a2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h8192_tp8_mbs4_seqlen2048.yaml +++ /dev/null @@ -1 +0,0 @@ -# dummy file to build hydra configs diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/transformer_lm_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/transformer_lm_config.yaml deleted file mode 100644 index 31040dd7239a99aca2aae5ddaa2ce1320d849a97..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/conf/transformer_lm_config.yaml +++ /dev/null @@ -1,102 +0,0 @@ -name: TransformerLanguageModel -do_training: True # set to False if only preprocessing data - -model: - label_smoothing: 0.0 - preproc_out_dir: null # path to store data preprocessing outputs - - train_ds: - file_name: ??? # path to file with training data - tokens_in_batch: 4096 - clean: true - shuffle: true - num_workers: 8 - - # tarred dataset specific config - # use_tarred_dataset: true - # tar_files: ??? # path to tarred files - # metadata_file: ??? # metadata for tarred dataset - # shard_strategy: scatter - # tar_shuffle_n: 256 - - validation_ds: - file_name: ??? # path to file with validation data - tokens_in_batch: 512 - clean: false - shuffle: false - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - - test_ds: - file_name: ??? # path to file with test data - tokens_in_batch: 512 - clean: false - shuffle: false - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - - optim: - name: adam - lr: 0.001 - betas: - - 0.9 - - 0.98 - weight_decay: 0.0 - sched: - name: InverseSquareRootAnnealing - min_lr: 0.0 - last_epoch: -1 - warmup_ratio: 0.1 - - tokenizer: - tokenizer_name: yttm - tokenizer_model: ??? - vocab_file: null - special_tokens: null - training_sample_size: null # valid for sentencepiece tokenizer - - encoder: - library: nemo - model_name: null - pretrained: false - max_sequence_length: 512 - num_token_types: 0 - embedding_dropout: 0.1 - learn_positional_encodings: false - hidden_size: 512 - num_layers: 6 - inner_size: 2048 - num_attention_heads: 8 - ffn_dropout: 0.1 - attn_score_dropout: 0.1 - attn_layer_dropout: 0.1 - hidden_act: relu - mask_future: true - pre_ln: false - - head: - num_layers: 1 - activation: relu - log_softmax: true - dropout: 0.0 - -trainer: - devices: 4 - num_nodes: 1 - max_epochs: 200 - precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 - accelerator: gpu - strategy: ddp - enable_checkpointing: False - logger: False - log_every_n_steps: 50 # Interval of logging. - check_val_every_n_epoch: 1 - benchmark: False - -exp_manager: - name: TransformerLM - files_to_copy: [] \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/convert_weights_to_nemo1.0.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/convert_weights_to_nemo1.0.py deleted file mode 100644 index b58b33942ea55a98e942dc495fc0f3da51dbef4d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/convert_weights_to_nemo1.0.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Converts BERT NeMo0.* checkpoints to NeMo1.0 format. -""" - -from argparse import ArgumentParser - -import torch - -parser = ArgumentParser() -parser.add_argument("--bert_encoder", required=True, help="path to BERT encoder, e.g. /../BERT-STEP-2285714.pt") -parser.add_argument( - "--bert_token_classifier", - required=True, - help="path to BERT token classifier, e.g. /../BertTokenClassifier-STEP-2285714.pt", -) -parser.add_argument( - "--bert_sequence_classifier", - required=False, - default=None, - help="path to BERT sequence classifier, e.g /../SequenceClassifier-STEP-2285714.pt", -) -parser.add_argument( - "--output_path", required=False, default="converted_model.pt", help="output path to newly converted model" -) -args = parser.parse_args() - -bert_in = torch.load(args.bert_encoder) -tok_in = torch.load(args.bert_token_classifier) -if args.bert_sequence_classifier: - seq_in = torch.load(args.bert_sequence_classifier) - -new_dict = {} -new_model = {"state_dict": new_dict} -for k in bert_in: - new_name = k.replace("bert.", "bert_model.") - new_dict[new_name] = bert_in[k] - -for k in tok_in: - new_name = "mlm_classifier." + k - new_dict[new_name] = tok_in[k] - -if args.bert_sequence_classifier: - for k in seq_in: - new_name = "nsp_classifier." + k - new_dict[new_name] = seq_in[k] - -torch.save(new_model, args.output_path) diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/get_wkt2.sh b/SoundScribe/SpeakerID/examples/nlp/language_modeling/get_wkt2.sh deleted file mode 100644 index 33b55b2a73051f63bf87503112c1ec6bb2bb8a9e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/get_wkt2.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -""" -This file is adapted from -https://github.com/salesforce/awd-lstm-lm/blob/master/getdata.sh -Copyright by the AWD LSTM authors. -""" -DATA_DIR=$1 -echo "- Downloading WikiText-2" - -wget --continue -P $DATA_DIR https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip -unzip -q $DATA_DIR/wikitext-2-v1.zip -d $DATA_DIR -cd $DATA_DIR/wikitext-2 -mv wiki.train.tokens train.txt -sed -i -e "s//[UNK]/g" train.txt -mv wiki.valid.tokens valid.txt -sed -i -e "s//[UNK]/g" valid.txt -mv wiki.test.tokens test.txt -sed -i -e "s//[UNK]/g" test.txt -cd .. -rm wikitext-2-v1.zip - -echo "- WikiText-2 saved at $DATA_DIR/wikitext-2" diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_bart_pretraining.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_bart_pretraining.py deleted file mode 100644 index 72c7a755c0ec154841ccbeac5984546bc59b7a94..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_bart_pretraining.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import ModelSummary -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector - -from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - PipelineMixedPrecisionPlugin, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="megatron_bart_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2: - plugins.append(MegatronHalfPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer( - plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3), CustomProgressBar()] - ) - - exp_manager(trainer, cfg.exp_manager) - - # update resume from checkpoint found by exp_manager - if cfg.model.resume_from_checkpoint is not None: - trainer.ckpt_path = cfg.model.resume_from_checkpoint - - logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') - - model = MegatronBARTModel(cfg.model, trainer) - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_bert_pretraining.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_bert_pretraining.py deleted file mode 100644 index 2901a58dff1c24b43d1e782922f9e7e0ee804d60..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_bert_pretraining.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict - -from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel -from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronBertTrainerBuilder -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="megatron_bert_config") -def main(cfg) -> None: - if cfg.model.data.dataloader_type != "LDDL": - mp.set_start_method("spawn", force=True) - - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - trainer = MegatronBertTrainerBuilder(cfg).create_trainer() - exp_manager(trainer, cfg.exp_manager) - - model = MegatronBertModel(cfg.model, trainer) - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_change_num_partitions.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_change_num_partitions.py deleted file mode 100644 index f7ada6ab762f1c0dd0b797f8fe3933b97d25c1e9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_change_num_partitions.py +++ /dev/null @@ -1,1502 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import shutil -import tarfile -import tempfile -from argparse import ArgumentParser -from typing import Dict, List - -import torch -import torch.nn as nn -from omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer - -from nemo.collections.nlp.parts.nlp_overrides import ( - NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, - PipelineMixedPrecisionPlugin, -) -from nemo.utils import logging, model_utils -from nemo.utils.app_state import AppState - -""" -Usage: - -### Tensor Parallelism and Pipeline Parallelism conversion ### - -# Megatron GPT -python megatron_change_num_partitions.py \ - --model_file=PATH_TO_SRC_FILE \ - --target_file=PATH_TO_TGT_FILE \ - --tensor_model_parallel_size=-1 \ - --target_tensor_model_parallel_size=1 \ - --pipeline_model_parallel_size=-1 \ - --target_pipeline_model_parallel_size=1 \ - --precision=bf16 - -# Megatron T5 -python megatron_change_num_partitions.py \ - --model_file=PATH_TO_SRC_FILE \ - --target_file=PATH_TO_TGT_FILE \ - --model_class="nemo.collections.nlp.models.language_modeling.megatron_t5_model.MegatronT5Model" \ - --tensor_model_parallel_size=-1 \ - --target_tensor_model_parallel_size=1 \ - --pipeline_model_parallel_size=-1 \ - --target_pipeline_model_parallel_size=1 \ - --target_pipeline_model_parallel_split_rank=0 \ - --precision=bf16 - -# Megatron GPT + Virtual Pipeline parallelism - -python megatron_change_num_partitions.py \ - --model_extracted_dir="" \ - --target_file="" \ - --ckpt_name="" \ - --tensor_model_parallel_size= \ - --target_tensor_model_parallel_size= \ - --pipeline_model_parallel_size= \ - --target_pipeline_model_parallel_size= \ - --virtual_pipeline_model_parallel_size= \ - --hparams_file="" \ - --precision=bf16 - -### Only Tensor Parallelism conversion ### - -To the above commands, add the following argument: `--tp_conversion_only` - -# Note: This requires that the pipeline_model_parallel_size and tgt_pipeline_model_parallel_size is set to 1. - -### Large Models conversion ### - -When converting large models, ** always ** ensure that you pre-extract the nemo model and then only perform conversion - -$ mkdir "unpacked_nemo_file" -$ tar -xvf "" -C "/unpacked_nemo_file/" - -python megatron_change_num_partitions.py \ - ... - --model_extracted_dir="/unpacked_nemo_file/" - -### Model Classes ### - -# NOTE: Conversion of other model types. -# Default model type is MegatronGPTModel, if you want another model you need to pass classpath of the model -# For example - MegatronT5Model - - -python megatron_change_num_partitions.py \ - ... - --model_class="nemo.collections.nlp.models.language_modeling.megatron_t5_model.MegatronT5Model" - -# Additional arguments: - ---num_gpu_per_node: Number of GPUs per node. Default is 8. ---megatron_legacy: Whether the model is a legacy Megatron model or not. Default is False. May be unsuported for - Pipeline Parallelism change. ---tokenizer_model_path: Path to tokenizer model. Default is None. When not None, overrides the tokenizer model path - in the model config. ---tokenizer_vocab_file: Path to tokenizer vocab file. Default is None. When not None, overrides the tokenizer vocab - file in the model config. - -# Comments - -Passing --tensor_model_parallel_size=-1 or --pipeline_model_parallel_size=-1 will automatically infer the size from the -model config. - -""" - - -def set_virtual_parallel_rank_safely(rank: int): - AppState().virtual_pipeline_model_parallel_rank = rank - - try: - from megatron.core import parallel_state - - parallel_state.set_virtual_pipeline_model_parallel_rank(rank) - - if rank is None: - parallel_state.set_virtual_pipeline_model_parallel_world_size(None) - - except (ImportError, ModuleNotFoundError): - logging.warning("`megatron-core` not installed, cannot set virtual parallel rank !") - - -################# -### Utilities ### -################# - - -def force_cpu_model(cfg): - with open_dict(cfg): - # temporarily set to cpu - original_cpu_init = cfg.get('use_cpu_initialization', False) - if 'megatron_amp_O2' in cfg: - key = 'megatron_amp_O2' - original_amp_o2 = cfg.megatron_amp_O2 - elif 'megatron_amp_02' in cfg: - key = 'megatron_amp_02' - original_amp_o2 = cfg.megatron_amp_02 - else: - key, original_amp_o2 = None, None - - # Set new values - cfg.use_cpu_initialization = True - if key is not None: - cfg[key] = False - - # Setup restore dict - restore_dict = {'use_cpu_initialization': original_cpu_init} # 'megatron_amp_O2': original_amp_o2 - if key is not None: - restore_dict[key] = original_amp_o2 - - return cfg, restore_dict - - -def restore_model_config(cfg, original_dict): - with open_dict(cfg): - for key, val in original_dict.items(): - logging.info(f"Restoring model config key ({key}) from {cfg[key]} to original value of {val}") - cfg[key] = val - return cfg - - -################# -### Utilities ### -################# - - -def compute_tp_splits( - param_name, param, partitions, global_idx, tp_size, pp_size, pp_rank, pp_split_rank, megatron_legacy, model_cfg -): - """ - Function to compute the splits required for tensor-parallelism. - - Args: - param_name: Name of the current parameter of the current model (TP X PP Y) - param: Value of the current parameter of the current model (TP X PP Y) - partitions: Partitions of the flattened parameter of the current model (TP 1 PP 1) - global_idx: The index used to select the parameter in the global partition. - tp_size: Int, tensor-parallelism size. - pp_size: Int, pipeline-parallelism size. - pp_rank: Int, pipeline-parallelism rank. - pp_split_rank: Int, pipeline-parallelism split rank. This should be > 1 if TP is being used with EncDec models (T5) - megatron_legacy: Bool, whether the model is a legacy Megatron model or not. - model_cfg: The model config as a OmegaConf DictConfig. - - Returns: - List of torch tensors, each of which is a split of the current parameter. - """ - # alias the global index to idx - idx = global_idx - - fast_glu_activation = str(model_cfg.get('activation', '')).lower() in ['fast-geglu', 'fast-swiglu', 'fast-reglu'] - - if param.shape == partitions[0][idx].shape: - split = [partitions[0][idx].data] * tp_size - logging.debug(">> Perfect match, no splitting needed") - elif param.shape[0] == partitions[0][idx].shape[0]: - split = torch.split(partitions[0][idx].data, param.shape[-1], dim=-1) - else: - # For T5-converted weights, the splitting needs to be strided such that q,k,v weights are bunched together on each tensor-parallel rank. - if 'query_key_value.weight' in param_name and megatron_legacy: - split_dim = partitions[0][idx].data.shape[0] - if split_dim % (tp_size * 3) != 0: - raise ValueError( - f"Can not split Q,K,V parameter {param_name} with shape {param.shape} into tensor parallel size {tp_size}. Not divisible by {tp_size * 3}." - ) - tp_qkv_splits = torch.chunk(partitions[0][idx].data, tp_size * 3, dim=0) - split = [] - for i in range(tp_size): - tp_qkv = torch.cat([tp_qkv_splits[item] for item in range(i, tp_size * 3, tp_size)]) - split.append(tp_qkv) - elif 'key_value.weight' in param_name and megatron_legacy: - split_dim = partitions[0][idx].data.shape[0] - if split_dim % (tp_size * 2) != 0: - raise ValueError( - f"Can not split K,V parameter {param_name} with shape {param.shape} into tensor parallel size {tp_size}. Not divisible by {tp_size * 2}." - ) - tp_qkv_splits = torch.chunk(partitions[0][idx].data, tp_size * 2, dim=0) - split = [] - for i in range(tp_size): - tp_qkv = torch.cat([tp_qkv_splits[item] for item in range(i, tp_size * 2, tp_size)]) - split.append(tp_qkv) - elif ('dense_h_to_4h' in param_name or 'linear_fc1' in param_name) and fast_glu_activation: - # For Megatron GPT model with Fast Glu activation - # Handle gated linear units - # concat all the first halves ('W's) and all the second halves ('V's) - w_split, k_split = torch.chunk(partitions[0][idx].data, 2, dim=0) - w_split = torch.chunk(w_split, tp_size, dim=0) - k_split = torch.chunk(k_split, tp_size, dim=0) - split = [torch.cat(weights, dim=0) for weights in zip(w_split, k_split)] # split per tp rank - - # Regular split for Megatron and NeMo-Megatron models. - else: - split = torch.split(partitions[0][idx].data, param.shape[0], dim=0) - - return split - - -def compute_tp_merge(idx, name, param, partitions_pp, model_cfg): - """ - Function to compute the partition merge required for tensor-parallelism. - - Args: - idx: The index used to select the parameter in the current pipeline partition. - name: - param: The parameter to be merged under TP 1 PP 1. - partitions_pp: List of all TP partitions of the flattened parameter of the current model for a given PP rank - (TP X PP Y). Indexed as partitions_pp[tp_rank][idx]. - model_cfg: The model config as an OmegaConf DictConfig. - - Returns: - The concatenated parameter for TP 1 PP 1. - """ - fast_glu_activation = str(model_cfg.get('activation', '')).lower() in ['fast-geglu', 'fast-swiglu', 'fast-reglu'] - - # Logic from original TP rank change - if param.shape == partitions_pp[0][idx].shape: - concated = partitions_pp[0][idx].data - elif param.shape[0] == partitions_pp[0][idx].shape[0]: - concated = torch.cat([partitions_pp[i][idx].data for i in range(len(partitions_pp))], dim=-1) - else: - concated = torch.cat([partitions_pp[i][idx].data for i in range(len(partitions_pp))], dim=0) - - # Logic for Fast Glu activation - if 'dense_h_to_4h' in name and fast_glu_activation: - # concat all the first halves ('W's) and all the second halves ('V's) - wk_splits = [] - for tpr in range(len(partitions_pp)): - wk_splits.append(torch.chunk(partitions_pp[tpr][idx].data, 2, dim=0)) - - w_split = torch.cat([w[0] for w in wk_splits], dim=0) - k_split = torch.cat([w[1] for w in wk_splits], dim=0) - concated = torch.cat([w_split, k_split], dim=0) - - # Trim padding - if concated.shape != param.shape: - logging.info( - f"Warning: Shape mismatch for parameter {name} required shape: {param.shape}, merged shape: {concated.shape}. Narrowing to match required size." - ) - if concated.shape[1:] == param.shape[1:]: - concated = torch.narrow(concated, 0, 0, param.shape[0]) - elif concated.shape[:-1] == param.shape[:-1]: - concated = torch.narrow(concated, -1, 0, param.shape[-1]) - else: - raise RuntimeError( - f"Can not handle parameter {name}, required shape: {param.shape}, merged shape: {concated.shape}." - ) - return concated - - -def write_tp_pp_split(model, splits, app_state, tp_size, pp_rank, write_path): - """ - Function to write the given TP PP split to NeMo File. - - Save each of the TP ranks in reverse order - This is done so that the last PP rank will save the last TP rank only after all other PP TP ranks are saved - The final rank will then save a new NeMo file with all other ranks inside. - - Args: - model: The model corresponding to the current TP PP split. Contains partial parameters. - splits: Nested List of tensors containing the TP splits of the current model given current PP rank. - Indexed as splits[idx][tp_rank]. - app_state: AppState object. - tp_size: The global tensor-parallel size of the final model. - pp_rank: The local pipeline parallel rank of the final model. - write_path: The path to save the NeMo file. - """ - for tp_rank in range(tp_size - 1, -1, -1): - app_state.pipeline_model_parallel_rank = pp_rank - app_state.tensor_model_parallel_rank = tp_rank - - idx = 0 - for name, param in model.named_parameters(): - split_val = splits[idx][tp_rank].clone() - - if param.shape != split_val.shape: - logging.info( - f"Warning: Shape mismatch for parameter {name} required shape: {param.shape}, split shape: {split_val.shape}. Padding to match required size." - ) - - if split_val.shape[1:] == param.shape[1:]: - pad = [0, 0] * len(split_val.shape) - pad[-1] = param.shape[0] - split_val.shape[0] - split_val = torch.nn.functional.pad(split_val, pad, 'constant') - elif split_val.shape[:-1] == param.shape[:-1]: - pad = [0, param.shape[-1] - split_val.shape[-1]] - split_val = torch.nn.functional.pad(split_val, pad, 'constant') - else: - raise RuntimeError( - f"Can not handle parameter {name}, required shape: {param.shape}, split shape: {split_val.shape}." - ) - - param.data = split_val - idx += 1 - - if write_path is not None: - logging.info(f"Writing pp rank {pp_rank} tp rank {tp_rank} to file {write_path}") - model.save_to(write_path) - - -def debug_log_split_param_diff(idx, param, param_name, partitions): - # Log some useful comparison of tensors that are being mapped. - # Note that the global param index for layers and modules may be different but the shapes - # and semantics of the layer should match. - logging.debug(f"Index: {idx} Model Params : {param_name} - {param.shape}") - logging.debug(f"Index: {idx} Global params: {partitions[1][idx]} - {partitions[0][idx].shape}") - - -################ -### Handlers ### -################ - - -class GPTHandler: - def __init__(self, megatron_legacy: bool): - self.duplicate_gpt_word_embedding_offset = 0 - self.untied_gpt_embedding = False - self.megatron_legacy = megatron_legacy - - def compute_split_index(self, model, idx, tp_rank, pp_rank, pp_split_rank, tp_size, pp_size): - if pp_rank == (pp_size - 1) and hasattr(model, 'model') and hasattr(model.model, 'word_embeddings'): - # duplicate embedding copy (tied weights) - self.duplicate_gpt_word_embedding_offset = 1 - - if model.cfg.get('share_embeddings_and_output_weights', True) is False: - self.untied_gpt_embedding = True - - if self.duplicate_gpt_word_embedding_offset > 0: - logging.info(f"GPT duplicate_gpt_word_embedding_offset: {self.duplicate_gpt_word_embedding_offset}") - - return idx + self.duplicate_gpt_word_embedding_offset - - def compute_splits(self, model, partitions, idx, tp_rank, pp_rank, pp_split_rank, tp_size, pp_size): - splits = [] - - # This is the PP X TP Y model with partial parameters present in correct order. - # We need to extract the parameters from the global map in reverse order to fill in the - # parameters of this model in forward order. - for param_name, param in model.named_parameters(): - - # Since we are moving forward, we may reach the end of the global map - # but GPT has an additional word embedding as its last parameter - # Therefore we check for this, and reset the index to the parameter of the PP 0 TP 0 rank - # which holds the parameters of the embedding. - if idx == (len(partitions[0])) and self.duplicate_gpt_word_embedding_offset > 0: - logging.info("Found duplicate embedding copy for GPT model, resetting index") - idx = 0 # reset idx parameter to 0 if we have duplicate embedding copy - - debug_log_split_param_diff(idx, param, param_name, partitions) - - # Tensor Parallel Splitting - split = compute_tp_splits( - param_name, - param, - partitions, - idx, - tp_size, - pp_size, - pp_rank, - pp_split_rank, - self.megatron_legacy, - model.cfg, - ) - - splits.append(split) - idx += 1 - - return idx, splits - - def compute_split_offset(self, offset_diff, tp_rank, pp_rank, pp_split_rank, tp_size, pp_size): - # GPT offset correction - if not self.untied_gpt_embedding and pp_size > 1 and pp_rank == (pp_size - 1) and pp_split_rank == 0: - offset_diff += 1 - - return offset_diff - - -class T5Handler: - def __init__(self, megatron_legacy: bool): - self.shared_enc_dec_embeddings = False - self.shared_enc_dec_embeddings_intermediate = False - self.enc_dec_share_token_embeddings_count = 0 - self.intermediate_shared_embedding_location = -1 - self.megatron_legacy = megatron_legacy - - def compute_split_index(self, model, idx, tp_rank, pp_rank, pp_split_rank, tp_size, pp_size): - final_idx = idx - - # Special case for T5 models - where the embeddings are shared between encoder and decoder - # and the rank of decoder split is arbitrary. - # Megatron T5 check for pipeline_model_parallel_split_rank in order to inject encoder embeddings - self.shared_enc_dec_embeddings = ( - pp_split_rank > 0 and pp_split_rank == pp_rank and model.cfg.get('share_token_embeddings', True) - ) - # If embedding sharing is active, both vocab and position embeddings are shared - if self.shared_enc_dec_embeddings: - self.enc_dec_share_token_embeddings_count = 2 - else: - self.enc_dec_share_token_embeddings_count = 0 - - # Start to calculate new idx - final_idx = final_idx + self.enc_dec_share_token_embeddings_count - - # Special case for T5 models - where the embeddings are shared between encoder and decoder - # For all decoder ranks which are not the pp_split_rank, we need to inject the vocab embeddings only at - # an intermediate location of the model (usually second last location). - # Megatron T5 check for pipeline_model_parallel_split_rank in order to inject encoder embeddings - # when the pipeline_model_parallel_split_rank is not the last PP rank - self.shared_enc_dec_embeddings_intermediate = ( - pp_split_rank > 0 - and pp_split_rank < pp_size - and hasattr(model, 'enc_dec_model') - and hasattr(model.enc_dec_model, 'word_embeddings') - ) - - if self.shared_enc_dec_embeddings_intermediate: - # Loop until we get the location of this tensor - self.intermediate_shared_embedding_location = -1 - for param_name, param in model.named_parameters(): # special case for T5 - if param_name == 'enc_dec_model.word_embeddings.weight': - self.intermediate_shared_embedding_location += 1 - break - self.intermediate_shared_embedding_location += 1 - else: - self.intermediate_shared_embedding_location = -1 - - # Re-evaluate the intermediate shared embedding flag - self.shared_enc_dec_embeddings_intermediate = self.shared_enc_dec_embeddings_intermediate and ( - self.intermediate_shared_embedding_location >= 0 - ) - # If module is present, add a module offset to the index - if self.shared_enc_dec_embeddings_intermediate: - final_idx += 1 - - if self.enc_dec_share_token_embeddings_count: - logging.info(f"EncDec share_token_embeddings_count: {self.enc_dec_share_token_embeddings_count}") - if self.shared_enc_dec_embeddings_intermediate: - logging.info( - f"EncDec share_enc_dec_embeddings_intermediate: {self.intermediate_shared_embedding_location}" - ) - - return final_idx - - def compute_splits(self, model, partitions, idx, tp_rank, pp_rank, pp_split_rank, tp_size, pp_size): - splits = [] - - # Backup index when EncDec models reset the index to fill in the first embedding matrices (when pp split rank == pp rank) - computed_index = idx - - # This is the PP X TP Y model with partial parameters present in correct order. - # We need to extract the parameters from the global map in reverse order to fill in the - # parameters of this model in forward order. - for param_name, param in model.named_parameters(): - - # Since we are moving forward, we may reach the end of the global map - # but T5 has an additional word embedding as its first two parameter when pp split rank == pp rank - # Therefore we check for this, and update the index to the parameter of the PP 0 TP 0 rank - # which holds the parameters of the embedding. - if self.enc_dec_share_token_embeddings_count: - logging.info("EncDec models decoder shares embedding with encoder, resetting index") - idx = ( - 2 - self.enc_dec_share_token_embeddings_count - ) # 0th index is vocab embedding, 1 is pos embedding, 2 is embedding count - - # Since we are moving forward, we may reach the end of the global map - # but T5 has an additional word embedding as randomly located in the decoder when - # when pp rank > pp_split_rank. - # Therefore we check for this, and skip the parameter of the current TP X PP Y module - # and fill this parameter later. - if self.shared_enc_dec_embeddings_intermediate and param_name == 'enc_dec_model.word_embeddings.weight': - logging.info( - "EncDec models decoder shares embedding with encoder in intermediate pos, skipping module for later update" - ) - continue - - debug_log_split_param_diff(idx, param, param_name, partitions) - - # Tensor Parallel Splitting - split = compute_tp_splits( - param_name, - param, - partitions, - idx, - tp_size, - pp_size, - pp_rank, - pp_split_rank, - self.megatron_legacy, - model.cfg, - ) - - splits.append(split) - idx += 1 - - # When pp split rank is equal to current pp rank, we need to first inject the encoder embeddings - # and then reset the index to the originally computed index - if self.enc_dec_share_token_embeddings_count > 0: - if self.enc_dec_share_token_embeddings_count - 1 == 0: - idx = computed_index - - self.enc_dec_share_token_embeddings_count -= 1 - - # Inject the EncDec shared embeddings intermediate tensor - # at one random location in the decoder of this TP PP rank. - # Note that usually it is the second last tensor, but to avoid specific index we search for it - # again. - if self.shared_enc_dec_embeddings_intermediate: - for param_name, param in model.named_parameters(): - if param_name == 'enc_dec_model.word_embeddings.weight': - logging.info("Found intermediate shared embedding, injecting") - split = compute_tp_splits( - param_name, - param, - partitions, - global_idx=0, - tp_size=tp_size, - pp_size=pp_size, - pp_rank=pp_rank, - pp_split_rank=pp_split_rank, - megatron_legacy=self.megatron_legacy, - model_cfg=model.cfg, - ) - splits.insert(self.intermediate_shared_embedding_location, split) - break - - return idx, splits - - def compute_split_offset(self, offset_diff, tp_rank, pp_rank, pp_split_rank, tp_size, pp_size): - # T5 offset correction for shared embedding when pp split rank == pp rank - if self.shared_enc_dec_embeddings: - offset_diff += 2 - - # T5 offset correction for intermediate shared embedding when pp rank > pp split rank - if self.shared_enc_dec_embeddings_intermediate: - offset_diff += 1 - - return offset_diff - - -################## -### Converters ### -################## - - -def merge_partition(model, partitions: Dict[int, List[List[torch.Tensor]]], write_path: str = None): - # Extract the pp_rank and number of modules per tp rank in each pp rank - pp_ranks = list(partitions.keys()) - pp_lens = [] - for pp_rank in pp_ranks: - partition_pp = partitions[pp_rank] - max_len = max([len(x) for x in partition_pp]) # Perform max as we need to iterate through all modules - pp_lens.append(max_len) - - total_params_merged = len([p for p in model.parameters()]) - pp_total_len = sum(pp_lens) - logging.info(f"Total layers in Merged Model: {total_params_merged}") - - og_pp_split_rank = 0 - if pp_total_len > total_params_merged: - og_pp_split_rank = model.cfg.get('pipeline_model_parallel_split_rank', 0) - - idx = 0 - pp_rank = 0 - global_idx = 0 - - # During merge - model is TP 1 PP 1 model with all parameters present in correct order. - # Merge the parameters of the various PP X TP Y models into the TP 1 PP 1 model. - for name, param in model.named_parameters(): - # Since the PP ranks each contain the list of all their TP rank parameters - # We need to detect if we need to move to the next PP rank when we run out of tensors in current PP rank - # Reset the index so that it indexes the new pp rank tensor list correctly - if idx >= pp_lens[pp_rank]: - pp_rank += 1 - idx = 0 - - # For EncDec models, after the encoder-decoder PP split occurs, - # the vocab and positional embeddings are duplicated across the PP ranks at the - # beginning of the decoder rank. We can skip them during the merge step. - if pp_total_len > total_params_merged: - if og_pp_split_rank > 0 and og_pp_split_rank == pp_rank: - logging.info( - f"Skipping duplicate vocab and positional embeddings for EncDec model " - f"at the pp split rank: {og_pp_split_rank}" - ) - idx += 2 - - # For EncDec models, after the pp split occurs, final pp rank of the decoder - # has an intermediate embedding tensor at the penultimate positon, skip that. - if og_pp_split_rank > 0 and global_idx == total_params_merged - 1: - logging.info( - f"Skipping intermediate embedding tensor for EncDec model at the final pp split " - f"rank: {og_pp_split_rank}", - ) - idx = pp_lens[pp_rank] - 1 - - # Extract all TP ranks out of current PP rank - partitions_pp = partitions[pp_rank] - - logging.debug( - f"Global idx: {global_idx} Index: {idx} Model Param: {name} " - f"Partition Params: {[p[idx].shape for p in partitions_pp]}" - ) - - # Original TP rank change logic - concated = compute_tp_merge(idx, name, param, partitions_pp, model.cfg) - - # Update the model parameter with the merged tensor - param.data = concated - idx += 1 - global_idx += 1 - - # Save the file iff the original file was PP 1 TP 1 - if write_path is not None: - model.save_to(write_path) - - -def split_partition( - model, - partitions, - pp_size: int, - tp_size: int, - pp_rank: int, - offset: int, - pp_split_rank: int = 0, - write_path: str = None, - megatron_legacy: bool = False, -): - if len(partitions) != 2: - raise ValueError( - "Can only split partitions of model with TP=1. For partitions of models with TP>1, merge first." - ) - - if tp_size < 1: - raise ValueError("TP size must to be >= 1.") - - if pp_size < 1: - raise ValueError("PP size must to be >= 1.") - - # Setup app state to mimic current PP and TP ranks with single merged module - app_state = AppState() - app_state.data_parallel_rank = 0 - app_state.pipeline_model_parallel_size = pp_size - app_state.tensor_model_parallel_size = tp_size - app_state.model_parallel_size = app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size - - # Go in reverse for TP order, as PP 0 TP 0 will merge all preceding files - app_state.pipeline_model_parallel_rank = pp_rank - app_state.tensor_model_parallel_rank = tp_size - 1 - - # Compute reverse offset of parameter index from global map - num_params = sum([1 for _ in model.parameters()]) # Count number of parameters iteratively - idx = offset - num_params + 1 # start index of current PP TP rank in global map - - assert ( - idx + num_params - 1 == offset - ), f"idx = {idx}, num_params = {num_params}, sum = {idx + num_params}, offset = {offset}" - - # Special case for GPT models - whose last PP TP rank has a duplicate embedding tensor - - if 'gpt' in model.cfg.target.lower(): - logging.info("Splitting GPT model") - handler = GPTHandler(megatron_legacy=megatron_legacy) - - elif 't5' in model.cfg.target.lower(): - logging.info("Splitting T5 model") - handler = T5Handler(megatron_legacy=megatron_legacy) - - else: - raise ValueError(f"Unsupported model for Pipeline Parallelism change - {model.cfg.target}") - - idx = handler.compute_split_index(model, idx, 0, pp_rank, pp_split_rank, tp_size, pp_size) - - # Print some debug info - logging.info(f"Start Layer Idx: {idx} Number of layers in current rank: {num_params} Offset: {offset}") - logging.info("\n") - - # Split the model's parameters according to TP PP ranks - idx, splits = handler.compute_splits(model, partitions, idx, 0, pp_rank, pp_split_rank, tp_size, pp_size) - - # Compute the new offset for the next PP rank in reverse order - # Add 1 to offset to account for last PP rank's duplicated Embedding - offset_diff = offset - num_params - offset_diff = handler.compute_split_offset(offset_diff, 0, pp_rank, pp_split_rank, tp_size, pp_size) - - # Finalize the new offset - new_offset = offset_diff - - # Save each of the TP ranks in reverse order - # This is done so that the last PP rank will save the last TP rank only after all other PP TP ranks are saved - # The final rank will then save a new NeMo file with all other ranks inside. - write_tp_pp_split(model, splits, app_state, tp_size, pp_rank, write_path) - - return new_offset - - -def split_tp_partition_only(model, partitions, tp_size, write_path=None, megatron_legacy=False): - if len(partitions) != 2: - raise ValueError( - "Can only split partitions of model with TP=1. For partitions of models with TP>1, merge first." - ) - - if tp_size < 1: - raise ValueError("TP size must to be >= 1.") - - app_state = AppState() - app_state.data_parallel_rank = 0 - app_state.pipeline_model_parallel_size = 1 - app_state.tensor_model_parallel_size = tp_size - app_state.model_parallel_size = app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size - - app_state.pipeline_model_parallel_rank = 0 - app_state.tensor_model_parallel_rank = tp_size - 1 - - idx = 0 - splits = [] - for param_name, param in model.named_parameters(): - split = compute_tp_splits( - param_name, - param, - partitions, - idx, - tp_size, - pp_size=1, - pp_rank=0, - pp_split_rank=0, - megatron_legacy=megatron_legacy, - model_cfg=model.cfg, - ) - splits.append(split) - idx += 1 - - # Save each of the TP ranks in reverse order - # This is done so that the last PP rank will save the last TP rank only after all other PP TP ranks are saved - # The final rank will then save a new NeMo file with all other ranks inside. - write_tp_pp_split(model, splits, app_state, tp_size, pp_rank=0, write_path=write_path) - - -def main(): - parser = ArgumentParser() - parser.add_argument("--model_file", type=str, default=None, required=False, help="Path to source .nemo file") - parser.add_argument("--target_file", type=str, required=True, help="Path to write target .nemo file") - parser.add_argument( - "--tensor_model_parallel_size", type=int, default=-1, required=False, help="TP size of source model" - ) - parser.add_argument("--target_tensor_model_parallel_size", type=int, required=True, help="TP size of target model") - parser.add_argument( - '--pipeline_model_parallel_size', type=int, default=-1, required=False, help='PP size of source model' - ) - parser.add_argument( - '--target_pipeline_model_parallel_size', type=int, required=True, help='PP size of target model' - ) - parser.add_argument( - '--target_pipeline_model_parallel_split_rank', type=int, default=0, help='PP rank to split for Enc-Dec models' - ) - parser.add_argument( - '--virtual_pipeline_model_parallel_size', type=int, default=None, help='Virtual Pipeline parallelism size' - ) - parser.add_argument( - '--ckpt_name', type=str, default=None, help='Checkpoint name to load from for Virtual Parallel' - ) - parser.add_argument( - "--model_class", - type=str, - default="nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel", - help="NeMo model class. This script should support all NeMo megatron models that use Tensor Parallel", - ) - parser.add_argument("--precision", default=16, help="PyTorch Lightning Trainer precision flag") - parser.add_argument('--num_gpu_per_node', default=8, type=int, help='Number of GPUs per node') - parser.add_argument( - "--megatron_legacy", - action="store_true", - help="Converter for legacy megatron modles that have different q,k,v weight splits", - ) - parser.add_argument( - "--tokenizer_model_path", - type=str, - required=False, - default=None, - help="Path to the tokenizer model path if your model uses a tokenizer model as an artifact. This is needed if your model uses a sentencepiece tokenizer.", - ) - parser.add_argument( - "--tokenizer_vocab_file", - type=str, - required=False, - default=None, - help="Path to the tokenizer model path if your model uses a tokenizer model as an artifact. This is needed if your model uses a sentencepiece tokenizer.", - ) - parser.add_argument('--hparams_file', type=str, default=None, help='Path to hparams file from PTL training') - parser.add_argument('--tp_conversion_only', action='store_true', help='Only convert TP model to TP model') - parser.add_argument('--model_extracted_dir', type=str, default=None, help='Path to pre-extracted model directory') - - args = parser.parse_args() - - precision = args.precision - num_gpu_per_node = int(args.num_gpu_per_node) - if args.precision in ["32", "16"]: - precision = int(float(args.precision)) - - if precision in ["bf16", "bf16-mixed"]: - if torch.cuda.is_available() and torch.cuda.is_bf16_supported(): - pass - else: - logging.warning("BF16 is not supported on this device. Using FP16 instead.") - precision = precision[2:] - - if precision == 32: - dtype = torch.float32 - elif precision in [16, "16", "16-mixed"]: - dtype = torch.float16 - elif precision in ["bf16", "bf16-mixed"]: - dtype = torch.bfloat16 - else: - dtype = torch.float32 # fallback - - # Built target directory if it does not exist - target_dir = os.path.split(args.target_file)[0] - if not os.path.exists(target_dir): - os.makedirs(target_dir, exist_ok=True) - - tp_size = args.tensor_model_parallel_size - tgt_tp_size = args.target_tensor_model_parallel_size - pp_size = args.pipeline_model_parallel_size - tgt_pp_size = args.target_pipeline_model_parallel_size - pipeline_model_parallel_split_rank = args.target_pipeline_model_parallel_split_rank - vp_size = args.virtual_pipeline_model_parallel_size - if vp_size is None: - vp_size = 1 - - convert_vp = vp_size > 1 - if convert_vp: - from megatron.core import parallel_state - - parallel_state.set_virtual_pipeline_model_parallel_world_size(vp_size) - - hparams_filepath = args.hparams_file - if hparams_filepath is None: - logging.warning( - '\n\n\n!!!!!!!!!\n' - 'You are converting a model with virtual pipeline parallelism enabled, \n' - 'but have not passed `hparams_file` argument. \n' - 'This will cause each ckpt file to be temporarily laoded onto GPU memory!\n\n' - 'It is highly recommended to pass `hparams_file` argument to avoid this.\n' - ) - else: - hparams_filepath = None - - # Import the class of the model - cls = model_utils.import_class_by_path(args.model_class) - - if args.model_file is None and args.model_extracted_dir is None: - raise ValueError("Cannot pass model_file and model_extracted_dir as None at the same time.") - - tmp_cfg = cls.restore_from( - restore_path=args.model_file, - trainer=Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision), - map_location=torch.device("cpu"), - return_config=True, - ) - plugins = [] - if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=tmp_cfg.get('native_amp_init_scale', 2 ** 32), - growth_interval=tmp_cfg.get('native_amp_growth_interval', 1000), - hysteresis=tmp_cfg.get('hysteresis', 2), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - - if tmp_cfg.get('megatron_amp_O2', False): - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision) - - if tp_size < 0 or pp_size < 0: - logging.info(f"Loading model config from {args.model_file} to get TP and PP size") - model_config_internal = cls.restore_from( - restore_path=args.model_file, trainer=trainer, map_location=torch.device("cpu"), return_config=True, - ) - - tp_size = model_config_internal.get('tensor_model_parallel_size', 1) - pp_size = model_config_internal.get('pipeline_model_parallel_size', 1) - - # Check if TP conversion only - tp_conversion_only = args.tp_conversion_only - if tp_conversion_only: - logging.info("Converting TP model to TP model only") - - if pp_size > 1: - raise ValueError("Provided `--tp_conversion_only` but `--pipeline_model_parallel_size` > 1") - - if tgt_pp_size > 1: - raise ValueError("Provided `--tp_conversion_only` but `--target_pipeline_model_parallel_size` > 1") - - if pipeline_model_parallel_split_rank > 0: - raise ValueError("Provided `--tp_conversion_only` but `--target_pipeline_model_parallel_split_rank` > 0") - - # Force PP size to 1 - pp_size = 1 - tgt_pp_size = 1 - pipeline_model_parallel_split_rank = 0 - - if vp_size is None or vp_size < 0: - vp_size = 1 - - app_state = AppState() - app_state.data_parallel_rank = 0 - app_state.pipeline_model_parallel_size = pp_size - app_state.tensor_model_parallel_size = tp_size - - if vp_size > 1: - app_state.virtual_pipeline_model_parallel_size = vp_size - app_state.model_parallel_size = app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size - - world_size = pp_size * tp_size # pseudo world size for simulating load of a specific rank on a single gpu - - app_state.tensor_model_parallel_rank = 0 - app_state.pipeline_model_parallel_rank = 0 - - if vp_size > 1: - set_virtual_parallel_rank_safely(0) - - # Extract tokenizer artifact from the model to temp directory - logging.info("Extracting tokenizer artifact from NeMo file...") - temp_dir = tempfile.mkdtemp() - tokenizer_model_path = None - with tarfile.open(args.model_file, "r") as tar: - for member in tar.getmembers(): - if '.model' in member.name: - extracted_file = tar.extractfile(member) - extracted_file_path = os.path.join(temp_dir, member.name) - - if tokenizer_model_path is None: - logging.info(f"Found tokenizer. Extracting {member.name} to {extracted_file_path}") - - tokenizer_model_path = extracted_file_path - with open(extracted_file_path, "wb") as f: - f.write(extracted_file.read()) - else: - if args.tokenizer_model_path is None: - logging.warning( - f"\n\nFound multiple tokenizer artifacts in the model file.\n" - f"Using only {tokenizer_model_path}.\n" - f"If this is incorrect, manually pass the correct tokenizer using " - f"`--tokenizer_model_path`.\n\n" - ) - - # If input model has TP > 1 or PP > 1 - # Reconstruct the model to have TP = 1 and PP = 1 - # Note that this is a forward loop that will process PP [0..N] TP [0..M] in sequential order. - if tp_size > 1 or pp_size > 1: - partitions = {} # 3d list of VP x PP x TP - model = None - - # Build partitions structure - for vp_idx in range(vp_size): - partitions[vp_idx] = [] # Build first layer - VP - - for pp_idx in range(pp_size): - # For each VP, build PP x TP holder - partitions[vp_idx].append({}) - partitions[vp_idx][pp_idx] = [] - - for vp_rank in range(vp_size): - if vp_size > 1: - set_virtual_parallel_rank_safely(vp_rank) - - for pp_rank in range(pp_size): - app_state.pipeline_model_parallel_rank = pp_rank - - for tp_rank in range(tp_size): - app_state.tensor_model_parallel_rank = tp_rank - - logging.info(f"Loading ------------ PP Rank: {pp_rank} TP Rank: {tp_rank}") - - # Override flag that forces Model to use AppState instead of Trainer - # to determine the world size, global and local rank - # Used for simulating load of a specific rank on a single gpu - os.environ[NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE] = "true" - - # Compute the global rank to load the correct subset of parameters - global_rank = pp_rank * tp_size + tp_rank - - # Update AppState - app_state.world_size = world_size - app_state.global_rank = global_rank - app_state.local_rank = global_rank % num_gpu_per_node - app_state.pipeline_model_parallel_size = pp_size - app_state.tensor_model_parallel_size = tp_size - app_state.pipeline_model_parallel_split_rank = pipeline_model_parallel_split_rank - app_state.model_parallel_size = ( - app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size - ) - - if vp_size > 1: - set_virtual_parallel_rank_safely(vp_rank) - - if vp_rank == 0: - save_restore_connector = NLPSaveRestoreConnector() - - if args.model_extracted_dir is not None: - logging.info(f"Using extracted model directory: {args.model_extracted_dir}") - save_restore_connector.model_extracted_dir = args.model_extracted_dir - - if args.model_file is not None: - model_filepath = args.model_file - else: - model_filepath = args.model_extracted_dir - - if vp_size == 1: - - # Get model config - tmp_cfg = cls.restore_from( - restore_path=model_filepath, - trainer=trainer, - map_location=torch.device("cpu"), - save_restore_connector=save_restore_connector, - return_config=True, - ) - - # Force model onto CPU - tmp_cfg, restore_dict = force_cpu_model(tmp_cfg) - - # Restore model - model = cls.restore_from( - restore_path=model_filepath, - trainer=trainer, - map_location=torch.device("cpu"), - save_restore_connector=save_restore_connector, - override_config_path=tmp_cfg, - ) - model.freeze() - - # Restore model config - restore_model_config(model.cfg, restore_dict) - - else: - if args.ckpt_name is None: - raise ValueError( - "For Virtual Parallel, ckpt name is required.\n" - "Please provide `--ckpt_name` argument." - ) - - # inject model parallel rank - checkpoint_path = model_utils.inject_model_parallel_rank( - os.path.join(model_filepath, args.ckpt_name) - ) - - vp_state_dict = torch.load(checkpoint_path, map_location="cpu") - - if hparams_filepath is not None: - # Force the model onto CPU - tmp_cfg = OmegaConf.load(hparams_filepath) - tmp_cfg, restore_dict = force_cpu_model(tmp_cfg) - - with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.yml') as tmp: - OmegaConf.save(tmp_cfg, tmp, resolve=True) - tmp.seek(0) - - model = cls.load_from_checkpoint( - checkpoint_path=checkpoint_path, - trainer=trainer, - map_location=torch.device("cpu"), - hparams_file=tmp.name, - ) - model.freeze() - - restore_model_config(model.cfg, restore_dict) - - else: - model = cls.load_from_checkpoint( - checkpoint_path=checkpoint_path, trainer=trainer, map_location=torch.device("cpu"), - ) - model.freeze() - - model.to(dtype=dtype) - - # Reset env flag - os.environ.pop(NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE, None) - - logging.info( - f"<<<<<<<< LOADED MODEL PP={pp_rank + 1} TP={tp_rank + 1} | " - f"GLOBAL RANK = {global_rank} >>>>>>>>>" - ) - - # Save the parameters - if vp_size == 1: - params = [p for p in model.parameters()] - partitions[vp_rank][pp_rank].append(params) # vp_rank = 0 - - else: - vp_params_tmp = [] - for vp_idx in range(vp_size): - set_virtual_parallel_rank_safely(vp_idx) - vp_params = vp_state_dict[f'model{vp_idx}'] - model.model[vp_idx].module.load_state_dict(vp_params, strict=True) - model.model[vp_idx].module.to('cpu') - params = [p for p in model.model[vp_idx].module.parameters()] - vp_params_tmp.append(params) - # partitions[pp_rank][vp_idx].append(params) - - for vp_idx in range(vp_size): - partitions[vp_idx][pp_rank].append(vp_params_tmp[vp_idx]) - - del vp_params_tmp - set_virtual_parallel_rank_safely(0) - - # app_state is being updated incorrectly during restore - app_state.data_parallel_rank = 0 - app_state.pipeline_model_parallel_rank = pp_rank - app_state.tensor_model_parallel_rank = tp_rank - app_state.pipeline_model_parallel_size = pp_size - app_state.tensor_model_parallel_size = tp_size - app_state.model_parallel_size = ( - app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size - ) - - if vp_size > 1: - app_state.virtual_pipeline_model_parallel_size = vp_size - set_virtual_parallel_rank_safely(vp_rank) - - # Build a unified model with PP 1 TP 1 - with open_dict(model.cfg): - model.cfg.tensor_model_parallel_size = 1 - model.cfg.pipeline_model_parallel_size = 1 - model.cfg.virtual_pipeline_model_parallel_size = None - - app_state.global_rank = 0 - app_state.local_rank = 0 - app_state.data_parallel_rank = 0 - app_state.pipeline_model_parallel_rank = 0 - app_state.tensor_model_parallel_rank = 0 - app_state.pipeline_model_parallel_size = 1 - app_state.tensor_model_parallel_size = 1 - app_state.model_parallel_size = 1 - - if vp_size > 1: - set_virtual_parallel_rank_safely(None) - - trainer = Trainer( - plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision - ) - - with open_dict(model.cfg): - if args.tokenizer_model_path is not None: - model.cfg.tokenizer.model = args.tokenizer_model_path - if args.tokenizer_vocab_file is not None: - model.cfg.tokenizer.vocab_file = args.tokenizer_vocab_file - - model.cfg, restore_dict = force_cpu_model(model.cfg) - - # Remove Virtual Parallelism - model.cfg.virtual_pipeline_model_parallel_size = None - - logging.info(f"<<<<<<<< Building TP 1 PP 1 base model >>>>>>>>>") - model = cls(model.cfg, trainer) # type: nn.Module - model.freeze() - model = model.to('cpu') - model._save_restore_connector = NLPSaveRestoreConnector() - - restore_model_config(model.cfg, restore_dict) - - vp_param_count = 0 - for vp in range(vp_size): - for pp in range(pp_size): - for tp in range(tp_size): - vp_param_count += len(partitions[vp][pp][tp]) - - if vp_size > 1: - logging.debug(f"Total params in TP PP VP = 1 : {len(list(model.parameters()))}") - logging.debug(f"Total params in VP PP TP (og): {vp_param_count}") - - # Flatten Virtual Pipeline - if vp_size == 1: - # unpack vp container, pack pp tp container - partitions = partitions[0] - partitions = {idx: val for idx, val in enumerate(partitions)} - else: - flat_partitions = {idx: [] for idx in range(pp_size)} - - """ - Under VP convention - Notation : - Stage = PP rank - Number = GPT model / layer index - Ignore TP - every PP has all TP corresponding to that PP - chunk_index = the physical index of any [] in the list. Ex idx = 2 in below map corresponds to [2: PP 0 VP 1]] - - - For a PP 2 VP 4 model with 8 GPT layers- - - Indices - # Stage 0: [0:PP 0 VP 0] [2:PP 0 VP 1] [4:PP 0 VP 2] [6:PP 0 VP 3] - # Stage 1: [1:PP 1 VP 0] [3:PP 1 VP 1] [5:PP 1 VP 2] [7:PP 1 VP 3] - - after conversion will become - - # Stage 0: [0,1,2,3:PP 0] - # Stage 1: [4,5,6,7:PP 1] - - """ - pp_index = 0 - chunk_counter = 0 - tp_cache = [[] for _ in range(tp_size)] - - for vp in range(vp_size): - for pp in range(pp_size): - # Gather all TP under this VP PP combination. - # We will accumulate TP parameters from multiple layers in this cache. - for tp in range(tp_size): - tp_cache[tp].extend(partitions[vp][pp][tp]) - - # This counter indexes the global selection of a VP PP combination in the above map - chunk_counter += 1 - - # Log the mapping from old VP x PP to new PP index - logging.info(f"VP Conversion - vp: {vp} pp: {pp} -> pp_idx: {pp_index}") - - # Every vp_size chunks, we can fill a new PP index in the flat_partitions - if chunk_counter % vp_size == 0: - flat_partitions[pp_index].extend(tp_cache) - tp_cache = [[] for _ in range(tp_size)] - pp_index += 1 - - logging.debug( - f"VP merge step: \n" - f"vp: {vp} pp: {pp} pp_idx: {pp_index - 1} " - f"len(flat_partitions): {len(flat_partitions[pp_index - 1])}" - ) - - logging.debug(f"PP Size len(flat partitions) : {len(flat_partitions)}") - logging.debug(f"TP Size len(flat partitions[0]): {len(flat_partitions[0])}") - logging.debug(f"Layers len(flat partitions[0][0]) : {len(flat_partitions[0][0])}") - - partitions = flat_partitions - del tp_cache - - if tgt_tp_size > 1 or tgt_pp_size > 1: - merge_partition(model, partitions) - else: - # Write out the PP 1 TP 1 model to disk - merge_partition(model, partitions, args.target_file) - - # Empty cache memory of all parameters from all PP TP partitions - partitions.clear() - - else: - # If input model has TP = 1 and PP = 1 - app_state.model_parallel_size = 1 - - save_restore_connector = NLPSaveRestoreConnector() - - if args.model_extracted_dir is not None: - logging.info(f"Using extracted model directory: {args.model_extracted_dir}") - save_restore_connector.model_extracted_dir = args.model_extracted_dir - - if args.model_file is not None: - model_filepath = args.model_file - else: - model_filepath = args.model_extracted_dir - - tmp_cfg = cls.restore_from( - restore_path=model_filepath, - trainer=trainer, - map_location=torch.device("cpu"), - save_restore_connector=save_restore_connector, - return_config=True, - ) - - tmp_cfg, restore_dict = force_cpu_model(tmp_cfg) - - model = cls.restore_from( - restore_path=model_filepath, - trainer=trainer, - map_location=torch.device("cpu"), - save_restore_connector=save_restore_connector, - override_config_path=tmp_cfg, - ) - model.to(dtype=dtype) - - restore_model_config(model.cfg, restore_dict) - - # If target model has TP > 1 or PP > 1 - if tgt_pp_size > 1 or tgt_tp_size > 1: - - # Preserve the TP 1 PP 1 model parameters and names - global_params = [] - global_params.append([p for n, p in model.named_parameters()]) # params - global_params.append([n for n, p in model.named_parameters()]) # names - - logging.debug("Global parameters:") - for idx, (name, p) in enumerate(zip(global_params[1], global_params[0])): - logging.debug(f"{name} - {p.shape}") - - logging.info(f"TP 1 PP 1 Number of Parameters : {len(global_params[0])}") - - world_size = ( - tgt_pp_size * tgt_tp_size - ) # pseudo world size for simulating load of a specific rank on a single gpu - new_global_batch_size = model.cfg.micro_batch_size * world_size - old_global_batch_size = model.cfg.get('global_batch_size', model.cfg.micro_batch_size) - - global_offset = len(global_params[0]) - 1 # -1 cause this indexes the array, range [0, L-1] - logging.info(f"Final layer offset for parameters: {global_offset}") - - for pp_rank in range(tgt_pp_size - 1, -1, -1): # reverse order - - with open_dict(model.cfg): - model.cfg.pipeline_model_parallel_size = tgt_pp_size - model.cfg.tensor_model_parallel_size = tgt_tp_size - - if 'pipeline_model_parallel_split_rank' in model.cfg: - if pipeline_model_parallel_split_rank > 0: - model.cfg.pipeline_model_parallel_split_rank = pipeline_model_parallel_split_rank - elif pp_size > 1: - logging.warning( - f"Model config has `pipeline_model_parallel_split_rank` set to " - f"{model.cfg.pipeline_model_parallel_split_rank} and target PP " - f"size is {tgt_pp_size}. " - f"Provided `pipeline_model_parallel_split_rank` is " - f"{pipeline_model_parallel_split_rank}. " - f"Be careful that the model config is correct " - f"if encoder-decoder models are being converted." - ) - - model.cfg.global_batch_size = old_global_batch_size # Used for restoration - - # Override flag that forces Model to use AppState instead of Trainer - # to determine the world size, global and local rank - # Used for simulating load of a specific rank on a single gpu - os.environ[NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE] = "true" - - # Compute the global rank - global_rank = ( - pp_rank * tgt_tp_size + 0 - ) # tp_rank = 0 needed just for modules, all TP will be merged to this PP rank - - # Update AppState - app_state.world_size = world_size - app_state.global_rank = global_rank - app_state.local_rank = global_rank % num_gpu_per_node - app_state.pipeline_model_parallel_size = tgt_pp_size - app_state.tensor_model_parallel_size = tgt_tp_size - app_state.model_parallel_size = ( - app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size - ) - - trainer = Trainer( - plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision - ) - if args.tokenizer_model_path is not None: - with open_dict(model.cfg): - model.cfg.tokenizer.model = args.tokenizer_model_path - - else: - if tokenizer_model_path is None: - logging.warning("Could not extract tokenizer model file from checkpoint.") - - else: - # Extract tokenizer info - with open_dict(model.cfg): - model.cfg.tokenizer.model = tokenizer_model_path - - model.cfg, restore_dict = force_cpu_model(model.cfg) - - model = cls(model.cfg, trainer) - model = model.to('cpu') - model._save_restore_connector = NLPSaveRestoreConnector() - model.freeze() - model.to(dtype=dtype) - - restore_model_config(model.cfg, restore_dict) - - # Update global batch size - if old_global_batch_size % new_global_batch_size != 0 or old_global_batch_size < new_global_batch_size: - logging.info( - f"Global batch size {old_global_batch_size} is not divisible by new global batch size {new_global_batch_size}." - f" The model config will be updated with new global batch size {new_global_batch_size}." - ) - with open_dict(model.cfg): - model.cfg.global_batch_size = new_global_batch_size - - logging.info(f"Global rank: {global_rank} Local rank: {app_state.local_rank} World size: {world_size}") - logging.info(f"PP rank: {pp_rank} TP rank: {0}") - logging.info(f"TP 1 PP 1 Number of Layers : {len(global_params[0])}") - logging.info(f"Remaining layer offset for parameters: {global_offset}") - logging.info("\n") - - # Special case for TP conversion only mode - if tp_conversion_only: - logging.info(f"Skipping PP split due to flag `--tp_conversion_only`") - - split_tp_partition_only(model, global_params, tgt_tp_size, args.target_file, args.megatron_legacy) - break - - global_offset = split_partition( - model, - global_params, - tgt_pp_size, - tgt_tp_size, - pp_rank, - global_offset, - pipeline_model_parallel_split_rank, - args.target_file, - args.megatron_legacy, - ) - - # Reset env flag - os.environ.pop(NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE, None) - - # Check if invalid global offset - after all PP splits, global offset should be -1 - if global_offset < -1 and not tp_conversion_only: - raise ValueError( - f"Invalid global offset {global_offset} found for global rank {app_state.global_rank} " - f"and local rank {app_state.local_rank}. Should be -1 if all parameters have been assigned. " - f"Currently, seems some parameters were duplicated." - ) - elif global_offset > -1 and not tp_conversion_only: - logging.error("\n") - logging.error("!" * 80) - logging.error("Error: Some parameters were not correctly added to model partitions.") - logging.error("Below is list of parameters skipped in reverse order: ") - - for param_id in range(global_offset, -1, -1): - logging.error( - f"Param ID: {param_id} : {global_params[1][param_id]} {global_params[0][param_id].shape}" - ) - logging.error("!" * 80) - - raise ValueError( - f"Invalid global offset {global_offset} found for global rank {app_state.global_rank} " - f"and local rank {app_state.local_rank}. Should be -1 if all parameters have been assigned. " - f"Currently, seems some parameters were not assigned." - ) - - logging.info("Successfully finished changing partitions!") - - if temp_dir is not None: - shutil.rmtree(temp_dir, ignore_errors=True) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py deleted file mode 100644 index 1161614e0a53471575a8ec8807cf55c15c1447c4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py +++ /dev/null @@ -1,198 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -r""" -Conversion script to convert PTL checkpoints into nemo checkpoint. - Example to run this conversion script: - python -m torch.distributed.launch --nproc_per_node= * \ - megatron_ckpt_to_nemo.py \ - --checkpoint_folder \ - --checkpoint_name \ - --nemo_file_path \ - --tensor_model_parallel_size \ - --pipeline_model_parallel_size -""" - -import dis -import os -from argparse import ArgumentParser - -import torch -from genericpath import isdir -from megatron.core import parallel_state -from omegaconf import open_dict -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.trainer.trainer import Trainer - -from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel -from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel -from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel -from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model -from nemo.collections.nlp.models.machine_translation.megatron_nmt_model import MegatronNMTModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector -from nemo.utils import AppState, logging -from nemo.utils.distributed import initialize_distributed -from nemo.utils.model_utils import inject_model_parallel_rank - - -def get_args(): - parser = ArgumentParser() - parser.add_argument( - "--checkpoint_folder", - type=str, - default=None, - required=True, - help="Path to PTL checkpoints saved during training. Ex: /raid/nemo_experiments/megatron_gpt/checkpoints", - ) - parser.add_argument( - "--checkpoint_name", - type=str, - default=None, - required=True, - help="Name of checkpoint to be used. Ex: megatron_gpt--val_loss=6.34-step=649-last.ckpt", - ) - - parser.add_argument( - "--hparams_file", - type=str, - default=None, - required=False, - help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml", - ) - parser.add_argument("--nemo_file_path", type=str, default=None, required=True, help="Path to output .nemo file.") - parser.add_argument("--gpus_per_node", type=int, required=True, default=None) - parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None) - parser.add_argument("--pipeline_model_parallel_size", type=int, required=True, default=None) - parser.add_argument( - "--pipeline_model_parallel_split_rank", - type=int, - required=False, - default=None, - help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.", - ) - parser.add_argument( - "--model_type", - type=str, - required=True, - default="gpt", - choices=["gpt", "sft", "t5", "bert", "nmt", "bart", "retro"], - ) - parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1)) - parser.add_argument("--bcp", action="store_true", help="Whether on BCP platform") - - args = parser.parse_args() - return args - - -def convert(local_rank, rank, world_size, args): - - app_state = AppState() - app_state.data_parallel_rank = 0 - num_nodes = world_size // args.gpus_per_node - plugins = [] - strategy = "auto" - if args.bcp: - plugins.append(TorchElasticEnvironment()) - if args.model_type == 'gpt': - strategy = NLPDDPStrategy() - - trainer = Trainer( - devices=args.gpus_per_node, num_nodes=num_nodes, accelerator='gpu', plugins=plugins, strategy=strategy - ) - - app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size - app_state.tensor_model_parallel_size = args.tensor_model_parallel_size - # Auto set split rank for T5, BART, NMT if split rank is None. - if args.pipeline_model_parallel_size > 1 and args.model_type in ['t5', 'bart', 'nmt']: - if args.pipeline_model_parallel_split_rank is not None: - app_state.pipeline_model_parallel_split_rank = args.pipeline_model_parallel_split_rank - else: - if args.pipeline_model_parallel_size % 2 != 0: - raise ValueError( - f"Pipeline model parallel size {args.pipeline_model_parallel_size} must be even if split rank is not specified." - ) - else: - # If split rank is not set, then we set it to be pipeline_model_parallel_size // 2 - this is because in most cases we have the same number of enc/dec layers. - app_state.pipeline_model_parallel_split_rank = args.pipeline_model_parallel_size // 2 - else: - app_state.pipeline_model_parallel_split_rank = None - - app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size - - parallel_state.initialize_model_parallel( - tensor_model_parallel_size=app_state.tensor_model_parallel_size, - pipeline_model_parallel_size=app_state.pipeline_model_parallel_size, - pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank, - ) - - app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank() - app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank() - - # check for distributed checkpoint - dist_ckpt_dir = os.path.join(args.checkpoint_folder, args.checkpoint_name) - if os.path.isdir(dist_ckpt_dir): - checkpoint_path = dist_ckpt_dir - else: - # legacy checkpoint needs model parallel injection - checkpoint_path = inject_model_parallel_rank(os.path.join(args.checkpoint_folder, args.checkpoint_name)) - - logging.info( - f'rank: {rank}, local_rank: {local_rank}, is loading checkpoint: {checkpoint_path} for tp_rank: {app_state.tensor_model_parallel_rank} and pp_rank: {app_state.pipeline_model_parallel_rank}' - ) - - if args.model_type == 'gpt': - model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) - elif args.model_type == 'sft': - model = MegatronGPTSFTModel.load_from_checkpoint( - checkpoint_path, hparams_file=args.hparams_file, trainer=trainer - ) - # we force the target for the loaded model to have the correct target - # because the hparams.yaml sometimes contains MegatronGPTModel as the target. - with open_dict(model.cfg): - model.cfg.target = f"{MegatronGPTSFTModel.__module__}.{MegatronGPTSFTModel.__name__}" - - elif args.model_type == 'bert': - model = MegatronBertModel.load_from_checkpoint( - checkpoint_path, hparams_file=args.hparams_file, trainer=trainer - ) - elif args.model_type == 't5': - model = MegatronT5Model.load_from_checkpoint(checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) - elif args.model_type == 'bart': - model = MegatronBARTModel.load_from_checkpoint( - checkpoint_path, hparams_file=args.hparams_file, trainer=trainer - ) - elif args.model_type == 'nmt': - model = MegatronNMTModel.load_from_checkpoint(checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) - elif args.model_type == 'retro': - model = MegatronRetrievalModel.load_from_checkpoint( - checkpoint_path, hparams_file=args.hparams_file, trainer=trainer - ) - model._save_restore_connector = NLPSaveRestoreConnector() - - if torch.distributed.is_initialized(): - torch.distributed.barrier() - - model.save_to(args.nemo_file_path) - - logging.info(f'NeMo model saved to: {args.nemo_file_path}') - - -if __name__ == '__main__': - args = get_args() - - local_rank, rank, world_size = initialize_distributed(args) - - convert(local_rank, rank, world_size, args) diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_export.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_export.py deleted file mode 100644 index bf9157884bfc989b6d07d459b8d46c6798f334e4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_export.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -from omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer - -from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel -from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel -from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model -from nemo.collections.nlp.models.machine_translation.megatron_nmt_model import MegatronNMTModel -from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector -from nemo.core import ModelPT -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.app_state import AppState -from nemo.utils.model_utils import inject_model_parallel_rank - - -def get_model_class(cfg): - if cfg.model_type == 'gpt': - return MegatronGPTModel - elif cfg.model_type == 'bert': - return MegatronBertModel - elif cfg.model_type == 't5': - return MegatronT5Model - elif cfg.model_type == 'bart': - return MegatronBARTModel - elif cfg.model_type == 'nmt': - return MegatronNMTModel - elif cfg.model_type == 'retro': - return MegatronRetrievalModel - else: - raise ValueError("Invalid Model Type") - - -@hydra_runner(config_path="conf", config_name="megatron_gpt_export") -def nemo_export(cfg): - """Convert a nemo model into .onnx ONNX format.""" - nemo_in = None - if cfg.gpt_model_file: - nemo_in = cfg.gpt_model_file - elif cfg.checkpoint_dir: - nemo_in = os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name) - assert nemo_in is not None, "NeMo model not provided. Please provide the path to the .nemo or .ckpt file" - - onnx_out = cfg.onnx_model_file - - trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) - assert ( - cfg.trainer.devices * cfg.trainer.num_nodes - == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size - ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" - - logging.info("Restoring NeMo model from '{}'".format(nemo_in)) - try: - if cfg.gpt_model_file: - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(cfg.gpt_model_file): - save_restore_connector.model_extracted_dir = cfg.gpt_model_file - - pretrained_cfg = ModelPT.restore_from( - restore_path=cfg.gpt_model_file, - trainer=trainer, - return_config=True, - save_restore_connector=save_restore_connector, - ) - OmegaConf.set_struct(pretrained_cfg, True) - with open_dict(pretrained_cfg): - pretrained_cfg.sequence_parallel = False - pretrained_cfg.activations_checkpoint_granularity = None - pretrained_cfg.activations_checkpoint_method = None - pretrained_cfg.precision = trainer.precision - if trainer.precision == "16": - pretrained_cfg.megatron_amp_O2 = False - model = ModelPT.restore_from( - restore_path=cfg.gpt_model_file, - trainer=trainer, - override_config_path=pretrained_cfg, - save_restore_connector=save_restore_connector, - ) - elif cfg.checkpoint_dir: - app_state = AppState() - if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: - app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size - app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size - app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size - ( - app_state.tensor_model_parallel_rank, - app_state.pipeline_model_parallel_rank, - app_state.model_parallel_size, - app_state.data_parallel_size, - app_state.pipeline_model_parallel_split_rank, - app_state.virtual_pipeline_model_parallel_rank, - ) = fake_initialize_model_parallel( - world_size=app_state.model_parallel_size, - rank=trainer.global_rank, - tensor_model_parallel_size_=cfg.tensor_model_parallel_size, - pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, - pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank, - ) - checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)) - model_cls = get_model_class(cfg) - model = model_cls.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer) - else: - raise ValueError("need at least a nemo file or checkpoint dir") - except Exception as e: - logging.error( - "Failed to restore model from NeMo file : {}. Please make sure you have the latest NeMo package installed with [all] dependencies.".format( - nemo_in - ) - ) - raise e - - logging.info("Model {} restored from '{}'".format(model.__class__.__name__, nemo_in)) - - # Export - check_trace = cfg.export_options.runtime_check - - try: - model.to(device=cfg.export_options.device).freeze() - model.eval() - model.export( - onnx_out, - onnx_opset_version=cfg.export_options.onnx_opset, - do_constant_folding=cfg.export_options.do_constant_folding, - dynamic_axes={ - 'input_ids': {0: "sequence", 1: "batch"}, - 'position_ids': {0: "sequence", 1: "batch"}, - 'logits': {0: "sequence", 1: "batch"}, - }, - check_trace=check_trace, - check_tolerance=cfg.export_options.check_tolerance, - verbose=cfg.export_options.verbose, - ) - except Exception as e: - logging.error( - "Export failed. Please make sure your NeMo model class ({}) has working export() and that you have the latest NeMo package installed with [all] dependencies.".format( - model.__class__ - ) - ) - raise e - - -if __name__ == '__main__': - nemo_export() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_continue_training.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_continue_training.py deleted file mode 100644 index 7b8a9897e1c00554790ae259257dc36cde97f2bb..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_continue_training.py +++ /dev/null @@ -1,194 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import tempfile - -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector - -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, - PipelineMixedPrecisionPlugin, -) -from nemo.core.config import hydra_runner -from nemo.utils import AppState, logging -from nemo.utils.exp_manager import exp_manager -from nemo.utils.model_utils import inject_model_parallel_rank - - -def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False): - """ - This function modifies the original gpt pre-training config (t5_cfg) with attributes from the finetuning config (cfg). - The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`. - """ - OmegaConf.set_struct(gpt_cfg, True) - OmegaConf.resolve(cfg) - with open_dict(gpt_cfg): - gpt_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) - gpt_cfg.micro_batch_size = cfg.model.micro_batch_size - gpt_cfg.global_batch_size = cfg.model.global_batch_size - gpt_cfg.sequence_parallel = cfg.model.get("sequence_parallel", False) - gpt_cfg.activations_checkpoint_granularity = cfg.model.get("activations_checkpoint_granularity", None) - gpt_cfg.activations_checkpoint_num_layers = cfg.model.get("activations_checkpoint_num_layers", None) - gpt_cfg.activations_checkpoint_method = cfg.model.get("activations_checkpoint_method", None) - gpt_cfg.data = cfg.model.data - gpt_cfg.optim = cfg.model.optim - gpt_cfg.precision = cfg.trainer.precision - gpt_cfg.restore_from_path = cfg.restore_from_path - gpt_cfg.resume_from_checkpoint = cfg.model.resume_from_checkpoint - gpt_cfg.gradient_as_bucket_view = cfg.model.gradient_as_bucket_view - gpt_cfg.encoder_seq_length = cfg.model.encoder_seq_length - gpt_cfg.max_position_embeddings = cfg.model.max_position_embeddings - gpt_cfg.seq_len_interpolation_factor = cfg.model.seq_len_interpolation_factor - gpt_cfg.use_flash_attention = cfg.model.use_flash_attention - gpt_cfg.tensor_model_parallel_size = cfg.model.get('tensor_model_parallel_size', 1) - gpt_cfg.pipeline_model_parallel_size = cfg.model.get('pipeline_model_parallel_size', 1) - gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get('pipeline_model_parallel_split_rank', 0) - - # This is needed when modifying a hparam file directly to load `.ckpt` files. - # This is not needed to modify the cfg in `.nemo` files. - if add_cfg_to_tree: - OmegaConf.resolve(gpt_cfg) - gpt_cfg.cfg = gpt_cfg - - return gpt_cfg - - -def load_from_nemo(cls, cfg, trainer, gpt_cfg, modify_confg_fn): - gpt_cfg = modify_confg_fn(gpt_cfg, cfg, add_cfg_to_tree=False) - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(cfg.restore_from_path): - save_restore_connector.model_extracted_dir = cfg.restore_from_path - model = cls.restore_from( - restore_path=cfg.restore_from_path, - trainer=trainer, - override_config_path=gpt_cfg, - save_restore_connector=save_restore_connector, - ) - return model - - -def load_from_checkpoint_dir(cls, cfg, trainer, modify_confg_fn): - app_state = AppState() - if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1: - app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.model.pipeline_model_parallel_size - app_state.tensor_model_parallel_size = cfg.model.tensor_model_parallel_size - app_state.pipeline_model_parallel_size = cfg.model.pipeline_model_parallel_size - ( - app_state.tensor_model_parallel_rank, - app_state.pipeline_model_parallel_rank, - app_state.model_parallel_size, - app_state.data_parallel_size, - app_state.pipeline_model_parallel_split_rank, - app_state.virtual_pipeline_model_parallel_rank, - ) = fake_initialize_model_parallel( - world_size=app_state.model_parallel_size, - rank=trainer.global_rank, - tensor_model_parallel_size_=cfg.model.tensor_model_parallel_size, - pipeline_model_parallel_size_=cfg.model.pipeline_model_parallel_size, - pipeline_model_parallel_split_rank_=cfg.model.pipeline_model_parallel_split_rank, - ) - checkpoint_path = inject_model_parallel_rank( - os.path.join(cfg.model.pretrained_checkpoint.checkpoint_dir, cfg.model.pretrained_checkpoint.checkpoint_name) - ) - hparams_file = OmegaConf.load(cfg.model.pretrained_checkpoint.hparams_file) - gpt_cfg = modify_confg_fn(hparams_file.cfg, cfg, add_cfg_to_tree=True) - with tempfile.NamedTemporaryFile(suffix='.yaml') as f: - OmegaConf.save(config=gpt_cfg, f=f.name) - model = cls.load_from_checkpoint(checkpoint_path=checkpoint_path, trainer=trainer, hparams_file=f.name,) - return model - - -def validate_checkpoint_loading_args(cfg): - if cfg.checkpoint_dir is None or not os.path.isdir(cfg.checkpoint_dir): - raise ValueError(f'Checkpoint directory {cfg.checkpoint_dir} does not exist or is not a directory.') - if cfg.checkpoint_name is None: - raise ValueError(f'Checkpoint name {cfg.checkpoint_name} is not valid.') - if cfg.hparams_file is None or not os.path.isfile(cfg.hparams_file): - raise ValueError(f'Hparams file {cfg.hparams_file} does not exist or is not a file.') - - -@hydra_runner(config_path="conf", config_name="megatron_gpt_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - with_distributed_adam = cfg.model.optim.get('name', 'fused_adam') == 'distributed_fused_adam' - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2 and not with_distributed_adam: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()]) - - exp_manager(trainer, cfg.exp_manager) - - # update resume from checkpoint found by exp_manager - if cfg.model.resume_from_checkpoint is not None: - trainer.ckpt_path = cfg.model.resume_from_checkpoint - - logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') - - if cfg.restore_from_path: - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(cfg.restore_from_path): - save_restore_connector.model_extracted_dir = cfg.restore_from_path - gpt_cfg = MegatronGPTModel.restore_from( - restore_path=cfg.restore_from_path, - trainer=trainer, - return_config=True, - save_restore_connector=save_restore_connector, - ) - model = load_from_nemo(MegatronGPTModel, cfg, trainer, gpt_cfg, modify_confg_fn=_modify_config) - elif cfg.model.get("pretrained_checkpoint", None) is not None: - validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) - model = load_from_checkpoint_dir(MegatronGPTModel, cfg, trainer, gpt_cfg, modify_confg_fn=_modify_config) - else: - print(' > WARNING: No checkpoint provided. Starting from scratch.') - model = MegatronGPTModel(cfg.model, trainer) - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_eval.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_eval.py deleted file mode 100644 index c9eb013b64e98e50328b320ab208af39ff1ca38f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_eval.py +++ /dev/null @@ -1,361 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import asyncio -import datetime -import os -import threading -from functools import partial - -import torch -from omegaconf import OmegaConf, open_dict -from pytorch_lightning.trainer.trainer import Trainer -from torch.utils.data import DataLoader, Dataset - -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer -from nemo.collections.nlp.modules.common.text_generation_utils import generate -from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam -from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy, NLPSaveRestoreConnector -from nemo.core.config import hydra_runner -from nemo.utils.app_state import AppState -from nemo.utils.model_utils import inject_model_parallel_rank - -try: - from megatron.core import parallel_state - - HAVE_MEGATRON_CORE = True - -except (ImportError, ModuleNotFoundError): - - HAVE_MEGATRON_CORE = False - -""" -This is the script to run GPT text generation. - -Usage: - Assume the model has TP=1, PP=1 in the following use cases. - a. run greedy inference from a nemo file: - python megatron_gpt_eval.py \ - gpt_model_file=PATH_TO_MODEL \ - inference.greedy=True \ - inference.add_BOS=True \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - tensor_model_parallel_size=-1 \ - pipeline_model_parallel_size=-1 \ - prompts=[prompt1,prompt2] - - b. run greedy inference from a PTL checkpoint file: - python megatron_gpt_eval.py \ - checkpoint_dir=PATH_TO_CHECKPOINT_FILE \ - checkpoint_name=CHECKPOINT_FILE_NAME \ - hparams_file=HPARAMS_FILE \ - inference.greedy=True \ - inference.add_BOS=True \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - tensor_model_parallel_size=-1 \ - pipeline_model_parallel_size=-1 \ - prompts=[prompt1,prompt2] - - c. run top_p inference from a nemo file: - python megatron_gpt_eval.py \ - gpt_model_file=PATH_TO_MODEL \ - inference.greedy=False \ - inference.top_k=0 \ - inference.top_p=0.9 \ - inference.repetition_penalty=1.2 \ - inference.add_BOS=True \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - tensor_model_parallel_size=-1 \ - pipeline_model_parallel_size=-1 \ - prompts=[prompt1,prompt2] - - d. If you don't need to generate tokens and need model to compute logprobs: - python megatron_gpt_eval.py \ - gpt_model_file=PATH_TO_MODEL \ - inference.compute_logprob=True \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - tensor_model_parallel_size=-1 \ - pipeline_model_parallel_size=-1 \ - prompts=[text to get logprob] - - e. Launch the inference server - python megatron_gpt_eval.py \ - gpt_model_file=PATH_TO_MODEL \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - tensor_model_parallel_size=-1 \ - pipeline_model_parallel_size=-1 \ - server=True - - To send a request to the server, here is one example code: - ```python - import json - import requests - - batch_size = 8 - port_num = 5555 - headers = {"Content-Type": "application/json"} - - - def request_data(data): - resp = requests.put('http://localhost:{}/generate'.format(port_num), - data=json.dumps(data), - headers=headers) - sentences = resp.json()['sentences'] - return sentences - - - data = { - "sentences": [""] * batch_size, - "tokens_to_generate": 300, - "temperature": 1.0, - "add_BOS": True, - "top_k": 0, - "top_p": 0.9, - "greedy": False, - "all_probs": False, - "repetition_penalty": 1.2, - "min_tokens_to_generate": 2, - } - - sentences = request_data(data) - ``` -""" - -if not torch.cuda.is_available(): - raise EnvironmentError("GPU is needed for the inference") - - -class RequestDataSet(Dataset): - def __init__(self, sentences): - super().__init__() - self.sentences = sentences - - def __len__(self,): - return len(self.sentences) - - def __getitem__(self, idx): - return self.sentences[idx] - - -def remove_padded_prompts(response, nb_paddings): - result = {} - for k, v in response.items(): - if v != None and (type(v) is list or type(v) is torch.Tensor): - v = v[:-nb_paddings] - result[k] = v - return result - - -@hydra_runner(config_path="conf", config_name="megatron_gpt_inference") -def main(cfg) -> None: - - # trainer required for restoring model parallel models - trainer = Trainer( - strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)), - **cfg.trainer, - callbacks=[CustomProgressBar()], - ) - - if cfg.gpt_model_file is not None: - if ( - cfg.tensor_model_parallel_size < 0 - or cfg.pipeline_model_parallel_size < 0 - or cfg.get('pipeline_model_parallel_split_rank', -1) < 0 - ): - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(cfg.gpt_model_file): - save_restore_connector.model_extracted_dir = cfg.gpt_model_file - model_config = MegatronGPTModel.restore_from( - restore_path=cfg.gpt_model_file, - trainer=trainer, - return_config=True, - save_restore_connector=save_restore_connector, - ) - - # with dist checkpointing we don't need to set this - if not model_config.get('mcore_gpt', False): - with open_dict(cfg): - cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1) - cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1) - cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0) - - assert ( - cfg.trainer.devices * cfg.trainer.num_nodes - == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size - ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" - - if cfg.gpt_model_file: - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(cfg.gpt_model_file): - save_restore_connector.model_extracted_dir = cfg.gpt_model_file - - pretrained_cfg = MegatronGPTModel.restore_from( - restore_path=cfg.gpt_model_file, - trainer=trainer, - return_config=True, - save_restore_connector=save_restore_connector, - ) - OmegaConf.set_struct(pretrained_cfg, True) - with open_dict(pretrained_cfg): - pretrained_cfg.sequence_parallel = False - pretrained_cfg.activations_checkpoint_granularity = None - pretrained_cfg.activations_checkpoint_method = None - pretrained_cfg.precision = trainer.precision - if pretrained_cfg.get('mcore_gpt', False): - # with dist checkpointing we can use the model parallel config specified by the user - pretrained_cfg.tensor_model_parallel_size = cfg.tensor_model_parallel_size - pretrained_cfg.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size - if trainer.precision == "16": - pretrained_cfg.megatron_amp_O2 = False - elif trainer.precision in ['bf16', 'bf16-mixed'] and cfg.get('megatron_amp_O2', False): - pretrained_cfg.megatron_amp_O2 = True - model = MegatronGPTModel.restore_from( - restore_path=cfg.gpt_model_file, - trainer=trainer, - override_config_path=pretrained_cfg, - save_restore_connector=save_restore_connector, - map_location=f'cuda:{trainer.local_rank}', # map_location is needed for converted models - ) - elif cfg.checkpoint_dir: - app_state = AppState() - if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: - app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size - app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size - app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size - ( - app_state.tensor_model_parallel_rank, - app_state.pipeline_model_parallel_rank, - app_state.model_parallel_size, - app_state.data_parallel_size, - app_state.pipeline_model_parallel_split_rank, - app_state.virtual_pipeline_model_parallel_rank, - ) = fake_initialize_model_parallel( - world_size=app_state.model_parallel_size, - rank=trainer.global_rank, - tensor_model_parallel_size_=cfg.tensor_model_parallel_size, - pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, - pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank, - ) - checkpoint_path = os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name) - # checkpoint_path is a dir in case of distributed checkpointing - if not os.path.isdir(checkpoint_path): - # legacy checkpoint needs model parallel rank injection - checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)) - model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer) - else: - raise ValueError("need at least a nemo file or checkpoint dir") - - model.freeze() - - # Have to turn off activations_checkpoint_method for inference - try: - model.model.language_model.encoder.activations_checkpoint_method = None - except AttributeError: - pass - - length_params: LengthParam = { - "max_length": cfg.inference.tokens_to_generate, - "min_length": cfg.inference.min_tokens_to_generate, - } - - sampling_params: SamplingParam = { - "use_greedy": cfg.inference.greedy, - "temperature": cfg.inference.temperature, - "top_k": cfg.inference.top_k, - "top_p": cfg.inference.top_p, - "repetition_penalty": cfg.inference.repetition_penalty, - "add_BOS": cfg.inference.add_BOS, - "all_probs": cfg.inference.all_probs, - "compute_logprob": cfg.inference.compute_logprob, - "end_strings": cfg.inference.end_strings, - } - - fp8_enabled = hasattr(model.cfg, "fp8") and (model.cfg.fp8 == True) - if fp8_enabled: - nb_paddings = 0 - while len(cfg.prompts) % 8 != 0: - cfg.prompts.append("") - nb_paddings += 1 - - # First method of running text generation, call model.generate method - response = model.generate( - inputs=OmegaConf.to_container(cfg.prompts), length_params=length_params, sampling_params=sampling_params - ) - - if fp8_enabled: - response = remove_padded_prompts(response, nb_paddings) - print("***************************") - print(response) - print("***************************") - - # Second method of running text generation, call trainer.predict [recommended] - bs = 8 if fp8_enabled else 2 - ds = RequestDataSet(OmegaConf.to_container(cfg.prompts)) - request_dl = DataLoader(dataset=ds, batch_size=bs) - config = OmegaConf.to_container(cfg.inference) - model.set_inference_config(config) - response = trainer.predict(model, request_dl) - - if fp8_enabled: - response[-1] = remove_padded_prompts(response[-1], nb_paddings) - print("***************************") - print(response) - print("***************************") - - # Third method of running text generation, use inference server - if cfg.server: - from nemo.collections.nlp.modules.common.megatron_web_server import get_chatbot_demo, get_demo - - if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0: - if cfg.web_server: - if cfg.chat: - defaults = { - 'user': cfg.chatbot_config.user, - 'assistant': cfg.chatbot_config.assistant, - 'system': cfg.chatbot_config.system, - } - web_ui = partial( - get_chatbot_demo, - defaults=defaults, - value=cfg.chatbot_config.value, - attributes=cfg.chatbot_config.attributes, - ) - else: - web_ui = get_demo - loop = asyncio.new_event_loop() - thread = threading.Thread( - target=web_ui, - daemon=True, - args=(cfg.share, cfg.username, cfg.password, cfg.port, cfg.web_port, loop), - ) - thread.start() - server = MegatronServer(model.cuda()) - server.run("0.0.0.0", port=cfg.port) - - while True: - choice = torch.cuda.LongTensor(1) - torch.distributed.broadcast(choice, 0) - if choice[0].item() == 0: - generate(model.cuda()) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_pretraining.py deleted file mode 100644 index 44834b35a0cfab0c5de8620618531f2194e8904a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_pretraining.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict - -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - -mp.set_start_method("spawn", force=True) - - -@hydra_runner(config_path="conf", config_name="megatron_gpt_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - trainer = MegatronTrainerBuilder(cfg).create_trainer() - exp_manager(trainer, cfg.exp_manager) - - model = MegatronGPTModel(cfg.model, trainer) - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py deleted file mode 100644 index 89077f099aebf0beb6437822911e3e8990a916c3..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment - -from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import ( - MegatronGPTPromptLearningModel, -) -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, - PipelineMixedPrecisionPlugin, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - -mp.set_start_method("spawn", force=True) - -""" -This is an example of how to ptune/prompt-tune a pretrained GPT model. -Be sure to use a .nemo gpt model with this code. If you've downloaded -a model from NGC or are otherwise using a MegatronLM model, please use -either megatron_ckpt_to_nemo.py or megatron_lm_ckpt_to_nemo.py found -withing this examples directory to convert your model to .nemo format. -""" - - -@hydra_runner(config_path="conf", config_name="megatron_gpt_prompt_learning_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - - plugins = [] - strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,) - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - enabled=False - if cfg.model.pipeline_model_parallel_size > 1 - else True, # turn off the grad scale for pipeline parallel LM model - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()]) - exp_manager(trainer, cfg.exp_manager) - - # load existing or init new soft prompt GPT model - if cfg.model.get("restore_path", None): - model = MegatronGPTPromptLearningModel.restore_from( - cfg.model.restore_path, cfg.model, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector() - ) - else: - model = MegatronGPTPromptLearningModel(cfg.model, trainer=trainer) - - trainer.fit(model) - - -if __name__ == '__main__': - dep_msg = "* Please switch to using examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py *" - dep = "Deprecation Notice!!".center(len(dep_msg) - 2, " ") - banner = "*" * len(dep_msg) - spacer = " " * (len(dep_msg) - 2) - logging.warning(f"\n\n{banner}\n*{spacer}*\n*{dep}*\n{dep_msg}\n*{spacer}*\n{banner}\n\n") - main() - logging.warning(f"\n\n{banner}\n*{spacer}*\n*{dep}*\n{dep_msg}\n*{spacer}*\n{banner}\n\n") diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py deleted file mode 100644 index e096bafc6132840f7369c1983c538b4b599f01f9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py +++ /dev/null @@ -1,204 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import torch -import torch.multiprocessing as mp -from megatron.core import parallel_state -from omegaconf import OmegaConf -from omegaconf.omegaconf import open_dict -from pytorch_lightning.trainer.trainer import Trainer - -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import ( - MegatronGPTPromptLearningModel, -) -from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam -from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy, NLPSaveRestoreConnector -from nemo.core.config import hydra_runner -from nemo.utils import logging - -mp.set_start_method("spawn", force=True) - -""" -This is the script to run GPT text generation. - a. run greedy inference from a p-tuned/prompt-tuned model's nemo file: - python megatron_gpt_prompt_learning_eval.py \ - virtual_prompt_model_file=PATH_TO_NEMO_PROMPT_LEARNING_MODEL_FILE \ - gpt_model_file=PATH_TO_FROZEN_GPT_MODEL_FILE \ - inference.greedy=True \ - inference.add_BOS=False \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - tensor_model_parallel_size=1 \ - pipeline_model_parallel_size=1 \ - pred_file_path=PATH_WHERE_PRED_TEXT_FILE_WILL_BE_SAVED \ - data_paths=[path/to/dataset1.jsonl, path/to/dataset2.jsonl] - - virtual_prompt_model_file should be a path to a .nemo file saved after p-tuning/prompt tuning and model file - is still the path to the gpt model's .nemo file. - - data_paths should be a list of .json or .jsonl files containing json objects similar to the ones - used during prompt learning. They should have keys that match the fields specified in the prompt template. - Fields can be dropped from the prompt dict and their corresponding section of the prompt template will - be automatically removed. - - For example, say the prompt template during p-tuning/prompt-tuning looked like: - - '<|VIRTUAL_PROMPT_0|> Context: {context} Question: {question} Answer: {answer}' - - but you don't want to include the answer field during inference. Just don't - include the answer field in the prompt dict like below: - - {"taskname": "squad", "context": "some paragraph", "question": "question related to paragraph"} - {"taskname": "squad", "context": "another paragraph", "question": "a different question related to paragraph"} - - And the dataset class will automatically format your input to have the form: - - [ - '<|VIRTUAL_PROMPT_0|> Context: some paragraph Question: question related to paragraph Answer:', - '<|VIRTUAL_PROMPT_0|> Context: another paragraph Question: a different question related to paragraph Answer:' - ] - - Similarly for other senarios, just add virtual_prompt_model_file=PATH_TO_NEMO_PROMPT_LEARNING_MODEL_FILE if you're using a - p-tuned/prompt-tuned model. -""" - - -@hydra_runner(config_path="conf", config_name="megatron_gpt_prompt_learning_inference") -def main(cfg) -> None: - if not torch.cuda.is_available(): - raise EnvironmentError("GPU is needed for the inference") - - # trainer required for restoring model parallel models - trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer, callbacks=[CustomProgressBar()]) - - if ( - cfg.tensor_model_parallel_size < 0 - or cfg.pipeline_model_parallel_size < 0 - or cfg.get('pipeline_model_parallel_split_rank', -1) < 0 - ): - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(cfg.gpt_model_file): - save_restore_connector.model_extracted_dir = cfg.gpt_model_file - model_config = MegatronGPTModel.restore_from( - restore_path=cfg.gpt_model_file, - trainer=trainer, - return_config=True, - save_restore_connector=save_restore_connector, - ) - - with open_dict(cfg): - cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1) - cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1) - cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0) - - assert ( - cfg.trainer.devices * cfg.trainer.num_nodes - == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size - ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" - - # Update frozen GPT model path if it is given in case it has changed - prompt_learning_cfg = MegatronGPTPromptLearningModel.restore_from( - cfg.virtual_prompt_model_file, trainer=trainer, return_config=True, - ) - if cfg.get("gpt_model_file"): - with open_dict(prompt_learning_cfg): - prompt_learning_cfg.language_model_path = cfg.gpt_model_file - prompt_learning_cfg.sequence_parallel = False - prompt_learning_cfg.activations_checkpoint_method = None - prompt_learning_cfg.activations_checkpoint_granularity = None - prompt_learning_cfg.activations_checkpoint_num_layers = None - - # Load prompt tuned model, virtual_prompt_model_file must be provided in config - # Now load prompt learning model with frozen gpt model base - - model = MegatronGPTPromptLearningModel.restore_from( - restore_path=cfg.virtual_prompt_model_file, trainer=trainer, override_config_path=prompt_learning_cfg, - ) - model.freeze() - - # Have to turn off activations_checkpoint_method for inference - try: - model.frozen_model.model.language_model.encoder.activations_checkpoint_method = None - except AttributeError: - pass - - # Check whether the DDP is initialized - if parallel_state.is_unitialized(): - - def placeholder(): - return - - if model.trainer.strategy.launcher is not None: - model.trainer.strategy.launcher.launch(placeholder, trainer=model.trainer) - model.trainer.strategy.setup_environment() - - length_params: LengthParam = { - "max_length": cfg.inference.tokens_to_generate, - "min_length": cfg.inference.min_tokens_to_generate, - } - - sampling_params: SamplingParam = { - "use_greedy": cfg.inference.greedy, - "temperature": cfg.inference.temperature, - "top_k": cfg.inference.top_k, - "top_p": cfg.inference.top_p, - "repetition_penalty": cfg.inference.repetition_penalty, - "add_BOS": cfg.inference.add_BOS, - "all_probs": cfg.inference.all_probs, - "compute_logprob": cfg.inference.compute_logprob, - } - - max_seq_length = model.frozen_model.cfg.encoder_seq_length - length_params["max_length"] - max_seq_length = min(max_seq_length, cfg.get("max_seq_length", 8192)) - - _, dataloader = model.build_virtual_prompt_dataset( - data=cfg.data_paths, - batch_size=cfg.inference.get('batch_size', 1), - max_seq_length=max_seq_length, - min_seq_length=model.cfg.data.get('min_seq_length', 1), - add_bos=sampling_params["add_BOS"], - add_eos=False, - for_train=False, - tokens_to_generate=length_params["max_length"], - drop_last=False, - shuffle=False, - num_workers=cfg.get("num_workers", 1), - ) - - config = OmegaConf.to_container(cfg.inference) - model.set_inference_config(config) - response = trainer.predict(model, dataloader) - - print("***************************") - with open(cfg.pred_file_path, "w", encoding="utf-8") as pred_file: - for i in range(len(response)): - for sent in response[i]["sentences"]: - sent = sent.strip() - sent = sent.replace("\n", " ") - pred_file.write(sent + "\n") - print(f"Inference Complete, prediction file saved at {cfg.pred_file_path}") - print("***************************") - - -if __name__ == '__main__': - dep_msg = "* Please switch to using examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py *" - dep = "Deprecation Notice!!".center(len(dep_msg) - 2, " ") - banner = "*" * len(dep_msg) - spacer = " " * (len(dep_msg) - 2) - logging.warning(f"\n\n{banner}\n*{spacer}*\n*{dep}*\n{dep_msg}\n*{spacer}*\n{banner}\n\n") - main() - logging.warning(f"\n\n{banner}\n*{spacer}*\n*{dep}*\n{dep_msg}\n*{spacer}*\n{banner}\n\n") diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_test.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_test.py deleted file mode 100644 index 5fc40039098b85c010b24ede0042f55d4393f438..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_test.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from omegaconf.omegaconf import OmegaConf -from pytorch_lightning import Trainer - -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank -from nemo.collections.nlp.parts.nlp_overrides import ( - NLPDDPStrategy, - NLPMixedPrecisionPlugin, - NLPPrecisionPlugin, - NLPSaveRestoreConnector, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.app_state import AppState - - -@hydra_runner(config_path="conf", config_name="megatron_gpt_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - trainer = None - if cfg.trainer.precision == 16: - trainer = Trainer( - plugins=[ - NLPMixedPrecisionPlugin( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - ), - ], - strategy=NLPDDPStrategy(), - **cfg.trainer, - ) - elif cfg.trainer.precision == 'bf16': - trainer = Trainer(plugins=[NLPNativeBfloat16PrecisionPlugin(),], strategy=NLPDDPStrategy(), **cfg.trainer,) - else: - trainer = Trainer(plugins=[NLPPrecisionPlugin()], strategy=NLPDDPStrategy(), **cfg.trainer) - - app_state = AppState() - app_state.model_parallel_size = cfg.model.tensor_model_parallel_size - app_state.model_parallel_rank = compute_model_parallel_rank(trainer.local_rank, app_state.model_parallel_size) - - model = MegatronGPTModel.restore_from( - cfg.restore_from_path, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector(), - ) - - # Note: most nemo models must have the data paths configured before instantiating the model - # MegatronGPTMOdel sets up the data in the PTL method .setup which happens after DDP spawns. - model.cfg.data.splits_string = cfg.model.data.splits_string - - trainer.test(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_validate.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_validate.py deleted file mode 100644 index b5a61e627a14ea4da4861579c95862f1fc88b0fa..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_gpt_validate.py +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import tempfile - -from omegaconf import OmegaConf, open_dict -from pytorch_lightning.trainer.trainer import Trainer - -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.parts.nlp_overrides import ( - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, - PipelineMixedPrecisionPlugin, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.app_state import AppState -from nemo.utils.model_utils import inject_model_parallel_rank - -""" Example script showing how to run validation on a MegatronGPT model. - - Sample usage: - - From nemo model: - - python megatron_gpt_validate.py \ - trainer.devices=4 \ - trainer.num_nodes=1 \ - trainer.limit_val_batches=10 \ - trainer.max_steps=100 \ - tensor_model_parallel_size=1 \ - pipeline_model_parallel_size=4 \ - trainer.precision=bf16 \ - gpt_model_file=/path/to/megatron_gpt_tp_1_pp4.nemo - - from PTL checkpoint: - python megatron_gpt_validate.py \ - trainer.devices=4 \ - trainer.num_nodes=1 \ - trainer.limit_val_batches=10 \ - trainer.max_steps=100 \ - tensor_model_parallel_size=1 \ - pipeline_model_parallel_size=4 \ - virtual_pipeline_model_parallel_size=4 \ - trainer.precision=bf16 \ - checkpoint_dir='/path/to/experiment/checkpoints' \ - checkpoint_name='megatron_gpt--val_loss=7.78-step=100-consumed_samples=6336.0-last.ckpt' \ - hparams_file='/path/to/experiment/hparams.yaml - -""" - - -def modify_pretrained_cfg(pretrained_cfg, trainer, cfg): - with open_dict(pretrained_cfg): - OmegaConf.set_struct(pretrained_cfg, True) - pretrained_cfg.sequence_parallel = False - pretrained_cfg.activations_checkpoint_granularity = None - pretrained_cfg.activations_checkpoint_method = None - pretrained_cfg.precision = trainer.precision - if cfg.micro_batch_size is not None: - pretrained_cfg.micro_batch_size = cfg.micro_batch_size - if cfg.global_batch_size is not None: - pretrained_cfg.global_batch_size = cfg.global_batch_size - if trainer.precision == "16": - pretrained_cfg.megatron_amp_O2 = False - return pretrained_cfg - - -@hydra_runner(config_path="conf", config_name="megatron_gpt_validate_config") -def main(cfg) -> None: - - trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) - - assert ( - cfg.trainer.devices * cfg.trainer.num_nodes - == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size - ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" - - if cfg.gpt_model_file: - logging.info(f"Restoring model from {cfg.gpt_model_file}") - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(cfg.gpt_model_file): - save_restore_connector.model_extracted_dir = cfg.gpt_model_file - - pretrained_cfg = MegatronGPTModel.restore_from( - restore_path=cfg.gpt_model_file, - trainer=trainer, - return_config=True, - save_restore_connector=save_restore_connector, - ) - pretrained_cfg = modify_pretrained_cfg(pretrained_cfg, trainer, cfg) - model = MegatronGPTModel.restore_from( - restore_path=cfg.gpt_model_file, - trainer=trainer, - override_config_path=pretrained_cfg, - save_restore_connector=save_restore_connector, - map_location=f'cuda:{trainer.local_rank}', # map_location is needed for converted models - ) - elif cfg.checkpoint_dir: - logging.info( - f"Restoring model from checkpoint_dir: {cfg.checkpoint_dir} with checkpoint name: {cfg.checkpoint_name}" - ) - app_state = AppState() - if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: - app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size - app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size - app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size - app_state.virtual_pipeline_model_parallel_size = cfg.virtual_pipeline_model_parallel_size - ( - app_state.tensor_model_parallel_rank, - app_state.pipeline_model_parallel_rank, - app_state.model_parallel_size, - app_state.data_parallel_size, - app_state.pipeline_model_parallel_split_rank, - app_state.virtual_pipeline_model_parallel_rank, - ) = fake_initialize_model_parallel( - world_size=app_state.model_parallel_size, - rank=trainer.global_rank, - tensor_model_parallel_size_=cfg.tensor_model_parallel_size, - pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, - virtual_pipeline_model_parallel_size_=cfg.virtual_pipeline_model_parallel_size, - ) - checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)) - pretrained_cfg = OmegaConf.load(cfg.hparams_file) - pretrained_cfg = modify_pretrained_cfg(pretrained_cfg.cfg, trainer, cfg) - with tempfile.NamedTemporaryFile(suffix='.yaml') as f: - OmegaConf.save(config=pretrained_cfg, f=f.name) - model = MegatronGPTModel.load_from_checkpoint( - checkpoint_path=checkpoint_path, trainer=trainer, hparams_file=f.name, - ) - else: - raise ValueError("need at least a nemo file or checkpoint dir") - - logging.info("\n\n************** Model configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(model.cfg)}') - - trainer.validate(model=model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py deleted file mode 100644 index e06f52a55acdbdd9f9a7dd6a4afae0190746fb84..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py +++ /dev/null @@ -1,494 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -r""" -Conversion script to convert Megatron_LM checkpoints into nemo checkpoint. - Example to run this conversion script: - python -m torch.distributed.launch --nproc_per_node= megatron_lm_ckpt_to_nemo.py \ - --checkpoint_folder \ - --checkpoint_name megatron_gpt--val_loss=99.99-step={steps}-consumed_samples={consumed}.0 \ - --nemo_file_path \ - --model_type \ - --hparams_file - --tensor_model_parallel_size - --pipeline_model_parallel_size - --gpus_per_node -Note, hparams_file usually is generated by pytorch lightning when running the training job. -It is the model section of the model pretraining conf with an extra cfg key. -Check https://github.com/NVIDIA/NeMo/issues/4993 for an example. -To resume the training from converted MegatronLM checkpoint, make sure to set the -`trainer.max_steps=round(lr-warmup-fraction * lr-decay-iters + lr-decay-iters)` -where `lr-warmup-fraction` and `lr-decay-iters` are arguments from MegatronLM training -so the learning rate scheduler will follow the same curve. -""" - -import importlib -import os -import pathlib -import sys -from argparse import ArgumentParser -from collections import OrderedDict -from typing import Any, Optional - -import torch -from lightning_fabric.utilities.cloud_io import _load as pl_load -from megatron.core import parallel_state -from pytorch_lightning.core.saving import _load_state as ptl_load_state -from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml -from pytorch_lightning.trainer.trainer import Trainer -from pytorch_lightning.utilities.migration import pl_legacy_patch - -from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.modules.common.megatron.megatron_init import initialize_model_parallel_for_nemo -from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector -from nemo.utils import AppState, logging -from nemo.utils.distributed import initialize_distributed -from nemo.utils.model_utils import inject_model_parallel_rank, uninject_model_parallel_rank - -# this enums code is copied from Megatron_LM -enum_code = ''' -import enum - -class ModelType(enum.Enum): - encoder_or_decoder = 1 - encoder_and_decoder = 2 - - -class LayerType(enum.Enum): - encoder = 1 - decoder = 2 - - -class AttnType(enum.Enum): - self_attn = 1 - cross_attn = 2 - - -class AttnMaskType(enum.Enum): - padding = 1 - causal = 2 -''' - - -def install_megatron_dependence(): - # this is a hack to install required modules for MegatronLM checkpoints - # run the following so we don't have to install Megatron_LM code - megatron_name = 'megatron' - megatron_spec = importlib.util.spec_from_loader(megatron_name, loader=None, is_package=True) - - megatron_module = importlib.util.module_from_spec(megatron_spec) - sys.modules[megatron_name] = megatron_module - - model_name = 'model' - model_spec = importlib.util.spec_from_loader(model_name, loader=None, is_package=True) - - model_module = importlib.util.module_from_spec(model_spec) - - megatron_module.__dict__['model'] = model_module - - sys.modules[megatron_name + '.' + model_name] = model_module - - enums_name = 'enums' - enums_spec = importlib.util.spec_from_loader(enums_name, loader=None, is_package=True) - enums_module = importlib.util.module_from_spec(enums_spec) - - model_module.__dict__['enums'] = enums_module - - sys.modules[megatron_name + '.' + model_name + '.' + enums_name] = enums_module - - exec(enum_code, enums_module.__dict__) - - -def get_args(): - parser = ArgumentParser() - parser.add_argument( - "--checkpoint_folder", - type=str, - default=None, - required=True, - help="Path to Megatron-LM checkpoints saved during training. Ex: /raid/Megatron_LM/checkpoints", - ) - parser.add_argument( - "--checkpoint_name", - type=str, - default='model_optim_rng.pt', - required=True, - help="Name of checkpoint to be used. Ex: model_optim_rng.pt", - ) - - parser.add_argument( - "--hparams_file", - type=str, - default=None, - required=False, - help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml", - ) - parser.add_argument("--nemo_file_path", type=str, default=None, required=False, help="Path to output .nemo file.") - - parser.add_argument( - "--output_ckpt_file_path", type=str, default=None, required=False, help="Path to output .ckpt file." - ) - - parser.add_argument("--gpus_per_node", type=int, required=False, default=1) - - parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None) - parser.add_argument("--pipeline_model_parallel_size", type=int, required=False, default=1) - - parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1)) - - parser.add_argument("--model_type", type=str, required=True, default="gpt", choices=["gpt", "t5", "bert"]) - - args = parser.parse_args() - return args - - -def parse_weights(weight_dict: OrderedDict, parent_key: str, total: list, converted: OrderedDict, translator: dict): - for key in weight_dict: - new_key = key - name_translate = translator - - for replace_key in name_translate: - if key.find(replace_key) >= 0: - new_key = key.replace(replace_key, name_translate[replace_key]) - if isinstance(weight_dict[key], OrderedDict) or isinstance(weight_dict[key], dict): - parse_weights(weight_dict[key], parent_key + '.' + new_key, total, converted, translator) - else: - num_parameters = torch.prod(torch.tensor(weight_dict[key].cpu().size())).item() - total[0] += num_parameters - final_key = 'model' + parent_key + '.' + new_key - converted[final_key] = weight_dict[key] - - -def add_optimizer_state(lm_checkpoint, new_checkpoint, megatron_amp_o2=True): - # this method is to convert lm_checkpoint optimizer states for nemo checkpoint - OPTIMIZER_KEY = 'optimizer' - FP32_FP16_KEY = 'fp32_from_fp16_params' - NEW_OPTIMIZER_KEY = 'optimizer_states' - STEP_KEY = 'iteration' - NEW_STEP_KEY = 'global_step' - LR_SCHEDULER = 'lr_scheduler' - NEW_LR_SCHEDULER = 'lr_schedulers' - if OPTIMIZER_KEY in lm_checkpoint and OPTIMIZER_KEY in lm_checkpoint[OPTIMIZER_KEY]: - opt_state = lm_checkpoint[OPTIMIZER_KEY][OPTIMIZER_KEY] - if megatron_amp_o2: - opt_dict = dict() - if LR_SCHEDULER in lm_checkpoint: - sched = lm_checkpoint[LR_SCHEDULER] - for param_group in opt_state['param_groups']: - param_group['initial_lr'] = sched['max_lr'] - if FP32_FP16_KEY in lm_checkpoint[OPTIMIZER_KEY]: - fp32_state = lm_checkpoint[OPTIMIZER_KEY][FP32_FP16_KEY] - opt_dict[FP32_FP16_KEY] = fp32_state - opt_dict[OPTIMIZER_KEY] = opt_state - new_checkpoint[NEW_OPTIMIZER_KEY] = [opt_dict] - else: - new_checkpoint[NEW_OPTIMIZER_KEY] = [opt_state] - - if STEP_KEY in lm_checkpoint: - new_checkpoint[NEW_STEP_KEY] = lm_checkpoint[STEP_KEY] - new_checkpoint['epoch'] = 1 # always one epoch - if LR_SCHEDULER in lm_checkpoint: - gbs = lm_checkpoint['args'].global_batch_size - sched = lm_checkpoint[LR_SCHEDULER] - content = OrderedDict() - content['max_steps'] = int(sched['decay_steps']) // gbs + sched['warmup_steps'] // gbs - content['warmup_steps'] = int(sched['warmup_steps']) // gbs - content['constant_steps'] = 0 # no such conf in lm checkpoint - content['decay_steps'] = int(sched['decay_steps']) // gbs - content['min_lr'] = sched['min_lr'] - if OPTIMIZER_KEY in lm_checkpoint: - content['base_lrs'] = [ - i['initial_lr'] for i in new_checkpoint['optimizer_states'][0]['optimizer']['param_groups'] - ] - content['last_epoch'] = int(sched['num_steps']) // gbs - content['_last_lr'] = [i['lr'] for i in new_checkpoint['optimizer_states'][0]['optimizer']['param_groups']] - else: - content['base_lrs'] = [sched['max_lr']] - content['last_epoch'] = int(sched['num_steps']) // gbs - content['_step_count'] = int(sched['num_steps']) // gbs - content['verbose'] = False - content['_get_lr_called_within_step'] = False - new_checkpoint[NEW_LR_SCHEDULER] = [content] - - -def load_model(cls, checkpoint, strict, **kwargs): - try: - if 'cfg' in kwargs: - model = ptl_load_state(cls, checkpoint, strict=strict, **kwargs) - else: - model = ptl_load_state( - cls, checkpoint, strict=strict, cfg=checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY].cfg, **kwargs - ) - # register the artifacts - cfg = checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY].cfg - if cfg.tokenizer.model is not None: - model.register_artifact("tokenizer.tokenizer_model", cfg.tokenizer.model) - if cfg.tokenizer.vocab_file is not None: - model.register_artifact("tokenizer.vocab_file", cfg.tokenizer.vocab_file) - if cfg.tokenizer.merge_file is not None: - model.register_artifact("tokenizer.merge_file", cfg.tokenizer.merge_file) - finally: - cls._set_model_restore_state(is_being_restored=False) - return model - - -def load_from_checkpoint( - cls, - checkpoint_path: str, - map_location: Any = None, - hparams_file: Optional[str] = None, - strict: bool = True, - **kwargs, -): - """ - Loads Megatron_LM checkpoints, convert it, with some maintenance of restoration. - For documentation, please refer to LightningModule.load_from_checkpoin() documentation. - """ - checkpoint = None - try: - cls._set_model_restore_state(is_being_restored=True) - # TODO: replace with proper PTL API - - with pl_legacy_patch(): - if map_location is not None: - old_checkpoint = pl_load(checkpoint_path, map_location=map_location) - else: - old_checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage) - - total_params = [0] - checkpoint = OrderedDict() - checkpoint['state_dict'] = OrderedDict() - parse_weights( - old_checkpoint['model'], "", total_params, checkpoint['state_dict'], translator=kwargs['translator'] - ) - print('converted {:.2f}M parameters'.format(total_params[0] / 1e6)) - - if hparams_file is not None: - extension = hparams_file.split(".")[-1] - if extension.lower() == "csv": - hparams = load_hparams_from_tags_csv(hparams_file) - elif extension.lower() in ("yml", "yaml"): - hparams = load_hparams_from_yaml(hparams_file) - else: - raise ValueError(".csv, .yml or .yaml is required for `hparams_file`") - - hparams["on_gpu"] = False - - # overwrite hparams by the given file - checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY] = hparams - - check_point_version = old_checkpoint.get('checkpoint_version', 0) - if check_point_version < 3: - # need to do the transpose of query_key_value variables - if hparams_file is not None: - np = hparams['cfg']['num_attention_heads'] - elif 'config' in old_checkpoint and 'num-attention-heads' in old_checkpoint['config']: - np = old_checkpoint['config']['num-attention-heads'] - - else: - logging.warning("cannot determine the number attention heads") - raise ValueError('need to know number of attention heads') - - if check_point_version == 0: - # 3, np, hn -> np, 3, hn - for key in checkpoint['state_dict']: - if key.find('query_key_value') >= 0: - weight = checkpoint['state_dict'][key] - if len(weight.size()) == 2: - # weight - weight = weight.view(3, np, -1, weight.size()[-1]) - weight = weight.transpose(0, 1).contiguous() - checkpoint['state_dict'][key] = weight.view(-1, weight.size()[-1]) - else: - # biase - weight = weight.view(3, np, -1) - weight = weight.transpose(0, 1).contiguous() - checkpoint['state_dict'][key] = weight.view(-1) - elif check_point_version == 1: - # np, hn, 3 -> np, 3, hn - for key in checkpoint['state_dict']: - if key.find('query_key_value') >= 0: - weight = checkpoint['state_dict'][key] - if len(weight.size()) == 2: - # weight - weight = weight.view(np, -1, 3, weight.size()[-1]) - weight = weight.transpose(1, 2).contiguous() - checkpoint['state_dict'][key] = weight - else: - # biase - weight = weight.view(np, -1, 3) - weight = weight.transpose(1, 2).contiguous() - checkpoint['state_dict'][key] = weight - - # for past checkpoint need to add the new key - if cls.CHECKPOINT_HYPER_PARAMS_KEY not in checkpoint: - checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY] = {} - # override the hparams with values that were passed in - # TODO: can we do this without overriding? - config_kwargs = kwargs.copy() - if 'trainer' in config_kwargs: - config_kwargs.pop('trainer') - checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY].update(config_kwargs) - add_optimizer_state(old_checkpoint, checkpoint) - consumed = None - if 'args' in old_checkpoint and hasattr(old_checkpoint['args'], 'consumed_train_samples'): - consumed = getattr(old_checkpoint['args'], 'consumed_train_samples') - steps = None - if 'iteration' in old_checkpoint: - steps = old_checkpoint['iteration'] - finally: - cls._set_model_restore_state(is_being_restored=False) - logging.warning(f"the checkpoint version is {check_point_version}") - return checkpoint, consumed, steps, check_point_version - - -def megatron_lm_inject_model_parallel_rank(filepath): - """ - Injects tensor/pipeline model parallel ranks into the filepath. - Does nothing if not using model parallelism. - """ - # first make sure filepath does not have rank - filepath = uninject_model_parallel_rank(filepath) - - app_state = AppState() - if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1: - # filepath needs to be updated to include mp_rank - dirname = os.path.dirname(filepath) - basename = os.path.basename(filepath) - if app_state.pipeline_model_parallel_size is None or app_state.pipeline_model_parallel_size == 1: - filepath = f'{dirname}/mp_rank_{app_state.tensor_model_parallel_rank:02d}/{basename}' - else: - filepath = f'{dirname}/mp_rank_{app_state.tensor_model_parallel_rank:02d}_{app_state.pipeline_model_parallel_rank:03d}/{basename}' - return filepath - else: - return filepath - - -def convert(local_rank, rank, world_size, args): - - app_state = AppState() - initialize_model_parallel_for_nemo( - world_size=world_size, - global_rank=rank, - local_rank=local_rank, - tensor_model_parallel_size=args.tensor_model_parallel_size, - pipeline_model_parallel_size=args.pipeline_model_parallel_size, - virtual_pipeline_model_parallel_size=None, - pipeline_model_parallel_split_rank=0, - micro_batch_size=None, - global_batch_size=None, - seed=1234, - apex_transformer_log_level=30, - ) - # hard set the data parallel rank to 0, otherwiaze it is default to None - app_state.data_parallel_rank = 0 - - # tensor_model_parallel_size = args.tensor_model_parallel_size - num_nodes = world_size // args.gpus_per_node - assert world_size % args.gpus_per_node == 0, "world_size must be divisible by gpus_per_node" - - trainer = Trainer(devices=args.gpus_per_node, accelerator='gpu', num_nodes=num_nodes) - checkpoint_path = megatron_lm_inject_model_parallel_rank( - os.path.join(args.checkpoint_folder, args.checkpoint_name) - ) - logging.info(f"loading checkpoint {checkpoint_path}") - - if args.model_type == 'gpt': - # this dictionary is used to rename the model parameters - name_translate = {} - name_translate['transformer'] = 'encoder' - name_translate['.attention.'] = '.self_attention.' - # nemo megatron doesn't have _for_head key - name_translate['word_embeddings_for_head'] = 'word_embeddings' - checkpoint, consumed, steps, version = load_from_checkpoint( - MegatronGPTModel, - checkpoint_path, - hparams_file=args.hparams_file, - trainer=trainer, - translator=name_translate, - strict=False, - ) - elif args.model_type == 'bert': - # this dictionary is used to rename the model parameters - name_translate = {} - name_translate['transformer'] = 'encoder' - name_translate['.attention.'] = '.self_attention.' - # nemo megatron doesn't have _for_head key - name_translate['word_embeddings_for_head'] = 'word_embeddings' - checkpoint, consumed, steps, version = load_from_checkpoint( - MegatronBertModel, - checkpoint_path, - hparams_file=args.hparams_file, - trainer=trainer, - translator=name_translate, - strict=False, - ) - else: - raise NotImplemented("{} is not supported".format(args.model_type)) - - if torch.distributed.is_initialized(): - torch.distributed.barrier() - - if args.output_ckpt_file_path: - filepath = args.output_ckpt_file_path - base_dir = pathlib.Path(filepath).parent - filename_str = pathlib.Path(filepath).name - suffix = '.ckpt' - content = {} - if consumed is not None: - content['consumed'] = consumed - else: - content['consumed'] = 0 - if steps is not None: - content['steps'] = steps - else: - content['steps'] = 0 - filename = filename_str.format(**content) + suffix - checkpoint_path_output = inject_model_parallel_rank(os.path.join(base_dir, filename)) - trainer.training_type_plugin.checkpoint_io.save_checkpoint(checkpoint, checkpoint_path_output) - logging.info(f'NeMo model checkpoint files saved to: {args.output_ckpt_file_path}') - - if args.nemo_file_path: - if args.model_type == 'gpt': - model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer) - elif args.model_type == 'bert': - model = load_model(MegatronBertModel, checkpoint, strict=False, trainer=trainer) - else: - raise NotImplemented("{} is not supported".format(args.model_type)) - - # verify tensor parallel rank id and pipeline parallel rank id matches - assert app_state.data_parallel_size == 1 - model._save_restore_connector = NLPSaveRestoreConnector() - model.save_to(args.nemo_file_path) - logging.info(f'NeMo model saved to: {args.nemo_file_path}') - - -if __name__ == '__main__': - install_megatron_dependence() - args = get_args() - if args.local_rank == -1: - device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") - rank = args.local_rank - local_rank = rank - world_size = 1 - else: - local_rank, rank, world_size = initialize_distributed(args) - - # make sure the world size is divisible by tensor model parallel_size - assert world_size % args.tensor_model_parallel_size == 0 - - torch.distributed.barrier() - convert(local_rank, rank, world_size, args) - torch.distributed.barrier() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_retro_cal_shape.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_retro_cal_shape.py deleted file mode 100644 index 008ce1445929caa755bd8a5a15bfebf7795ec2dc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_retro_cal_shape.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.plugins.precision import MixedPrecisionPlugin - -from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel -from nemo.collections.nlp.modules.common.megatron.mup.shape import make_base_shapes -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging - - -@hydra_runner(config_path="conf", config_name="megatron_retro_mutransfer") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True if megatron_amp_o2 else False, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(MixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()]) - - # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams - with open_dict(cfg): - cfg.base_model.precision = cfg.trainer.precision - cfg.delta_model.precision = cfg.trainer.precision - - base_model = MegatronRetrievalModel(cfg.base_model, trainer) - delta_model = MegatronRetrievalModel(cfg.delta_model, trainer) - make_base_shapes(base_model, delta_model, savefile=cfg.model.shape_file) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_retro_eval.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_retro_eval.py deleted file mode 100644 index 79b1e2debdfa1f60e29b2473ca8abf60dfb1cac6..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_retro_eval.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -from examples.nlp.language_modeling.megatron_gpt_eval import RequestDataSet -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from torch.utils.data import DataLoader - -from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel -from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector -from nemo.core.config import hydra_runner - -try: - from megatron.core import parallel_state - - HAVE_MEGATRON_CORE = True - -except (ImportError, ModuleNotFoundError): - - HAVE_MEGATRON_CORE = False - -""" -This is the script to run RETRO Model text generation. - -Usage: - Assume the model has TP=1, PP=1 - run greedy inference from a nemo file: - python megatron_retro_eval.py \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.precision=16 \ - inference.tokens_to_generate=128 \ - inference.greedy=True \ - retro_model_file=path_to_retro_nemo_file \ - tensor_model_parallel_size=-1 \ - pipeline_model_parallel_size=-1 \ - retrieval_service.faiss_devices='0' \ - retrieval_service.faiss_index=path_to_faiss_index \ - retrieval_service.retrieval_index=path_to_retrieval_dataset \ - retrieval_service.neighbors=20 -""" - - -@hydra_runner(config_path="conf", config_name="megatron_retro_inference") -def main(cfg) -> None: - trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) - - model_path = cfg.retro_model_file - - save_restore_connector = NLPSaveRestoreConnector() - - if os.path.isdir(model_path): - save_restore_connector.model_extracted_dir = model_path - - model_cfg = MegatronRetrievalModel.restore_from( - model_path, trainer=trainer, return_config=True, save_restore_connector=save_restore_connector, - ) - - with open_dict(model_cfg): - model_cfg.precision = trainer.precision - model_cfg.sequence_parallel = False - model_cfg.activations_checkpoint_granularity = None - model_cfg.activations_checkpoint_method = None - - if ( - cfg.tensor_model_parallel_size < 0 - or cfg.pipeline_model_parallel_size < 0 - or cfg.get('pipeline_model_parallel_split_rank', -1) < 0 - ): - with open_dict(cfg): - cfg.tensor_model_parallel_size = model_cfg.get('tensor_model_parallel_size', 1) - cfg.pipeline_model_parallel_size = model_cfg.get('pipeline_model_parallel_size', 1) - cfg.pipeline_model_parallel_split_rank = model_cfg.get('pipeline_model_parallel_split_rank', 0) - - model = MegatronRetrievalModel.restore_from( - model_path, trainer=trainer, save_restore_connector=save_restore_connector, override_config_path=model_cfg, - ) - - length_params: LengthParam = { - "max_length": cfg.inference.tokens_to_generate, - "min_length": cfg.inference.min_tokens_to_generate, - } - - sampling_params: SamplingParam = { - "use_greedy": cfg.inference.greedy, - "temperature": cfg.inference.temperature, - "top_k": cfg.inference.top_k, - "top_p": cfg.inference.top_p, - "repetition_penalty": cfg.inference.repetition_penalty, - "add_BOS": cfg.inference.add_BOS, - "all_probs": cfg.inference.all_probs, - "compute_logprob": cfg.inference.compute_logprob, - } - - # check whether the DDP is initialized - if parallel_state.is_unitialized(): - - def dummy(): - return - - if model.trainer.strategy.launcher is not None: - model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer) - model.trainer.strategy.setup_environment() - - config = OmegaConf.to_container(cfg.inference) - retrieval_service = OmegaConf.to_container(cfg.retrieval_service) - model.set_inference_config(config, retrieval_service) - - if not cfg.use_predict_method: - # First method of running text generation, call model.generate method - response = model.generate( - inputs=OmegaConf.to_container(cfg.prompts), - length_params=length_params, - sampling_params=sampling_params, - strategy=model.inference_strategy, - ) - else: - # Second method of running text generation, call trainer.predict - ds = RequestDataSet(OmegaConf.to_container(cfg.prompts)) - request_dl = DataLoader(dataset=ds, batch_size=cfg.inference_batch_size) - response = trainer.predict(model, request_dl) - - print("***************************") - print(response) - print("***************************") - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_retro_fine_tune.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_retro_fine_tune.py deleted file mode 100644 index 031191c22b0a0b85831587de35dab829acfe4145..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_retro_fine_tune.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import datetime -import os - -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.plugins.precision import MixedPrecisionPlugin -from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector - -from nemo.collections.nlp.models.language_modeling.megatron_retro_fine_tune_model import MegatronRetroFinetuneModel -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import StatelessTimer, exp_manager - - -def _modify_config(retro_cfg, cfg, add_cfg_to_tree=False): - """ - This function modifies the original retro pre-training config with attributes from the finetuning config (cfg). - The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`. - """ - OmegaConf.set_struct(retro_cfg, True) - with open_dict(retro_cfg): - retro_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) - retro_cfg.data = cfg.model.data - retro_cfg.precision = cfg.trainer.precision - retro_cfg.optim = cfg.model.optim - retro_cfg.micro_batch_size = cfg.model.micro_batch_size - # This is needed when modifying a hparam file directly to load `.ckpt` files. - # This is not needed to modify the cfg in `.nemo` files. - if add_cfg_to_tree: - OmegaConf.resolve(retro_cfg) - retro_cfg.cfg = retro_cfg - return retro_cfg - - -def load_from_nemo(cls, cfg, trainer, retro_cfg, modify_confg_fn, save_restore_connector): - retro_cfg = modify_confg_fn(retro_cfg, cfg, add_cfg_to_tree=False) - model = cls.restore_from( - restore_path=cfg.model.restore_path, - trainer=trainer, - override_config_path=retro_cfg, - save_restore_connector=save_restore_connector, - ) - return model - - -@hydra_runner(config_path="conf", config_name="megatron_retro_finetune_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - ###### following is the workaround for num_workers=0 issue ##### - # import torch.multiprocessing as mp - # mp.set_start_method("spawn", force=True) - ##################################################### - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True if megatron_amp_o2 else False, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - timeout=datetime.timedelta(seconds=18000), - ) - - if cfg.trainer.precision in [16, '16', '16-mixed', 'bf16', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(MixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()]) - exp_manager(trainer, cfg.exp_manager) - - logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') - - # Override timer callback to a stateless one - for idx, callback in enumerate(trainer.callbacks): - if isinstance(callback, Timer): - trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,) - - # load existing or init new soft prompt GPT model - if cfg.model.get("restore_path", None): - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(cfg.model.restore_path): - save_restore_connector.model_extracted_dir = cfg.model.restore_path - - model_cfg = MegatronRetroFinetuneModel.restore_from( - restore_path=cfg.model.restore_path, - trainer=trainer, - return_config=True, - save_restore_connector=save_restore_connector, - ) - # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams - model = load_from_nemo( - MegatronRetroFinetuneModel, - cfg, - trainer, - model_cfg, - modify_confg_fn=_modify_config, - save_restore_connector=save_restore_connector, - ) - else: - model = MegatronRetroFinetuneModel(cfg.model, trainer=trainer) - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py deleted file mode 100644 index 79b25fd2c5e89de1f2b710616fadbbd4d48eaa81..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.plugins.precision import MixedPrecisionPlugin -from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector - -from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel -from nemo.collections.nlp.modules.common.megatron.mup.optim import MuAdam, MuAdamW -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, -) -from nemo.core.config import hydra_runner -from nemo.core.config.optimizers import AdamParams, AdamWParams -from nemo.core.optim.optimizers import register_optimizer -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="megatron_retro_mutransfer") -def main(cfg) -> None: - register_optimizer("muadamw", MuAdamW, AdamWParams()) - register_optimizer("muadam", MuAdam, AdamParams()) - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True if megatron_amp_o2 else False, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(MixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()]) - - exp_manager(trainer, cfg.exp_manager) - - # resume_from_checkpoint = uninject_model_parallel_rank(resume_from_checkpoint) - logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') - - model = MegatronRetrievalModel(cfg.model, trainer) - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_retro_pretraining.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_retro_pretraining.py deleted file mode 100644 index e90fdddf1169639e4389f3e5d74893bd3ab1ce08..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_retro_pretraining.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.plugins.precision import MixedPrecisionPlugin -from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector - -from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel -from nemo.collections.nlp.modules.common.megatron.megatron_init import initialize_model_parallel_for_nemo -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="megatron_retro_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True if megatron_amp_o2 else False, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2: - plugins.append(MegatronHalfPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(MixedPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()]) - - exp_manager(trainer, cfg.exp_manager) - - # resume_from_checkpoint = uninject_model_parallel_rank(resume_from_checkpoint) - logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') - - # load existing nemo retro model - if cfg.get("restore_from_path", None) is not None: - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(cfg.restore_from_path): - save_restore_connector.model_extracted_dir = cfg.restore_from_path - model = MegatronRetrievalModel.restore_from( - restore_path=cfg.restore_from_path, - trainer=trainer, - override_config_path=cfg.model, - save_restore_connector=save_restore_connector, - strict=False, - ) - else: - model = MegatronRetrievalModel(cfg.model, trainer) - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_eval.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_eval.py deleted file mode 100644 index 0b6ea54b6b9947563f692ea8cf15b2d6555fa906..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_eval.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -from argparse import ArgumentParser - -import torch -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning.trainer.trainer import Trainer -from torch.utils.data import DataLoader - -from nemo.collections.nlp.data.language_modeling.megatron.request_dataset import T5RequestDataset -from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model -from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector -from nemo.utils.app_state import AppState - -assert torch.cuda.is_available() - - -def main(): - parser = ArgumentParser() - parser.add_argument("--model_file", type=str, default="", required=True, help="Pass path to model's .nemo file") - parser.add_argument( - "--prompt", type=str, default="", required=True, help="Prompt for the model (a text to complete)" - ) - parser.add_argument( - "--tokens_to_generate", type=int, default="16", required=False, help="How many tokens to add to prompt" - ) - parser.add_argument( - "--tensor_model_parallel_size", type=int, default=-1, required=False, - ) - parser.add_argument( - "--pipeline_model_parallel_size", type=int, default=-1, required=False, - ) - parser.add_argument( - "--pipeline_model_parallel_split_rank", type=int, default=-1, required=False, - ) - parser.add_argument("--precision", default="16", type=str, help="PyTorch Lightning Trainer precision flag") - parser.add_argument("--decoder_starts_with_pad", action="store_true", help="Decoder starts with pad token") - parser.add_argument("--add_eos_to_encoder_input", action="store_true", help="Encoder input ends with EOS token") - args = parser.parse_args() - - # cast precision to int if 32 or 16 - if args.precision in ["32", "16"]: - args.precision = int(float(args.precision)) - - if ( - args.tensor_model_parallel_size < 0 - or args.pipeline_model_parallel_size < 0 - or args.pipeline_model_parallel_split_rank < 0 - ): - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(args.model_file): - save_restore_connector.model_extracted_dir = args.model_file - - model_config = MegatronT5Model.restore_from( - restore_path=args.model_file, - trainer=Trainer(strategy=NLPDDPStrategy()), - return_config=True, - save_restore_connector=save_restore_connector, - ) - - args.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1) - args.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1) - args.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0) - - # trainer required for restoring model parallel models - trainer = Trainer( - strategy=NLPDDPStrategy(), - devices=args.tensor_model_parallel_size * args.pipeline_model_parallel_size, - accelerator='gpu', - precision=args.precision, - ) - - app_state = AppState() - if args.tensor_model_parallel_size > 1 or args.pipeline_model_parallel_size > 1: - app_state.model_parallel_size = args.tensor_model_parallel_size * args.pipeline_model_parallel_size - ( - app_state.tensor_model_parallel_rank, - app_state.pipeline_model_parallel_rank, - app_state.model_parallel_size, - app_state.data_parallel_size, - app_state.pipeline_model_parallel_split_rank, - app_state.virtual_pipeline_model_parallel_rank, - ) = fake_initialize_model_parallel( - world_size=app_state.model_parallel_size, - rank=trainer.global_rank, - tensor_model_parallel_size_=args.tensor_model_parallel_size, - pipeline_model_parallel_size_=args.pipeline_model_parallel_size, - pipeline_model_parallel_split_rank_=args.pipeline_model_parallel_split_rank, - ) - - model_cfg = MegatronT5Model.restore_from( - restore_path=args.model_file, - trainer=trainer, - save_restore_connector=NLPSaveRestoreConnector(), - return_config=True, - ) - OmegaConf.set_struct(model_cfg, True) - with open_dict(model_cfg): - model_cfg.precision = trainer.precision - - model = MegatronT5Model.restore_from( - restore_path=args.model_file, - trainer=trainer, - save_restore_connector=NLPSaveRestoreConnector(), - override_config_path=model_cfg, - ) - model.freeze() - model.training = False - - request = { - "prompt": args.prompt, - "tokens_to_generate": args.tokens_to_generate, - "bos_id": model.tokenizer.pad_id if args.decoder_starts_with_pad else model.tokenizer.bos_id, - "add_eos_to_encoder_input": args.add_eos_to_encoder_input, - } - - dataset = T5RequestDataset(request, model.tokenizer) - - request_dl = DataLoader(dataset) - - response = trainer.predict(model, request_dl) - - print("***************************") - print(response) - print(response[0]['completion']['text']) - print("***************************") - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py deleted file mode 100644 index 0a10d434127da2f3960193cb410457f85f8669ba..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import ModelSummary -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector - -from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, - PipelineMixedPrecisionPlugin, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="megatron_t5_lm_adaptation_finetune") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - if cfg.trainer.precision in [16, '16', '16-mixed', 'bf16', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer( - plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3), CustomProgressBar()] - ) - exp_manager(trainer, cfg.exp_manager) - - # update resume from checkpoint found by exp_manager - if cfg.model.resume_from_checkpoint is not None: - trainer.ckpt_path = cfg.model.resume_from_checkpoint - logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') - - if hasattr(cfg.model, 'pretrained_model_path') and cfg.model.pretrained_model_path is not None: - pretrained_cfg = MegatronT5Model.restore_from( - cfg.model.pretrained_model_path, trainer=trainer, return_config=True - ) - OmegaConf.set_struct(pretrained_cfg, True) - with open_dict(pretrained_cfg): - - # Override data from T5 to Prefix-LM - encoder_seq_length = pretrained_cfg.data.seq_length - decoder_seq_length = ( - pretrained_cfg.data.seq_length - ) # Set decoder seq length to be enoder seq length for prefix-lm - pretrained_cfg.data = cfg.model.data - pretrained_cfg.data.seq_length = encoder_seq_length - pretrained_cfg.data.seq_length_dec = ( - decoder_seq_length - 1 - ) # -1 is to account for the addition of and and right shifting to create targets. - - # Override fusion params. - pretrained_cfg.masked_softmax_fusion = cfg.model.masked_softmax_fusion - pretrained_cfg.bias_dropout_add_fusion = cfg.model.bias_dropout_add_fusion - pretrained_cfg.bias_gelu_fusion = cfg.model.bias_gelu_fusion - - # Override dropout - if cfg.model.hidden_dropout is not None: - pretrained_cfg.hidden_dropout = cfg.model.hidden_dropout - - if cfg.model.attention_dropout is not None: - pretrained_cfg.attention_dropout = cfg.model.attention_dropout - - # Override precision - pretrained_cfg.precision = trainer.precision # Set above from trainer.precision - - # Override micro/global batch - pretrained_cfg.micro_batch_size = cfg.model.micro_batch_size - pretrained_cfg.global_batch_size = cfg.model.global_batch_size - - # O2 AMP - pretrained_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) - - # Optimizer overrides. - pretrained_cfg.optim = cfg.model.optim - - model = MegatronT5Model.restore_from( - cfg.model.pretrained_model_path, - trainer=trainer, - override_config_path=pretrained_cfg, - save_restore_connector=NLPSaveRestoreConnector(), - ) - else: - raise ValueError(f'No pretrained model path specified or does not exist {cfg.model.pretrained_model_path}') - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_pretraining.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_pretraining.py deleted file mode 100644 index dd5dde2f09e68226165607d0b1b7b4e315f961c5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_pretraining.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from omegaconf.omegaconf import OmegaConf, open_dict - -from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model -from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronT5TrainerBuilder -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="megatron_t5_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - trainer = MegatronT5TrainerBuilder(cfg).create_trainer() - exp_manager(trainer, cfg.exp_manager) - - model = MegatronT5Model(cfg.model, trainer) - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_prompt_learning.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_prompt_learning.py deleted file mode 100644 index 3edca99e15a5f24800690d06b8009d1b46428b8f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_prompt_learning.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment - -from nemo.collections.nlp.models.language_modeling.megatron_t5_prompt_learning_model import ( - MegatronT5PromptLearningModel, -) -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - NLPDDPStrategy, - NLPSaveRestoreConnector, - PipelineMixedPrecisionPlugin, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.decorators import deprecated -from nemo.utils.exp_manager import exp_manager - -mp.set_start_method("spawn", force=True) - - -""" -This is an example of how to ptune/prompt-tune a pretrained T5 model. -Be sure to use a .nemo T5 model with this code. If you've downloaded -a model from NGC or are otherwise using a MegatronLM model, please use -either megatron_ckpt_to_nemo.py or megatron_lm_ckpt_to_nemo.py found -within this examples directory to convert your model to .nemo format. -""" - - -@deprecated( - explanation=f"{__file__} is deprecated. Please use MegatronT5SFTModel.add_adapter() for PEFT features." - "See updated scripts `megatron_t5_peft_tuning.py` and `megatron_t5_peft_eval.py` for examples." -) -@hydra_runner(config_path="conf", config_name="megatron_t5_prompt_learning.yaml") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - plugins = [] - strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,) - if cfg.trainer.precision == 16 or cfg.trainer.precision == '16-mixed': - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - enabled=False - if cfg.model.pipeline_model_parallel_size > 1 - else True, # turn off the grad scale for pipeline parallel LM model - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugins.append(PipelineMixedPrecisionPlugin(precision='16-mixed', device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()]) - exp_manager(trainer, cfg.exp_manager) - - # load existing or init new soft prompt T5 model - if cfg.model.get("restore_path", None): - model = MegatronT5PromptLearningModel.restore_from( - cfg.model.restore_path, cfg.model, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector() - ) - - else: - model = MegatronT5PromptLearningModel(cfg.model, trainer=trainer) - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py deleted file mode 100644 index 67640138b3ff5d700386bf2c6628e6f9f6991824..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch -from omegaconf.omegaconf import open_dict -from pytorch_lightning.trainer.trainer import Trainer - -from nemo.collections.nlp.models.language_modeling.megatron_t5_prompt_learning_model import ( - MegatronT5PromptLearningModel, -) -from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy -from nemo.core.config import hydra_runner -from nemo.utils.app_state import AppState -from nemo.utils.decorators import deprecated - -try: - from megatron.core import parallel_state - - HAVE_MEGATRON_CORE = True -except (ImportError, ModuleNotFoundError): - HAVE_MEGATRON_CORE = False - - -if not torch.cuda.is_available(): - raise EnvironmentError("GPU is needed for the inference") - - -@deprecated( - explanation=f"{__file__} is deprecated. Please use MegatronT5SFTModel.add_adapter() for PEFT features." - "See updated scripts `megatron_t5_peft_tuning.py` and `megatron_t5_peft_eval.py` for examples." -) -@hydra_runner(config_path="conf", config_name="megatron_t5_prompt_learning_inference") -def main(cfg) -> None: - - # trainer required for restoring model parallel models - trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) - - if ( - cfg.tensor_model_parallel_size < 0 - or cfg.pipeline_model_parallel_size < 0 - or cfg.get('pipeline_model_parallel_split_rank', -1) < 0 - ): - model_config = MegatronT5PromptLearningModel.restore_from( - restore_path=cfg.language_model_path, trainer=trainer, return_config=True, - ) - - with open_dict(cfg): - cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1) - cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1) - cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0) - - assert ( - cfg.trainer.devices * cfg.trainer.num_nodes - == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size - ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" - - app_state = AppState() - if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: - app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size - ( - app_state.tensor_model_parallel_rank, - app_state.pipeline_model_parallel_rank, - app_state.model_parallel_size, - app_state.data_parallel_size, - app_state.pipeline_model_parallel_split_rank, - app_state.virtual_pipeline_model_parallel_rank, - ) = fake_initialize_model_parallel( - world_size=app_state.model_parallel_size, - rank=trainer.global_rank, - tensor_model_parallel_size_=cfg.tensor_model_parallel_size, - pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, - pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank, - ) - - # Load prompt tuned model, virtual_prompt_model_file and language_model_path must be provided in config - if cfg.get('virtual_prompt_model_file', None) is not None and cfg.get('language_model_path', None) is not None: - - # Update frozen T5 model path in case it has changed - prompt_learning_cfg = MegatronT5PromptLearningModel.restore_from( - cfg.virtual_prompt_model_file, trainer=trainer, return_config=True - ) - with open_dict(prompt_learning_cfg): - if cfg.get("language_model_path"): - # This is for backward compatibility with old checkpoints that used `pretrained_language_model_path` instead of `language_model_path`. - if hasattr(prompt_learning_cfg, 'pretrained_language_model_path'): - prompt_learning_cfg.pretrained_language_model_path = cfg.language_model_path - else: - prompt_learning_cfg.language_model_path = cfg.language_model_path - prompt_learning_cfg.micro_batch_size = cfg.data.get('micro_batch_size', 4) - prompt_learning_cfg.global_batch_size = cfg.data.get('global_batch_size', 4) - - # Now load prompt learning model with frozen T5 model base - model = MegatronT5PromptLearningModel.restore_from( - restore_path=cfg.virtual_prompt_model_file, trainer=trainer, override_config_path=prompt_learning_cfg - ) - - else: - raise ValueError("virtual_prompt_model_file and pretrained_language_model_file must be provided in config") - - # check whether the DDP is initialized - if parallel_state.is_unitialized(): - - def dummy(): - return - - if model.trainer.strategy.launcher is not None: - model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer) - model.trainer.strategy.setup_environment() - - model.freeze() - - _, test_dl = model.build_virtual_prompt_dataset( - dataset_paths=cfg.data.test_ds, - batch_size=cfg.data.global_batch_size, - for_train=False, - drop_last=False, - shuffle=False, - num_workers=cfg.data.num_workers, - pin_memory=True, - ) - - outputs = trainer.predict(model, test_dl) - with open(cfg.pred_file_path, "w", encoding="utf-8") as pred_file: - for batch in outputs: - preds = batch["preds_text"] - for pred in preds: - pred = pred.strip().replace("\n", " ") - pred_file.write(pred + "\n") - print('test finish---------------------------------') - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py deleted file mode 100644 index 6a5ab9fbf481bfcac4be8688176a969a276125b4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from megatron_t5_seq2seq_finetune import load_from_checkpoint_dir, load_from_nemo, validate_checkpoint_loading_args -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.plugins.precision import MixedPrecisionPlugin - -from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel -from nemo.collections.nlp.models.language_modeling.megatron_t0_model import MegatronT0Model -from nemo.collections.nlp.models.language_modeling.megatron_t5_sft_model import MegatronT5SFTModel -from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPStrategy -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False): - """ - This function modifies the original t5 pre-training config (t5_cfg) with attributes from the finetuning config (cfg). - The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`. - """ - OmegaConf.set_struct(t5_cfg, True) - with open_dict(t5_cfg): - t5_cfg.precision = cfg.trainer.precision - # Overwrite data configs - if cfg.model.data.validation_ds.get('src_file_name', None) is not None: - logging.info( - 'Found validation_ds.src_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.' - ) - t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name - if cfg.model.data.validation_ds.get('tgt_file_name', None) is not None: - logging.info( - 'Found validation_ds.tgt_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.' - ) - t5_cfg.data.validation_ds.tgt_file_name = cfg.model.data.validation_ds.tgt_file_name - - if "write_predictions_to_file" in cfg.model.data.validation_ds: - t5_cfg.data.validation_ds.write_predictions_to_file = ( - cfg.model.data.validation_ds.write_predictions_to_file - ) - if "output_file_path_prefix" in cfg.model.data.validation_ds: - t5_cfg.data.validation_ds.output_file_path_prefix = cfg.model.data.validation_ds.output_file_path_prefix - - t5_cfg.data.validation_ds.micro_batch_size = cfg.model.data.validation_ds.micro_batch_size - t5_cfg.data.validation_ds.global_batch_size = cfg.model.data.validation_ds.global_batch_size - - # This is needed when modifying a hparam file directly to load `.ckpt` files. - # This is not needed to modify the cfg in `.nemo` files. - if add_cfg_to_tree: - OmegaConf.resolve(t5_cfg) - t5_cfg.cfg = t5_cfg - - return t5_cfg - - -@hydra_runner(config_path="conf", config_name="megatron_t5_config_finetune_glue_eval") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - if cfg.trainer.precision in [16, '16', '16-mixed', 'bf16', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(MixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) - - exp_manager(trainer, cfg.exp_manager) - - if hasattr(cfg.model.data.validation_ds, 'task_name'): - if cfg.model.restore_from_path: - t5_cfg = MegatronT5GLUEModel.restore_from( - restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True - ) - model = load_from_nemo(MegatronT5GLUEModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) - else: - validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) - model = load_from_checkpoint_dir(MegatronT5GLUEModel, cfg, trainer, modify_confg_fn=_modify_config) - elif hasattr(cfg.model.data.validation_ds, 'file_names'): - if cfg.model.restore_from_path: - t5_cfg = MegatronT0Model.restore_from( - restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True - ) - model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) - else: - validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) - model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, modify_confg_fn=_modify_config) - else: - if cfg.model.restore_from_path: - t5_cfg = MegatronT5SFTModel.restore_from( - restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True - ) - model = load_from_nemo(MegatronT5SFTModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) - else: - validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) - model = load_from_checkpoint_dir(MegatronT5SFTModel, cfg, trainer, modify_confg_fn=_modify_config) - - model.freeze() - trainer.validate(model) - if hasattr(cfg.model.data, 'test_ds'): - trainer.test(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py deleted file mode 100644 index 59ca94082f26b353dc1b61f2e1c88e5ceedc4949..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py +++ /dev/null @@ -1,225 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import tempfile - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector - -from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel -from nemo.collections.nlp.models.language_modeling.megatron_t0_model import MegatronT0Model -from nemo.collections.nlp.models.language_modeling.megatron_t5_sft_model import MegatronT5SFTModel -from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, - PipelineMixedPrecisionPlugin, -) -from nemo.core.config import hydra_runner -from nemo.utils import AppState, logging -from nemo.utils.exp_manager import exp_manager -from nemo.utils.model_utils import inject_model_parallel_rank - -mp.set_start_method("spawn", force=True) - - -def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False): - """ - This function modifies the original t5 pre-training config (t5_cfg) with attributes from the finetuning config (cfg). - The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`. - """ - OmegaConf.set_struct(t5_cfg, True) - with open_dict(t5_cfg): - t5_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) - if hasattr(t5_cfg, 'encoder') and hasattr(t5_cfg, 'decoder'): - t5_cfg.encoder.masked_softmax_fusion = False - t5_cfg.decoder.masked_softmax_fusion = False - t5_cfg.encoder.hidden_dropout = cfg.model.get('hidden_dropout', 0.1) - t5_cfg.decoder.hidden_dropout = cfg.model.get('hidden_dropout', 0.1) - if hasattr(t5_cfg.encoder, 'ffn_dropout'): - t5_cfg.encoder.ffn_dropout = cfg.model.get('ffn_dropout', 0.1) - if hasattr(t5_cfg.decoder, 'ffn_dropout'): - t5_cfg.decoder.ffn_dropout = cfg.model.get('ffn_dropout', 0.1) - - if hasattr(cfg.model, 'encoder'): - if hasattr(cfg.model.encoder, 'position_embedding_type'): - t5_cfg.encoder.position_embedding_type = cfg.model.encoder.position_embedding_type - if hasattr(cfg.model.encoder, 'use_flash_attention'): - t5_cfg.encoder.use_flash_attention = cfg.model.encoder.use_flash_attention - if hasattr(cfg.model.encoder, 'attention_dropout'): - t5_cfg.encoder.attention_dropout = cfg.model.encoder.attention_dropout - if hasattr(cfg.model, 'decoder'): - if hasattr(cfg.model.decoder, 'position_embedding_type'): - t5_cfg.decoder.position_embedding_type = cfg.model.decoder.position_embedding_type - if hasattr(cfg.model.decoder, 'use_flash_attention'): - t5_cfg.decoder.use_flash_attention = cfg.model.decoder.use_flash_attention - if hasattr(cfg.model.decoder, 'attention_dropout'): - t5_cfg.decoder.attention_dropout = cfg.model.decoder.attention_dropout - else: - t5_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.1) - t5_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.1) - t5_cfg.masked_softmax_fusion = False - t5_cfg.data = cfg.model.data - t5_cfg.precision = cfg.trainer.precision - t5_cfg.optim = cfg.model.optim - t5_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size - t5_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size - # XNLI has eval languages in the yaml config. - if hasattr(cfg.model, 'eval_languages'): - t5_cfg.eval_languages = cfg.model.eval_languages - - # This is needed when modifying a hparam file directly to load `.ckpt` files. - # This is not needed to modify the cfg in `.nemo` files. - if add_cfg_to_tree: - OmegaConf.resolve(t5_cfg) - t5_cfg.cfg = t5_cfg - - return t5_cfg - - -def load_from_nemo(cls, cfg, trainer, t5_cfg, modify_confg_fn): - t5_cfg = modify_confg_fn(t5_cfg, cfg, add_cfg_to_tree=False) - model = cls.restore_from( - restore_path=cfg.model.restore_from_path, - trainer=trainer, - override_config_path=t5_cfg, - save_restore_connector=NLPSaveRestoreConnector(), - ) - return model - - -def load_from_checkpoint_dir(cls, cfg, trainer, modify_confg_fn): - app_state = AppState() - if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1: - app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.model.pipeline_model_parallel_size - app_state.tensor_model_parallel_size = cfg.model.tensor_model_parallel_size - app_state.pipeline_model_parallel_size = cfg.model.pipeline_model_parallel_size - ( - app_state.tensor_model_parallel_rank, - app_state.pipeline_model_parallel_rank, - app_state.model_parallel_size, - app_state.data_parallel_size, - app_state.pipeline_model_parallel_split_rank, - app_state.virtual_pipeline_model_parallel_rank, - ) = fake_initialize_model_parallel( - world_size=app_state.model_parallel_size, - rank=trainer.global_rank, - tensor_model_parallel_size_=cfg.model.tensor_model_parallel_size, - pipeline_model_parallel_size_=cfg.model.pipeline_model_parallel_size, - pipeline_model_parallel_split_rank_=cfg.model.pipeline_model_parallel_split_rank, - ) - checkpoint_path = inject_model_parallel_rank( - os.path.join(cfg.model.pretrained_checkpoint.checkpoint_dir, cfg.model.pretrained_checkpoint.checkpoint_name) - ) - hparams_file = OmegaConf.load(cfg.model.pretrained_checkpoint.hparams_file) - t5_cfg = modify_confg_fn(hparams_file.cfg, cfg, add_cfg_to_tree=True) - with tempfile.NamedTemporaryFile(suffix='.yaml') as f: - OmegaConf.save(config=t5_cfg, f=f.name) - model = cls.load_from_checkpoint(checkpoint_path=checkpoint_path, trainer=trainer, hparams_file=f.name,) - return model - - -def validate_checkpoint_loading_args(cfg): - if cfg.checkpoint_dir is None or not os.path.isdir(cfg.checkpoint_dir): - raise ValueError(f'Checkpoint directory {cfg.checkpoint_dir} does not exist or is not a directory.') - if cfg.checkpoint_name is None: - raise ValueError(f'Checkpoint name {cfg.checkpoint_name} is not valid.') - if cfg.hparams_file is None or not os.path.isfile(cfg.hparams_file): - raise ValueError(f'Hparams file {cfg.hparams_file} does not exist or is not a file.') - - -@hydra_runner(config_path="conf", config_name="megatron_t5_config_finetune_glue_mnli") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()]) - - exp_manager(trainer, cfg.exp_manager) - - # update resume from checkpoint found by exp_manager - if cfg.model.resume_from_checkpoint is not None: - trainer.ckpt_path = cfg.model.resume_from_checkpoint - logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') - - if hasattr(cfg.model.data.train_ds, 'task_name'): - if cfg.model.restore_from_path: - t5_cfg = MegatronT5GLUEModel.restore_from( - restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True - ) - model = load_from_nemo(MegatronT5GLUEModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) - else: - validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) - model = load_from_checkpoint_dir(MegatronT5GLUEModel, cfg, trainer, modify_confg_fn=_modify_config) - elif hasattr(cfg.model.data.train_ds, 'file_names'): - if cfg.model.restore_from_path: - t5_cfg = MegatronT0Model.restore_from( - restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True - ) - model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) - else: - validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) - model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, modify_confg_fn=_modify_config) - else: - if cfg.model.restore_from_path: - t5_cfg = MegatronT5SFTModel.restore_from( - restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True - ) - model = load_from_nemo(MegatronT5SFTModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) - else: - validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) - model = load_from_checkpoint_dir(MegatronT5SFTModel, cfg, trainer, modify_confg_fn=_modify_config) - - trainer.fit(model) - trainer.validate(model) - if hasattr(cfg.model.data, 'test_ds'): - trainer.test(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/transformer_lm.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/transformer_lm.py deleted file mode 100644 index caaa0e0d2935fd71329887a9fa93dde316dde922..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/transformer_lm.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models.language_modeling import TransformerLMModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="transformer_lm_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - transformer_lm = TransformerLMModel(cfg.model, trainer=trainer) - trainer.fit(transformer_lm) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_adapter_inference.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_adapter_inference.yaml deleted file mode 100644 index bf724ad4a060ebda1e8df42b026aa34681fbabd5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_adapter_inference.yaml +++ /dev/null @@ -1,34 +0,0 @@ -inference: - greedy: True # Whether or not to use sampling ; use greedy decoding otherwise - top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - temperature: 1.0 # sampling temperature - add_BOS: True # add the bos token at the begining of the prompt - tokens_to_generate: 30 # The minimum length of the sequence to be generated. - all_probs: False # whether return the log prob for all the tokens in vocab - repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. - min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. - compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - logger: False # logger provided by exp_manager - precision: 16 # 16, 32, or bf16 - -tensor_model_parallel_size: -1 -pipeline_model_parallel_size: -1 -pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) -gpt_model_file: ??? # GPT nemo file path # used when starting from a .nemo file -adapter_model_file: ??? # .nemo file saved during training (using megatron_gpt_adapter_tuning.py) -pred_file_path: null # save predictions to this file -checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training -checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading -hparams_file: null # model configuration file, only used for PTL checkpoint loading -data_paths: ??? # prompts for GPT inference -server: False # whether launch the inference server -port: 5555 # the port number for the inference server -batch_size: 8 -num_workers: 8 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_adapter_tuning_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_adapter_tuning_config.yaml deleted file mode 100644 index a7829e212f53cd08d81d8429539604f45f207809..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_adapter_tuning_config.yaml +++ /dev/null @@ -1,154 +0,0 @@ -name: adapter_tuning_${model.new_tasks[0]}_max_epochs${trainer.max_epochs}_adapter_dim${model.adapter_tuning.adapter_dim} - -trainer: - devices: 1 - accelerator: gpu - num_nodes: 1 - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 - max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 0.2 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - benchmark: False - - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: null - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 1 - mode: min - save_nemo_on_train_end: True # Should be false, correct prompt learning model file is saved at model.nemo_path set below, - filename: 'megatron_gpt_adapter_tuning--{val_loss:.3f}-{step}' - model_parallel_size: ${model.tensor_model_parallel_size} - always_save_nemo: True - save_best_model: True - create_early_stopping_callback: True - early_stopping_callback_params: - monitor: "val_loss" - mode: "min" - min_delta: 0.001 - patience: 10 - verbose: True - -model: - seed: 1234 - nemo_path: ${exp_manager.exp_dir}/${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved - virtual_prompt_style: 'no-prompts' # adapter tuning requires no virtual prompts - encoder_seq_length: 2048 - gradient_as_bucket_view: false - tensor_model_parallel_size: 1 # intra-layer model parallelism - pipeline_model_parallel_size: 1 # inter-layer model parallelism - global_batch_size: 8 - micro_batch_size: 4 - validation_global_batch_size: ${model.global_batch_size} - validation_micro_batch_size: ${model.micro_batch_size} - validation_drop_last: False - - restore_path: null # Path to an existing adapter .nemo model you wish to add new tasks to or run inference with - language_model_path: ??? # Path to the GPT language model .nemo file, always required - existing_tasks: [] # List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given - new_tasks: ["rte"] # List of new tasknames to be prompt-tuned - - task_templates: # Add more/replace tasks as needed, these are just examples - - taskname: "boolq" # The task name - prompt_template: "Passage: {passage} \nQuestion: {question} \nAnswer: {answer}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|> - total_virtual_tokens: 0 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time. - virtual_token_splits: [] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens - truncate_field: "passage" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped. - answer_only_loss: True - answer_field: "answer" - - - taskname: "intent_and_slot" - prompt_template: "intent options: {intent_options} slot options: {slot_options} {utterance} \nintent: {intent} \nslot: {slot}" - total_virtual_tokens: 0 - answer_only_loss: False - virtual_token_splits: [] - truncate_field: null - - - taskname: "rte" - prompt_template: "sentence1: {premise} sentence2: {hypothesis} Answer: {answer}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_only_loss: True - answer_field: "answer" - - - taskname: "squad" - prompt_template: "context: {context} question: {question} answer: {answer}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_only_loss: True - answer_field: "answer" - - - taskname: "arc-challenge" - prompt_template: "question: {question} choices: {choices} answer: {answer}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_only_loss: True - answer_field: "answer" - - - taskname: "xsum" - prompt_template: "{source} Summary: {target}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_only_loss: True - answer_field: "target" - - - taskname: "taskname" - prompt_template: "{prompt} {completion}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: "prompt" - answer_only_loss: True - answer_field: "completion" - - adapter_tuning: - type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' - adapter_dim: 50 - adapter_dropout: 0.1 - norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used. - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] - - data: - train_ds: ??? # expects a list of paths to training data files - validation_ds: ??? # expects a paths to validation data files - add_eos: True - shuffle: True - num_workers: 8 - pin_memory: True - - - optim: - name: fused_adam - lr: 1e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 50 - constant_steps: 0 # Constant steps should also be 0 when min_lr=0 - min_lr: 0.0 # min_lr must be 0.0 for prompt learning - monitor: val_loss - reduce_on_plateau: false diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_ia3_inference.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_ia3_inference.yaml deleted file mode 100644 index 0cb8467c66f0b5521145a102b177ba1be72375b0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_ia3_inference.yaml +++ /dev/null @@ -1,32 +0,0 @@ -inference: - greedy: True # Whether or not to use sampling ; use greedy decoding otherwise - top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - temperature: 1.0 # sampling temperature - add_BOS: True # add the bos token at the begining of the prompt - tokens_to_generate: 30 # The minimum length of the sequence to be generated. - all_probs: False # whether return the log prob for all the tokens in vocab - repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. - min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. - compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - logger: False # logger provided by exp_manager - precision: 16 # 16, 32, or bf16 - -tensor_model_parallel_size: -1 -pipeline_model_parallel_size: -1 -pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) -gpt_model_file: ??? # GPT nemo file path # used when starting from a .nemo file -adapter_model_file: ??? # .nemo file saved during training (using megatron_gpt_adapter_tuning.py) -pred_file_path: null # save predictions to this file -checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training -checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading -hparams_file: null # model configuration file, only used for PTL checkpoint loading -data_paths: ??? # prompts for GPT inference -batch_size: 8 -num_workers: 8 \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_ia3_tuning_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_ia3_tuning_config.yaml deleted file mode 100644 index b5e2afb73186e9d6f829986d14673277784588de..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_ia3_tuning_config.yaml +++ /dev/null @@ -1,130 +0,0 @@ -name: ia3_tuning_${model.new_tasks[0]}_max_epochs${trainer.max_epochs} - -trainer: - devices: 1 - accelerator: gpu - num_nodes: 1 - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 - max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 0.2 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - benchmark: False - - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: null - wandb_logger_kwargs: - project: null - name: null - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 1 - mode: min - save_nemo_on_train_end: True # Should be false, correct prompt learning model file is saved at model.nemo_path set below, - filename: 'megatron_gpt_ia3_tuning--{val_loss:.3f}-{step}' - model_parallel_size: ${model.tensor_model_parallel_size} - save_best_model: True - -model: - seed: 1234 - nemo_path: ${exp_manager.exp_dir}/${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved - virtual_prompt_style: 'no-prompts' # adapter tuning requires no virtual prompts - encoder_seq_length: 2048 - gradient_as_bucket_view: false - tensor_model_parallel_size: 1 # intra-layer model parallelism - pipeline_model_parallel_size: 1 # inter-layer model parallelism - global_batch_size: 8 - micro_batch_size: 4 - validation_global_batch_size: ${model.global_batch_size} - validation_micro_batch_size: ${model.micro_batch_size} - validation_drop_last: False - - restore_path: null # Path to an existing adapter .nemo model you wish to add new tasks to or run inference with - language_model_path: ??? # Path to the GPT language model .nemo file, always required - existing_tasks: [] # List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given - new_tasks: ["rte"] # List of new tasknames to be prompt-tuned - - task_templates: # Add more/replace tasks as needed, these are just examples - - taskname: "boolq" # The task name - prompt_template: "Passage: {passage} \nQuestion: {question} \nAnswer: {answer}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|> - total_virtual_tokens: 0 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time. - virtual_token_splits: [] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens - truncate_field: "passage" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped. - answer_only_loss: True - answer_field: "answer" - - - taskname: "intent_and_slot" - prompt_template: "intent options: {intent_options} slot options: {slot_options} {utterance} \nintent: {intent} \nslot: {slot}" - total_virtual_tokens: 0 - answer_only_loss: True - virtual_token_splits: [] - truncate_field: null - - - taskname: "rte" - prompt_template: "sentence1: {premise} sentence2: {hypothesis} Answer: {answer}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_only_loss: True - answer_field: "answer" - - - taskname: "squad" - prompt_template: "context: {context} question: {question} answer: {answer}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_only_loss: True - answer_field: "answer" - - - taskname: "arc-challenge" - prompt_template: "question: {question} choices: {choices} answer: {answer}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_only_loss: True - answer_field: "answer" - - - taskname: "xsum" - prompt_template: "{source} Summary: {target}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_only_loss: True - answer_field: "target" - - data: - train_ds: ??? # expects a list of paths to training data files - validation_ds: ??? # expects a paths to validation data files - add_eos: True - shuffle: True - num_workers: 8 - pin_memory: True - - - optim: - name: fused_adam - lr: 1e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 50 - constant_steps: 0 # Constant steps should also be 0 when min_lr=0 - min_lr: 0.0 # min_lr must be 0.0 for prompt learning - monitor: val_loss - reduce_on_plateau: false diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml deleted file mode 100644 index 0409e0fec0aa4c0f92f627a8b825b0bacc01e703..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml +++ /dev/null @@ -1,213 +0,0 @@ -name: megatron_gpt_peft_${model.peft.peft_scheme}_tuning - -trainer: - devices: 1 - accelerator: gpu - num_nodes: 1 - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: 9999 - max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 # frequency with which training steps are logged - val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: validation_${model.data.test_ds.metric.name} - save_top_k: 1 - mode: max - save_nemo_on_train_end: True - filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' - model_parallel_size: ${model.tensor_model_parallel_size} - always_save_nemo: True - save_best_model: False - -model: - seed: 1234 - tensor_model_parallel_size: 1 # intra-layer model parallelism - pipeline_model_parallel_size: 1 # inter-layer model parallelism - - global_batch_size: 1 - micro_batch_size: 1 - restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. - sync_batch_comm: False - megatron_amp_O2: False - - ## Sequence Parallelism - # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially - # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - sequence_parallel: False - - ## Activation Checkpoint - activations_checkpoint_granularity: null # 'selective' or 'full' - activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' - # 'uniform' divides the total number of transformer layers and checkpoints the input activation - # of each chunk at the specified granularity - # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity - activations_checkpoint_num_layers: null # not used with 'selective' - activations_checkpoint_layers_per_pipeline: null - answer_only_loss: False # not used right now - gradient_as_bucket_view: False - - hidden_dropout: 0.0 - attention_dropout: 0.0 - ffn_dropout: 0.0 - - peft: - peft_scheme: "adapter" # can be either adapter,ia3, or ptuning - restore_from_path: null - restore_from_ckpt_name: null - restore_from_hparams_path: null - - # Used for adapter peft training - adapter_tuning: - type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' - adapter_dim: 32 - adapter_dropout: 0.0 - norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] - layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers - weight_tying: False - position_embedding_strategy: null # used only when weight_tying is True - - lora_tuning: - adapter_dim: 32 - adapter_dropout: 0.0 - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers - weight_tying: False - position_embedding_strategy: null # used only when weight_tying is True - - # Used for p-tuning peft training - p_tuning: - virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence - bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck - embedding_dim: 1024 # the size of the prompt encoder embeddings - init_std: 0.023 - - ia3_tuning: - layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers - - data: - test_ds: - file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. - names: ??? # Names of the corresponding datasets used to log metrics. - global_batch_size: 1 - micro_batch_size: 1 - shuffle: False - num_workers: 0 - pin_memory: True - max_seq_length: 2048 - min_seq_length: 1 - drop_last: False - context_key: 'input' - label_key: ${data.train_ds.label_key} - add_eos: ${data.train_ds.add_eos} - add_sep: ${data.train_ds.add_sep} - add_bos: ${data.train_ds.add_bos} - write_predictions_to_file: False - output_file_path_prefix: null # Prefix of the file to write predictions to. - truncation_field: ${data.train_ds.truncation_field} # Options: keys in prompt_template - index_mapping_dir: null # Path to a directory to write index mapping files. - prompt_template: ${data.train_ds.prompt_template} - tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics - truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] - - metric: - name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - -inference: - greedy: True # Whether or not to use sampling ; use greedy decoding otherwise - top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - temperature: 1.0 # sampling temperature - all_probs: False # whether return the log prob for all the tokens in vocab - repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. - min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. - compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - outfile_path: output.txt - compute_attention_mask: True - -# server-related configs -server: False # whether launch the API server -port: 5555 # the port number for the inference server -web_server: False # whether launch the web inference server -share: True # whether create a public URL -username: test # user name for web client -password: test2 # password for web client -web_port: 9889 # the port number of the web server 1058 -chat: False # use the chat interface -chatbot_config: - value: False # whether to inject the value attributes - attributes: - - name: Quality - min: 0 - max: 4 - key: quality - type: int - default: 4 - - name: Toxicity - min: 0 - max: 4 - key: toxcity - type: int - default: 0 - - name: Humor - min: 0 - max: 4 - key: humor - type: int - default: 0 - - name: Creativity - min: 0 - max: 4 - key: creativity - type: int - default: 0 - - name: Violence - min: 0 - max: 4 - key: violence - type: int - default: 0 - - name: Helpfulness - min: 0 - max: 4 - key: helpfulness - type: int - default: 4 - - name: Not_Appropriate - min: 0 - max: 4 - key: not_appropriate - type: int - default: 0 - - name: Language - choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh'] - key: lang - type: list - default: en - - user: User - assistant: Assistant - system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml deleted file mode 100644 index 6b6af4b1c81bec999e0145493a2423b475353117..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml +++ /dev/null @@ -1,216 +0,0 @@ -name: megatron_gpt_peft_${model.peft.peft_scheme}_tuning - -trainer: - devices: 1 - accelerator: gpu - num_nodes: 1 - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: 9999 - max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 # frequency with which training steps are logged - val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: validation_${model.data.validation_ds.metric.name} - save_top_k: 1 - mode: min - save_nemo_on_train_end: True - filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' - model_parallel_size: ${model.tensor_model_parallel_size} - always_save_nemo: False - save_best_model: True - create_early_stopping_callback: True - early_stopping_callback_params: - monitor: "val_loss" - mode: "min" - min_delta: 0.001 - patience: 10 - verbose: True - strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. - -model: - seed: 1234 - tensor_model_parallel_size: 1 # intra-layer model parallelism - pipeline_model_parallel_size: 1 # inter-layer model parallelism - - global_batch_size: 128 - micro_batch_size: 4 - restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. - sync_batch_comm: False - megatron_amp_O2: False - - ## Sequence Parallelism - # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially - # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - sequence_parallel: False - - ## Activation Checkpoint - activations_checkpoint_granularity: null # 'selective' or 'full' - activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' - # 'uniform' divides the total number of transformer layers and checkpoints the input activation - # of each chunk at the specified granularity - # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity - activations_checkpoint_num_layers: null # not used with 'selective' - activations_checkpoint_layers_per_pipeline: null - answer_only_loss: True - gradient_as_bucket_view: False - - hidden_dropout: 0.0 - attention_dropout: 0.0 - ffn_dropout: 0.0 - - peft: - peft_scheme: "adapter" # can be either adapter,ia3, or ptuning - restore_from_path: null - - # Used for adapter peft training - adapter_tuning: - type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' - adapter_dim: 32 - adapter_dropout: 0.0 - norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] - layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers - weight_tying: False - position_embedding_strategy: null # used only when weight_tying is True - - lora_tuning: - adapter_dim: 32 - adapter_dropout: 0.0 - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers - weight_tying: False - position_embedding_strategy: null # used only when weight_tying is True - - # Used for p-tuning peft training - p_tuning: - virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence - bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck - embedding_dim: 1024 # the size of the prompt encoder embeddings - init_std: 0.023 - - ia3_tuning: - layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers - - data: - train_ds: - # Example of how to specify paths to multiple datasets - # file_names: - # - /path/to/squad.jsonl - # - /path/to/mnli.jsonl - # - /path/to/boolq.jsonl - # Example of how each dataset is formatted - # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} - file_names: ??? # Path to a list of JSONL files corresponding to the source data. - global_batch_size: ${model.global_batch_size} - micro_batch_size: ${model.micro_batch_size} - shuffle: True - num_workers: 0 - memmap_workers: 2 - pin_memory: True - max_seq_length: 2048 - min_seq_length: 1 - drop_last: True - # Example of how to specify concat_sampling_probabilities - # concat_sampling_probabilities: - # - 0.5 - # - 0.25 - # - 0.25 - concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' - label_key: 'output' - add_eos: True - add_sep: False - add_bos: False - truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template - index_mapping_dir: null # Path to a directory to write index mapping files. - prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" - truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] - validation_ds: - file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. - names: null # Names of the corresponding datasets used to log metrics. - global_batch_size: ${model.global_batch_size} - micro_batch_size: ${model.micro_batch_size} - shuffle: False - num_workers: 0 - memmap_workers: ${model.data.train_ds.memmap_workers} - pin_memory: True - max_seq_length: 2048 - min_seq_length: 1 - drop_last: False - label_key: ${model.data.train_ds.label_key} - add_eos: ${model.data.train_ds.add_eos} - add_sep: ${model.data.train_ds.add_sep} - add_bos: ${model.data.train_ds.add_bos} - write_predictions_to_file: False - output_file_path_prefix: null # Prefix of the file to write predictions to. - truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template - index_mapping_dir: null # Path to a directory to write index mapping files. - prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" - tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics - truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] - metric: - name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - test_ds: - file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. - names: null # Names of the corresponding datasets used to log metrics. - global_batch_size: ${model.global_batch_size} - micro_batch_size: ${model.micro_batch_size} - shuffle: False - num_workers: 0 - memmap_workers: ${model.data.train_ds.memmap_workers} - pin_memory: True - max_seq_length: 2048 - min_seq_length: 1 - drop_last: False - label_key: ${model.data.train_ds.label_key} - add_eos: ${model.data.train_ds.add_eos} - add_sep: ${model.data.train_ds.add_sep} - add_bos: ${model.data.train_ds.add_bos} - write_predictions_to_file: False - output_file_path_prefix: null # Prefix of the file to write predictions to. - truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template - index_mapping_dir: null # Path to a directory to write index mapping files. - prompt_template: ${model.data.train_ds.prompt_template} - tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics - truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] - metric: - name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - - optim: - name: fused_adam - lr: 1e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 50 - min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1 - constant_steps: 0 # Constant steps should also be 0 when min_lr=0 - monitor: val_loss - reduce_on_plateau: false \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml deleted file mode 100644 index 27e73996225fb1531c64cdd5daa8632b979eeb9f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml +++ /dev/null @@ -1,191 +0,0 @@ -name: megatron_gpt_sft - -trainer: - devices: 1 - accelerator: gpu - num_nodes: 1 - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: 9999 - max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 # frequency with which training steps are logged - val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: validation_${model.data.validation_ds.metric.name} - save_top_k: 2 - mode: max - save_nemo_on_train_end: False - filename: 'megatron_gpt_sft--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' - model_parallel_size: ${model.tensor_model_parallel_size} - save_best_model: True - -model: - seed: 1234 - tensor_model_parallel_size: 1 # intra-layer model parallelism - pipeline_model_parallel_size: 1 # inter-layer model parallelism - global_batch_size: 128 - micro_batch_size: 4 - restore_from_path: ??? # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. - sync_batch_comm: False - megatron_amp_O2: False - - ## Sequence Parallelism - # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially - # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - sequence_parallel: False - - ## Activation Checkpoint - activations_checkpoint_granularity: null # 'selective' or 'full' - activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' - # 'uniform' divides the total number of transformer layers and checkpoints the input activation - # of each chunk at the specified granularity - # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity - activations_checkpoint_num_layers: null # not used with 'selective' - activations_checkpoint_layers_per_pipeline: null - # This feature is valid only when used with pipeline-model-parallelism. More details in megatron_gpt_config.yaml. - answer_only_loss: False # not used right now - gradient_as_bucket_view: False - seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value - use_flash_attention: null # if not None, will match the base model's value - - hidden_dropout: 0.0 - attention_dropout: 0.0 - ffn_dropout: 0.0 - - data: - chat: False # whether use chatbot data or not - chat_prompt_tokens: # special tokens for the chat prompts, a dictionary of {token_type: token}. note that some tokenizer may combine the characters at the junction between {end_of_turn}{turn_start}. e.g. '', the '><' sometimes is merged to be a single token. This is not supported, try to avoid - system_turn_start: '' - turn_start: '' - label_start: '' - end_of_turn: "\x0A" # \0x0A is '\n' - end_of_name: "\x0A" # \0x0A is '\n' - train_ds: - # Example of how to specify paths to multiple datasets - # file_names: - # - /path/to/squad.jsonl - # - /path/to/mnli.jsonl - # - /path/to/boolq.jsonl - # Example of how each dataset is formatted - # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} - file_names: ??? # Path to a list of JSONL files corresponding to the source data. - global_batch_size: ${model.global_batch_size} - micro_batch_size: ${model.micro_batch_size} - shuffle: True - num_workers: 4 - memmap_workers: null - pin_memory: True - max_seq_length: 2048 - min_seq_length: 1 - drop_last: True - # Example of how to specify concat_sampling_probabilities - # concat_sampling_probabilities: - # - 0.5 - # - 0.25 - # - 0.25 - concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' - label_key: 'output' - add_eos: True - add_sep: False - add_bos: False - truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template - index_mapping_dir: null # Path to a directory to write index mapping files. - prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" - hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset. - truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] - - validation_ds: - file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. - names: null # Names of the corresponding datasets used to log metrics. - global_batch_size: ${model.global_batch_size} - micro_batch_size: ${model.micro_batch_size} - shuffle: False - num_workers: 4 - memmap_workers: ${model.data.train_ds.memmap_workers} - pin_memory: True - max_seq_length: ${model.data.train_ds.max_seq_length} - min_seq_length: 1 - drop_last: False - label_key: ${model.data.train_ds.label_key} - add_eos: ${model.data.train_ds.add_eos} - add_sep: ${model.data.train_ds.add_sep} - add_bos: ${model.data.train_ds.add_bos} - write_predictions_to_file: False - output_file_path_prefix: null # Prefix of the file to write predictions to. - truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template - index_mapping_dir: null # Path to a directory to write index mapping files. - prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" - tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics - hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset. - truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] - - metric: - name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss', 'rouge', 'token_f1'] - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - - test_ds: - file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. - names: null # Names of the corresponding datasets used to log metrics. - global_batch_size: ${model.global_batch_size} - micro_batch_size: ${model.micro_batch_size} - shuffle: False - num_workers: 4 - memmap_workers: ${model.data.train_ds.memmap_workers} - pin_memory: True - max_seq_length: ${model.data.train_ds.max_seq_length} - min_seq_length: 1 - drop_last: False - label_key: ${model.data.train_ds.label_key} - add_eos: ${model.data.train_ds.add_eos} - add_sep: ${model.data.train_ds.add_sep} - add_bos: ${model.data.train_ds.add_bos} - write_predictions_to_file: False - output_file_path_prefix: null # Prefix of the file to write predictions to. - truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template - index_mapping_dir: null # Path to a directory to write index mapping files. - prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" - tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics - hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset. - truncation_method: 'right' # Truncation from which position, Options: Options: ['left', 'right'] - - metric: - name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - - optim: - name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work. - lr: 3e-5 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - -inference: - greedy: True # Whether or not to use sampling ; use greedy decoding otherwise - top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - temperature: 1.0 # sampling temperature - all_probs: False # whether return the log prob for all the tokens in vocab - repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. - min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. - compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - compute_attention_mask: True \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_adapter_inference.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_adapter_inference.yaml deleted file mode 100644 index fcd92a40197067d60f3c09187b5b35befc0ec16c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_adapter_inference.yaml +++ /dev/null @@ -1,37 +0,0 @@ -inference: - greedy: True # Whether or not to use sampling ; use greedy decoding otherwise - top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - temperature: 1.0 # sampling temperature - add_BOS: True # add the bos token at the begining of the prompt - tokens_to_generate: 30 # The minimum length of the sequence to be generated. - all_probs: False # whether return the log prob for all the tokens in vocab - repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. - min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. - compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - logger: False # logger provided by exp_manager - precision: 16 # 16, 32, or bf16 - -data: - test_ds: ??? - num_workers: 1 - global_batch_size: 4 - micro_batch_size: 4 - -tensor_model_parallel_size: -1 -pipeline_model_parallel_size: -1 -pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) -language_model_path: ??? # GPT nemo file path # used when starting from a .nemo file -adapter_model_file: ??? # .nemo file saved during training (using megatron_gpt_adapter_tuning.py) -pred_file_path: null # save predictions to this file -checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training -checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading -hparams_file: null # model configuration file, only used for PTL checkpoint loading -batch_size: 8 - diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_adapter_tuning_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_adapter_tuning_config.yaml deleted file mode 100644 index 5fc411f61b11f5ccd73fc230a9d61a78a1ecc0ee..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_adapter_tuning_config.yaml +++ /dev/null @@ -1,135 +0,0 @@ -name: adapter_tuning_${model.new_tasks[0]}_max_epochs${trainer.max_epochs}_adapter_dim${model.adapter_tuning.adapter_dim} - -trainer: - devices: 1 - accelerator: gpu - num_nodes: 1 - precision: 16 - logger: False - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 - max_steps: 100 - log_every_n_steps: 10 - val_check_interval: 20 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - benchmark: False - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 1 - mode: min - save_nemo_on_train_end: True # Should be false, correct prompt learning model file is saved at model.virtual_prompt_save_path set below - filename: "megatron_t5_adapter_tune--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}" - model_parallel_size: ${model.tensor_model_parallel_size} - save_best_model: True - -model: - seed: 1234 - nemo_path: ${exp_manager.exp_dir}/${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved - virtual_prompt_style: 'no-prompts' #'prompt-tuning' # adapter tuning requires no virtual prompts - encoder_seq_length: 2048 - gradient_as_bucket_view: false - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - global_batch_size: 8 - micro_batch_size: 4 - validation_global_batch_size: ${model.global_batch_size} - validation_micro_batch_size: ${model.micro_batch_size} - validation_drop_last: False - report_validation_metric: False - validation_metric: accuracy - - restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with - language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required - existing_tasks: [] - new_tasks: ["squad"] - - task_templates: - - taskname: "boolq" # The task name - prompt_template: "Passage: {passage} \nQuestion: {question} \nAnswer: {answer}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|> - total_virtual_tokens: 0 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time. - virtual_token_splits: [] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens - truncate_field: "passage" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped. - answer_field: "answer" - - - taskname: "intent_and_slot" - prompt_template: "intent options: {intent_options} slot options: {slot_options} {utterance} \nintent: {intent} \nslot: {slot}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - - - taskname: "rte" - prompt_template: "sentence1: {premise} sentence2: {hypothesis} Answer: {answer}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_field: "answer" - - - taskname: "squad" - prompt_template: "context: {context} question: {question} answer: {answer}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_field: "answer" - - - taskname: "arc-challenge" - prompt_template: "question: {question} choices: {choices} answer: {answer}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_field: "answer" - - - taskname: "xsum" - prompt_template: "{source} Summary: {target}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_field: "target" - - adapter_tuning: - type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' - adapter_dim: 50 - adapter_dropout: 0.1 - norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used. - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] - - data: - train_ds: ??? - validation_ds: ??? - add_eos: True - shuffle: True - num_workers: 8 - pin_memory: True - - - optim: - name: fused_adam - lr: 1e-3 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 50 - constant_steps: 0 - min_lr: 0.0 - monitor: val_loss - reduce_on_plateau: false - - diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_ia3_inference.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_ia3_inference.yaml deleted file mode 100644 index fcd92a40197067d60f3c09187b5b35befc0ec16c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_ia3_inference.yaml +++ /dev/null @@ -1,37 +0,0 @@ -inference: - greedy: True # Whether or not to use sampling ; use greedy decoding otherwise - top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - temperature: 1.0 # sampling temperature - add_BOS: True # add the bos token at the begining of the prompt - tokens_to_generate: 30 # The minimum length of the sequence to be generated. - all_probs: False # whether return the log prob for all the tokens in vocab - repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. - min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. - compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - logger: False # logger provided by exp_manager - precision: 16 # 16, 32, or bf16 - -data: - test_ds: ??? - num_workers: 1 - global_batch_size: 4 - micro_batch_size: 4 - -tensor_model_parallel_size: -1 -pipeline_model_parallel_size: -1 -pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) -language_model_path: ??? # GPT nemo file path # used when starting from a .nemo file -adapter_model_file: ??? # .nemo file saved during training (using megatron_gpt_adapter_tuning.py) -pred_file_path: null # save predictions to this file -checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training -checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading -hparams_file: null # model configuration file, only used for PTL checkpoint loading -batch_size: 8 - diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_ia3_tuning_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_ia3_tuning_config.yaml deleted file mode 100644 index 5c12993bd12e7d3030c9166755f604f7178b732a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_ia3_tuning_config.yaml +++ /dev/null @@ -1,112 +0,0 @@ -name: ia3_tuning_${model.new_tasks[0]}_max_epochs${trainer.max_epochs} - -trainer: - devices: 1 - accelerator: gpu - num_nodes: 1 - precision: 16 - logger: False - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: -1 - max_steps: 100 - log_every_n_steps: 10 - val_check_interval: 20 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - benchmark: False - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 1 - mode: min - save_nemo_on_train_end: True - filename: "megatron_t5_ia3_tune--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}" - model_parallel_size: ${model.tensor_model_parallel_size} - save_best_model: True - -model: - seed: 1234 - nemo_path: ${exp_manager.exp_dir}/${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved - virtual_prompt_style: 'no-prompts' #'prompt-tuning' # adapter tuning requires no virtual prompts - encoder_seq_length: 2048 - gradient_as_bucket_view: false - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - global_batch_size: 4 - micro_batch_size: 2 - validation_global_batch_size: ${model.global_batch_size} - validation_micro_batch_size: ${model.micro_batch_size} - validation_drop_last: False - report_validation_metric: False - - restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with - language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required - existing_tasks: [] - new_tasks: ["squad"] - - task_templates: - - taskname: "squad" - prompt_template: "context: {context} question: {question} answer: {answer}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_field: "answer" - - - taskname: "arc-challenge" - prompt_template: "question: {question} choices: {choices} answer: {answer}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_field: "answer" - - - taskname: "xsum" - prompt_template: "{source} Summary: {target}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_field: "target" - - - taskname: "rte" - prompt_template: "sentence1: {premise} sentence2: {hypothesis} Answer: {answer}" - total_virtual_tokens: 0 - virtual_token_splits: [] - truncate_field: null - answer_field: "answer" - - data: - train_ds: ["data/squad_train.jsonl"] - validation_ds: ["data/squad_val.jsonl"] - add_eos: True - shuffle: True - num_workers: 8 - pin_memory: True - - - optim: - name: fused_adam - lr: 1e-3 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 50 - constant_steps: 0 - min_lr: 0.0 - monitor: val_loss - reduce_on_plateau: false - - diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_lora_inference.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_lora_inference.yaml deleted file mode 100644 index 008241d193891310b109894c939c87ccbb785086..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_lora_inference.yaml +++ /dev/null @@ -1,36 +0,0 @@ -inference: - greedy: True # Whether or not to use sampling ; use greedy decoding otherwise - top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - temperature: 1.0 # sampling temperature - add_BOS: True # add the bos token at the begining of the prompt - tokens_to_generate: 30 # The minimum length of the sequence to be generated. - all_probs: False # whether return the log prob for all the tokens in vocab - repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. - min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. - compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - logger: False # logger provided by exp_manager - precision: 16 # 16, 32, or bf16 - -data: - test_ds: ??? - num_workers: 1 - global_batch_size: 4 - micro_batch_size: 4 - -tensor_model_parallel_size: -1 -pipeline_model_parallel_size: -1 -pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) -language_model_path: ??? # GPT nemo file path # used when starting from a .nemo file -adapter_model_file: ??? # .nemo file saved during training (using megatron_t5_lora_tuning.py) -pred_file_path: null # save predictions to this file -checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training -checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading -hparams_file: null # model configuration file, only used for PTL checkpoint loading -batch_size: 8 diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_lora_tuning_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_lora_tuning_config.yaml deleted file mode 100644 index 8f46f1f3720d6c9dc22c78c9a23c18e4f76041e0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_lora_tuning_config.yaml +++ /dev/null @@ -1,99 +0,0 @@ -name: adapter_tuning_${model.new_tasks[0]}_max_epochs${trainer.max_epochs}_lora_dim${model.lora_tuning.kqv_adapter_dim} - -trainer: - devices: 1 - accelerator: gpu - num_nodes: 1 - precision: 16 - logger: False - enable_checkpointing: False - replace_sampler_ddp: False - max_epochs: 10 - max_steps: 1000 - log_every_n_steps: 1 - val_check_interval: 2 - accumulate_grad_batches: 1 - gradient_clip_val: 0.0 - benchmark: False - -exp_manager: - explicit_log_dir: null - exp_dir: nemo-lora-mt0-tr - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_from_checkpoint: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: reduced_train_loss - save_top_k: 1 - mode: min - save_nemo_on_train_end: True # Should be false, correct prompt learning model file is saved at model.virtual_prompt_save_path set below - filename: "megatron_t5_adapter_tune--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}" - model_parallel_size: ${model.tensor_model_parallel_size} - save_best_model: True - -model: - seed: 1234 - nemo_path: ${exp_manager.exp_dir}/${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved - virtual_prompt_style: 'no-prompts' #'prompt-tuning' # adapter tuning requires no virtual prompts - encoder_seq_length: 2048 - gradient_as_bucket_view: false - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - global_batch_size: 4 - micro_batch_size: 4 - validation_global_batch_size: ${model.global_batch_size} - validation_micro_batch_size: ${model.micro_batch_size} - validation_drop_last: False - report_validation_metric: False - validation_metric: accuracy - - restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with - language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required - existing_tasks: [] - new_tasks: ["taskname"] - - task_templates: - - taskname: "taskname" # The task name - prompt_template: "{prompt} {completion}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|> - total_virtual_tokens: 0 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time. - virtual_token_splits: [] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens - truncate_field: "prompt" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped. - answer_field: "completion" - - lora_tuning: - kqv_adapter_dim: 24 - kv_adapter_dim: 16 - q_adapter_dim: 8 - adapter_dropout: 0.1 - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - - data: - train_ds: ??? - validation_ds: ??? - shuffle: True - num_workers: 0 - pin_memory: True - add_eos: True - - - optim: - name: fused_adam - lr: 1e-3 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 50 - constant_steps: 0 - min_lr: 0.0 - monitor: val_loss - reduce_on_plateau: false diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_peft_eval_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_peft_eval_config.yaml deleted file mode 100644 index 0ef90a2343fa2c41a4edbcdd8b3dad26cb6b3b2b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_peft_eval_config.yaml +++ /dev/null @@ -1,213 +0,0 @@ -name: megatron_t5_peft_${model.peft.peft_scheme}_tuning - -trainer: - devices: 1 - accelerator: gpu - num_nodes: 1 - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: 9999 - max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 # frequency with which training steps are logged - val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: validation_${model.data.test_ds.metric.name} - save_top_k: 1 - mode: max - save_nemo_on_train_end: True - filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' - model_parallel_size: ${model.tensor_model_parallel_size} - always_save_nemo: True - save_best_model: False - -model: - seed: 1234 - tensor_model_parallel_size: 1 # intra-layer model parallelism - pipeline_model_parallel_size: 1 # inter-layer model parallelism - - global_batch_size: 1 - micro_batch_size: 1 - restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. - sync_batch_comm: False - megatron_amp_O2: False - - ## Sequence Parallelism - # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially - # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - sequence_parallel: False - - ## Activation Checkpoint - activations_checkpoint_granularity: null # 'selective' or 'full' - activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' - # 'uniform' divides the total number of transformer layers and checkpoints the input activation - # of each chunk at the specified granularity - # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity - activations_checkpoint_num_layers: null # not used with 'selective' - activations_checkpoint_layers_per_pipeline: null - answer_only_loss: False # not used right now - gradient_as_bucket_view: False - - hidden_dropout: 0.0 - attention_dropout: 0.0 - ffn_dropout: 0.0 - - peft: - peft_scheme: "adapter" # can be either adapter,ia3, or ptuning - restore_from_path: null - restore_from_ckpt_name: null - restore_from_hparams_path: null - - # Used for adapter peft training - adapter_tuning: - type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' - adapter_dim: 32 - adapter_dropout: 0.0 - norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] - layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers - weight_tying: False - position_embedding_strategy: null # used only when weight_tying is True - - lora_tuning: - adapter_dim: 32 - adapter_dropout: 0.0 - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers - weight_tying: False - position_embedding_strategy: null # used only when weight_tying is True - - # Used for p-tuning peft training - p_tuning: - virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence - bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck - embedding_dim: 1024 # the size of the prompt encoder embeddings - init_std: 0.023 - - ia3_tuning: - layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers - - data: - test_ds: - file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. - names: ??? # Names of the corresponding datasets used to log metrics. - global_batch_size: 1 #${model.global_batch_size} - micro_batch_size: 1 #${model.micro_batch_size} - shuffle: False - num_workers: 0 - pin_memory: True - max_seq_length: 2048 - min_seq_length: 1 - drop_last: False - context_key: 'input' - label_key: ${data.train_ds.label_key} - add_eos: ${data.train_ds.add_eos} - add_sep: ${data.train_ds.add_sep} - add_bos: ${data.train_ds.add_bos} - separate_prompt_and_response_with_newline: ${data.train_ds.separate_prompt_and_response_with_newline} - write_predictions_to_file: False - output_file_path_prefix: null # Prefix of the file to write predictions to. - truncation_field: ${data.train_ds.truncation_field} # Options: keys in prompt_template index_mapping_dir: null # Path to a directory to write index mapping files. - index_mapping_dir: null # Path to a directory to write index mapping files. - prompt_template: ${data.train_ds.prompt_template} - tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics - - metric: - name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - -inference: - greedy: True # Whether or not to use sampling ; use greedy decoding otherwise - top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - temperature: 1.0 # sampling temperature - all_probs: False # whether return the log prob for all the tokens in vocab - repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. - min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. - compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False - outfile_path: output.txt - compute_attention_mask: True - -# server-related configs -server: False # whether launch the API server -port: 5555 # the port number for the inference server -web_server: False # whether launch the web inference server -share: True # whether create a public URL -username: test # user name for web client -password: test2 # password for web client -web_port: 9889 # the port number of the web server 1058 -chat: False # use the chat interface -chatbot_config: - value: False # whether to inject the value attributes - attributes: - - name: Quality - min: 0 - max: 4 - key: quality - type: int - default: 4 - - name: Toxicity - min: 0 - max: 4 - key: toxcity - type: int - default: 0 - - name: Humor - min: 0 - max: 4 - key: humor - type: int - default: 0 - - name: Creativity - min: 0 - max: 4 - key: creativity - type: int - default: 0 - - name: Violence - min: 0 - max: 4 - key: violence - type: int - default: 0 - - name: Helpfulness - min: 0 - max: 4 - key: helpfulness - type: int - default: 4 - - name: Not_Appropriate - min: 0 - max: 4 - key: not_appropriate - type: int - default: 0 - - name: Language - choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh'] - key: lang - type: list - default: en - - user: User - assistant: Assistant - system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n" \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_peft_tuning_config.yaml b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_peft_tuning_config.yaml deleted file mode 100644 index d0ee2c4178563fad4f5c306285683ae497d3338b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/conf/megatron_t5_peft_tuning_config.yaml +++ /dev/null @@ -1,220 +0,0 @@ -name: megatron_t5_peft_${model.peft.peft_scheme}_tuning - -trainer: - devices: 1 - accelerator: gpu - num_nodes: 1 - precision: 16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: 9999 - max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 # frequency with which training steps are logged - val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: validation_${model.data.validation_ds.metric.name} - save_top_k: 1 - mode: min - save_nemo_on_train_end: True - filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' - model_parallel_size: ${model.tensor_model_parallel_size} - always_save_nemo: False - save_best_model: True - create_early_stopping_callback: True - early_stopping_callback_params: - monitor: "val_loss" - mode: "min" - min_delta: 0.001 - patience: 10 - verbose: True - strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. - -model: - seed: 1234 - tensor_model_parallel_size: 1 # intra-layer model parallelism - pipeline_model_parallel_size: 1 # inter-layer model parallelism - - global_batch_size: 128 - micro_batch_size: 4 - restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. - sync_batch_comm: False - megatron_amp_O2: False - - ## Sequence Parallelism - # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially - # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - sequence_parallel: False - - ## Activation Checkpoint - activations_checkpoint_granularity: null # 'selective' or 'full' - activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' - # 'uniform' divides the total number of transformer layers and checkpoints the input activation - # of each chunk at the specified granularity - # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity - activations_checkpoint_num_layers: null # not used with 'selective' - activations_checkpoint_layers_per_pipeline: null - answer_only_loss: True - gradient_as_bucket_view: False - - hidden_dropout: 0.0 - attention_dropout: 0.0 - ffn_dropout: 0.0 - - peft: - peft_scheme: "adapter" # can be either adapter,ia3, or ptuning - restore_from_path: null - - # Used for adapter peft training - adapter_tuning: - type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' - adapter_dim: 32 - adapter_dropout: 0.0 - norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] - layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers - weight_tying: False - position_embedding_strategy: null # used only when weight_tying is True - - lora_tuning: - adapter_dim: 32 - adapter_dropout: 0.0 - column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal - row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal - layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers - weight_tying: False - position_embedding_strategy: null # used only when weight_tying is True - - # Used for p-tuning peft training - p_tuning: - virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence - bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck - embedding_dim: 1024 # the size of the prompt encoder embeddings - init_std: 0.023 - - ia3_tuning: - layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers - - data: - train_ds: - # Example of how to specify paths to multiple datasets - # file_names: - # - /path/to/squad.jsonl - # - /path/to/mnli.jsonl - # - /path/to/boolq.jsonl - # Example of how each dataset is formatted - # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} - file_names: ??? # Path to a list of JSONL files corresponding to the source data. - global_batch_size: ${model.global_batch_size} - micro_batch_size: ${model.micro_batch_size} - shuffle: True - num_workers: 0 - memmap_workers: 2 - pin_memory: True - max_seq_length: 2048 - min_seq_length: 1 - drop_last: True - # Example of how to specify concat_sampling_probabilities - # concat_sampling_probabilities: - # - 0.5 - # - 0.25 - # - 0.25 - concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' - context_key: 'input' - label_key: 'output' - add_eos: True - add_sep: False - add_bos: False - separate_prompt_and_response_with_newline: False - truncation_field: "context" # Options: ['context', 'answer'] - index_mapping_dir: null # Path to a directory to write index mapping files. - prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" - - validation_ds: - file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. - names: null # Names of the corresponding datasets used to log metrics. - global_batch_size: ${model.global_batch_size} - micro_batch_size: ${model.micro_batch_size} - shuffle: False - num_workers: 0 - memmap_workers: ${model.data.train_ds.memmap_workers} - pin_memory: True - max_seq_length: 2048 - min_seq_length: 1 - drop_last: False - context_key: 'input' - label_key: 'output' - add_eos: ${model.data.train_ds.add_eos} - add_sep: ${model.data.train_ds.add_sep} - add_bos: ${model.data.train_ds.add_bos} - separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline} - write_predictions_to_file: False - output_file_path_prefix: null # Prefix of the file to write predictions to. - truncation_field: "context" # Options: ['context', 'answer'] - index_mapping_dir: null # Path to a directory to write index mapping files. - prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" - tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics - metric: - name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - test_ds: - file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. - names: null # Names of the corresponding datasets used to log metrics. - global_batch_size: ${model.global_batch_size} - micro_batch_size: ${model.micro_batch_size} - shuffle: False - num_workers: 0 - memmap_workers: ${model.data.train_ds.memmap_workers} - pin_memory: True - max_seq_length: 2048 - min_seq_length: 1 - drop_last: False - context_key: 'input' - label_key: 'output' - add_eos: ${model.data.train_ds.add_eos} - add_sep: ${model.data.train_ds.add_sep} - add_bos: ${model.data.train_ds.add_bos} - separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline} - write_predictions_to_file: False - output_file_path_prefix: null # Prefix of the file to write predictions to. - truncation_field: "context" # Options: ['context', 'answer'] - index_mapping_dir: null # Path to a directory to write index mapping files. - prompt_template: ${model.data.train_ds.prompt_template} - tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics - metric: - name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - - optim: - name: fused_adam - lr: 1e-4 - weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 50 - min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1 - constant_steps: 0 # Constant steps should also be 0 when min_lr=0 - monitor: val_loss - reduce_on_plateau: false \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_adapter_eval.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_adapter_eval.py deleted file mode 100644 index d08f4291321adfe5b234422c377f082252081a5f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_adapter_eval.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os - -import torch -import torch.multiprocessing as mp -from megatron.core import parallel_state -from omegaconf import OmegaConf -from omegaconf.omegaconf import open_dict -from pytorch_lightning.trainer.trainer import Trainer - -from nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model import MegatronGPTAdapterLearningModel -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.decorators import deprecated - -mp.set_start_method("spawn", force=True) - -""" -This is the script to run an Adapter Tuned GPT Model for text generation. - -Usage: - Assume the model has TP=1, PP=1 in the following use cases. - a. run greedy inference using a base gpt nemo file, and an adapter nemo file: - python megatron_gpt_adapter_eval.py \ - gpt_model_file=PATH TO GPT MODEL NEMO FILE \ - adapter_model_file=PATH TO ADAPTER MODEL NEMO FILE (generated by training script: ./megatron_gpt_adapter_tuning.py) \ - data_paths=[PATH TO A JSONL FILE CONTAINING PROMPTS], \ - pred_file_path=PATH TO OUTPUT FILE TO DUMP PREDICTIONS -""" - -if not torch.cuda.is_available(): - raise EnvironmentError("GPU is needed for the inference") - - -@deprecated( - explanation=f"{__file__} is deprecated. Please use MegatronGPTSFTModel.add_adapter() for PEFT features." - "See updated scripts `megatron_gpt_peft_tuning.py` and `megatron_gpt_peft_eval.py` for examples." -) -@hydra_runner(config_path="conf", config_name="megatron_gpt_adapter_inference") -def main(cfg) -> None: - - # trainer required for restoring model parallel models - trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) - - if ( - cfg.tensor_model_parallel_size < 0 - or cfg.pipeline_model_parallel_size < 0 - or cfg.get('pipeline_model_parallel_split_rank', -1) < 0 - ): - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(cfg.gpt_model_file): - save_restore_connector.model_extracted_dir = cfg.gpt_model_file - model_config = MegatronGPTModel.restore_from( - restore_path=cfg.gpt_model_file, - trainer=trainer, - return_config=True, - save_restore_connector=save_restore_connector, - ) - - with open_dict(cfg): - cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1) - cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1) - cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0) - - # Load an adapter model, must be provided in config - if cfg.get("adapter_model_file", None) is not None: - # Update frozen GPT model path in case it has changed - adapter_tuning_cfg = MegatronGPTAdapterLearningModel.restore_from( - cfg.adapter_model_file, trainer=trainer, return_config=True - ) - with open_dict(adapter_tuning_cfg): - adapter_tuning_cfg.language_model_path = cfg.gpt_model_file - - # Now load prompt learning model with frozen gpt model base - model = MegatronGPTAdapterLearningModel.restore_from( - restore_path=cfg.adapter_model_file, trainer=trainer, override_config_path=adapter_tuning_cfg - ) - - # Or load regular GPT model - else: - raise NotImplementedError( - "This script is meant for inference from an Adapter Tuned GPT Model, for inference from a Megatron GPT model, refer to ../megatron_gpt_eval.py" - ) - - model.freeze() - - # Have to turn off activations_checkpoint_method for inference - try: - model.model.language_model.encoder.activations_checkpoint_method = None - except AttributeError: - pass - - try: - model.frozen_model.model.language_model.encoder.activations_checkpoint_method = None - except AttributeError: - pass - - max_input_length = model.frozen_model.cfg.encoder_seq_length - cfg.inference.tokens_to_generate - # check whether the DDP is initialized - if parallel_state.is_unitialized(): - - def dummy(): - return - - if trainer.strategy.launcher is not None: - trainer.strategy.launcher.launch(dummy, trainer=trainer) - trainer.strategy.setup_environment() - - _, dataloader = model.build_virtual_prompt_dataset( - data=cfg.data_paths, - batch_size=cfg.get("batch_size", 1), - max_seq_length=max_input_length, - min_seq_length=model.cfg.data.get('min_seq_length', 1), - add_bos=cfg.inference.add_BOS, - add_eos=False, - for_train=False, - tokens_to_generate=cfg.inference.tokens_to_generate, - drop_last=False, - shuffle=False, - num_workers=cfg.get("num_workers", 1), - ) - - config = OmegaConf.to_container(cfg.inference) - model.set_inference_config(config) - response = trainer.predict(model, dataloader) - print("***************************") - if cfg.pred_file_path is not None: - with open(cfg.pred_file_path, "w", encoding="utf-8") as f: - for batch in response: - for sentence in batch['sentences']: - s = ' '.join(sentence.split('\n')) - f.write(s + "\n") - print("predictions saved to {}".format(cfg.pred_file_path)) - else: - print(response) - print("***************************") - - -if __name__ == '__main__': - dep_msg = "* Please switch to using examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py *" - dep = "Deprecation Notice!!".center(len(dep_msg) - 2, " ") - banner = "*" * len(dep_msg) - spacer = " " * (len(dep_msg) - 2) - logging.warning(f"\n\n{banner}\n*{spacer}*\n*{dep}*\n{dep_msg}\n*{spacer}*\n{banner}\n\n") - main() # noqa pylint: disable=no-value-for-parameter - logging.warning(f"\n\n{banner}\n*{spacer}*\n*{dep}*\n{dep_msg}\n*{spacer}*\n{banner}\n\n") diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py deleted file mode 100644 index 15a7ed800ce4aea4251c8481b0875dac94173a29..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment - -from nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model import MegatronGPTAdapterLearningModel -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, - PipelineMixedPrecisionPlugin, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.decorators import deprecated -from nemo.utils.exp_manager import exp_manager - -mp.set_start_method("spawn", force=True) - -""" -This is the script to train an Adapter infused GPT Model for text generation. -A base GPT Model is required as a starting point. This script will then insert -Adapters into each Transformer layer and will train/update only these adapters -during training. The base GPT Model weights will remain frozen. - -During training this script will only save the newly trained Adapter weights -in checkpoints. At the end of training a .nemo file of Adapter weights will -be saved. - -Usage: - Assuming the base model is a 125m GPT Model, with TP=1, PP=1: - a. run a training run for a base gpt nemo file: - python megatron_gpt_adapter_tuning.py \ - "model.data.train_ds=[PATH TO TRAINING JSONL FILE]", - "model.data.validation_ds=[PATH TO VALIDATION JSONL FILE]", - model.language_model_path="PATH TO BASE GPT MODEL .nemo FILE" - name="NAME OF TRAINING RUN" - exp_manager.exp_dir="DIR TO SAVE CHECKPOINTS and .nemo FILE", - trainer.max_epochs=2 -""" - - -@deprecated( - explanation=f"{__file__} is deprecated. Please use MegatronGPTSFTModel.add_adapter() for PEFT features." - "See updated scripts `megatron_gpt_peft_tuning.py` and `megatron_gpt_peft_eval.py` for examples." -) -@hydra_runner(config_path="conf", config_name="megatron_gpt_adapter_tuning_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - with_distributed_adam = cfg.model.optim.get('name') == 'distributed_fused_adam' - - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - - if megatron_amp_o2 and not with_distributed_adam: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()]) - exp_manager(trainer, cfg.exp_manager) - - # load existing or init new soft prompt GPT model - if cfg.model.get("restore_path", None): - model = MegatronGPTAdapterLearningModel.restore_from( - cfg.model.restore_path, cfg.model, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector() - ) - else: - model = MegatronGPTAdapterLearningModel(cfg.model, trainer=trainer) - - trainer.fit(model) - - -if __name__ == '__main__': - dep_msg = "* Please switch to using examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py *" - dep = "Deprecation Notice!!".center(len(dep_msg) - 2, " ") - banner = "*" * len(dep_msg) - spacer = " " * (len(dep_msg) - 2) - logging.warning(f"\n\n{banner}\n*{spacer}*\n*{dep}*\n{dep_msg}\n*{spacer}*\n{banner}\n\n") - main() - logging.warning(f"\n\n{banner}\n*{spacer}*\n*{dep}*\n{dep_msg}\n*{spacer}*\n{banner}\n\n") diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_eval.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_eval.py deleted file mode 100644 index 2a6ca4b4804931695388bf9674dae74f62cb2038..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_eval.py +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import torch -import torch.multiprocessing as mp -from megatron.core import parallel_state -from omegaconf import OmegaConf -from omegaconf.omegaconf import open_dict -from pytorch_lightning.trainer.trainer import Trainer - -from nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model import MegatronGPTInfusedAdapterModel -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector -from nemo.core.config import hydra_runner - -from nemo.utils import logging -from nemo.utils.decorators import deprecated - -mp.set_start_method("spawn", force=True) - -""" -This is the script to run an Adapter Tuned GPT Model for text generation. - -Usage: - Assume the model has TP=1, PP=1 in the following use cases. - a. run greedy inference using a base gpt nemo file, and an adapter nemo file: - python megatron_gpt_ia3_eval.py \ - gpt_model_file=PATH TO GPT MODEL NEMO FILE \ - adapter_model_file=PATH TO ADAPTER MODEL NEMO FILE (generated by training script: ./megatron_gpt_ia3_tuning.py) \ - data_paths=[PATH TO A JSONL FILE CONTAINING PROMPTS], \ - pred_file_path=PATH TO OUTPUT FILE TO DUMP PREDICTIONS -""" - -if not torch.cuda.is_available(): - raise EnvironmentError("GPU is needed for the inference") - - -@deprecated( - explanation=f"{__file__} is deprecated. Please use MegatronGPTSFTModel.add_adapter() for PEFT features." - "See updated scripts `megatron_gpt_peft_tuning.py` and `megatron_gpt_peft_eval.py` for examples." -) -@hydra_runner(config_path="conf", config_name="megatron_gpt_adapter_inference") -def main(cfg) -> None: - - # trainer required for restoring model parallel models - trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) - - if ( - cfg.tensor_model_parallel_size < 0 - or cfg.pipeline_model_parallel_size < 0 - or cfg.get('pipeline_model_parallel_split_rank', -1) < 0 - ): - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(cfg.gpt_model_file): - save_restore_connector.model_extracted_dir = cfg.gpt_model_file - model_config = MegatronGPTModel.restore_from( - restore_path=cfg.gpt_model_file, - trainer=trainer, - return_config=True, - save_restore_connector=save_restore_connector, - ) - - with open_dict(cfg): - cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1) - cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1) - cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0) - - # Load an adapter model, must be provided in config - if cfg.get("adapter_model_file", None) is not None: - # Update frozen GPT model path in case it has changed - ia3_tuning_cfg = MegatronGPTInfusedAdapterModel.restore_from( - cfg.adapter_model_file, trainer=trainer, return_config=True - ) - with open_dict(ia3_tuning_cfg): - ia3_tuning_cfg.language_model_path = cfg.gpt_model_file - - # Now load prompt learning model with frozen gpt model base - model = MegatronGPTInfusedAdapterModel.restore_from( - restore_path=cfg.adapter_model_file, trainer=trainer, override_config_path=ia3_tuning_cfg - ) - - # Or load regular GPT model - else: - raise NotImplementedError( - "This script is meant for inference from an Adapter Tuned GPT Model, for inference from a Megatron GPT model, refer to ../megatron_gpt_eval.py" - ) - - model.freeze() - - # Have to turn off activations_checkpoint_method for inference - try: - model.model.language_model.encoder.activations_checkpoint_method = None - except AttributeError: - pass - - try: - model.frozen_model.model.language_model.encoder.activations_checkpoint_method = None - except AttributeError: - pass - - max_input_length = model.frozen_model.cfg.encoder_seq_length - cfg.inference.tokens_to_generate - # check whether the DDP is initialized - if parallel_state.is_unitialized(): - - def dummy(): - return - - if trainer.strategy.launcher is not None: - trainer.strategy.launcher.launch(dummy, trainer=trainer) - trainer.strategy.setup_environment() - - _, dataloader = model.build_virtual_prompt_dataset( - data=cfg.data_paths, - batch_size=cfg.get("batch_size", 1), - max_seq_length=max_input_length, - min_seq_length=model.cfg.data.get('min_seq_length', 1), - add_bos=cfg.inference.add_BOS, - add_eos=False, - for_train=False, - tokens_to_generate=cfg.inference.tokens_to_generate, - drop_last=False, - shuffle=False, - num_workers=cfg.get("num_workers", 1), - ) - - config = OmegaConf.to_container(cfg.inference) - model.set_inference_config(config) - response = trainer.predict(model, dataloader) - print("***************************") - if cfg.pred_file_path is not None: - with open(cfg.pred_file_path, "w", encoding="utf-8") as f: - for batch in response: - for sentence in batch['sentences']: - s = ' '.join(sentence.split('\n')) - f.write(s + "\n") - print("predictions saved to {}".format(cfg.pred_file_path)) - else: - print(response) - print("***************************") - - -if __name__ == '__main__': - dep_msg = "* Please switch to using examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py *" - dep = "Deprecation Notice!!".center(len(dep_msg) - 2, " ") - banner = "*" * len(dep_msg) - spacer = " " * (len(dep_msg) - 2) - logging.warning(f"\n\n{banner}\n*{spacer}*\n*{dep}*\n{dep_msg}\n*{spacer}*\n{banner}\n\n") - main() # noqa pylint: disable=no-value-for-parameter - logging.warning(f"\n\n{banner}\n*{spacer}*\n*{dep}*\n{dep_msg}\n*{spacer}*\n{banner}\n\n") diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py deleted file mode 100644 index b55739b08794d2db706efe8287ddf07b5637e65a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment - -from nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model import MegatronGPTInfusedAdapterModel -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, - PipelineMixedPrecisionPlugin, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.decorators import deprecated -from nemo.utils.exp_manager import exp_manager - -mp.set_start_method("spawn", force=True) - -""" -This is the script to train an Adapter infused GPT Model for text generation. -A base GPT Model is required as a starting point. This script will then insert -Adapters into each Transformer layer and will train/update only these adapters -during training. The base GPT Model weights will remain frozen. - -During training this script will only save the newly trained Adapter weights -in checkpoints. At the end of training a .nemo file of Adapter weights will -be saved. - -Usage: - Assuming the base model is a 125m GPT Model, with TP=1, PP=1: - a. run a training run for a base gpt nemo file: - python megatron_gpt_adapter_tuning.py \ - "model.data.train_ds=[PATH TO TRAINING JSONL FILE]", - "model.data.validation_ds=[PATH TO VALIDATION JSONL FILE]", - model.language_model_path="PATH TO BASE GPT MODEL .nemo FILE" - name="NAME OF TRAINING RUN" - exp_manager.exp_dir="DIR TO SAVE CHECKPOINTS and .nemo FILE", - trainer.max_epochs=2 -""" - - -@deprecated( - explanation=f"{__file__} is deprecated. Please use MegatronGPTSFTModel.add_adapter() for PEFT features." - "See updated scripts `megatron_gpt_peft_tuning.py` and `megatron_gpt_peft_eval.py` for examples." -) -@hydra_runner(config_path="conf", config_name="megatron_gpt_ia3_tuning_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - with_distributed_adam = cfg.model.optim.get('name') == 'distributed_fused_adam' - - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2 and not with_distributed_adam: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()]) - exp_manager(trainer, cfg.exp_manager) - - # load existing or init new soft prompt GPT model - if cfg.model.get("restore_path", None): - model = MegatronGPTInfusedAdapterModel.restore_from( - cfg.model.restore_path, cfg.model, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector() - ) - else: - model = MegatronGPTInfusedAdapterModel(cfg.model, trainer=trainer) - - trainer.fit(model) - - -if __name__ == '__main__': - dep_msg = "* Please switch to using examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py *" - dep = "Deprecation Notice!!".center(len(dep_msg) - 2, " ") - banner = "*" * len(dep_msg) - spacer = " " * (len(dep_msg) - 2) - logging.warning(f"\n\n{banner}\n*{spacer}*\n*{dep}*\n{dep_msg}\n*{spacer}*\n{banner}\n\n") - main() - logging.warning(f"\n\n{banner}\n*{spacer}*\n*{dep}*\n{dep_msg}\n*{spacer}*\n{banner}\n\n") diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py deleted file mode 100644 index 8098e62684eea1e42161d31a18d605ecdc601176..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import asyncio -import threading -from functools import partial - -import torch -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf - - -from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel -from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer -from nemo.collections.nlp.modules.common.text_generation_utils import generate -from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder -from nemo.core.config import hydra_runner -from nemo.utils import logging - -try: - from megatron.core import parallel_state - - HAVE_MEGATRON_CORE = True -except: - pass - -mp.set_start_method("spawn", force=True) -""" -This is the script to run inference with a PEFT model or an SFT Model. - -If you want to evaluate an SFT .nemo file: - -python examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py \ - model.restore_from_path= \ - model.peft.restore_from_path=null \ - trainer.devices=1 model.data.test_ds.file_names=\[, ] \ - model.data.test_ds.names=\['name_for_test_file1', 'name_for_test_file2'] \ # this is not the filename just some identifier - model.data.test_ds.global_batch_size=4 \ # or some other value - model.data.test_ds.micro_batch_size=4 \ - model.data.test_ds.tokens_to_generate=30 \ - inference.greedy=True \ - inference.outfile_path=\'' - -If you want to evaluate a PEFT Model, you should provide a base GPT model and a PEFT model .nemo file - -python examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py \ - model.restore_from_path= \ - model.peft.restore_from_path= \ # this will be created if you use `megatron_gpt_peft_tuning.py` - trainer.devices=1 model.data.test_ds.file_names=\[, ] \ - model.data.test_ds.names=\['name_for_test_file1', 'name_for_test_file2'] \ # this is not the filename just some identifier - model.data.test_ds.global_batch_size=4 \ # or some other value - model.data.test_ds.micro_batch_size=4 \ - model.data.test_ds.tokens_to_generate=30 \ - inference.greedy=True \ - inference.outfile_path=\'' - -""" - - -def use_inference_server(cfg, model, trainer): - if not HAVE_MEGATRON_CORE: - raise ValueError('Megatron-core needs to be installed to use this feature!') - - from nemo.collections.nlp.modules.common.megatron_web_server import get_chatbot_demo, get_demo - - trainer.test(model, dataloaders=None) - - if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0: - if cfg.web_server: - if cfg.chat: - defaults = { - 'user': cfg.chatbot_config.user, - 'assistant': cfg.chatbot_config.assistant, - 'system': cfg.chatbot_config.system, - } - web_ui = partial( - get_chatbot_demo, - defaults=defaults, - value=cfg.chatbot_config.value, - attributes=cfg.chatbot_config.attributes, - ) - else: - web_ui = get_demo - loop = asyncio.new_event_loop() - thread = threading.Thread( - target=web_ui, daemon=True, args=(cfg.share, cfg.username, cfg.password, cfg.port, cfg.web_port, loop), - ) - thread.start() - server = MegatronServer(model.cuda()) - server.run("0.0.0.0", port=cfg.port) - - while True: - choice = torch.cuda.LongTensor(1) - torch.distributed.broadcast(choice, 0) - if choice[0].item() == 0: - generate(model.cuda()) - - -@hydra_runner(config_path="conf", config_name="megatron_gpt_peft_eval_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f"\n{OmegaConf.to_yaml(cfg)}") - trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() - - if cfg.model.peft.restore_from_path: - model_cfg = MegatronGPTSFTModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg) - else: - model_cfg = MegatronGPTSFTModel.merge_inference_cfg(cfg.model.restore_from_path, cfg) - - model = MegatronGPTSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) - - if cfg.model.peft.restore_from_path: - model.load_adapters(cfg.model.peft.restore_from_path) - - model.freeze() - logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}") - - if not cfg.model.get('use_flash_attention', False): - cfg.inference.compute_attention_mask = True - config = OmegaConf.to_container(cfg.inference, resolve=True) - model.set_inference_config(config) - - if not cfg.server: - trainer.test(model) - else: - use_inference_server(cfg, model, trainer) - - -if __name__ == "__main__": - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py deleted file mode 100644 index cffe974a071ad8f03c351180a110d678d4279500..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf - -from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel -from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder -from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP - -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - -mp.set_start_method("spawn", force=True) - -""" -This is the script to finetuning a GPT Model with any PEFT method. -A base GPT Model is required as a starting point. This script will then insert -Adapters into each Transformer layer and will train/update only these adapters -during training. The base GPT Model weights will remain frozen. - -During training this script will only save the newly trained Adapter weights -in checkpoints. At the end of training a .nemo file of Adapter weights will -be saved. - -Usage: - Assuming the base model is a 125m GPT Model, with TP=1, PP=1: - a. run a training run for a base gpt nemo file: - python megatron_gpt_peft_tuning.py \ - "model.data.train_ds.file_names=[PATH TO TRAINING JSONL FILE]", - "model.data.train_ds.concat_sampling_probabilities=[SAMPLING VAL]", - "model.data.validation_ds.file_names=[PATH TO VALIDATION JSONL FILE]", - "model.data.validation_ds.names=[NAME FOR METRIC LOGGING]", - model.restore_from_path="PATH TO BASE GPT MODEL .nemo FILE" - model.peft.peft_scheme='lora' # lora, ptuning, adapter, ia3, or none for full fineutning - name="NAME OF TRAINING RUN" - exp_manager.exp_dir="DIR TO SAVE CHECKPOINTS and .nemo FILE", -Please see lora.ipynb for a step-by-step guide. -""" - - -@hydra_runner(config_path="conf", config_name="megatron_gpt_peft_tuning_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() - exp_manager(trainer, cfg.exp_manager) - - model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg) - model = MegatronGPTSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) - peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme] - - if cfg.model.peft.restore_from_path is not None: - # initialize peft weights from a checkpoint instead of randomly - # This is not the same as resume training because optimizer states are not restored. - logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path) - model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg)) - elif peft_cfg_cls is not None: - logging.info("Adding adapter weights to the model for PEFT") - model.add_adapter(peft_cfg_cls(model_cfg)) - else: - logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}") - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py deleted file mode 100644 index 4c4c001014dba9a81a7bc7f8a96b92b1112e2dfd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py +++ /dev/null @@ -1,230 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import tempfile - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment - -from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset import get_prompt_template_example -from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel -from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, - PipelineMixedPrecisionPlugin, -) -from nemo.core.config import hydra_runner -from nemo.utils import AppState, logging -from nemo.utils.decorators import deprecated -from nemo.utils.exp_manager import exp_manager -from nemo.utils.model_utils import inject_model_parallel_rank - -mp.set_start_method("spawn", force=True) - - -def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False): - """ - This function modifies the original gpt pre-training config (gpt_cfg) with attributes from the finetuning config (cfg). - The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`. - """ - OmegaConf.set_struct(gpt_cfg, True) - OmegaConf.resolve(cfg) - with open_dict(gpt_cfg): - gpt_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) - gpt_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size - gpt_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size - gpt_cfg.sequence_parallel = cfg.model.get("sequence_parallel", False) - gpt_cfg.activations_checkpoint_granularity = cfg.model.get("activations_checkpoint_granularity", None) - gpt_cfg.activations_checkpoint_num_layers = cfg.model.get("activations_checkpoint_num_layers", None) - gpt_cfg.activations_checkpoint_method = cfg.model.get("activations_checkpoint_method", None) - gpt_cfg.activations_checkpoint_layers_per_pipeline = cfg.model.get( - "activations_checkpoint_layers_per_pipeline", None - ) - gpt_cfg.data = cfg.model.data - gpt_cfg.optim = cfg.model.optim - gpt_cfg.precision = cfg.trainer.precision - gpt_cfg.answer_only_loss = cfg.model.answer_only_loss - gpt_cfg.restore_from_path = cfg.model.restore_from_path - gpt_cfg.resume_from_checkpoint = cfg.model.resume_from_checkpoint - gpt_cfg.save_nemo_on_validation_end = cfg.model.save_nemo_on_validation_end - gpt_cfg.gradient_as_bucket_view = cfg.model.gradient_as_bucket_view - gpt_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.0) - gpt_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.0) - gpt_cfg.ffn_dropout = cfg.model.ffn_dropout - gpt_cfg.use_flash_attention = cfg.model.get('use_flash_attention', False) - gpt_cfg.tensor_model_parallel_size = cfg.model.get('tensor_model_parallel_size', 1) - gpt_cfg.pipeline_model_parallel_size = cfg.model.get('pipeline_model_parallel_size', 1) - gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get('pipeline_model_parallel_split_rank', 0) - - if cfg.model.data.get('chat', False): - # chat model, overwrite the prompt template - prompt_template = get_prompt_template_example(cfg.model.data.chat_prompt_tokens) - gpt_cfg.data.train_ds.prompt_template = prompt_template - gpt_cfg.data.validation_ds.prompt_template = prompt_template - gpt_cfg.data.test_ds.prompt_template = prompt_template - - sft_cls = MegatronGPTSFTModel - gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}" - - if cfg.model.get('use_flash_attention', None) is not None: - gpt_cfg.use_flash_attention = cfg.model.use_flash_attention - - if cfg.model.get('seq_len_interpolation_factor', None) is not None: - gpt_cfg.seq_len_interpolation_factor = cfg.model.seq_len_interpolation_factor - - sft_cls = MegatronGPTSFTModel - gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}" - - # This is needed when modifying a hparam file directly to load `.ckpt` files. - # This is not needed to modify the cfg in `.nemo` files. - if add_cfg_to_tree: - OmegaConf.resolve(gpt_cfg) - gpt_cfg.cfg = gpt_cfg - - return gpt_cfg - - -def load_from_nemo(cls, cfg, trainer, gpt_cfg, modify_confg_fn): - gpt_cfg = modify_confg_fn(gpt_cfg, cfg, add_cfg_to_tree=False) - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(cfg.model.restore_from_path): - save_restore_connector.model_extracted_dir = cfg.model.restore_from_path - model = cls.restore_from( - restore_path=cfg.model.restore_from_path, - trainer=trainer, - override_config_path=gpt_cfg, - save_restore_connector=save_restore_connector, - ) - return model - - -def load_from_checkpoint_dir(cls, cfg, trainer, modify_confg_fn): - app_state = AppState() - if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1: - app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.model.pipeline_model_parallel_size - app_state.tensor_model_parallel_size = cfg.model.tensor_model_parallel_size - app_state.pipeline_model_parallel_size = cfg.model.pipeline_model_parallel_size - ( - app_state.tensor_model_parallel_rank, - app_state.pipeline_model_parallel_rank, - app_state.model_parallel_size, - app_state.data_parallel_size, - app_state.pipeline_model_parallel_split_rank, - app_state.virtual_pipeline_model_parallel_rank, - ) = fake_initialize_model_parallel( - world_size=app_state.model_parallel_size, - rank=trainer.global_rank, - tensor_model_parallel_size_=cfg.model.tensor_model_parallel_size, - pipeline_model_parallel_size_=cfg.model.pipeline_model_parallel_size, - pipeline_model_parallel_split_rank_=cfg.model.pipeline_model_parallel_split_rank, - ) - checkpoint_path = inject_model_parallel_rank( - os.path.join(cfg.model.pretrained_checkpoint.checkpoint_dir, cfg.model.pretrained_checkpoint.checkpoint_name) - ) - hparams_file = OmegaConf.load(cfg.model.pretrained_checkpoint.hparams_file) - gpt_cfg = modify_confg_fn(hparams_file.cfg, cfg, add_cfg_to_tree=True) - with tempfile.NamedTemporaryFile(suffix='.yaml') as f: - OmegaConf.save(config=gpt_cfg, f=f.name) - model = cls.load_from_checkpoint(checkpoint_path=checkpoint_path, trainer=trainer, hparams_file=f.name,) - return model - - -def validate_checkpoint_loading_args(cfg): - if cfg.checkpoint_dir is None or not os.path.isdir(cfg.checkpoint_dir): - raise ValueError(f'Checkpoint directory {cfg.checkpoint_dir} does not exist or is not a directory.') - if cfg.checkpoint_name is None: - raise ValueError(f'Checkpoint name {cfg.checkpoint_name} is not valid.') - if cfg.hparams_file is None or not os.path.isfile(cfg.hparams_file): - raise ValueError(f'Hparams file {cfg.hparams_file} does not exist or is not a file.') - - -@deprecated( - explanation=f"{__file__} is deprecated. PEFT and SFT scripts are now consolidated" - "See updated scripts `megatron_gpt_peft_tuning.py` and `megatron_gpt_peft_eval.py` for examples." -) -@hydra_runner(config_path="conf", config_name="megatron_gpt_sft") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - with_distributed_adam = cfg.model.optim.get('name', 'fused_adam') == 'distributed_fused_adam' - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2 and not with_distributed_adam: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()]) - - exp_manager(trainer, cfg.exp_manager) - - # update resume from checkpoint found by exp_manager - if cfg.model.resume_from_checkpoint is not None: - trainer.ckpt_path = cfg.model.resume_from_checkpoint - logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') - - if cfg.model.restore_from_path: - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(cfg.model.restore_from_path): - save_restore_connector.model_extracted_dir = cfg.model.restore_from_path - gpt_cfg = MegatronGPTSFTModel.restore_from( - restore_path=cfg.model.restore_from_path, - trainer=trainer, - return_config=True, - save_restore_connector=save_restore_connector, - ) - model = load_from_nemo(MegatronGPTSFTModel, cfg, trainer, gpt_cfg, modify_confg_fn=_modify_config) - else: - validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) - model = load_from_checkpoint_dir(MegatronGPTSFTModel, cfg, trainer, modify_confg_fn=_modify_config) - - if 'inference' in cfg: - if not cfg.model.use_flash_attention: - cfg.inference.compute_attention_mask = True - config = OmegaConf.to_container(cfg.inference, resolve=True) - model.set_inference_config(config) - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_adapter_eval.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_adapter_eval.py deleted file mode 100644 index 5fd07e85ce2df506e205a6eaa100d9df7507cb24..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_adapter_eval.py +++ /dev/null @@ -1,165 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch -import torch.multiprocessing as mp -from megatron.core import parallel_state -from omegaconf import OmegaConf -from omegaconf.omegaconf import open_dict -from pytorch_lightning.trainer.trainer import Trainer - -from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5AdapterLearningModel -from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy -from nemo.core.config import hydra_runner -from nemo.utils.app_state import AppState -from nemo.utils.decorators import deprecated - -mp.set_start_method("spawn", force=True) - -""" -This is the script to run an Adapter Tuned GPT Model for text generation. - -Usage: - Assume the model has TP=1, PP=1 in the following use cases. - a. run greedy inference using a base gpt nemo file, and an adapter nemo file: - python megatron_gpt_ia3_eval.py \ - gpt_model_file=PATH TO GPT MODEL NEMO FILE \ - adapter_model_file=PATH TO ADAPTER MODEL NEMO FILE (generated by training script: ./megatron_gpt_ia3_tuning.py) \ - data_paths=[PATH TO A JSONL FILE CONTAINING PROMPTS], \ - pred_file_path=PATH TO OUTPUT FILE TO DUMP PREDICTIONS -""" - -if not torch.cuda.is_available(): - raise EnvironmentError("GPU is needed for the inference") - - -@deprecated( - explanation=f"{__file__} is deprecated. Please use MegatronT5SFTModel.add_adapter() for PEFT features." - "See updated scripts `megatron_t5_peft_tuning.py` and `megatron_t5_peft_eval.py` for examples." -) -@hydra_runner(config_path="conf", config_name="megatron_t5_adapter_inference") -def main(cfg) -> None: - - # trainer required for restoring model parallel models - trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) - - if ( - cfg.tensor_model_parallel_size < 0 - or cfg.pipeline_model_parallel_size < 0 - or cfg.get('pipeline_model_parallel_split_rank', -1) < 0 - ): - model_config = MegatronT5AdapterLearningModel.restore_from( - restore_path=cfg.language_model_path, trainer=trainer, return_config=True, - ) - - with open_dict(cfg): - cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1) - cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1) - cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0) - - app_state = AppState() - if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: - app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size - ( - app_state.tensor_model_parallel_rank, - app_state.pipeline_model_parallel_rank, - app_state.model_parallel_size, - app_state.data_parallel_size, - app_state.pipeline_model_parallel_split_rank, - app_state.virtual_pipeline_model_parallel_rank, - ) = fake_initialize_model_parallel( - world_size=app_state.model_parallel_size, - rank=trainer.global_rank, - tensor_model_parallel_size_=cfg.tensor_model_parallel_size, - pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, - pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank, - ) - - # Load an adapter model, must be provided in config - if cfg.get("adapter_model_file", None) is not None and cfg.get("language_model_path", None) is not None: - # Update frozen GPT model path in case it has changed - adapter_tuning_cfg = MegatronT5AdapterLearningModel.restore_from( - cfg.adapter_model_file, trainer=trainer, return_config=True - ) - with open_dict(adapter_tuning_cfg): - adapter_tuning_cfg.language_model_path = cfg.language_model_path - adapter_tuning_cfg.pretrained_language_model_path = cfg.language_model_path - adapter_tuning_cfg.micro_batch_size = cfg.data.micro_batch_size - adapter_tuning_cfg.global_batch_size = cfg.data.global_batch_size - - # Now load prompt learning model with frozen gpt model base - model = MegatronT5AdapterLearningModel.restore_from( - restore_path=cfg.adapter_model_file, trainer=trainer, override_config_path=adapter_tuning_cfg - ) - - # Or load regular GPT model - else: - raise NotImplementedError( - "This script is meant for inference from an Infused Adapter Tuned T5 Model, config should contain an adapter_model_file and a language_model_path" - ) - - # check whether the DDP is initialized - if parallel_state.is_unitialized(): - - def dummy(): - return - - if trainer.strategy.launcher is not None: - trainer.strategy.launcher.launch(dummy, trainer=trainer) - trainer.strategy.setup_environment() - - model.freeze() - - # Have to turn off activations_checkpoint_method for inference - try: - model.model.language_model.encoder.activations_checkpoint_method = None - except AttributeError: - pass - - try: - model.frozen_model.model.language_model.encoder.activations_checkpoint_method = None - except AttributeError: - pass - - test_ds, test_dl = model.build_virtual_prompt_dataset( - dataset_paths=cfg.data.test_ds, - batch_size=cfg.data.global_batch_size, - for_train=False, - drop_last=False, - shuffle=False, - num_workers=cfg.data.num_workers, - pin_memory=True, - ) - - config = OmegaConf.to_container(cfg.inference) - model.set_inference_config(config) - response = trainer.predict(model, test_dl) - print("***************************") - if cfg.pred_file_path is not None: - with open(cfg.pred_file_path, "w", encoding="utf-8") as f: - for batch in response: - for inp, pred in zip(batch['input_text'], batch['preds_text']): - inp = ' '.join(inp.split('\n')) - pred = ' '.join(pred.split('\n')) - f.write(f'{inp} {pred}\n') - print("predictions saved to {}".format(cfg.pred_file_path)) - else: - print(response) - print("***************************") - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py deleted file mode 100644 index ebc6a0327b782c490823f4c14adf2066afab1695..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment - -from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5AdapterLearningModel -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, - PipelineMixedPrecisionPlugin, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.decorators import deprecated -from nemo.utils.exp_manager import exp_manager - -mp.set_start_method("spawn", force=True) - -""" -This is the script to train an Adapter infused GPT Model for text generation. -A base GPT Model is required as a starting point. This script will then insert -Adapters into each Transformer layer and will train/update only these adapters -during training. The base GPT Model weights will remain frozen. - -During training this script will only save the newly trained Adapter weights -in checkpoints. At the end of training a .nemo file of Adapter weights will -be saved. - -Usage: - Assuming the base model is a 125m GPT Model, with TP=1, PP=1: - a. run a training run for a base gpt nemo file: - python megatron_gpt_adapter_tuning.py \ - "model.data.train_ds=[PATH TO TRAINING JSONL FILE]", - "model.data.validation_ds=[PATH TO VALIDATION JSONL FILE]", - model.language_model_path="PATH TO BASE GPT MODEL .nemo FILE" - name="NAME OF TRAINING RUN" - exp_manager.exp_dir="DIR TO SAVE CHECKPOINTS and .nemo FILE", - trainer.max_epochs=2 -""" - - -@deprecated( - explanation=f"{__file__} is deprecated. Please use MegatronT5SFTModel.add_adapter() for PEFT features." - "See updated scripts `megatron_t5_peft_tuning.py` and `megatron_t5_peft_eval.py` for examples." -) -@hydra_runner(config_path="conf", config_name="megatron_t5_adapter_tuning_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - with_distributed_adam = cfg.model.optim.get('name') == 'distributed_fused_adam' - - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2 and not with_distributed_adam: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()]) - exp_manager(trainer, cfg.exp_manager) - - # load existing or init new soft prompt GPT model - if cfg.model.get("restore_path", None): - model = MegatronT5AdapterLearningModel.restore_from( - cfg.model.restore_path, cfg.model, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector() - ) - else: - model = MegatronT5AdapterLearningModel(cfg.model, trainer=trainer) - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_ia3_eval.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_ia3_eval.py deleted file mode 100644 index cc9dfef059b849605039894d34bf159b9bfde6c7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_ia3_eval.py +++ /dev/null @@ -1,165 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch -import torch.multiprocessing as mp -from megatron.core import parallel_state -from omegaconf import OmegaConf -from omegaconf.omegaconf import open_dict -from pytorch_lightning.trainer.trainer import Trainer - -from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5InfusedAdapterModel -from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy -from nemo.core.config import hydra_runner -from nemo.utils.app_state import AppState -from nemo.utils.decorators import deprecated - -mp.set_start_method("spawn", force=True) - -""" -This is the script to run an Adapter Tuned GPT Model for text generation. - -Usage: - Assume the model has TP=1, PP=1 in the following use cases. - a. run greedy inference using a base gpt nemo file, and an adapter nemo file: - python megatron_gpt_ia3_eval.py \ - gpt_model_file=PATH TO GPT MODEL NEMO FILE \ - adapter_model_file=PATH TO ADAPTER MODEL NEMO FILE (generated by training script: ./megatron_gpt_ia3_tuning.py) \ - data_paths=[PATH TO A JSONL FILE CONTAINING PROMPTS], \ - pred_file_path=PATH TO OUTPUT FILE TO DUMP PREDICTIONS -""" - -if not torch.cuda.is_available(): - raise EnvironmentError("GPU is needed for the inference") - - -@deprecated( - explanation=f"{__file__} is deprecated. Please use MegatronT5SFTModel.add_adapter() for PEFT features." - "See updated scripts `megatron_t5_peft_tuning.py` and `megatron_t5_peft_eval.py` for examples." -) -@hydra_runner(config_path="conf", config_name="megatron_t5_ia3_inference") -def main(cfg) -> None: - - # trainer required for restoring model parallel models - trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) - - if ( - cfg.tensor_model_parallel_size < 0 - or cfg.pipeline_model_parallel_size < 0 - or cfg.get('pipeline_model_parallel_split_rank', -1) < 0 - ): - model_config = MegatronT5InfusedAdapterModel.restore_from( - restore_path=cfg.language_model_path, trainer=trainer, return_config=True, - ) - - with open_dict(cfg): - cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1) - cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1) - cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0) - - app_state = AppState() - if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: - app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size - ( - app_state.tensor_model_parallel_rank, - app_state.pipeline_model_parallel_rank, - app_state.model_parallel_size, - app_state.data_parallel_size, - app_state.pipeline_model_parallel_split_rank, - app_state.virtual_pipeline_model_parallel_rank, - ) = fake_initialize_model_parallel( - world_size=app_state.model_parallel_size, - rank=trainer.global_rank, - tensor_model_parallel_size_=cfg.tensor_model_parallel_size, - pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, - pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank, - ) - - # Load an adapter model, must be provided in config - if cfg.get("adapter_model_file", None) is not None and cfg.get("language_model_path", None) is not None: - # Update frozen GPT model path in case it has changed - ia3_tuning_cfg = MegatronT5InfusedAdapterModel.restore_from( - cfg.adapter_model_file, trainer=trainer, return_config=True - ) - with open_dict(ia3_tuning_cfg): - ia3_tuning_cfg.language_model_path = cfg.language_model_path - ia3_tuning_cfg.pretrained_language_model_path = cfg.language_model_path - ia3_tuning_cfg.micro_batch_size = cfg.data.micro_batch_size - ia3_tuning_cfg.global_batch_size = cfg.data.global_batch_size - - # Now load prompt learning model with frozen gpt model base - model = MegatronT5InfusedAdapterModel.restore_from( - restore_path=cfg.adapter_model_file, trainer=trainer, override_config_path=ia3_tuning_cfg - ) - - # Or load regular GPT model - else: - raise NotImplementedError( - "This script is meant for inference from an Infused Adapter Tuned T5 Model, config should contain an adapter_model_file and a language_model_path" - ) - - # check whether the DDP is initialized - if parallel_state.is_unitialized(): - - def dummy(): - return - - if trainer.strategy.launcher is not None: - trainer.strategy.launcher.launch(dummy, trainer=trainer) - trainer.strategy.setup_environment() - - model.freeze() - - # Have to turn off activations_checkpoint_method for inference - try: - model.model.language_model.encoder.activations_checkpoint_method = None - except AttributeError: - pass - - try: - model.frozen_model.model.language_model.encoder.activations_checkpoint_method = None - except AttributeError: - pass - - test_ds, test_dl = model.build_virtual_prompt_dataset( - dataset_paths=cfg.data.test_ds, - batch_size=cfg.data.global_batch_size, - for_train=False, - drop_last=False, - shuffle=False, - num_workers=cfg.data.num_workers, - pin_memory=True, - ) - - config = OmegaConf.to_container(cfg.inference) - model.set_inference_config(config) - response = trainer.predict(model, test_dl) - print("***************************") - if cfg.pred_file_path is not None: - with open(cfg.pred_file_path, "w", encoding="utf-8") as f: - for batch in response: - for inp, pred in zip(batch['input_text'], batch['preds_text']): - inp = ' '.join(inp.split('\n')) - pred = ' '.join(pred.split('\n')) - f.write(f'{inp} {pred}\n') - print("predictions saved to {}".format(cfg.pred_file_path)) - else: - print(response) - print("***************************") - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py deleted file mode 100644 index fc508611c9b11cab16aa15d263c90b752baa36ae..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment - -from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5InfusedAdapterModel -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, - PipelineMixedPrecisionPlugin, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.decorators import deprecated -from nemo.utils.exp_manager import exp_manager - -mp.set_start_method("spawn", force=True) - -""" -This is the script to train an Adapter infused GPT Model for text generation. -A base GPT Model is required as a starting point. This script will then insert -Adapters into each Transformer layer and will train/update only these adapters -during training. The base GPT Model weights will remain frozen. - -During training this script will only save the newly trained Adapter weights -in checkpoints. At the end of training a .nemo file of Adapter weights will -be saved. - -Usage: - Assuming the base model is a 125m GPT Model, with TP=1, PP=1: - a. run a training run for a base gpt nemo file: - python megatron_gpt_adapter_tuning.py \ - "model.data.train_ds=[PATH TO TRAINING JSONL FILE]", - "model.data.validation_ds=[PATH TO VALIDATION JSONL FILE]", - model.language_model_path="PATH TO BASE GPT MODEL .nemo FILE" - name="NAME OF TRAINING RUN" - exp_manager.exp_dir="DIR TO SAVE CHECKPOINTS and .nemo FILE", - trainer.max_epochs=2 -""" - - -@deprecated( - explanation=f"{__file__} is deprecated. Please use MegatronT5SFTModel.add_adapter() for PEFT features." - "See updated scripts `megatron_t5_peft_tuning.py` and `megatron_t5_peft_eval.py` for examples." -) -@hydra_runner(config_path="conf", config_name="megatron_t5_ia3_tuning_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - with_distributed_adam = cfg.model.optim.get('name') == 'distributed_fused_adam' - - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - - if megatron_amp_o2 and not with_distributed_adam: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()]) - exp_manager(trainer, cfg.exp_manager) - - # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams - with open_dict(cfg): - cfg.model.pretrained_language_model_path = cfg.model.language_model_path - - # load existing or init new soft prompt GPT model - if cfg.model.get("restore_path", None): - model = MegatronT5InfusedAdapterModel.restore_from( - cfg.model.restore_path, cfg.model, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector() - ) - else: - model = MegatronT5InfusedAdapterModel(cfg.model, trainer=trainer) - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_lora_eval.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_lora_eval.py deleted file mode 100644 index 38032d06a8c84dd718649b91738435aeec534959..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_lora_eval.py +++ /dev/null @@ -1,165 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch -import torch.multiprocessing as mp -from megatron.core import parallel_state -from omegaconf import OmegaConf -from omegaconf.omegaconf import open_dict -from pytorch_lightning.trainer.trainer import Trainer - -from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5LoraModel -from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy -from nemo.core.config import hydra_runner -from nemo.utils.app_state import AppState -from nemo.utils.decorators import deprecated - -mp.set_start_method("spawn", force=True) - -""" -This is the script to run an Adapter Tuned GPT Model for text generation. - -Usage: - Assume the model has TP=1, PP=1 in the following use cases. - a. run greedy inference using a base gpt nemo file, and an adapter nemo file: - python megatron_gpt_ia3_eval.py \ - gpt_model_file=PATH TO GPT MODEL NEMO FILE \ - adapter_model_file=PATH TO ADAPTER MODEL NEMO FILE (generated by training script: ./megatron_gpt_ia3_tuning.py) \ - data_paths=[PATH TO A JSONL FILE CONTAINING PROMPTS], \ - pred_file_path=PATH TO OUTPUT FILE TO DUMP PREDICTIONS -""" - -if not torch.cuda.is_available(): - raise EnvironmentError("GPU is needed for the inference") - - -@deprecated( - explanation=f"{__file__} is deprecated. Please use MegatronT5SFTModel.add_adapter() for PEFT features." - "See updated scripts `megatron_t5_peft_tuning.py` and `megatron_t5_peft_eval.py` for examples." -) -@hydra_runner(config_path="conf", config_name="megatron_t5_adapter_inference") -def main(cfg) -> None: - - # trainer required for restoring model parallel models - trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) - - if ( - cfg.tensor_model_parallel_size < 0 - or cfg.pipeline_model_parallel_size < 0 - or cfg.get('pipeline_model_parallel_split_rank', -1) < 0 - ): - model_config = MegatronT5LoraModel.restore_from( - restore_path=cfg.language_model_path, trainer=trainer, return_config=True, - ) - - with open_dict(cfg): - cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1) - cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1) - cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0) - - app_state = AppState() - if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: - app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size - ( - app_state.tensor_model_parallel_rank, - app_state.pipeline_model_parallel_rank, - app_state.model_parallel_size, - app_state.data_parallel_size, - app_state.pipeline_model_parallel_split_rank, - app_state.virtual_pipeline_model_parallel_rank, - ) = fake_initialize_model_parallel( - world_size=app_state.model_parallel_size, - rank=trainer.global_rank, - tensor_model_parallel_size_=cfg.tensor_model_parallel_size, - pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, - pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank, - ) - - # Load an adapter model, must be provided in config - if cfg.get("adapter_model_file", None) is not None and cfg.get("language_model_path", None) is not None: - # Update frozen GPT model path in case it has changed - adapter_tuning_cfg = MegatronT5LoraModel.restore_from( - cfg.adapter_model_file, trainer=trainer, return_config=True - ) - with open_dict(adapter_tuning_cfg): - adapter_tuning_cfg.language_model_path = cfg.language_model_path - adapter_tuning_cfg.pretrained_language_model_path = cfg.language_model_path - adapter_tuning_cfg.micro_batch_size = cfg.data.micro_batch_size - adapter_tuning_cfg.global_batch_size = cfg.data.global_batch_size - - # Now load prompt learning model with frozen gpt model base - model = MegatronT5LoraModel.restore_from( - restore_path=cfg.adapter_model_file, trainer=trainer, override_config_path=adapter_tuning_cfg - ) - - # Or load regular GPT model - else: - raise NotImplementedError( - "This script is meant for inference from an Infused Adapter Tuned T5 Model, config should contain an adapter_model_file and a language_model_path" - ) - - # check whether the DDP is initialized - if parallel_state.is_unitialized(): - - def dummy(): - return - - if trainer.strategy.launcher is not None: - trainer.strategy.launcher.launch(dummy, trainer=trainer) - trainer.strategy.setup_environment() - - model.freeze() - - # Have to turn off activations_checkpoint_method for inference - try: - model.model.language_model.encoder.activations_checkpoint_method = None - except AttributeError: - pass - - try: - model.frozen_model.model.language_model.encoder.activations_checkpoint_method = None - except AttributeError: - pass - - test_ds, test_dl = model.build_virtual_prompt_dataset( - dataset_paths=cfg.data.test_ds, - batch_size=cfg.data.global_batch_size, - for_train=False, - drop_last=False, - shuffle=False, - num_workers=cfg.data.num_workers, - pin_memory=True, - ) - - config = OmegaConf.to_container(cfg.inference) - model.set_inference_config(config) - response = trainer.predict(model, test_dl) - print("***************************") - if cfg.pred_file_path is not None: - with open(cfg.pred_file_path, "w", encoding="utf-8") as f: - for batch in response: - for inp, pred in zip(batch['input_text'], batch['preds_text']): - inp = ' '.join(inp.split('\n')) - pred = ' '.join(pred.split('\n')) - f.write(f'{inp} {pred}\n') - print("predictions saved to {}".format(cfg.pred_file_path)) - else: - print(response) - print("***************************") - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_lora_tuning.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_lora_tuning.py deleted file mode 100644 index 73cfae76b7a5a19853cc95499fc387efa02faaf2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_lora_tuning.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment - -from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5LoraModel -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, - PipelineMixedPrecisionPlugin, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.decorators import deprecated -from nemo.utils.exp_manager import exp_manager - -mp.set_start_method("spawn", force=True) - -""" -This is the script to train an Adapter infused GPT Model for text generation. -A base GPT Model is required as a starting point. This script will then insert -Adapters into each Transformer layer and will train/update only these adapters -during training. The base GPT Model weights will remain frozen. - -During training this script will only save the newly trained Adapter weights -in checkpoints. At the end of training a .nemo file of Adapter weights will -be saved. - -Usage: - Assuming the base model is a 125m GPT Model, with TP=1, PP=1: - a. run a training run for a base gpt nemo file: - python megatron_gpt_adapter_tuning.py \ - "model.data.train_ds=[PATH TO TRAINING JSONL FILE]", - "model.data.validation_ds=[PATH TO VALIDATION JSONL FILE]", - model.language_model_path="PATH TO BASE GPT MODEL .nemo FILE" - name="NAME OF TRAINING RUN" - exp_manager.exp_dir="DIR TO SAVE CHECKPOINTS and .nemo FILE", - trainer.max_epochs=2 -""" - - -@deprecated( - explanation=f"{__file__} is deprecated. Please use MegatronT5SFTModel.add_adapter() for PEFT features." - "See updated scripts `megatron_t5_peft_tuning.py` and `megatron_t5_peft_eval.py` for examples." -) -@hydra_runner(config_path="conf", config_name="megatron_t5_lora_tuning_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - with_distributed_adam = cfg.model.optim.get('name') == 'distributed_fused_adam' - - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2 and not with_distributed_adam: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()]) - exp_manager(trainer, cfg.exp_manager) - - # load existing or init new soft prompt GPT model - if cfg.model.get("restore_path", None): - model = MegatronT5LoraModel.restore_from( - cfg.model.restore_path, cfg.model, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector() - ) - else: - model = MegatronT5LoraModel(cfg.model, trainer=trainer) - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_peft_eval.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_peft_eval.py deleted file mode 100644 index d9d4d075362bdf1d312ee5c70cc74ee1a71d56b2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_peft_eval.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import asyncio -import threading -from functools import partial - -import torch -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf - - -from nemo.collections.nlp.models.language_modeling.megatron_t5_sft_model import MegatronT5SFTModel -from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer -from nemo.collections.nlp.modules.common.text_generation_utils import generate -from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder -from nemo.core.config import hydra_runner -from nemo.utils import logging - -try: - from megatron.core import parallel_state - - HAVE_MEGATRON_CORE = True -except (ImportError, ModuleNotFoundError): - HAVE_MEGATRON_CORE = False - -mp.set_start_method("spawn", force=True) -""" -This is the script to run inference with a PEFT model or an SFT Model. - -If you want to evaluate an SFT .nemo file: - -python examples/nlp/language_modeling/tuning/megatron_t5_peft_eval.py \ - model.restore_from_path= \ - model.peft.restore_from_path=null \ - trainer.devices=1 model.data.test_ds.file_names=\[, ] \ - model.data.test_ds.names=\['name_for_test_file1', 'name_for_test_file2'] \ # this is not the filename just some identifier - model.data.test_ds.global_batch_size=4 \ # or some other value - model.data.test_ds.micro_batch_size=4 \ - model.data.test_ds.tokens_to_generate=30 \ - inference.greedy=True \ - inference.outfile_path=\'' - -If you want to evaluate a PEFT Model, you should provide a base T5 model and a PEFT model .nemo file - -python examples/nlp/language_modeling/tuning/megatron_t5_peft_eval.py \ - model.restore_from_path= \ - model.peft.restore_from_path= \ # this will be created if you use `megatron_t5_peft_tuning.py` - trainer.devices=1 model.data.test_ds.file_names=\[, ] \ - model.data.test_ds.names=\['name_for_test_file1', 'name_for_test_file2'] \ # this is not the filename just some identifier - model.data.test_ds.global_batch_size=4 \ # or some other value - model.data.test_ds.micro_batch_size=4 \ - model.data.test_ds.tokens_to_generate=30 \ - inference.greedy=True \ - inference.outfile_path=\'' - -""" - - -def use_inference_server(cfg, model, trainer): - if not HAVE_MEGATRON_CORE: - raise ValueError('Megatron-core needs to be installed to use this feature!') - - from nemo.collections.nlp.modules.common.megatron_web_server import get_chatbot_demo, get_demo - - trainer.test(model, dataloaders=None) - - if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0: - if cfg.web_server: - if cfg.chat: - defaults = { - 'user': cfg.chatbot_config.user, - 'assistant': cfg.chatbot_config.assistant, - 'system': cfg.chatbot_config.system, - } - web_ui = partial( - get_chatbot_demo, - defaults=defaults, - value=cfg.chatbot_config.value, - attributes=cfg.chatbot_config.attributes, - ) - else: - web_ui = get_demo - loop = asyncio.new_event_loop() - thread = threading.Thread( - target=web_ui, daemon=True, args=(cfg.share, cfg.username, cfg.password, cfg.port, cfg.web_port, loop), - ) - thread.start() - server = MegatronServer(model.cuda()) - server.run("0.0.0.0", port=cfg.port) - - while True: - choice = torch.cuda.LongTensor(1) - torch.distributed.broadcast(choice, 0) - if choice[0].item() == 0: - generate(model.cuda()) - - -@hydra_runner(config_path="conf", config_name="megatron_t5_peft_eval_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f"\n{OmegaConf.to_yaml(cfg)}") - trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() - - model_cfg = MegatronT5SFTModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg) - model = MegatronT5SFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) - - model.load_adapters(cfg.model.peft.restore_from_path) - - model.freeze() - logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}") - - if not cfg.model.get('use_flash_attention', False): - cfg.inference.compute_attention_mask = True - - if not cfg.server: - trainer.test(model) - else: - use_inference_server(cfg, model, trainer) - - -if __name__ == "__main__": - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py b/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py deleted file mode 100644 index 04e25956aed20ce8c7b5fabe3a9191dcd626a202..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf - -from nemo.collections.nlp.models.language_modeling.megatron_t5_sft_model import MegatronT5SFTModel -from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder -from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - -mp.set_start_method("spawn", force=True) - -""" -This is the script to finetuning a T5 Model with any PEFT method. -A base T5 Model is required as a starting point. This script will then insert -Adapters into each Transformer layer and will train/update only these adapters -during training. The base T5 Model weights will remain frozen. - -This script is exactly the same as the peft tuning script for GPT. For more details -please refer to the GPT script and docs. -""" - - -@hydra_runner(config_path="conf", config_name="megatron_t5_peft_tuning_config") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() - exp_manager(trainer, cfg.exp_manager) - - model_cfg = MegatronT5SFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg) - model = MegatronT5SFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer) - peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme] - - if cfg.model.peft.restore_from_path is not None: - # initialize peft weights from a checkpoint instead of randomly - # This is not the same as resume training because optimizer states are not restored. - logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path) - model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg)) - elif peft_cfg_cls is not None: - logging.info("Adding adapter weights to the model for PEFT") - model.add_adapter(peft_cfg_cls(model_cfg)) - else: - logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}") - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/aayn_base.yaml b/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/aayn_base.yaml deleted file mode 100644 index 39f500da70bda66ced7f233139433ed50858d16d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/aayn_base.yaml +++ /dev/null @@ -1,159 +0,0 @@ -name: AttentionIsAllYouNeed -do_training: True # set to False if only preprocessing data -do_testing: False # set to True to run evaluation on test data after training - -model: - beam_size: 4 - len_pen: 0.6 - multilingual: False - max_generation_delta: -1 - label_smoothing: 0.1 - shared_tokenizer: True # train tokenizer model across src and tgt train data - preproc_out_dir: null # path to store data preprocessing outputs - src_language: 'en' - tgt_language: 'de' - shared_embeddings: false - - train_ds: - src_file_name: null - tgt_file_name: null - use_tarred_dataset: False # if true tar_file_name and meta_file_name will be used (or created automatically) - # config for preprocessing training data and creating a tarred datset automatically - tar_file_prefix: parallel # prefix for tar file names - tar_files: null # if data has already been preprocessed (rest of config ignored) - metadata_file: null # metadata for tarred dataset - lines_per_dataset_fragment: 1000000 # Number of lines to consider for bucketing and padding - num_batches_per_tarfile: 100 # Number of batches (pickle files) within each tarfile - tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled - shard_strategy: scatter # tarred dataset shard distribution strategy - n_preproc_jobs: -2 # number of processes to use for data preprocessing (-2 means all but 2) - tokens_in_batch: 512 - clean: true - max_seq_length: 512 - shuffle: true - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - concat_sampling_technique: temperature # only used with ConcatTranslationDataset - concat_sampling_temperature: 5 # only used with ConcatTranslationDataset - concat_sampling_probabilities: null # only used with ConcatTranslationDataset - - validation_ds: - src_file_name: ??? - tgt_file_name: ??? - tokens_in_batch: 512 - clean: false - max_seq_length: 512 - shuffle: false - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - - test_ds: - src_file_name: ??? - tgt_file_name: ??? - tokens_in_batch: 512 - clean: false - max_seq_length: 512 - shuffle: false - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - - optim: - name: adam - lr: 0.001 - betas: - - 0.9 - - 0.98 - weight_decay: 0.0 - sched: - name: InverseSquareRootAnnealing - min_lr: 0.0 - last_epoch: -1 - warmup_ratio: 0.1 - - encoder_tokenizer: - library: yttm - tokenizer_model: null - vocab_size: null # vocab size for training bpe - bpe_dropout: null - vocab_file: null - special_tokens: null - training_sample_size: null # valid for sentencepiece tokenizer - r2l: false - - decoder_tokenizer: - library: yttm - tokenizer_model: null - vocab_size: null # vocab size for training bpe - bpe_dropout: null - vocab_file: null - special_tokens: null - training_sample_size: null # valid for sentencepiece tokenizer - r2l: false - - encoder: - library: nemo - model_name: null - pretrained: false - max_sequence_length: 512 - num_token_types: 0 - embedding_dropout: 0.1 - learn_positional_encodings: false - hidden_size: 512 - num_layers: 6 - inner_size: 2048 - num_attention_heads: 8 - ffn_dropout: 0.1 - attn_score_dropout: 0.1 - attn_layer_dropout: 0.1 - hidden_act: relu - mask_future: false - pre_ln: false - pre_ln_final_layer_norm: true - - decoder: - library: nemo - model_name: null - pretrained: false - max_sequence_length: 512 - num_token_types: 0 - embedding_dropout: 0.1 - learn_positional_encodings: false - hidden_size: 512 - inner_size: 2048 - num_layers: 6 - num_attention_heads: 8 - ffn_dropout: 0.1 - attn_score_dropout: 0.1 - attn_layer_dropout: 0.1 - hidden_act: relu - pre_ln: false - pre_ln_final_layer_norm: true - - head: - num_layers: 1 - activation: relu - log_softmax: true - dropout: 0.0 - use_transformer_init: true - -trainer: - devices: 4 - num_nodes: 1 - max_epochs: 200 - precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 - accelerator: gpu - enable_checkpointing: False - logger: False - log_every_n_steps: 50 # Interval of logging. - check_val_every_n_epoch: 1 - benchmark: False - -exp_manager: - name: AAYNBase - files_to_copy: [] diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/aayn_base_megatron.yaml b/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/aayn_base_megatron.yaml deleted file mode 100644 index 3cd434e195588aa42365c0f0a5897f82d345c6af..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/aayn_base_megatron.yaml +++ /dev/null @@ -1,179 +0,0 @@ -defaults: - - ../../language_modeling/conf@model.encoder: megatron_model_base_config - - ../../language_modeling/conf@model.decoder: megatron_model_base_config - -name: megatron_nmt -restore_from_path: null # used when starting from a .nemo file - -trainer: - devices: 1 - num_nodes: 1 - precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 - accelerator: gpu - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: 1000 # PTL default. In practice, max_steps will be reached first. - max_steps: 400000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 1000 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: null - exp_dir: null - name: ${name} - create_wandb_logger: False - wandb_logger_kwargs: - project: null - name: null - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 10 - mode: min - always_save_nemo: False # saves nemo file during validation, not implemented for model parallel - filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' - model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} - -model: - # NMT Params - multilingual: False - label_smoothing: 0.1 # TODO: Implement this. - shared_tokenizer: True # train tokenizer model across src and tgt train data - preproc_out_dir: null # path to store data preprocessing outputs - src_language: 'en' - tgt_language: 'de' - max_generation_delta: 20 # Maximum decoder sequence length is encoder sequence length + this parameter. - pretrained_model_path: null # Path to a pretrained model - pretrained_model_type: T5 - - # model parallelism - micro_batch_size: 32 - global_batch_size: 512 # will use more micro batches to reach global batch size - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - pipeline_model_parallel_split_rank: 0 # rank at which decoder starts. - resume_from_checkpoint: null # manually set the checkpoint file to load from - - # model architecture - make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. - - megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting. - grad_allreduce_chunk_size_mb: 125 - grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - - seq_length: 512 - max_position_embeddings: ${.seq_length} - - # weight init - embedding_init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') - - # embedding dropout - embedding_dropout: 0.1 - - # embedding sharing - share_token_embeddings: True # If True share encoder/decoder embeddings - share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits - - # token head - tokens_head_bias: True - - # precision - native_amp_init_scale: 4294967296 # 2 ** 32 - native_amp_growth_interval: 1000 - fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 - - # miscellaneous - seed: 1234 - use_cpu_initialization: False # Init weights on the CPU (slow for large models) - apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this - - train_ds: - src_file_name: null - tgt_file_name: null - dataset_type: 'text_memmap' # Options ['bin_memmap', 'text_memmap'] - sampler: 'megatron' # Options ['megatron']. Note megatron samplers do not shuffle across epochs. - objective: 'nmt' # Options ['nmt', 'nmt-xlm'] - # NOTE: These ratios are used only when the objective is `nmt-xlm` - sampling_ratios: - x-masking: 0.17 # Extreme span masking task selection probability (either large spans or large masking prob) - r-masking: 0.17 # T5-style random span masking task selection probability - s-masking: 0.16 # Prefix-LM task selection probability - nmt: 0.5 # NMT selection probability - micro_batch_size: ${model.micro_batch_size} - global_batch_size: ${model.global_batch_size} - # config for preprocessing training data and creating a tarred datset automatically - max_seq_length: 512 - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - concat_sampling_probabilities: null # only used with ConcatTranslationDataset - - validation_ds: - src_file_name: ??? - tgt_file_name: ??? - dataset_type: 'text' # Options: ['text']. Validation data needs to be raw text. - tokens_in_batch: 512 - clean: false - max_seq_length: 512 - shuffle: false - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - - test_ds: - src_file_name: ??? - tgt_file_name: ??? - dataset_type: 'text' # Options: ['text']. Validation data needs to be raw text. - tokens_in_batch: 512 - clean: false - max_seq_length: 512 - shuffle: false - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - - optim: - name: fused_adam - lr: 0.0004 - betas: - - 0.9 - - 0.98 - weight_decay: 0.0 - sched: - name: InverseSquareRootAnnealing - min_lr: 0.0 - last_epoch: -1 - warmup_ratio: 0.1 - - encoder_tokenizer: - library: yttm - model: null - vocab_size: null # vocab size for training bpe - bpe_dropout: null - vocab_file: null - special_tokens: null - training_sample_size: null # valid for sentencepiece tokenizer - r2l: false - sentencepiece_legacy: True # Legacy=True allows you to add special tokens to sentencepiece tokenizers. - num_sentinel_tokens: 0 - - decoder_tokenizer: - library: yttm - model: null - vocab_size: null # vocab size for training bpe - bpe_dropout: null - vocab_file: null - special_tokens: null - training_sample_size: null # valid for sentencepiece tokenizer - r2l: false - sentencepiece_legacy: True - num_sentinel_tokens: 0 diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/aayn_bottleneck.yaml b/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/aayn_bottleneck.yaml deleted file mode 100644 index 463ee06277bb21e538b6e3a0703b478fb6e9a9a4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/aayn_bottleneck.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# The bottleneck architecture supports three learning framework (i.e., losses) -# via model.model_type: -# 1) nll - Conditional cross entropy (the usual NMT loss) -# 2) mim - MIM learning framework. A latent variable model with good -# reconstruction and compressed latent representation. -# https://arxiv.org/pdf/2003.02645.pdf -# 3) vae - VAE learning framework. A latent variable model which learns -# good probability estimation over observations and -# a regularized latent representation. -# https://arxiv.org/pdf/1312.6114.pdf -# The bottleneck architecture supports three encoder architectures via -# model.encoder.arch: -# 1) seq2seq - the usual NMT model without bottleneck -# 2) bridge - a bottleneck which projects the encoder output to a fixed -# number of steps using attention bridge (https://arxiv.org/pdf/1703.03130.pdf) -# 3) perceiver - a bottleneck by projecting inputs to a fixed -# number of steps using perceiver architecture (https://arxiv.org/pdf/2103.03206.pdf) -# 4) max_pool / avg_pool - a reduction by halving the number of steps at the end of every hidden block. -# reduction is using max pooling or average pooling. - -defaults: - - aayn_base - -name: AttentionIsAllYouNeedBottleneck - -do_training: True # set to False if only preprocessing data -do_testing: False # set to True to run evaluation on test data after training - -model: - model_type: 'nll' # learning (i.e., loss) type: nll (i.e., cross-entropy/auto-encoder), mim, vae (see description above) - min_logv: -6 # minimal allowed log variance for mim - latent_size: -1 # dimension of latent (projected from hidden) -1 will take value of hidden size - non_recon_warmup_batches: 200000 # warm-up steps for mim, and vae losses - recon_per_token: true # when false reconstruction is computed per sample, not per token - - encoder: - mask_future: false - arch: perceiver # avg_pool, max_pool, seq2seq, bridge, perceiver (see description above) - hidden_steps: 32 # fixed number of hidden steps - hidden_blocks: 1 # number of repeat blocks (see classes for description) - hidden_init_method: default # see classes for available values - - decoder: - arch: seq2seq # currently only seq2seq is supported - -exp_manager: - name: AAYNBottleneck diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/aayn_finetune.yaml b/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/aayn_finetune.yaml deleted file mode 100644 index f62e2a629464a5f6725213f5e160a7668a27fe27..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/aayn_finetune.yaml +++ /dev/null @@ -1,77 +0,0 @@ -name: AttentionIsAllYouNeedFinetune -do_training: True # set to False if only preprocessing data -do_testing: False # set to True to run evaluation on test data after training -model_path: ??? - -model: - train_ds: - src_file_name: null - tgt_file_name: null - use_tarred_dataset: False # if true tar_file_name and meta_file_name will be used (or created automatically) - # config for preprocessing training data and creating a tarred datset automatically - tar_file_prefix: parallel # prefix for tar file names - tar_files: null # if data has already been preprocessed (rest of config ignored) - metadata_file: null # metadata for tarred dataset - lines_per_dataset_fragment: 1000000 # Number of lines to consider for bucketing and padding - num_batches_per_tarfile: 100 # Number of batches (pickle files) within each tarfile - tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled - shard_strategy: scatter # tarred dataset shard distribution strategy - n_preproc_jobs: -2 # number of processes to use for data preprocessing (-2 means all but 2) - tokens_in_batch: 512 - clean: true - max_seq_length: 512 - shuffle: true - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - concat_sampling_technique: temperature # only used with ConcatTranslationDataset - concat_sampling_temperature: 5 # only used with ConcatTranslationDataset - concat_sampling_probabilities: null # only used with ConcatTranslationDataset - - validation_ds: - src_file_name: ??? - tgt_file_name: ??? - tokens_in_batch: 512 - clean: false - max_seq_length: 512 - shuffle: false - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - - test_ds: - src_file_name: ??? - tgt_file_name: ??? - tokens_in_batch: 512 - clean: false - max_seq_length: 512 - shuffle: false - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - - optim: - name: adam - lr: 0.00002 - betas: - - 0.9 - - 0.98 - weight_decay: 0.0 - -trainer: - devices: 4 - num_nodes: 1 - max_epochs: 200 - precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 - accelerator: gpu - enable_checkpointing: False - logger: False - log_every_n_steps: 50 # Interval of logging. - check_val_every_n_epoch: 1 - -exp_manager: - name: AAYNBaseFineTune - files_to_copy: [] diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/huggingface.yaml b/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/huggingface.yaml deleted file mode 100644 index f1874a93d05525a400313a3cf8439a7c73fa71ff..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/huggingface.yaml +++ /dev/null @@ -1,132 +0,0 @@ -name: HuggingFaceEncoder -do_training: True # set to False if only preprocessing data -do_testing: False # set to True to run evaluation on test data after training - -model: - beam_size: 4 - len_pen: 0.6 - max_generation_delta: -1 - label_smoothing: 0.1 - shared_tokenizer: false - preproc_out_dir: null - src_language: 'en' - tgt_language: 'de' - - train_ds: - src_file_name: null - tgt_file_name: null - use_tarred_dataset: False # if true tar_file_name and meta_file_name will be used (or created automatically) - # config for preprocessing training data and creating a tarred datset automatically - tar_file_prefix: parallel # prefix for tar file names - tar_files: null # if data has already been preprocessed (rest of config ignored) - metadata_file: null # metadata for tarred dataset - lines_per_dataset_fragment: 1000000 # Number of lines to consider for bucketing and padding - num_batches_per_tarfile: 100 # Number of batches (pickle files) within each tarfile - tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled - shard_strategy: scatter # tarred dataset shard distribution strategy - n_preproc_jobs: -2 # number of processes to use for data preprocessing (-2 means all but 2) - tokens_in_batch: 512 - clean: true - max_seq_length: 512 - shuffle: true - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - - validation_ds: - src_file_name: null - tgt_file_name: null - tokens_in_batch: 512 - clean: false - max_seq_length: 512 - shuffle: false - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - - test_ds: - src_file_name: null - tgt_file_name: null - tokens_in_batch: 512 - clean: false - max_seq_length: 512 - shuffle: false - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - - optim: - name: adam - lr: 0.001 - betas: - - 0.9 - - 0.98 - weight_decay: 0.0 - sched: - name: InverseSquareRootAnnealing - min_lr: 0.0 - last_epoch: -1 - warmup_ratio: 0.1 - - encoder_tokenizer: - library: huggingface - tokenizer_model: null - vocab_file: null - special_tokens: null - vocab_size: null - - decoder_tokenizer: - library: yttm - tokenizer_model: null - vocab_file: null - special_tokens: null - vocab_size: null - - encoder: - library: huggingface - model_name: bert-base-uncased - pretrained: false - - decoder: - library: nemo - model_name: null - pretrained: false - max_sequence_length: 512 - num_token_types: 2 - embedding_dropout: 0.1 - learn_positional_encodings: false - hidden_size: 512 - inner_size: 2048 - num_layers: 6 - num_attention_heads: 8 - ffn_dropout: 0.1 - attn_score_dropout: 0.1 - attn_layer_dropout: 0.1 - hidden_act: relu - pre_ln: false - - head: - num_layers: 1 - activation: relu - log_softmax: true - dropout: 0.0 - use_transformer_init: true - -trainer: - devices: 4 - num_nodes: 1 - max_epochs: 200 - precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 - accelerator: gpu - enable_checkpointing: False - logger: False - log_every_n_steps: 50 # Interval of logging. - check_val_every_n_epoch: 1 - benchmark: False - -exp_manager: - name: HuggingFaceEncoder - files_to_copy: [] diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/megatron.yaml b/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/megatron.yaml deleted file mode 100644 index abb5c39d485215483c1fc81fe1238eb73ba92fcb..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/megatron.yaml +++ /dev/null @@ -1,160 +0,0 @@ -name: MegatronEncoder -do_training: True # set to False if only preprocessing data -do_testing: False # set to True to run evaluation on test data after training - -model: - beam_size: 4 - len_pen: 0.6 - max_generation_delta: -1 - label_smoothing: 0.1 - shared_tokenizer: false - preproc_out_dir: null - src_language: 'en' - tgt_language: 'de' - - train_ds: - src_file_name: null - tgt_file_name: null - use_tarred_dataset: False # if true tar_file_name and meta_file_name will be used (or created automatically) - # config for preprocessing training data and creating a tarred datset automatically - tar_file_prefix: parallel # prefix for tar file names - tar_files: null # if data has already been preprocessed (rest of config ignored) - metadata_file: null # metadata for tarred dataset - lines_per_dataset_fragment: 1000000 # Number of lines to consider for bucketing and padding - num_batches_per_tarfile: 100 # Number of batches (pickle files) within each tarfile - tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled - shard_strategy: scatter # tarred dataset shard distribution strategy - n_preproc_jobs: -2 # number of processes to use for data preprocessing (-2 means all but 2) - tokens_in_batch: 512 - clean: true - max_seq_length: 512 - shuffle: true - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - - validation_ds: - src_file_name: null - tgt_file_name: null - tokens_in_batch: 512 - clean: false - max_seq_length: 512 - shuffle: false - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - - test_ds: - src_file_name: null - tgt_file_name: null - tokens_in_batch: 512 - clean: false - max_seq_length: 512 - shuffle: false - num_samples: -1 - drop_last: false - pin_memory: false - num_workers: 8 - - optim: - name: adam - lr: 0.001 - betas: - - 0.9 - - 0.98 - weight_decay: 0.0 - sched: - name: InverseSquareRootAnnealing - min_lr: 0.0 - last_epoch: -1 - warmup_ratio: 0.1 - - encoder_tokenizer: - library: megatron - tokenizer_model: null - vocab_file: null - special_tokens: null - vocab_size: null - model_name: null - - decoder_tokenizer: - library: yttm - tokenizer_model: null - vocab_file: null - special_tokens: null - vocab_size: null - - encoder: - library: megatron - - # If using a pretrained megatron bert model from NGC, then use the corresponding model name - # For example, 'megatron-bert-345m-uncased'. - # If restoring from a local checkpoint, then use either 'megatron-bert-uncased' or 'megatron-bert-cased' - model_name: megatron-bert-uncased # or megatron-bert-cased - - # If restoring from a model parallel checkpoint, then checkpoint_file should be a path to - # the directory containing the megatron-lm checkpoints. The directory will have the structure: - - # /path/to/my/checkpoint/ - # ├── mp_rank_00 - # │ └── model_optim_rng.pt - # └── mp_rank_01 - # └── model_optim_rng.pt - - # If not using a model parallel checkpoint, then use the full path to the checkpoint: - - # /path/to/my/checkpoint/model_optim_rng.pt - checkpoint_file: null - vocab_file : null - - pretrained: true # only pretrained=true supported for now - - # model architecture configuration - hidden_size: 1024 - num_attention_heads: 16 - num_layers: 24 - max_position_embeddings: 512 - num_tokentypes: 0 - - decoder: - library: nemo - model_name: null - pretrained: false - max_sequence_length: 512 - num_token_types: 2 - embedding_dropout: 0.1 - learn_positional_encodings: false - hidden_size: 512 - inner_size: 2048 - num_layers: 6 - num_attention_heads: 8 - ffn_dropout: 0.1 - attn_score_dropout: 0.1 - attn_layer_dropout: 0.1 - hidden_act: relu - pre_ln: false - - head: - num_layers: 1 - activation: relu - log_softmax: true - dropout: 0.0 - use_transformer_init: true - -trainer: - devices: 4 - num_nodes: 1 - max_epochs: 200 - precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 - accelerator: gpu - strategy: ddp - enable_checkpointing: False - logger: False - log_every_n_steps: 50 # Interval of logging. - check_val_every_n_epoch: 1 - -exp_manager: - name: ${name} - files_to_copy: [] \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/nmt_megatron_infer.yaml b/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/nmt_megatron_infer.yaml deleted file mode 100644 index 6ca11153d39ffae66b6ca0ddf07bf48af7579165..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/conf/nmt_megatron_infer.yaml +++ /dev/null @@ -1,16 +0,0 @@ -trainer: - devices: 1 - num_nodes: 1 - precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 - accelerator: gpu - -model_file: ??? -checkpoint_dir: null -srctext: ??? -tgtout: ??? -batch_size: 128 -source_lang: null -target_lang: null -tensor_model_parallel_size: 1 -pipeline_model_parallel_size: 1 -pipeline_model_parallel_split_rank: 0 \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/create_tarred_monolingual_dataset.py b/SoundScribe/SpeakerID/examples/nlp/machine_translation/create_tarred_monolingual_dataset.py deleted file mode 100644 index d8b93f5bd7ce036283086eaabf626c506b9885d2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/create_tarred_monolingual_dataset.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -from nemo.collections.nlp.data.machine_translation.preproc_mt_data import MTDataPreproc - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='NMT dataset pre-processing') - parser.add_argument( - '--tokenizer_name', type=str, default='yttm', help='Supports yttm, sentencepiece and HuggingFace tokenizers', - ) - parser.add_argument('--tokenizer_model', type=str, default=None, help='Path to tokenizer model') - parser.add_argument('--bpe_droput', type=float, default=0.0, help='BPE dropout to use') - parser.add_argument('--clean', action="store_true", help='Whether to clean dataset based on length diff') - parser.add_argument('--pkl_file_prefix', type=str, default='parallel', help='Prefix for tar and pickle files') - parser.add_argument('--fname', type=str, required=True, help='Path to monolingual data file') - parser.add_argument('--out_dir', type=str, required=True, help='Path to store dataloader and tokenizer models') - parser.add_argument('--max_seq_length', type=int, default=512, help='Max Sequence Length') - parser.add_argument('--min_seq_length', type=int, default=1, help='Min Sequence Length') - parser.add_argument('--tokens_in_batch', type=int, default=16000, help='# Tokens per batch per GPU') - parser.add_argument( - '--lines_per_dataset_fragment', - type=int, - default=1000000, - help='Number of lines to consider for bucketing and padding', - ) - parser.add_argument( - '--num_batches_per_tarfile', - type=int, - default=1000, - help='Number of batches (pickle files) within each tarfile', - ) - - args = parser.parse_args() - if not os.path.exists(args.out_dir): - os.mkdir(args.out_dir) - if args.tokenizer_name in ["yttm", "sentencepiece"] and not os.path.exists(args.tokenizer_model): - assert FileNotFoundError("Could not find tokenizer model %s" % (args.tokenizer)) - - tokenizer_model = MTDataPreproc.get_monolingual_tokenizer( - tokenizer_name=args.tokenizer_name, tokenizer_model=args.tokenizer_model, bpe_dropout=args.bpe_droput - ) - - MTDataPreproc.preprocess_monolingual_dataset( - clean=args.clean, - fname=args.fname, - out_dir=args.out_dir, - tokenizer=tokenizer_model, - max_seq_length=args.max_seq_length, - min_seq_length=args.min_seq_length, - tokens_in_batch=args.tokens_in_batch, - lines_per_dataset_fragment=args.lines_per_dataset_fragment, - num_batches_per_tarfile=args.num_batches_per_tarfile, - pkl_file_prefix=args.pkl_file_prefix, - global_rank=0, - world_size=1, - ) diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/create_tarred_parallel_dataset.py b/SoundScribe/SpeakerID/examples/nlp/machine_translation/create_tarred_parallel_dataset.py deleted file mode 100644 index ce61cc81dafef3efc980462aa42337bbe87da6e1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/create_tarred_parallel_dataset.py +++ /dev/null @@ -1,187 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -from nemo.collections.nlp.data.machine_translation.preproc_mt_data import MTDataPreproc - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='NMT dataset pre-processing') - parser.add_argument('--shared_tokenizer', action="store_true", help='Whether to share encoder/decoder tokenizers') - parser.add_argument('--clean', action="store_true", help='Whether to clean dataset based on length diff') - parser.add_argument('--tar_file_prefix', type=str, default='parallel', help='Prefix for tar files') - parser.add_argument('--src_fname', type=str, required=True, help='Path to the source file') - parser.add_argument('--tgt_fname', type=str, required=True, help='Path to the target file') - parser.add_argument('--out_dir', type=str, required=True, help='Path to store dataloader and tokenizer models') - parser.add_argument('--encoder_model_name', type=str, default=None, help='For use with pretrained encoders') - parser.add_argument( - '--decoder_model_name', type=str, default=None, help='For use with pretrained decoders (not yet supported)' - ) - parser.add_argument( - '--encoder_tokenizer_model', type=str, default='None', help='Path to pre-trained encoder tokenizer model' - ) - parser.add_argument( - '--encoder_tokenizer_name', - type=str, - default='yttm', - help='Encoder BPE Tokenizer Name, Options: [yttm, sentencepiece]', - ) - parser.add_argument('--encoder_tokenizer_vocab_size', type=int, default=32000, help='Encoder Vocab size after BPE') - parser.add_argument( - '--encoder_tokenizer_coverage', type=float, default=0.999, help='Encoder Character coverage for BPE' - ) - parser.add_argument('--encoder_tokenizer_bpe_dropout', type=float, default=0.1, help='Encoder BPE dropout prob') - parser.add_argument( - '--encoder_tokenizer_r2l', action="store_true", help='Whether to return encoded sequence from right to left' - ) - parser.add_argument( - '--encoder_tokenizer_legacy', - action="store_true", - help='Whether to use legacy tokenizer implementation of sentencepiece', - ) - parser.add_argument( - '--decoder_tokenizer_model', type=str, default='None', help='Path to pre-trained decoder tokenizer model' - ) - parser.add_argument( - '--decoder_tokenizer_name', - type=str, - default='yttm', - help='Encoder BPE Tokenizer Name, Options: [yttm, sentencepiece]', - ) - parser.add_argument('--decoder_tokenizer_vocab_size', type=int, default=32000, help='Encoder Vocab size after BPE') - parser.add_argument( - '--decoder_tokenizer_coverage', type=float, default=0.999, help='Encoder Character coverage for BPE' - ) - parser.add_argument('--decoder_tokenizer_bpe_dropout', type=float, default=0.1, help='Encoder BPE dropout prob') - parser.add_argument( - '--decoder_tokenizer_r2l', action="store_true", help='Whether to return encoded sequence from right to left' - ) - parser.add_argument( - '--decoder_tokenizer_legacy', - action="store_true", - help='Whether to use legacy tokenizer implementation of sentencepiece', - ) - parser.add_argument('--max_seq_length', type=int, default=512, help='Max Sequence Length') - parser.add_argument('--min_seq_length', type=int, default=1, help='Min Sequence Length') - parser.add_argument('--tokens_in_batch', type=int, default=16000, help='# Tokens per batch per GPU') - parser.add_argument('--coverage', type=float, default=0.999, help='BPE character coverage [0-1]') - parser.add_argument( - '--lines_per_dataset_fragment', - type=int, - default=1000000, - help='Number of lines to consider for bucketing and padding', - ) - parser.add_argument( - '--num_batches_per_tarfile', - type=int, - default=1000, - help='Number of batches (pickle files) within each tarfile', - ) - parser.add_argument( - '--n_preproc_jobs', type=int, default=-2, help='Number of processes to use for creating the tarred dataset.', - ) - parser.add_argument( - '--byte_fallback', - action="store_true", - help='Whether to use byte fallback with sentencepiece for BPE tokenization.', - ) - parser.add_argument( - '--split_digits', action="store_true", help='Whether to split digits while tokenizing with sentencepiece.' - ) - parser.add_argument( - '--no_split_by_whitespace', - action="store_true", - help='If True, this will not respect whitepsaces while learning BPE merges.', - ) - args = parser.parse_args() - if not os.path.exists(args.out_dir): - os.mkdir(args.out_dir) - - if ( - args.encoder_tokenizer_model != 'None' - and args.decoder_tokenizer_model == 'None' - or args.decoder_tokenizer_model != 'None' - and args.encoder_tokenizer_model == 'None' - ): - if args.shared_tokenizer: - raise ValueError( - ''' - If using a pre-trained shared tokenizer, - both encoder and decoder tokenizers must be the same - ''' - ) - else: - raise ValueError('Both encoder and decoder pre-trained tokenizer models must be specified') - - if args.encoder_tokenizer_model == 'None' and args.decoder_tokenizer_model == 'None': - encoder_tokenizer_model, decoder_tokenizer_model = MTDataPreproc.train_tokenizers( - out_dir=args.out_dir, - src_fname=args.src_fname, - tgt_fname=args.tgt_fname, - shared_tokenizer=args.shared_tokenizer, - encoder_tokenizer_name=args.encoder_tokenizer_name, - encoder_tokenizer_vocab_size=args.encoder_tokenizer_vocab_size, - encoder_tokenizer_coverage=args.encoder_tokenizer_coverage, - decoder_tokenizer_name=args.decoder_tokenizer_name, - decoder_tokenizer_vocab_size=args.decoder_tokenizer_vocab_size, - decoder_tokenizer_coverage=args.decoder_tokenizer_coverage, - global_rank=0, - byte_fallback=args.byte_fallback, - split_digits=args.split_digits, - split_by_whitespace=not args.no_split_by_whitespace, - ) - else: - encoder_tokenizer_model, decoder_tokenizer_model = args.encoder_tokenizer_model, args.decoder_tokenizer_model - - encoder_tokenizer, decoder_tokenizer = MTDataPreproc.get_enc_dec_tokenizers( - encoder_tokenizer_name=args.encoder_tokenizer_name, - encoder_tokenizer_model=encoder_tokenizer_model, - encoder_bpe_dropout=args.encoder_tokenizer_bpe_dropout, - encoder_r2l=args.encoder_tokenizer_r2l, - decoder_tokenizer_name=args.decoder_tokenizer_name, - decoder_tokenizer_model=decoder_tokenizer_model, - decoder_bpe_dropout=args.decoder_tokenizer_bpe_dropout, - decoder_r2l=args.decoder_tokenizer_r2l, - encoder_tokenizer_legacy=args.encoder_tokenizer_legacy, - decoder_tokenizer_legacy=args.decoder_tokenizer_legacy, - ) - - _, _ = MTDataPreproc.preprocess_parallel_dataset( - clean=args.clean, - src_fname=args.src_fname, - tgt_fname=args.tgt_fname, - out_dir=args.out_dir, - encoder_tokenizer_name=args.encoder_tokenizer_name, - encoder_model_name=args.encoder_model_name, - encoder_tokenizer_model=encoder_tokenizer_model, - encoder_bpe_dropout=args.encoder_tokenizer_bpe_dropout, - encoder_tokenizer_r2l=args.encoder_tokenizer_r2l, - decoder_tokenizer_name=args.decoder_tokenizer_name, - decoder_model_name=args.decoder_model_name, - decoder_tokenizer_model=decoder_tokenizer_model, - decoder_tokenizer_r2l=args.decoder_tokenizer_r2l, - decoder_bpe_dropout=args.decoder_tokenizer_bpe_dropout, - max_seq_length=args.max_seq_length, - min_seq_length=args.min_seq_length, - tokens_in_batch=args.tokens_in_batch, - lines_per_dataset_fragment=args.lines_per_dataset_fragment, - num_batches_per_tarfile=args.num_batches_per_tarfile, - tar_file_prefix=args.tar_file_prefix, - global_rank=0, - world_size=1, - n_jobs=args.n_preproc_jobs, - encoder_tokenizer_legacy=args.encoder_tokenizer_legacy, - decoder_tokenizer_legacy=args.decoder_tokenizer_legacy, - ) diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py b/SoundScribe/SpeakerID/examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py deleted file mode 100644 index 87898da156434f34dab0a436aada9242a2235bdf..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass -from typing import Optional - -from omegaconf import OmegaConf -from pytorch_lightning import Trainer - -from nemo.collections.nlp.data.machine_translation.preproc_mt_data import MTDataPreproc -from nemo.collections.nlp.models.machine_translation.mt_enc_dec_bottleneck_model import MTBottleneckModel -from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import MTBottleneckModelConfig -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy -from nemo.core.config import hydra_runner -from nemo.core.config.modelPT import NemoConfig -from nemo.core.config.pytorch_lightning import TrainerConfig -from nemo.utils import logging -from nemo.utils.config_utils import update_model_config -from nemo.utils.exp_manager import ExpManagerConfig, exp_manager - - -""" -Usage: - 1. If you need to start docker and install NeMo, otherwise skip this step: - - a. ```docker run --gpus all -it --rm -v /home/okuchaiev/repos/NeMo/:/NeMo -p 6006:6006 -v /mnt:/mnt --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:20.11-py3``` - b. ```cd /NeMo``` - c. ```./reinstall.sh``` - - 2. Train a new tokenizer (or use pre-trained one): - ```yttm bpe --data /mnt/D1/Data/NMT/wmt16_de_en/train.clean.en-de.shuffled.common --model tokenizer.BPE.8192.model --vocab_size 8192``` - -(To use WANDB, optionally, do login first) -``wandb login [YOUR WANDB login]`` - - 3. Start training: - - - (This example for "base" model on 2 GPUs for 150000 steps with batch size of 12500 tokens per GPU) - - python enc_dec_nmt-bottleneck.py \ - --config-path=conf \ - --config-name=aayn_bottleneck \ - trainer.devices=[0,1] \ - ~trainer.max_epochs \ - +trainer.max_steps=150000 \ - model.beam_size=4 \ - model.max_generation_delta=256 \ - model.label_smoothing=0.1 \ - model.model_type=nll \ - model.non_recon_warmup_batches=7500 \ - model.encoder_tokenizer.tokenizer_model=tokenizer.BPE.8192.model \ - model.decoder_tokenizer.tokenizer_model=tokenizer.BPE.8192.model \ - model.encoder.arch=perceiver \ - model.encoder.hidden_steps=32 \ - model.encoder.hidden_blocks=2 \ - model.encoder.hidden_init_method=bridge \ - model.encoder.num_layers=6 \ - model.encoder.hidden_size=512 \ - model.encoder.inner_size=2048 \ - model.encoder.num_attention_heads=8 \ - model.encoder.ffn_dropout=0.1 \ - model.decoder.num_layers=6 \ - model.decoder.hidden_size=512 \ - model.decoder.inner_size=2048 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_dropout=0.1 \ - model.train_ds.src_file_name=/mnt/D1/Data/NMT/wmt16_de_en/train.clean.de.shuffled \ - model.train_ds.tgt_file_name=/mnt/D1/Data/NMT/wmt16_de_en/train.clean.en.shuffled \ - model.train_ds.tokens_in_batch=12500 \ - model.validation_ds.src_file_name=/mnt/D1/Data/NMT/wmt16_de_en/wmt14-en-de.ref \ - model.validation_ds.tgt_file_name=/mnt/D1/Data/NMT/wmt16_de_en/wmt14-en-de.src \ - model.validation_ds.tokens_in_batch=8192 \ - model.test_ds.src_file_name=/mnt/D1/Data/NMT/wmt16_de_en/wmt14-en-de.ref \ - model.test_ds.tgt_file_name=/mnt/D1/Data/NMT/wmt16_de_en/wmt14-en-de.src \ - model.optim.lr=0.001 \ - model.optim.sched.warmup_ratio=0.05 \ - +exp_manager.create_wandb_logger=True \ - +exp_manager.wandb_logger_kwargs.name=TEST-nmt-base \ - +exp_manager.wandb_logger_kwargs.project=nmt-de-en \ - +exp_manager.create_checkpoint_callback=True \ - +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \ - +exp_manager.exp_dir=nmt_base \ - +exp_manager.checkpoint_callback_params.mode=max -""" - - -@dataclass -class MTBottleneckConfig(NemoConfig): - name: Optional[str] = 'MTBottleneck' - do_training: bool = True - do_testing: bool = False - model: MTBottleneckModelConfig = MTBottleneckModelConfig() - trainer: Optional[TrainerConfig] = TrainerConfig() - exp_manager: Optional[ExpManagerConfig] = ExpManagerConfig(name='MTBottleneck', files_to_copy=[]) - - -@hydra_runner(config_path="conf", config_name="aayn_bottleneck") -def main(cfg: MTBottleneckConfig) -> None: - # merge default config with user specified config - default_cfg = MTBottleneckConfig() - cfg = update_model_config(default_cfg, cfg) - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - - # training is managed by PyTorch Lightning - trainer_cfg = OmegaConf.to_container(cfg.trainer) - trainer_cfg.pop('strategy', None) - trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_cfg) - - # tokenizers will be trained and and tarred training data will be created if needed - # model config is then updated - if cfg.model.preproc_out_dir is not None: - MTDataPreproc(cfg=cfg.model, trainer=trainer) - - # experiment logs, checkpoints, and auto-resume are managed by exp_manager and PyTorch Lightning - exp_manager(trainer, cfg.exp_manager) - - # everything needed to train translation models is encapsulated in the NeMo MTEncdDecModel - mt_model = MTBottleneckModel(cfg.model, trainer=trainer) - - logging.info("\n\n************** Model parameters and their sizes ***********") - for name, param in mt_model.named_parameters(): - print(name, param.size()) - logging.info("***********************************************************\n\n") - - if cfg.do_training: - trainer.fit(mt_model) - - if cfg.do_testing: - trainer.test(mt_model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/enc_dec_nmt.py b/SoundScribe/SpeakerID/examples/nlp/machine_translation/enc_dec_nmt.py deleted file mode 100644 index 0d7334f9b39f9df96acf1590bd5c859c392b5fb3..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/enc_dec_nmt.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass -from typing import Optional - -from omegaconf import OmegaConf -from pytorch_lightning import Trainer - -from nemo.collections.nlp.data.machine_translation.preproc_mt_data import MTDataPreproc -from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import MTEncDecModelConfig -from nemo.collections.nlp.models.machine_translation.mt_enc_dec_model import MTEncDecModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy -from nemo.core.config import hydra_runner -from nemo.core.config.modelPT import NemoConfig -from nemo.core.config.pytorch_lightning import TrainerConfig -from nemo.utils import logging -from nemo.utils.config_utils import update_model_config -from nemo.utils.exp_manager import ExpManagerConfig, exp_manager - - -""" -Usage: - 1. If you need to start docker and install NeMo, otherwise skip this step: - - a. ```docker run --gpus all -it --rm -v /home/okuchaiev/repos/NeMo/:/NeMo -p 6006:6006 -v /mnt:/mnt --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:20.11-py3``` - b. ```cd /NeMo``` - c. ```./reinstall.sh``` - - 2. Train a new tokenizer (or use pre-trained one): - ```yttm bpe --data /mnt/D1/Data/NMT/wmt16_de_en/train.clean.en-de.shuffled.common --model tokenizer.BPE.8192.model --vocab_size 8192``` - -(To use WANDB, optionally, do login first) -``wandb login [YOUR WANDB login]`` - - 3. Start training: - - - (This example for "base" model on 2 GPUs for 150000 steps with batch size of 12500 tokens per GPU) - - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - trainer.devices=[0,1] \ - ~trainer.max_epochs \ - +trainer.max_steps=150000 \ - model.beam_size=4 \ - model.max_generation_delta=5 \ - model.label_smoothing=0.1 \ - model.encoder_tokenizer.tokenizer_model=tokenizer.BPE.8192.model \ - model.decoder_tokenizer.tokenizer_model=tokenizer.BPE.8192.model \ - model.encoder.num_layers=6 \ - model.encoder.hidden_size=512 \ - model.encoder.inner_size=2048 \ - model.encoder.num_attention_heads=8 \ - model.encoder.ffn_dropout=0.1 \ - model.decoder.num_layers=6 \ - model.decoder.hidden_size=512 \ - model.decoder.inner_size=2048 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_dropout=0.1 \ - model.train_ds.src_file_name=/mnt/D1/Data/NMT/wmt16_de_en/train.clean.de.shuffled \ - model.train_ds.tgt_file_name=/mnt/D1/Data/NMT/wmt16_de_en/train.clean.en.shuffled \ - model.train_ds.tokens_in_batch=12500 \ - model.validation_ds.src_file_name=/mnt/D1/Data/NMT/wmt16_de_en/wmt14-en-de.ref \ - model.validation_ds.tgt_file_name=/mnt/D1/Data/NMT/wmt16_de_en/wmt14-en-de.src \ - model.validation_ds.tokens_in_batch=8192 \ - model.test_ds.src_file_name=/mnt/D1/Data/NMT/wmt16_de_en/wmt14-en-de.ref \ - model.test_ds.tgt_file_name=/mnt/D1/Data/NMT/wmt16_de_en/wmt14-en-de.src \ - model.optim.lr=0.001 \ - model.optim.sched.warmup_ratio=0.05 \ - +exp_manager.create_wandb_logger=True \ - +exp_manager.wandb_logger_kwargs.name=TEST-nmt-base \ - +exp_manager.wandb_logger_kwargs.project=nmt-de-en \ - +exp_manager.create_checkpoint_callback=True \ - +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \ - +exp_manager.exp_dir=nmt_base \ - +exp_manager.checkpoint_callback_params.mode=max -""" - - -@dataclass -class MTEncDecConfig(NemoConfig): - name: Optional[str] = 'MTEncDec' - do_training: bool = True - do_testing: bool = False - model: MTEncDecModelConfig = MTEncDecModelConfig() - trainer: Optional[TrainerConfig] = TrainerConfig() - exp_manager: Optional[ExpManagerConfig] = ExpManagerConfig(name='MTEncDec', files_to_copy=[]) - - -@hydra_runner(config_path="conf", config_name="aayn_base") -def main(cfg: MTEncDecConfig) -> None: - # merge default config with user specified config - default_cfg = MTEncDecConfig() - cfg = update_model_config(default_cfg, cfg) - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - - # training is managed by PyTorch Lightning - trainer_cfg = OmegaConf.to_container(cfg.trainer) - trainer_cfg.pop('strategy', None) - trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_cfg) - - # tokenizers will be trained and and tarred training data will be created if needed - # model config is then updated - if cfg.model.preproc_out_dir is not None: - MTDataPreproc(cfg=cfg.model, trainer=trainer) - - # experiment logs, checkpoints, and auto-resume are managed by exp_manager and PyTorch Lightning - exp_manager(trainer, cfg.exp_manager) - - # everything needed to train translation models is encapsulated in the NeMo MTEncdDecModel - mt_model = MTEncDecModel(cfg.model, trainer=trainer) - - logging.info("\n\n************** Model parameters and their sizes ***********") - for name, param in mt_model.named_parameters(): - print(name, param.size()) - logging.info("***********************************************************\n\n") - - if cfg.do_training: - trainer.fit(mt_model) - # Reset for PTL 2.0 as test uses the same ckpt as train via previously set self._ckpt_path - trainer.ckpt_path = None - if cfg.do_testing: - trainer.test(mt_model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/enc_dec_nmt_finetune.py b/SoundScribe/SpeakerID/examples/nlp/machine_translation/enc_dec_nmt_finetune.py deleted file mode 100644 index 16a635d09dee0abffa97abe775d955165b64dc9a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/enc_dec_nmt_finetune.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass -from typing import Optional - -from omegaconf import OmegaConf -from omegaconf.omegaconf import MISSING -from pytorch_lightning import Trainer - -from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import MTEncDecModelConfig -from nemo.collections.nlp.models.machine_translation.mt_enc_dec_model import MTEncDecModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy -from nemo.core.config import hydra_runner -from nemo.core.config.modelPT import NemoConfig -from nemo.core.config.pytorch_lightning import TrainerConfig -from nemo.utils import logging -from nemo.utils.config_utils import update_model_config -from nemo.utils.exp_manager import ExpManagerConfig, exp_manager - - -""" -Usage: - python enc_dec_nmt_finetune.py \ - model_path=/raid/models/de_en_24x6.nemo \ - trainer.devices=2 \ - ~trainer.max_epochs \ - +trainer.max_steps=4500 \ - +trainer.val_check_interval=500 \ - model.train_ds.tgt_file_name=/raid/data/train_lang_filtered.en \ - model.train_ds.src_file_name=/raid/data/train_lang_filtered.de \ - model.train_ds.tokens_in_batch=6000 \ - model.validation_ds.tgt_file_name=/raid/data/2015.norm.tok.en \ - model.validation_ds.src_file_name=/raid/data/2015.norm.tok.de \ - model.validation_ds.tokens_in_batch=4000 \ - model.test_ds.tgt_file_name=/raid/data/2015.en \ - model.test_ds.src_file_name=/raid/data/2015.de \ - +exp_manager.exp_dir=/raid/results/finetune-test \ - +exp_manager.create_checkpoint_callback=True \ - +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \ - +exp_manager.checkpoint_callback_params.mode=max \ - +exp_manager.checkpoint_callback_params.save_best_model=true -""" - - -@dataclass -class MTFineTuneConfig(NemoConfig): - name: Optional[str] = 'MTEncDec' - model_path: str = MISSING - do_training: bool = True - do_testing: bool = False - model: MTEncDecModelConfig = MTEncDecModelConfig() - trainer: Optional[TrainerConfig] = TrainerConfig() - exp_manager: Optional[ExpManagerConfig] = ExpManagerConfig(name='MTEncDec', files_to_copy=[]) - - -@hydra_runner(config_path="conf", config_name="aayn_finetune") -def main(cfg: MTFineTuneConfig) -> None: - # merge default config with user specified config - default_cfg = MTFineTuneConfig() - default_cfg.model = MTEncDecModel.restore_from(restore_path=cfg.model_path, return_config=True) - del default_cfg.model.optim, default_cfg.model.train_ds, default_cfg.model.validation_ds, default_cfg.model.test_ds - cfg = update_model_config(default_cfg, cfg, drop_missing_subconfigs=False) - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - - # training is managed by PyTorch Lightning - trainer_cfg = OmegaConf.to_container(cfg.trainer) - trainer_cfg.pop('strategy', None) - trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_cfg) - - # experiment logs, checkpoints, and auto-resume are managed by exp_manager and PyTorch Lightning - exp_manager(trainer, cfg.exp_manager) - - # everything needed to train translation models is encapsulated in the NeMo MTEncdDecModel - mt_model = MTEncDecModel.restore_from(restore_path=cfg.model_path, override_config_path=cfg.model, trainer=trainer) - - mt_model.setup_training_data(cfg.model.train_ds) - mt_model.setup_multiple_validation_data(val_data_config=cfg.model.validation_ds) - - logging.info("\n\n************** Model parameters and their sizes ***********") - for name, param in mt_model.named_parameters(): - print(name, param.size()) - logging.info("***********************************************************\n\n") - - if cfg.do_training: - trainer.fit(mt_model) - - if cfg.do_testing: - mt_model.setup_multiple_test_data(test_data_config=cfg.model.test_ds) - trainer.test(mt_model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/megatron_nmt_training.py b/SoundScribe/SpeakerID/examples/nlp/machine_translation/megatron_nmt_training.py deleted file mode 100644 index 5d4768d0f1464e1351e2575b8d144aa7b662b6a5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/megatron_nmt_training.py +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch.multiprocessing as mp -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import ModelSummary -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector - -from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel -from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model -from nemo.collections.nlp.models.machine_translation.megatron_nmt_model import MegatronNMTModel -from nemo.collections.nlp.parts.nlp_overrides import ( - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, - PipelineMixedPrecisionPlugin, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - -mp.set_start_method("spawn", force=True) - - -@hydra_runner(config_path="conf", config_name="aayn_base_megatron") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - - if megatron_amp_o2: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) - - exp_manager(trainer, cfg.exp_manager) - - # update resume from checkpoint found by exp_manager - if cfg.model.resume_from_checkpoint is not None: - trainer.ckpt_path = cfg.model.resume_from_checkpoint - logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') - - trainer._checkpoint_connector = _CheckpointConnector(trainer) - - if hasattr(cfg.model, 'pretrained_model_path') and cfg.model.pretrained_model_path is not None: - if not hasattr(cfg.model, 'pretrained_model_type'): - raise ValueError(f"Pretrained model type must be in [T5, BART].") - - assert cfg.model.pretrained_model_type in ['T5', 'BART'] - if cfg.model.pretrained_model_type == 'T5': - pretrained_cfg = MegatronT5Model.restore_from( - cfg.model.pretrained_model_path, trainer=trainer, return_config=True - ) - else: - pretrained_cfg = MegatronBARTModel.restore_from( - cfg.model.pretrained_model_path, trainer=trainer, return_config=True - ) - OmegaConf.set_struct(pretrained_cfg, True) - with open_dict(pretrained_cfg): - pretrained_cfg.masked_softmax_fusion = False - # Set source and target language/multilingual - pretrained_cfg.src_language = cfg.model.src_language - pretrained_cfg.tgt_language = cfg.model.tgt_language - pretrained_cfg.multilingual = cfg.model.multilingual - pretrained_cfg.shared_tokenizer = True - - # Max generation delta - pretrained_cfg.max_generation_delta = cfg.model.max_generation_delta - - # Set label smoothing - pretrained_cfg.label_smoothing = cfg.model.label_smoothing - - # Set tokenizer paths: - pretrained_cfg.encoder_tokenizer = pretrained_cfg.tokenizer - pretrained_cfg.decoder_tokenizer = pretrained_cfg.tokenizer - - # Pre-trained models should use the legacy sentencepiece tokenizer ex: mT5 - pretrained_cfg.encoder_tokenizer.sentencepiece_legacy = True - pretrained_cfg.decoder_tokenizer.sentencepiece_legacy = True - - # Override dropout - - # Old pre-trained checkpoints do not have separate encoder/decoder configurations, so replicate the config to encoder/decoder. - if not hasattr(pretrained_cfg, 'encoder'): - assert not hasattr(pretrained_cfg, 'decoder') - logging.warning( - "No separate configuration for encoder, found in pretrained model, using encoder dropout settings everywhere." - ) - pretrained_cfg.hidden_dropout = cfg.model.encoder.hidden_dropout - pretrained_cfg.attention_dropout = cfg.model.encoder.attention_dropout - else: - assert hasattr(pretrained_cfg, 'decoder') and hasattr(pretrained_cfg, 'encoder') - pretrained_cfg.encoder.hidden_dropout = cfg.model.encoder.hidden_dropout - pretrained_cfg.encoder.attention_dropout = cfg.model.encoder.attention_dropout - pretrained_cfg.decoder.hidden_dropout = cfg.model.decoder.hidden_dropout - pretrained_cfg.decoder.attention_dropout = cfg.model.decoder.attention_dropout - - # Override precision - pretrained_cfg.precision = trainer.precision # Set above from trainer.precision - - # Override micro/global batch - pretrained_cfg.micro_batch_size = cfg.model.micro_batch_size - pretrained_cfg.global_batch_size = cfg.model.global_batch_size - - # O2 AMP - pretrained_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) - - # Override data and global/micro batch size. - pretrained_cfg.train_ds = cfg.model.train_ds - pretrained_cfg.train_ds.micro_batch_size = cfg.model.micro_batch_size - pretrained_cfg.train_ds.global_batch_size = cfg.model.global_batch_size - if hasattr(cfg.model, 'validation_ds'): - pretrained_cfg.validation_ds = cfg.model.validation_ds - else: - raise AttributeError(f"No validation dataset found in config.") - if hasattr(cfg.model, 'test_ds'): - pretrained_cfg.test_ds = cfg.model.test_ds - - # Class target for the new class being restored. - pretrained_cfg.target = ( - "nemo.collections.nlp.models.machine_translation.megatron_nmt_model.MegatronNMTModel" - ) - - # Optimizer overrides. - pretrained_cfg.optim = cfg.model.optim - - model = MegatronNMTModel.restore_from( - cfg.model.pretrained_model_path, - trainer=trainer, - override_config_path=pretrained_cfg, - save_restore_connector=NLPSaveRestoreConnector(), - ) - else: - model = MegatronNMTModel(cfg.model, trainer) - - trainer.fit(model) - trainer.validate(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/nmt_transformer_infer.py b/SoundScribe/SpeakerID/examples/nlp/machine_translation/nmt_transformer_infer.py deleted file mode 100644 index 882350a09dad8a75d1269cb88df01e1ee0a058db..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/nmt_transformer_infer.py +++ /dev/null @@ -1,304 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Given NMT model's .nemo file(s), this script can be used to translate text. -USAGE Example: -1. Obtain text file in src language. You can use sacrebleu to obtain standard test sets like so: - sacrebleu -t wmt14 -l de-en --echo src > wmt14-de-en.src -2. Translate: - python nmt_transformer_infer.py --model=[Path to .nemo file(s)] --srctext=wmt14-de-en.src --tgtout=wmt14-de-en.pre -""" - - -import json -from argparse import ArgumentParser - -import torch - -import nemo.collections.nlp as nemo_nlp -from nemo.collections.nlp.modules.common.transformer import ( - BeamSearchSequenceGenerator, - BeamSearchSequenceGeneratorWithLanguageModel, - EnsembleBeamSearchSequenceGenerator, -) -from nemo.utils import logging - - -def translate_text( - models, args, src_text, tgt_text, tgt_text_all, src_texts, all_scores, all_timing, ensemble_generator -): - if len(models) > 1: - src_ids, src_mask = models[0].prepare_inference_batch(src_text) - best_translations = ensemble_generator(src_ids, src_mask, return_beam_scores=args.write_scores) - if args.write_scores: - all_results, scores, best_translations = ( - best_translations[0], - best_translations[1], - best_translations[2], - ) - scores = scores.view(-1).data.cpu().numpy().tolist() - all_scores += scores - src_texts += [item for item in src_text for i in range(args.beam_size)] - all_results = models[0].ids_to_postprocessed_text( - all_results, models[0].decoder_tokenizer, models[0].target_processor - ) - tgt_text_all += all_results - best_translations = models[0].ids_to_postprocessed_text( - best_translations, models[0].decoder_tokenizer, models[0].target_processor - ) - tgt_text += best_translations - else: - model = models[0] - best_translations = model.translate( - text=src_text, - source_lang=args.source_lang, - target_lang=args.target_lang, - return_beam_scores=args.write_scores, - log_timing=args.write_timing, - ) - - if args.write_timing: - *best_translations, timing_dict = best_translations - all_timing.append(timing_dict) - else: - best_translations = (best_translations,) - - if args.write_scores: - all_results, scores, best_translations = ( - best_translations[0], - best_translations[1], - best_translations[2], - ) - all_scores += scores - src_texts += [item for item in src_text for i in range(args.beam_size)] - tgt_text_all += all_results - else: - best_translations = best_translations[0] - - tgt_text += best_translations - - print(f"Translated {len(tgt_text)} sentences") - - -def main(): - parser = ArgumentParser() - parser.add_argument( - "--model", - type=str, - required=True, - help="Path to .nemo model file(s). If ensembling, provide comma separated paths to multiple models.", - ) - parser.add_argument("--srctext", type=str, required=True, help="Path to the file to translate.") - parser.add_argument( - "--tgtout", type=str, required=True, help="Path to the file where translations are to be written." - ) - parser.add_argument( - "--batch_size", type=int, default=256, help="Number of sentences to batch together while translatiing." - ) - parser.add_argument("--beam_size", type=int, default=4, help="Beam size.") - parser.add_argument( - "--len_pen", type=float, default=0.6, help="Length Penalty. Ref: https://arxiv.org/abs/1609.08144" - ) - parser.add_argument( - "--max_delta_length", - type=int, - default=5, - help="Stop generating if target sequence length exceeds source length by this number.", - ) - parser.add_argument( - "--target_lang", - type=str, - default=None, - help="Target language identifier ex: en,de,fr,es etc. If both `--target_lang` and `--source_lang` are " - "not set, then target language processing will be done the same way as during model training. If " - "`--target_lang` parameter is not set but `--source_lang` parameter is set, then target language " - "processing will not be performed. If `--target_lang` equals 'ignore', then target language processing " - "will not be performed regardless of value of `--source_lang` parameter.", - ) - parser.add_argument( - "--source_lang", - type=str, - default=None, - help="Source language identifier ex: en,de,fr,es etc. If both `--target_lang` and `--source_lang` are " - "not set, then source language processing will be done the same way as during model training. If " - "`--source_lang` parameter is not set but `--target_lang` parameter is set, then source language " - "processing will not be performed. If `--source_lang` equals 'ignore', then source language processing " - "will not be performed regardless of value of `--target_lang` parameter.", - ) - parser.add_argument( - "--write_scores", - action="store_true", - help="Whether to write a separate file with scores not including length penalties corresponding to each beam hypothesis (.score suffix)", - ) - parser.add_argument( - "--write_timing", - action="store_true", - help="Whether to write a separate file with detailed timing info (.timing.json suffix)", - ) - # shallow fusion specific parameters - parser.add_argument( - "--lm_model", - type=str, - default=None, - help="Optional path to an LM model that has the same tokenizer as NMT models for shallow fuison. Note: If using --write_scores, it will add LM scores as well.", - ) - parser.add_argument( - "--fusion_coef", type=float, default=0.07, help="Weight assigned to LM scores during shallow fusion." - ) - - args = parser.parse_args() - torch.set_grad_enabled(False) - logging.info("Attempting to initialize from .nemo file") - models = [] - for model_path in args.model.split(','): - if not model_path.endswith('.nemo'): - raise NotImplementedError(f"Only support .nemo files, but got: {model_path}") - model = nemo_nlp.models.machine_translation.MTEncDecModel.restore_from(restore_path=model_path).eval() - models.append(model) - - if (len(models) > 1) and (args.write_timing): - raise RuntimeError("Cannot measure timing when more than 1 model is used") - - src_text = [] - tgt_text = [] - tgt_text_all = [] - src_texts = [] - all_scores = [] - all_timing = [] - - if torch.cuda.is_available(): - models = [model.cuda() for model in models] - - if args.lm_model is not None: - lm_model = nemo_nlp.models.language_modeling.TransformerLMModel.restore_from(restore_path=args.lm_model).eval() - else: - lm_model = None - - if len(models) > 1: - ensemble_generator = EnsembleBeamSearchSequenceGenerator( - encoders=[model.encoder for model in models], - embeddings=[model.decoder.embedding for model in models], - decoders=[model.decoder.decoder for model in models], - log_softmaxes=[model.log_softmax for model in models], - max_sequence_length=512, - beam_size=args.beam_size, - bos=models[0].decoder_tokenizer.bos_id, - pad=models[0].decoder_tokenizer.pad_id, - eos=models[0].decoder_tokenizer.eos_id, - len_pen=args.len_pen, - max_delta_length=args.max_delta_length, - language_model=lm_model, - fusion_coef=args.fusion_coef, - ) - else: - model = models[0] - ensemble_generator = None - if lm_model is not None: - model.beam_search = BeamSearchSequenceGeneratorWithLanguageModel( - embedding=model.decoder.embedding, - decoder=model.decoder.decoder, - log_softmax=model.log_softmax, - bos=model.decoder_tokenizer.bos_id, - pad=model.decoder_tokenizer.pad_id, - eos=model.decoder_tokenizer.eos_id, - language_model=lm_model, - fusion_coef=args.fusion_coef, - max_sequence_length=model.decoder.max_sequence_length, - beam_size=args.beam_size, - len_pen=args.len_pen, - max_delta_length=args.max_delta_length, - ) - else: - model.beam_search = BeamSearchSequenceGenerator( - embedding=model.decoder.embedding, - decoder=model.decoder.decoder, - log_softmax=model.log_softmax, - bos=model.decoder_tokenizer.bos_id, - pad=model.decoder_tokenizer.pad_id, - eos=model.decoder_tokenizer.eos_id, - max_sequence_length=model.decoder.max_sequence_length, - beam_size=args.beam_size, - len_pen=args.len_pen, - max_delta_length=args.max_delta_length, - ) - - logging.info(f"Translating: {args.srctext}") - - with open(args.srctext, 'r') as src_f: - for line in src_f: - src_text.append(line.strip()) - if len(src_text) == args.batch_size: - # warmup when measuring timing - if args.write_timing and (not all_timing): - print("running a warmup batch") - translate_text( - models=models, - args=args, - src_text=src_text, - tgt_text=[], - tgt_text_all=[], - src_texts=[], - all_scores=[], - all_timing=[], - ensemble_generator=ensemble_generator, - ) - translate_text( - models=models, - args=args, - src_text=src_text, - tgt_text=tgt_text, - tgt_text_all=tgt_text_all, - src_texts=src_texts, - all_scores=all_scores, - all_timing=all_timing, - ensemble_generator=ensemble_generator, - ) - src_text = [] - - if len(src_text) > 0: - translate_text( - models=models, - args=args, - src_text=src_text, - tgt_text=tgt_text, - tgt_text_all=tgt_text_all, - src_texts=src_texts, - all_scores=all_scores, - all_timing=all_timing, - ensemble_generator=ensemble_generator, - ) - - with open(args.tgtout, 'w') as tgt_f: - for line in tgt_text: - tgt_f.write(line + "\n") - - if args.write_scores: - with open(args.tgtout + '.score', 'w') as tgt_f_scores: - for line, score, inp in zip(tgt_text_all, all_scores, src_texts): - tgt_f_scores.write(inp + "\t" + line + "\t" + str(score) + "\n") - - if args.write_timing: - # collect list of dicts to a dict of lists - timing_dict = {} - if len(all_timing): - for k in all_timing[0].keys(): - timing_dict[k] = [t[k] for t in all_timing] - - with open(args.tgtout + '.timing.json', 'w') as timing_fh: - json.dump(timing_dict, timing_fh) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py b/SoundScribe/SpeakerID/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py deleted file mode 100644 index c8ab668fc16cb7cfdf8cfaf16f59174d8cc08670..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Given NMT model's .nemo file(s), this script can be used to translate text. -USAGE Example: -1. Obtain text file in src language. You can use sacrebleu to obtain standard test sets like so: - sacrebleu -t wmt14 -l de-en --echo src > wmt14-de-en.src -2. Translate: - python nmt_transformer_infer.py --model=[Path to .nemo file(s)] --srctext=wmt14-de-en.src --tgtout=wmt14-de-en.pre -""" - - -import os - -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning.trainer.trainer import Trainer - -from nemo.collections.nlp.models.machine_translation.megatron_nmt_model import MegatronNMTModel -from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.app_state import AppState -from nemo.utils.model_utils import inject_model_parallel_rank - -try: - from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator - - HAVE_APEX = True -except (ImportError, ModuleNotFoundError): - ModelType = ApexGuardDefaults() - HAVE_APEX = False - - -@hydra_runner(config_path="conf", config_name="nmt_megatron_infer") -def main(cfg) -> None: - - # trainer required for restoring model parallel models - trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) - assert ( - cfg.trainer.devices * cfg.trainer.num_nodes - == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size - ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" - - app_state = AppState() - app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size - ( - app_state.tensor_model_parallel_rank, - app_state.pipeline_model_parallel_rank, - app_state.model_parallel_size, - app_state.data_parallel_size, - app_state.pipeline_model_parallel_split_rank, - app_state.virtual_pipeline_model_parallel_rank, - ) = fake_initialize_model_parallel( - world_size=app_state.model_parallel_size, - rank=trainer.global_rank, - tensor_model_parallel_size_=cfg.tensor_model_parallel_size, - pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, - pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank, - ) - - if cfg.model_file is not None: - if not os.path.exists(cfg.model_file): - raise ValueError(f"Model file {cfg.model_file} does not exist") - pretrained_cfg = MegatronNMTModel.restore_from(cfg.model_file, trainer=trainer, return_config=True) - OmegaConf.set_struct(pretrained_cfg, True) - with open_dict(pretrained_cfg): - pretrained_cfg.precision = trainer.precision - model = MegatronNMTModel.restore_from( - restore_path=cfg.model_file, - trainer=trainer, - save_restore_connector=NLPSaveRestoreConnector(), - override_config_path=pretrained_cfg, - ) - elif cfg.checkpoint_dir is not None: - checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)) - model = MegatronNMTModel.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer) - else: - raise ValueError("need at least a nemo file or checkpoint dir") - - model.freeze() - - logging.info(f"Translating: {cfg.srctext}") - src_text = [] - translations = [] - with open(cfg.srctext, 'r') as src_f, open(cfg.tgtout, 'w') as tgt_f: - for line in src_f: - src_text.append(line.strip()) - if len(src_text) == cfg.batch_size: - translations = model.translate( - text=src_text, source_lang=cfg.source_lang, target_lang=cfg.target_lang, - ) - for translation in translations: - tgt_f.write(translation + "\n") - src_text = [] - if len(src_text) > 0: - translations = model.translate(text=src_text, source_lang=cfg.source_lang, target_lang=cfg.target_lang,) - for translation in translations: - tgt_f.write(translation + "\n") - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/noisy_channel_reranking.py b/SoundScribe/SpeakerID/examples/nlp/machine_translation/noisy_channel_reranking.py deleted file mode 100644 index c390722ea8d3f388b034180dd6e81027982d9649..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/noisy_channel_reranking.py +++ /dev/null @@ -1,320 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script implemts Noisy Channel Reranking (NCR) - https://arxiv.org/abs/1908.05731 -Given .nemo files for a, reverse model (target -> source) and transformer LM (target LM) NMT model's .nemo file, -this script can be used to re-rank a forward model's (source -> target) beam candidates. - -This script can be used in two ways 1) Given the score file generated by `nmt_transformer_infer.py`, re-rank beam candidates and -2) Given NCR score file generated by 1), Re-rank beam candidates based only on cached scores in the ncr file. This is meant to tune NCR coeficients. - -Pre-requisite: Generating translations using `nmt_transformer_infer.py` -1. Obtain text file in src language. You can use sacrebleu to obtain standard test sets like so: - sacrebleu -t wmt14 -l de-en --echo src > wmt14-de-en.src -2. Translate using `nmt_transformer_infer.py` with a large beam size.: - python nmt_transformer_infer.py --model=[Path to .nemo file(s)] --srctext=wmt14-de-en.src --tgtout=wmt14-de-en.translations --beam_size 15 --write_scores - -USAGE Example (case 1): -Re-rank beam candidates: - python noisy_channel_reranking.py \ - --reverse_model=[Path to .nemo file] \ - --language_model=[Path to .nemo file] \ - --srctext=wmt14-de-en.translations.scores \ - --tgtout=wmt14-de-en.ncr.translations \ - --forward_model_coef=1.0 \ - --reverse_model_coef=0.7 \ - --target_lm_coef=0.05 \ - --write_scores \ - -USAGE Example (case 2): -Re-rank beam candidates using cached score file only - python noisy_channel_reranking.py \ - --cached_score_file=wmt14-de-en.ncr.translations.scores \ - --forward_model_coef=1.0 \ - --reverse_model_coef=0.7 \ - --target_lm_coef=0.05 \ - --tgtout=wmt14-de-en.ncr.translations \ -""" - - -from argparse import ArgumentParser - -import numpy as np -import torch - -import nemo.collections.nlp as nemo_nlp -from nemo.utils import logging - - -def score_fusion(args, forward_scores, rev_scores, lm_scores, src_lens, tgt_lens): - """ - Fuse forward, reverse and language model scores. - """ - fused_scores = [] - for forward_score, rev_score, lm_score, src_len, tgt_len in zip( - forward_scores, rev_scores, lm_scores, src_lens, tgt_lens - ): - score = 0 - - forward_score = forward_score / tgt_len if args.length_normalize_scores else forward_score - score += args.forward_model_coef * forward_score - - rev_score = rev_score / src_len if args.length_normalize_scores else rev_score - score += args.reverse_model_coef * rev_score - - lm_score = lm_score / tgt_len if args.length_normalize_scores else lm_score - score += args.target_lm_coef * lm_score - - if args.len_pen is not None: - score = score / (((5 + tgt_len) / 6) ** args.len_pen) - - fused_scores.append(score) - - return fused_scores - - -def main(): - parser = ArgumentParser() - parser.add_argument( - "--reverse_model", - type=str, - help="Path to .nemo model file(s). If ensembling, provide comma separated paths to multiple models.", - ) - parser.add_argument( - "--language_model", type=str, help="Optional path to an LM model that has the same tokenizer as NMT models.", - ) - parser.add_argument( - "--forward_model_coef", - type=float, - default=1.0, - help="Weight assigned to the forward NMT model for re-ranking.", - ) - parser.add_argument( - "--reverse_model_coef", - type=float, - default=0.7, - help="Weight assigned to the reverse NMT model for re-ranking.", - ) - parser.add_argument( - "--target_lm_coef", type=float, default=0.07, help="Weight assigned to the target LM model for re-ranking.", - ) - parser.add_argument( - "--srctext", - type=str, - default=None, - help="Path to a TSV file containing forward model scores of the format source \t beam_candidate_i \t forward_score", - ) - parser.add_argument( - "--cached_score_file", - type=str, - default=None, - help="Path to a TSV file containing cached scores for each beam candidate. Format source \t target \t forward_score \t reverse_score \t lm_score \t src_len \t tgt_len", - ) - parser.add_argument( - "--tgtout", type=str, required=True, help="Path to the file where re-ranked translations are to be written." - ) - parser.add_argument( - "--beam_size", - type=int, - default=4, - help="Beam size with which forward model translations were generated. IMPORTANT: mismatch can lead to wrong results and an incorrect number of generated translations.", - ) - parser.add_argument( - "--target_lang", type=str, default=None, help="Target language identifier ex: en,de,fr,es etc." - ) - parser.add_argument( - "--source_lang", type=str, default=None, help="Source language identifier ex: en,de,fr,es etc." - ) - parser.add_argument( - "--write_scores", action="store_true", help="Whether to write forward, reverse and lm scores to a file." - ) - parser.add_argument( - "--length_normalize_scores", - action="store_true", - help="If true, it will divide forward, reverse and lm scores by the corresponding sequence length.", - ) - parser.add_argument( - "--len_pen", - type=float, - default=None, - help="Apply a length penalty based on target lengths to the final NCR score.", - ) - - args = parser.parse_args() - torch.set_grad_enabled(False) - - if args.cached_score_file is None: - reverse_models = [] - for model_path in args.reverse_model.split(','): - if not model_path.endswith('.nemo'): - raise NotImplementedError(f"Only support .nemo files, but got: {model_path}") - model = nemo_nlp.models.machine_translation.MTEncDecModel.restore_from(restore_path=model_path).eval() - model.eval_loss_fn.reduction = 'none' - reverse_models.append(model) - - lm_model = nemo_nlp.models.language_modeling.TransformerLMModel.restore_from( - restore_path=args.language_model - ).eval() - - if args.srctext is not None and args.cached_score_file is not None: - raise ValueError("Only one of --srctext or --cached_score_file must be provided.") - - if args.srctext is None and args.cached_score_file is None: - raise ValueError("Neither --srctext nor --cached_score_file were provided.") - - if args.srctext is not None: - logging.info(f"Re-ranking: {args.srctext}") - else: - logging.info(f"Re-ranking from cached score file only: {args.cached_score_file}") - - if args.cached_score_file is None: - if torch.cuda.is_available(): - reverse_models = [model.cuda() for model in reverse_models] - lm_model = lm_model.cuda() - - src_text = [] - tgt_text = [] - all_reverse_scores = [] - all_lm_scores = [] - all_forward_scores = [] - all_src_lens = [] - all_tgt_lens = [] - - # Chceck args if re-ranking from cached score file. - if args.cached_score_file is not None: - if args.write_scores: - raise ValueError("--write_scores cannot be provided with a cached score file.") - if args.reverse_model is not None: - raise ValueError( - "--reverse_model cannot be provided with a cached score file since it assumes reverse scores already present in the cached file." - ) - if args.language_model is not None: - raise ValueError( - "--language_model cannot be provided with a cached score file since it assumes language model scores already present in the cached file." - ) - - if args.srctext is not None: - # Compute reverse scores and LM scores from the provided models since cached scores file is not provided. - with open(args.srctext, 'r') as src_f: - count = 0 - for line in src_f: - src_text.append(line.strip().split('\t')) - if len(src_text) == args.beam_size: - # Source and target sequences are flipped for the reverse direction model. - src_texts = [item[1] for item in src_text] - tgt_texts = [item[0] for item in src_text] - src, src_mask = reverse_models[0].prepare_inference_batch(src_texts) - tgt, tgt_mask = reverse_models[0].prepare_inference_batch(tgt_texts, target=True) - src_lens = src_mask.sum(1).data.cpu().tolist() - tgt_lens = tgt_mask.sum(1).data.cpu().tolist() - forward_scores = [float(item[2]) for item in src_text] - - # Ensemble of reverse model scores. - nmt_lls = [] - for model in reverse_models: - nmt_log_probs = model(src, src_mask, tgt[:, :-1], tgt_mask[:, :-1]) - nmt_nll = model.eval_loss_fn(log_probs=nmt_log_probs, labels=tgt[:, 1:]) - nmt_ll = nmt_nll.view(nmt_log_probs.size(0), nmt_log_probs.size(1)).sum(1) * -1.0 - nmt_ll = nmt_ll.data.cpu().numpy().tolist() - nmt_lls.append(nmt_ll) - reverse_scores = np.stack(nmt_lls).mean(0) - - # LM scores. - if lm_model is not None: - # Compute LM score for the src of the reverse model. - lm_log_probs = lm_model(src[:, :-1], src_mask[:, :-1]) - lm_nll = model.eval_loss_fn(log_probs=lm_log_probs, labels=src[:, 1:]) - lm_ll = lm_nll.view(lm_log_probs.size(0), lm_log_probs.size(1)).sum(1) * -1.0 - lm_ll = lm_ll.data.cpu().numpy().tolist() - else: - lm_ll = None - lm_scores = lm_ll - - all_reverse_scores.extend(reverse_scores) - all_lm_scores.extend(lm_scores) - all_forward_scores.extend(forward_scores) - - # Swapping source and target here back again since this is what gets written to the file. - all_src_lens.extend(tgt_lens) - all_tgt_lens.extend(src_lens) - - fused_scores = score_fusion(args, forward_scores, reverse_scores, lm_scores, src_lens, tgt_lens) - tgt_text.append(src_texts[np.argmax(fused_scores)]) - src_text = [] - count += 1 - print(f'Reranked {count} sentences') - - else: - # Use reverse and LM scores from the cached scores file to re-rank. - with open(args.cached_score_file, 'r') as src_f: - count = 0 - for line in src_f: - src_text.append(line.strip().split('\t')) - if len(src_text) == args.beam_size: - if not all([len(item) == 7 for item in src_text]): - raise IndexError( - "All lines did not contain exactly 5 fields. Format - src_txt \t tgt_text \t forward_score \t reverse_score \t lm_score \t src_len \t tgt_len" - ) - src_texts = [item[0] for item in src_text] - tgt_texts = [item[1] for item in src_text] - forward_scores = [float(item[2]) for item in src_text] - reverse_scores = [float(item[3]) for item in src_text] - lm_scores = [float(item[4]) for item in src_text] - src_lens = [float(item[5]) for item in src_text] - tgt_lens = [float(item[6]) for item in src_text] - - fused_scores = score_fusion(args, forward_scores, reverse_scores, lm_scores, src_lens, tgt_lens) - tgt_text.append(tgt_texts[np.argmax(fused_scores)]) - src_text = [] - count += 1 - print(f'Reranked {count} sentences') - - with open(args.tgtout, 'w') as tgt_f: - for line in tgt_text: - tgt_f.write(line + "\n") - - # Write scores file - if args.write_scores: - with open(args.tgtout + '.scores', 'w') as tgt_f, open(args.srctext, 'r') as src_f: - src_lines = [] - for line in src_f: - src_lines.append(line.strip().split('\t')) - if not (len(all_reverse_scores) == len(all_lm_scores) == len(all_forward_scores) == len(src_lines)): - raise ValueError( - f"Length of scores files do not match. {len(all_reverse_scores)} != {len(all_lm_scores)} != {len(all_forward_scores)} != {len(src_lines)}. This is most likely because --beam_size is set incorrectly. This needs to be set to the same value that was used to generate translations." - ) - for f, r, lm, sl, tl, src in zip( - all_forward_scores, all_reverse_scores, all_lm_scores, all_src_lens, all_tgt_lens, src_lines - ): - tgt_f.write( - src[0] - + '\t' - + src[1] - + '\t' - + str(f) - + '\t' - + str(r) - + '\t' - + str(lm) - + '\t' - + str(sl) - + '\t' - + str(tl) - + '\n' - ) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/nlp/machine_translation/translate_ddp.py b/SoundScribe/SpeakerID/examples/nlp/machine_translation/translate_ddp.py deleted file mode 100644 index cbcc1afa52c80c426828ff6c20c89e1cb3afa7a2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/machine_translation/translate_ddp.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -from argparse import ArgumentParser - -import torch -import torch.multiprocessing as mp -from torch.utils.data import DataLoader - -from nemo.collections.nlp.data.language_modeling import TarredSentenceDataset -from nemo.collections.nlp.data.machine_translation import TarredTranslationDataset -from nemo.collections.nlp.models.machine_translation.mt_enc_dec_model import MTEncDecModel -from nemo.utils import logging - - -def get_args(): - parser = ArgumentParser(description='Batch translation of sentences from a pre-trained model on multiple GPUs') - parser.add_argument("--model", type=str, required=True, help="Path to the .nemo translation model file") - parser.add_argument( - "--text2translate", type=str, required=True, help="Path to the pre-processed tarfiles for translation" - ) - parser.add_argument("--result_dir", type=str, required=True, help="Folder to write translation results") - parser.add_argument( - "--twoside", action="store_true", help="Set flag when translating the source side of a parallel dataset" - ) - parser.add_argument( - '--metadata_path', type=str, required=True, help="Path to the JSON file that contains dataset info" - ) - parser.add_argument('--topk', type=int, default=500, help="Value of k for topk sampling") - parser.add_argument('--src_language', type=str, required=True, help="Source lang ID for detokenization") - parser.add_argument('--tgt_language', type=str, required=True, help="Target lang ID for detokenization") - parser.add_argument( - '--reverse_lang_direction', - action="store_true", - help="Reverse source and target language direction for parallel dataset", - ) - parser.add_argument('--n_gpus', type=int, default=1, help="Number of GPUs to use") - args = parser.parse_args() - return args - - -def translate(rank, world_size, args): - if args.model.endswith(".nemo"): - logging.info("Attempting to initialize from .nemo file") - model = MTEncDecModel.restore_from(restore_path=args.model, map_location=f"cuda:{rank}") - elif args.model.endswith(".ckpt"): - logging.info("Attempting to initialize from .ckpt file") - model = MTEncDecModel.load_from_checkpoint(checkpoint_path=args.model, map_location=f"cuda:{rank}") - model.replace_beam_with_sampling(topk=args.topk) - model.eval() - if args.twoside: - dataset = TarredTranslationDataset( - text_tar_filepaths=args.text2translate, - metadata_path=args.metadata_path, - encoder_tokenizer=model.encoder_tokenizer, - decoder_tokenizer=model.decoder_tokenizer, - shuffle_n=100, - shard_strategy="scatter", - world_size=world_size, - global_rank=rank, - reverse_lang_direction=args.reverse_lang_direction, - ) - else: - dataset = TarredSentenceDataset( - text_tar_filepaths=args.text2translate, - metadata_path=args.metadata_path, - tokenizer=model.encoder_tokenizer, - shuffle_n=100, - shard_strategy="scatter", - world_size=world_size, - global_rank=rank, - ) - loader = DataLoader(dataset, batch_size=1) - result_dir = os.path.join(args.result_dir, f'rank{rank}') - os.makedirs(result_dir, exist_ok=True) - originals_file_name = os.path.join(result_dir, 'originals.txt') - translations_file_name = os.path.join(result_dir, 'translations.txt') - num_translated_sentences = 0 - - with open(originals_file_name, 'w') as of, open(translations_file_name, 'w') as tf: - for batch_idx, batch in enumerate(loader): - for i in range(len(batch)): - if batch[i].ndim == 3: - batch[i] = batch[i].squeeze(dim=0) - batch[i] = batch[i].to(rank) - if args.twoside: - src_ids, src_mask, _, _, _ = batch - else: - src_ids, src_mask = batch - if batch_idx % 100 == 0: - logging.info( - f"{batch_idx} batches ({num_translated_sentences} sentences) were translated by process with " - f"rank {rank}" - ) - num_translated_sentences += len(src_ids) - inputs, translations = model.batch_translate(src=src_ids, src_mask=src_mask) - for src, translation in zip(inputs, translations): - of.write(src + '\n') - tf.write(translation + '\n') - - -def main() -> None: - args = get_args() - world_size = torch.cuda.device_count() if args.n_gpus == -1 else args.n_gpus - mp.spawn(translate, args=(world_size, args), nprocs=world_size, join=True) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/question_answering/conf/qa_conf.yaml b/SoundScribe/SpeakerID/examples/nlp/question_answering/conf/qa_conf.yaml deleted file mode 100644 index bb53054acd4b9d53d3533c63b98770d0134ee73e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/question_answering/conf/qa_conf.yaml +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -pretrained_model: null # pretrained model from list_available_models() -do_training: true # true for training mode, false for testing -trainer: - devices: [0] # 0 for CPU, or list of the GPUs to use e.g. [0, 1] or [0] - num_nodes: 1 - max_epochs: 3 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - gradient_clip_val: 1.0 - precision: 16 # should be set to 16 for O1 and O2 to enable the AMP. - accelerator: gpu - log_every_n_steps: 5 # interval of logging. - val_check_interval: 1.0 # set to 0.25 to check 4 times per epoch, or an int for number of iterations - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - enable_checkpointing: False # provided by exp_manager - logger: False # provided by exp_manager - strategy: ddp - -model: - tensor_model_parallel_size: 1 - nemo_path: null # filename to save the model and associated artifacts to .nemo file - library: huggingface # [huggingface, megatron]. Used by S2SQAModel and GPTQAModel - save_model: False # save validation model checkpoints - - tokens_to_generate: 32 # used by S2SQAModel and GPTQAModel to limit number of generated tokens - - dataset: - version_2_with_negative: true # if true, dataset contains some questions that do not have an answer - doc_stride: 128 # stride for splitting long documents into chunks - max_query_length: 64 - max_seq_length: 512 # max sequence length for input to the model - max_answer_length: 30 # max ground truth answer length - use_cache: false - do_lower_case: true - - # if true, context spans/chunks that do not contain answer are treated as unanswerable, - # useful for extractive datasets like SQuAD - # if false, all context spans/chunks are treated as relevant for answering given query, - # useful for generative datasets where answer is not necessarily in the context - # used by S2SQAModel and GPTQAModel - check_if_answer_in_context: true - - # if all, keep all doc spans - # if only_positive, keep doc spans containing answer only - # if limited_negative, keep 10 doc spans closest to answer per question - # used by BERTQAModel - keep_doc_spans: all # [all, only_positive, limited_negative] - - null_score_diff_threshold: 0.0 # If null_score - best_non_null is greater than the threshold predict null. - n_best_size: 20 - - num_workers: 1 - pin_memory: false - drop_last: false - - train_ds: - file: null # .json file - batch_size: 24 # per GPU - shuffle: true - num_samples: -1 - - # default values for the following params are retrieved from dataset config section, but you may override them - num_workers: ${model.dataset.num_workers} - drop_last: ${model.dataset.drop_last} - pin_memory: ${model.dataset.pin_memory} - - validation_ds: - file: null # .json file - batch_size: 24 # per GPU - shuffle: false - num_samples: -1 - - # default values for the following params are retrieved from dataset config section, but you may override them - num_workers: ${model.dataset.num_workers} - drop_last: ${model.dataset.drop_last} - pin_memory: ${model.dataset.pin_memory} - - test_ds: - file: null # .json file - batch_size: 24 # per GPU - shuffle: false - num_samples: -1 - - # default values for the following params are retrieved from dataset config section, but you may override them - num_workers: ${model.dataset.num_workers} - drop_last: ${model.dataset.drop_last} - pin_memory: ${model.dataset.pin_memory} - - language_model: - pretrained_model_name: bert-base-uncased # main config to select model (between bert, gpt2, t5/bart based models) - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - token_classifier: # used only by BERTQAModel for defining the extractive QA head - num_layers: 1 - dropout: 0. - num_classes: 2 - activation: relu - log_softmax: false - use_transformer_init: true - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # tokenizer that inherits from TokenizerSpec - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - - # expand the following to a dictionary if special tokens need to be added. - # only necessary for adding transformer/bert-specific special tokens to tokenizer if the tokenizer does not already have these inherently. - special_tokens: null - - optim: - name: adamw - lr: 5e-5 - - # optimizer arguments - betas: [0.9, 0.999] - weight_decay: 0. - - # scheduler setup - sched: - name: SquareRootAnnealing - - # scheduler params - warmup_steps: null - warmup_ratio: 0. - last_epoch: -1 - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./nemo_experiments" - name: "QnA" # the name of your model - create_wandb_logger: False - wandb_logger_kwargs: - name: ??? - project: QnA - create_tensorboard_logger: True # whether you want exp_manger to create a tb logger - create_checkpoint_callback: True # whether you want exp_manager to create a modelcheckpoint callback - resume_if_exists: false - resume_ignore_no_checkpoint: false \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/question_answering/conf/question_answering_squad_config.yaml b/SoundScribe/SpeakerID/examples/nlp/question_answering/conf/question_answering_squad_config.yaml deleted file mode 100644 index 2e54b6fecc7ef25ad3a41da50aa7bd5d3a6b956d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/question_answering/conf/question_answering_squad_config.yaml +++ /dev/null @@ -1,143 +0,0 @@ -# Question Answering with SQUAD -name: &name QA - -pretrained_model: null # pretrained QAModel model from list_available_models() -do_training: true # training mode, for testing change to false -trainer: - devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices - num_nodes: 1 - max_epochs: 2 # the number of training epochs - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - precision: 16 # 16 to use AMP - accelerator: gpu - strategy: ddp - gradient_clip_val: 0.0 - val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch - enable_checkpointing: False # provided by exp_manager - logger: false # provided by exp_manager - num_sanity_val_steps: 0 - log_every_n_steps: 1 # Interval of logging. - -model: - nemo_path: null # exported .nemo path - dataset: - version_2_with_negative: false - # If true, the examples contain some that do not have an answer. - doc_stride: 128 - # When splitting up a long document into chunks, - # how much stride to take between chunks. - max_query_length: 64 - # The maximum number of tokens for the question. - # Questions longer than this will be truncated to - # this length. - max_seq_length: 384 - # The maximum total input sequence length after - # WordPiece tokenization. Sequences longer than this - # will be truncated, and sequences shorter than this - # will be padded. - max_answer_length: 30 - # The maximum length of an answer that can be - # generated. This is needed because the start - # and end predictions are not conditioned - # on one another. - null_score_diff_threshold: 0.0 - # If null_score - best_non_null is greater than the threshold predict null. - n_best_size: 20 - # The total number of n-best predictions to generate at testing. - use_cache: true - do_lower_case: true - # if true does lower case - keep_doc_spans: all - # if all, keep all doc spans - # if only_positive, keep doc spans containing answer only - # if limited_negative, keep 10 doc spans closest to answer per question - - num_workers: 2 - pin_memory: false - drop_last: false - - train_ds: - file: null # .json file - batch_size: 24 # per GPU - shuffle: true - num_samples: -1 - # Default values for the following params are retrieved from dataset config section, but you may override them - num_workers: ${model.dataset.num_workers} - drop_last: ${model.dataset.drop_last} - pin_memory: ${model.dataset.pin_memory} - - validation_ds: - file: null # .json file - batch_size: 24 # per GPU - shuffle: false - num_samples: -1 - # Default values for the following params are retrieved from dataset config section, but you may override them - num_workers: ${model.dataset.num_workers} - drop_last: ${model.dataset.drop_last} - pin_memory: ${model.dataset.pin_memory} - - test_ds: - file: null # .json file - batch_size: 24 # per GPU - shuffle: false - num_samples: -1 - # Default values for the following params are retrieved from dataset config section, but you may override them - num_workers: ${model.dataset.num_workers} - drop_last: ${model.dataset.drop_last} - pin_memory: ${model.dataset.pin_memory} - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # tokenizer that inherits from TokenizerSpec - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null # expand the following to a dictionary if special tokens need to be added. - # only necessary for adding transformer/bert-specific special tokens to tokenizer if the tokenizer does not already have these inherently. - - language_model: - pretrained_model_name: bert-base-uncased # BERT-like model name - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null # if specified initializes model from scratch - - token_classifier: - num_layers: 1 - dropout: 0.0 - num_classes: 2 - activation: relu - log_softmax: false - use_transformer_init: true - - - optim: - name: adamw - lr: 3e-5 - weight_decay: 0.0 - sched: - name: SquareRootAnnealing - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - - # scheduler config override - warmup_steps: null - warmup_ratio: 0.0 - last_epoch: -1 - -exp_manager: - exp_dir: null # where to store logs and checkpoints - name: *name # name of experiment - create_tensorboard_logger: True - create_checkpoint_callback: True - create_wandb_logger: False - wandb_logger_kwargs: - name: ??? - project: QA - -hydra: - run: - dir: . - job_logging: - root: - handlers: null \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/question_answering/convert_msmarco_to_squad_format.py b/SoundScribe/SpeakerID/examples/nlp/question_answering/convert_msmarco_to_squad_format.py deleted file mode 100644 index 4e1a99733afde2941a6dcb665f61f2a5b3c36c9e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/question_answering/convert_msmarco_to_squad_format.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json -from ast import literal_eval - -from tqdm import tqdm - - -def load_json(filepath): - with open(filepath, "r") as f: - data = json.load(f) - return data - - -def dump_json(filepath, data): - with open(filepath, "w") as f: - json.dump(data, f) - - -def get_context_from_passages(passages, keep_only_relevant_passages): - contexts = [] - if keep_only_relevant_passages: - for passage in passages: - if passage["is_selected"] == 1: - contexts.append(passage["passage_text"]) - else: - contexts = [passage["passage_text"] for passage in passages] - - return " ".join(contexts) - - -def format_answers_into_squad_format(answers): - is_impossible = True if "No Answer Present." in answers else False - if is_impossible: - answers = [] - else: - answers = [{"text": ans, "answer_start": -1} for ans in answers] - - return answers - - -def convert_msmarco_to_squad_format(msmarco_data, args): - ids = list(msmarco_data["query"]) - squad_data = {"data": [{"title": "MSMARCO", "paragraphs": []}], "version": "v2.1"} - for index, _id in enumerate(tqdm(ids)): - - context = get_context_from_passages(msmarco_data["passages"][_id], args.keep_only_relevant_passages) - if not context: - continue - - query = msmarco_data["query"][_id] - - # use well formed answers if present, else use the 'answers' field - well_formed_answers = msmarco_data['wellFormedAnswers'][_id] - well_formed_answers = ( - well_formed_answers if isinstance(well_formed_answers, list) else literal_eval(well_formed_answers) - ) - answers = well_formed_answers if well_formed_answers else msmarco_data["answers"][_id] - answers = format_answers_into_squad_format(answers) - if args.exclude_negative_samples and (not answers): - continue - - squad_data["data"][0]["paragraphs"].append( - { - "context": context, - "qas": [ - {"id": index, "question": query, "answers": answers, "is_impossible": False if answers else True,} - ], - } - ) - - return squad_data - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--msmarco_train_input_filepath", default=None, type=str, required=True) - parser.add_argument("--msmarco_dev_input_filepath", default=None, type=str, required=True) - parser.add_argument("--converted_train_save_path", default=None, type=str, required=True) - parser.add_argument("--converted_dev_save_path", default=None, type=str, required=True) - parser.add_argument( - "--exclude_negative_samples", - default=False, - type=bool, - help="whether to keep No Answer samples in the dataset", - required=False, - ) - parser.add_argument( - "--keep_only_relevant_passages", - default=False, - type=bool, - help="if True, will only use passages with is_selected=True for context", - required=False, - ) - args = parser.parse_args() - - print("converting MS-MARCO train dataset...") - msmarco_train_data = load_json(args.msmarco_train_input_filepath) - squad_train_data = convert_msmarco_to_squad_format(msmarco_train_data, args) - dump_json(args.converted_train_save_path, squad_train_data) - - print("converting MS-MARCO dev dataset...") - msmarco_dev_data = load_json(args.msmarco_dev_input_filepath) - squad_dev_data = convert_msmarco_to_squad_format(msmarco_dev_data, args) - dump_json(args.converted_dev_save_path, squad_dev_data) - - -if __name__ == "__main__": - """ - Please agree to the Terms of Use at: - https://microsoft.github.io/msmarco/ - Download data at: - https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz - https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz - - Example usage: - python convert_msmarco_to_squad_format.py \ - --msmarco_train_input_filepath=/path/to/msmarco_train_v2.1.json \ - --msmarco_dev_input_filepath=/path/to/msmarco_dev_v2.1.json \ - --converted_train_save_path=/path/to/msmarco_squad_format_train.json \ - --converted_dev_save_path=/path/to/msmarco_squad_format_dev.json \ - --exclude_negative_samples=False \ - --keep_only_relevant_passages=False - """ - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/question_answering/get_squad.py b/SoundScribe/SpeakerID/examples/nlp/question_answering/get_squad.py deleted file mode 100644 index 255b040c36a26f3d070b1d24e99ffdadfd2b05e5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/question_answering/get_squad.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -import urllib.request - -from nemo.utils import logging - - -class SquadDownloader: - def __init__(self, save_path): - self.save_path = save_path + '/squad' - - if not os.path.exists(self.save_path): - os.makedirs(self.save_path) - - if not os.path.exists(self.save_path + '/v1.1'): - os.makedirs(self.save_path + '/v1.1') - - if not os.path.exists(self.save_path + '/v2.0'): - os.makedirs(self.save_path + '/v2.0') - - self.download_urls = { - 'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/train-v1.1.json': 'v1.1/train-v1.1.json', - 'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/dev-v1.1.json': 'v1.1/dev-v1.1.json', - 'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/train-v2.0.json': 'v2.0/train-v2.0.json', - 'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/dev-v2.0.json': 'v2.0/dev-v2.0.json', - } - - def download(self): - for item in self.download_urls: - url = item - file = self.download_urls[item] - - logging.info('Downloading: %s', url) - if os.path.isfile(self.save_path + '/' + file): - logging.info('** Download file already exists, skipping download') - else: - response = urllib.request.urlopen(url) - with open(self.save_path + '/' + file, "wb") as handle: - handle.write(response.read()) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Download Squad') - parser.add_argument( - '--destDir', - type=str, - required=False, - help='directory to store data', - default=os.path.split(os.path.abspath(__file__))[0], - ) - args = parser.parse_args() - logging.info(args.destDir) - squad_dl = SquadDownloader(args.destDir) - squad_dl.download() diff --git a/SoundScribe/SpeakerID/examples/nlp/question_answering/question_answering.py b/SoundScribe/SpeakerID/examples/nlp/question_answering/question_answering.py deleted file mode 100644 index fcde03582e5c4aea68d7b1afab0494c21183b1c9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/question_answering/question_answering.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models.question_answering.qa_bert_model import BERTQAModel -from nemo.collections.nlp.models.question_answering.qa_gpt_model import GPTQAModel -from nemo.collections.nlp.models.question_answering.qa_s2s_model import S2SQAModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="qa_conf") -def main(cfg: DictConfig) -> None: - pl.seed_everything(42) - # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True - # when there are unused parameters like here - if cfg.trainer.strategy == 'ddp': - cfg.trainer.strategy = "ddp_find_unused_parameters_true" - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - exp_dir = exp_manager(trainer, cfg.get("exp_manager", None)) - - if "bert" in cfg.model.language_model.pretrained_model_name.lower(): - model_class = BERTQAModel - elif "gpt" in cfg.model.language_model.pretrained_model_name.lower(): - model_class = GPTQAModel - elif ( - "bart" in cfg.model.language_model.pretrained_model_name.lower() - or "t5" in cfg.model.language_model.pretrained_model_name.lower() - ): - model_class = S2SQAModel - - if cfg.pretrained_model or (cfg.model.nemo_path and os.path.exists(cfg.model.nemo_path)): - if cfg.pretrained_model: - logging.info(f'Loading pretrained model {cfg.pretrained_model}') - model = model_class.from_pretrained(cfg.pretrained_model) - else: - logging.info(f'Restoring model from {cfg.model.nemo_path}') - model = model_class.restore_from(cfg.model.nemo_path) - - if cfg.do_training: - model.setup_training_data(train_data_config=cfg.model.train_ds) - model.setup_multiple_validation_data(val_data_config=cfg.model.validation_ds) - else: - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - model = model_class(cfg.model, trainer=trainer) - - if cfg.do_training: - trainer.fit(model) - if cfg.model.nemo_path: - model.save_to(cfg.model.nemo_path) - - if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.file is not None: - eval_device = [cfg.trainer.devices[0]] if isinstance(cfg.trainer.devices, list) else 1 - trainer = pl.Trainer(devices=eval_device, accelerator=cfg.trainer.accelerator, precision=16) - model.setup_test_data(test_data_config=cfg.model.test_ds) - trainer.test(model) - - # specifiy .json file to dump predictions. e.g. os.path.join(exp_dir, "output_nbest_file.json") - output_nbest_file = None - # specifiy .json file to dump predictions. e.g. os.path.join(exp_dir, "output_prediction_file.json") - output_prediction_file = None - inference_samples = 5 # for test purposes. To use entire inference dataset set to -1 - all_preds, all_nbest = model.inference( - cfg.model.test_ds.file, - output_prediction_file=output_prediction_file, - output_nbest_file=output_nbest_file, - num_samples=inference_samples, - ) - - for question_id in all_preds: - print(all_preds[question_id]) - - -if __name__ == "__main__": - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/README.md b/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/README.md deleted file mode 100644 index 9d2063eff181c3beedcd7eb684e81388b89cd182..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# SpellMapper - spellchecking model for ASR Customization -Paper: https://arxiv.org/abs/2306.02317 -This model was partly inspired by Microsoft's paper https://arxiv.org/pdf/2203.00888.pdf. -The goal is to build a model that gets as input a single ASR hypothesis (text) and a vocabulary of custom words/phrases and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any. -Our model is non-autoregressive (NAR) based on transformer architecture (BERT with multiple separators). - -As initial data we use about 5 mln entities from [YAGO corpus](https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/yago-naga/yago/downloads/). These entities are short phrases from Wikipedia headings. -In order to get misspelled predictions we feed these data to TTS model and then to ASR model. -Having a "parallel" corpus of "correct + misspelled" phrases, we use statistical machine translation techniques to create a dictionary of possible ngram mappings with their respective frequencies. -We create an auxiliary algorithm that takes as input a sentence (ASR hypothesis) and a large custom dictionary (e.g. 5000 phrases) and selects top 10 candidate phrases that are probably contained in this sentence in a misspelled way. -The task of our final neural model is to predict which fragments in the ASR hypothesis should be replaced by which of top-10 candidate phrases if any. - -The pipeline consists of multiple steps: - -1. Download or generate training data. - See `https://github.com/bene-ges/nemo_compatible/tree/main/scripts/nlp/en_spellmapper/dataset_preparation` - -2. [Optional] Convert training dataset to tarred files. - `convert_dataset_to_tarred.sh` - -3. Train spellchecking model. - `run_training.sh` - or - `run_training_tarred.sh` - -4. Run evaluation. - - [test_on_kensho.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh) - - [test_on_userlibri.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh) - - [test_on_spoken_wikipedia.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh) - -5. Run inference. - `python run_infer.sh` diff --git a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/checkpoint_to_nemo.py b/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/checkpoint_to_nemo.py deleted file mode 100644 index c2f514f3e67e70a680d236901fc48b23c9b90698..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/checkpoint_to_nemo.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script converts checkpoint .ckpt to .nemo file. - -This script uses the `examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml` -config file by default. The other option is to set another config file via command -line arguments by `--config-name=CONFIG_FILE_PATH'. -""" - -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import SpellcheckingAsrCustomizationModel -from nemo.core.config import hydra_runner -from nemo.utils import logging - - -@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config") -def main(cfg: DictConfig) -> None: - logging.debug(f'Config Params: {OmegaConf.to_yaml(cfg)}') - SpellcheckingAsrCustomizationModel.load_from_checkpoint(cfg.checkpoint_path).save_to(cfg.target_nemo_path) - - -if __name__ == "__main__": - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml b/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml deleted file mode 100644 index f8dca7b974e52288c607d42e7f8d9492bba5c324..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml +++ /dev/null @@ -1,97 +0,0 @@ -name: &name spellchecking -lang: ??? # e.g. 'ru', 'en' - -# Pretrained Nemo Models -pretrained_model: null - -trainer: - devices: 1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 3 # the number of training epochs - enable_checkpointing: false # provided by exp_manager - logger: false # provided by exp_manager - accumulate_grad_batches: 1 # accumulates grads every k batches - gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. - accelerator: gpu - strategy: ddp - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - -model: - do_training: true - label_map: ??? # path/.../label_map.txt - semiotic_classes: ??? # path/.../semiotic_classes.txt - max_sequence_len: 128 - lang: ${lang} - hidden_size: 768 - - optim: - name: adamw - lr: 3e-5 - weight_decay: 0.1 - - sched: - name: WarmupAnnealing - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - - # scheduler config override - warmup_ratio: 0.1 - last_epoch: -1 - - language_model: - pretrained_model_name: bert-base-uncased # For ru, try DeepPavlov/rubert-base-cased | For de or multilingual, try bert-base-multilingual-cased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - -exp_manager: - exp_dir: nemo_experiments # where to store logs and checkpoints - name: training # name of experiment - create_tensorboard_logger: True - create_checkpoint_callback: True - checkpoint_callback_params: - save_top_k: 3 - monitor: "val_loss" - mode: "min" - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - -tokenizer: - tokenizer_name: ${model.transformer} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - -# Data -data: - train_ds: - data_path: ??? # provide the full path to the file - batch_size: 8 - shuffle: true - num_workers: 3 - pin_memory: false - drop_last: false - - validation_ds: - data_path: ??? # provide the full path to the file. - batch_size: 8 - shuffle: false - num_workers: 3 - pin_memory: false - drop_last: false - - -# Inference -inference: - from_file: null # Path to the raw text, no labels required. Each sentence on a separate line - out_file: null # Path to the output file - batch_size: 16 # batch size for inference.from_file diff --git a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/convert_data_to_tarred.sh b/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/convert_data_to_tarred.sh deleted file mode 100644 index d4265eb4beb69cc190011ced068399a06c4fa642..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/convert_data_to_tarred.sh +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# Path to NeMo repository -NEMO_PATH=NeMo - -DATA_PATH="data_folder" - -## data_folder_example -## ├── tarred_data -## | └── (output) -## ├── config.json -##   ├── label_map.txt -##   ├── semiotic_classes.txt -## ├── test.tsv -## ├── 1.tsv -## ├── ... -## └── 200.tsv - -## Each of {1-200}.tsv input files are 110'000 examples subsets of all.tsv (except for validation part), -## generated by https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/dataset_preparation/build_training_data.sh -## Note that in this example we use 110'000 as input and only pack 100'000 of them to tar file. -## This is because some input examples, e.g. too long, can be skipped during preprocessing, and we want all tar files to contain fixed equal number of examples. - -for part in {1..200} -do - python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py \ - lang="en" \ - data.train_ds.data_path=${DATA_PATH}/${part}.tsv \ - data.validation_ds.data_path=${DATA_PATH}/test.tsv \ - model.max_sequence_len=256 \ - model.language_model.pretrained_model_name=huawei-noah/TinyBERT_General_6L_768D \ - model.language_model.config_file=${DATA_PATH}/config.json \ - model.label_map=${DATA_PATH}/label_map.txt \ - model.semiotic_classes=${DATA_PATH}/semiotic_classes.txt \ - +output_tar_file=${DATA_PATH}/tarred_data/part${part}.tar \ - +take_first_n_lines=100000 -done diff --git a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py b/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py deleted file mode 100644 index 68c55ff51a4fbb5d8157bd85931bb5ba3867d5b8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script is used to create an index of custom vocabulary and save it to file. -See "examples/nlp/spellchecking_asr_customization/run_infer.sh" for the whole inference pipeline. -""" - -from argparse import ArgumentParser - -from nemo.collections.nlp.data.spellchecking_asr_customization.utils import get_index, load_ngram_mappings - -parser = ArgumentParser(description="Create an index of custom vocabulary and save it to file") - -parser.add_argument( - "--input_name", required=True, type=str, help="Path to input file with custom vocabulary (plain text)" -) -parser.add_argument( - "--ngram_mappings", required=True, type=str, help="Path to input file with n-gram mapping vocabulary" -) -parser.add_argument("--output_name", required=True, type=str, help="Path to output file with custom vocabulary index") -parser.add_argument("--min_log_prob", default=-4.0, type=float, help="Threshold on log probability") -parser.add_argument( - "--max_phrases_per_ngram", - default=500, - type=int, - help="Threshold on number of phrases that can be stored for one n-gram key in index. Keys with more phrases are discarded.", -) -parser.add_argument( - "--max_misspelled_freq", default=125000, type=int, help="Threshold on maximum frequency of misspelled n-gram" -) - -args = parser.parse_args() - -# Load custom vocabulary -custom_phrases = set() -with open(args.input_name, "r", encoding="utf-8") as f: - for line in f: - phrase = line.strip() - custom_phrases.add(" ".join(list(phrase.replace(" ", "_")))) -print("Size of customization vocabulary:", len(custom_phrases)) - -# Load n-gram mappings vocabulary -ngram_mapping_vocab, ban_ngram = load_ngram_mappings(args.ngram_mappings, max_misspelled_freq=args.max_misspelled_freq) - -# Generate index of custom phrases -phrases, ngram2phrases = get_index( - custom_phrases, - ngram_mapping_vocab, - ban_ngram, - min_log_prob=args.min_log_prob, - max_phrases_per_ngram=args.max_phrases_per_ngram, -) - -# Save index to file -with open(args.output_name, "w", encoding="utf-8") as out: - for ngram in ngram2phrases: - for phrase_id, begin, size, logprob in ngram2phrases[ngram]: - phrase = phrases[phrase_id] - out.write(ngram + "\t" + phrase + "\t" + str(begin) + "\t" + str(size) + "\t" + str(logprob) + "\n") diff --git a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py b/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py deleted file mode 100644 index d0bdc2c9bd30db35b2ae1c20f1d07ce561147fe0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script is used to create a tarred dataset for SpellcheckingAsrCustomizationModel. - -This script uses the `/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml` -config file by default. The other option is to set another config file via command -line arguments by `--config-name=CONFIG_FILE_PATH'. Probably it is worth looking -at the example config file to see the list of parameters used for training. - -USAGE Example: -1. Obtain a processed dataset -2. Run: - python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py \ - lang=${LANG} \ - data.train_ds.data_path=${DATA_PATH}/train.tsv \ - model.language_model.pretrained_model_name=${LANGUAGE_MODEL} \ - model.label_map=${DATA_PATH}/label_map.txt \ - +output_tar_file=tarred/part1.tar \ - +take_first_n_lines=100000 - -""" -import pickle -import tarfile -from io import BytesIO - -from helpers import MODEL, instantiate_model_and_trainer -from omegaconf import DictConfig, OmegaConf - -from nemo.core.config import hydra_runner -from nemo.utils import logging - - -@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config Params: {OmegaConf.to_yaml(cfg)}') - logging.info("Start creating tar file from " + cfg.data.train_ds.data_path + " ...") - _, model = instantiate_model_and_trainer( - cfg, MODEL, True - ) # instantiate model like for training because we may not have pretrained model - dataset = model._train_dl.dataset - archive = tarfile.open(cfg.output_tar_file, mode="w") - max_lines = int(cfg.take_first_n_lines) - for i in range(len(dataset)): - if i >= max_lines: - logging.info("Reached " + str(max_lines) + " examples") - break - ( - input_ids, - input_mask, - segment_ids, - input_ids_for_subwords, - input_mask_for_subwords, - segment_ids_for_subwords, - character_pos_to_subword_pos, - labels_mask, - labels, - spans, - ) = dataset[i] - - # do not store masks as they are just arrays of 1 - content = { - "input_ids": input_ids, - "input_mask": input_mask, - "segment_ids": segment_ids, - "input_ids_for_subwords": input_ids_for_subwords, - "input_mask_for_subwords": input_mask_for_subwords, - "segment_ids_for_subwords": segment_ids_for_subwords, - "character_pos_to_subword_pos": character_pos_to_subword_pos, - "labels_mask": labels_mask, - "labels": labels, - "spans": spans, - } - b = BytesIO() - pickle.dump(content, b) - b.seek(0) - tarinfo = tarfile.TarInfo(name="example_" + str(i) + ".pkl") - tarinfo.size = b.getbuffer().nbytes - archive.addfile(tarinfo=tarinfo, fileobj=b) - - archive.close() - logging.info("Tar file " + cfg.output_tar_file + " created!") - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/helpers.py b/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/helpers.py deleted file mode 100644 index 2db11b0e7d96d99a0b491231ad137af00c88d5b8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/helpers.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -from typing import Tuple - -import pytorch_lightning as pl -from omegaconf import DictConfig - -from nemo.collections.nlp.models import SpellcheckingAsrCustomizationModel -from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector -from nemo.utils import logging - -__all__ = ["MODEL", "MODEL_NAMES", "instantiate_model_and_trainer"] - -MODEL = "spellchecking" -MODEL_NAMES = [MODEL] - - -def instantiate_model_and_trainer( - cfg: DictConfig, model_name: str, do_training: bool -) -> Tuple[pl.Trainer, SpellcheckingAsrCustomizationModel]: - """ Function for instantiating a model and a trainer - Args: - cfg: The config used to instantiate the model and the trainer. - model_name: A str indicates the model direction, currently only 'itn'. - do_training: A boolean flag indicates whether the model will be trained or evaluated. - - Returns: - trainer: A PyTorch Lightning trainer - model: A SpellcheckingAsrCustomizationModel - """ - - if model_name not in MODEL_NAMES: - raise ValueError(f"{model_name} is unknown model type") - - # Get configs for the corresponding models - trainer_cfg = cfg.get("trainer") - model_cfg = cfg.get("model") - pretrained_cfg = cfg.get("pretrained_model", None) - trainer = pl.Trainer(**trainer_cfg) - if not pretrained_cfg: - logging.info(f"Initializing {model_name} model") - if model_name == MODEL: - model = SpellcheckingAsrCustomizationModel(model_cfg, trainer=trainer) - else: - raise ValueError(f"{model_name} is unknown model type") - elif os.path.exists(pretrained_cfg): - logging.info(f"Restoring pretrained {model_name} model from {pretrained_cfg}") - save_restore_connector = NLPSaveRestoreConnector() - model = SpellcheckingAsrCustomizationModel.restore_from( - pretrained_cfg, save_restore_connector=save_restore_connector - ) - else: - logging.info(f"Loading pretrained model {pretrained_cfg}") - if model_name == MODEL: - if pretrained_cfg not in SpellcheckingAsrCustomizationModel.get_available_model_names(): - raise ( - ValueError( - f"{pretrained_cfg} not in the list of available Tagger models." - f"Select from {SpellcheckingAsrCustomizationModel.list_available_models()}" - ) - ) - model = SpellcheckingAsrCustomizationModel.from_pretrained(pretrained_cfg) - else: - raise ValueError(f"{model_name} is unknown model type") - - # Setup train and validation data - if do_training: - model.setup_training_data(train_data_config=cfg.data.train_ds) - model.setup_validation_data(val_data_config=cfg.data.validation_ds) - - logging.info(f"Model {model_name} -- Device {model.device}") - return trainer, model diff --git a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py b/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py deleted file mode 100644 index 871d5e5c0c0c8a236aac5e7f70d3231a462288c8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script is used to postprocess SpellMapper results and generate an updated nemo ASR manifest. -See "examples/nlp/spellchecking_asr_customization/run_infer.sh" for the whole inference pipeline. -""" - -from argparse import ArgumentParser - -from nemo.collections.nlp.data.spellchecking_asr_customization.utils import ( - update_manifest_with_spellmapper_corrections, -) - -parser = ArgumentParser(description="Postprocess SpellMapper results and generate an updated nemo ASR manifest") - -parser.add_argument("--input_manifest", required=True, type=str, help="Path to input nemo ASR manifest") -parser.add_argument( - "--field_name", default="pred_text", type=str, help="Name of json field with original ASR hypothesis text" -) -parser.add_argument( - "--short2full_name", - required=True, - type=str, - help="Path to input file with correspondence between sentence fragments and full sentences", -) -parser.add_argument( - "--spellmapper_results", required=True, type=str, help="Path to input file with SpellMapper inference results" -) -parser.add_argument("--output_manifest", required=True, type=str, help="Path to output nemo ASR manifest") -parser.add_argument("--min_prob", default=0.5, type=float, help="Threshold on replacement probability") -parser.add_argument( - "--use_dp", - action="store_true", - help="Whether to use additional replacement filtering by using dynamic programming", -) -parser.add_argument( - "--replace_hyphen_to_space", - action="store_true", - help="Whether to use space instead of hyphen in replaced fragments", -) -parser.add_argument( - "--ngram_mappings", type=str, required=True, help="File with ngram mappings, only needed if use_dp=true" -) -parser.add_argument( - "--min_dp_score_per_symbol", - default=-1.5, - type=float, - help="Minimum dynamic programming sum score averaged by hypothesis length", -) - -args = parser.parse_args() - -update_manifest_with_spellmapper_corrections( - input_manifest_name=args.input_manifest, - short2full_name=args.short2full_name, - output_manifest_name=args.output_manifest, - spellmapper_results_name=args.spellmapper_results, - min_prob=args.min_prob, - replace_hyphen_to_space=args.replace_hyphen_to_space, - field_name=args.field_name, - use_dp=args.use_dp, - ngram_mappings=args.ngram_mappings, - min_dp_score_per_symbol=args.min_dp_score_per_symbol, -) - -print("Resulting manifest saved to: ", args.output_manifest) diff --git a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py b/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py deleted file mode 100644 index 6fd5e524390a16d9c91a55c700f7d904dfc2ff4c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script contains an example on how to prepare input for SpellMapper inference from a nemo ASR manifest. -It splits sentences to shorter fragments, runs candidate retrieval and generates input in the required format. -It produces two output files: - 1. File with correspondence between sentence fragments and full sentences. - 2. File that will serve as input for SpellMapper inference. - -See "examples/nlp/spellchecking_asr_customization/run_infer.sh" for the whole inference pipeline. -""" - -from argparse import ArgumentParser - -from nemo.collections.nlp.data.spellchecking_asr_customization.utils import ( - extract_and_split_text_from_manifest, - get_candidates, - load_index, -) - -parser = ArgumentParser(description="Prepare input for SpellMapper inference from a nemo ASR manifest") -parser.add_argument("--manifest", required=True, type=str, help="Path to input manifest file") -parser.add_argument( - "--custom_vocab_index", required=True, type=str, help="Path to input file with custom vocabulary index" -) -parser.add_argument( - "--big_sample", - required=True, - type=str, - help="Path to input file with big sample of phrases to sample dummy candidates if there less than 10 are found by retrieval", -) -parser.add_argument( - "--short2full_name", - required=True, - type=str, - help="Path to output file with correspondence between sentence fragments and full sentences", -) -parser.add_argument( - "--output_name", - required=True, - type=str, - help="Path to output file that will serve as input for SpellMapper inference", -) -parser.add_argument("--field_name", default="pred_text", type=str, help="Name of json field with ASR hypothesis text") -parser.add_argument("--len_in_words", default=16, type=int, help="Maximum fragment length in words") -parser.add_argument( - "--step_in_words", - default=8, - type=int, - help="Step in words for moving to next fragment. If less than len_in_words, fragments will intersect", -) - -args = parser.parse_args() - -# Split ASR hypotheses to shorter fragments, because SpellMapper can't handle arbitrarily long sequences. -# The correspondence between short and original fragments is saved to a file and will be used at post-processing. -extract_and_split_text_from_manifest( - input_name=args.manifest, - output_name=args.short2full_name, - field_name=args.field_name, - len_in_words=args.len_in_words, - step_in_words=args.step_in_words, -) - -# Load index of custom vocabulary from file -phrases, ngram2phrases = load_index(args.custom_vocab_index) - -# Load big sample of phrases to sample dummy candidates if there less than 10 are found by retrieval -big_sample_of_phrases = set() -with open(args.big_sample, "r", encoding="utf-8") as f: - for line in f: - phrase, freq = line.strip().split("\t") - if int(freq) > 50: # do not want to use frequent phrases as dummy candidates - continue - if len(phrase) < 6 or len(phrase) > 15: # do not want to use too short or too long phrases as dummy candidates - continue - big_sample_of_phrases.add(phrase) - -big_sample_of_phrases = list(big_sample_of_phrases) - -# Generate input for SpellMapper inference -out = open(args.output_name, "w", encoding="utf-8") -with open(args.short2full_name, "r", encoding="utf-8") as f: - for line in f: - short_sent, _ = line.strip().split("\t") - sent = "_".join(short_sent.split()) - letters = list(sent) - candidates = get_candidates(ngram2phrases, phrases, letters, big_sample_of_phrases) - if len(candidates) == 0: - continue - if len(candidates) != 10: - raise ValueError("expect 10 candidates, got: ", len(candidates)) - - # We add two columns with targets and span_info. - # They have same format as during training, but start and end positions are APPROXIMATE, they will be adjusted when constructing BertExample. - targets = [] - span_info = [] - for idx, c in enumerate(candidates): - if c[1] == -1: - continue - targets.append(str(idx + 1)) # targets are 1-based - start = c[1] - # ensure that end is not outside sentence length (it can happen because c[2] is candidate length used as approximation) - end = min(c[1] + c[2], len(letters)) - span_info.append("CUSTOM " + str(start) + " " + str(end)) - out.write( - " ".join(letters) - + "\t" - + ";".join([x[0] for x in candidates]) - + "\t" - + " ".join(targets) - + "\t" - + ";".join(span_info) - + "\n" - ) -out.close() diff --git a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/run_infer.sh b/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/run_infer.sh deleted file mode 100644 index b4bbdc4da375db794576e9adae90a97a26aa30b0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/run_infer.sh +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -## RUN INFERENCE ON NEMO MANIFEST AND CUSTOM VOCABULARY - -## Path to NeMo repository -NEMO_PATH=NeMo - -## Download model repo from Hugging Face (if clone doesn't work, run "git lfs install" and try again) -git clone https://huggingface.co/bene-ges/spellmapper_asr_customization_en -## Download repo with test data -git clone https://huggingface.co/datasets/bene-ges/spellmapper_en_evaluation - -## Files in model repo -PRETRAINED_MODEL=spellmapper_asr_customization_en/training_10m_5ep.nemo -NGRAM_MAPPINGS=spellmapper_asr_customization_en/replacement_vocab_filt.txt -BIG_SAMPLE=spellmapper_asr_customization_en/big_sample.txt - -## Override these two files if you want to test on your own data -## File with input nemo ASR manifest -INPUT_MANIFEST=spellmapper_en_evaluation/medical_manifest_ctc.json -## File containing custom words and phrases (plain text) -CUSTOM_VOCAB=spellmapper_en_evaluation/medical_custom_vocab.txt - -## Other files will be created -## File with index of custom vocabulary -INDEX="index.txt" -## File with short fragments and corresponding original sentences -SHORT2FULL="short2full.txt" -## File with input for SpellMapper inference -SPELLMAPPER_INPUT="spellmapper_input.txt" -## File with output of SpellMapper inference -SPELLMAPPER_OUTPUT="spellmapper_output.txt" -## File with output nemo ASR manifest -OUTPUT_MANIFEST="out_manifest.json" - - -# Create index of custom vocabulary -python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/create_custom_vocab_index.py \ - --input_name ${CUSTOM_VOCAB} \ - --ngram_mappings ${NGRAM_MAPPINGS} \ - --output_name ${INDEX} \ - --min_log_prob -4.0 \ - --max_phrases_per_ngram 600 - -# Prepare input for SpellMapper inference -python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py \ - --manifest ${INPUT_MANIFEST} \ - --custom_vocab_index ${INDEX} \ - --big_sample ${BIG_SAMPLE} \ - --short2full_name ${SHORT2FULL} \ - --output_name ${SPELLMAPPER_INPUT} \ - --field_name "pred_text" \ - --len_in_words 16 \ - --step_in_words 8 - -# Run SpellMapper inference -python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \ - pretrained_model=${PRETRAINED_MODEL} \ - model.max_sequence_len=512 \ - inference.from_file=${SPELLMAPPER_INPUT} \ - inference.out_file=${SPELLMAPPER_OUTPUT} \ - inference.batch_size=16 \ - lang=en - -# Postprocess and create output corrected manifest -python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \ - --input_manifest ${INPUT_MANIFEST} \ - --short2full_name ${SHORT2FULL} \ - --output_manifest ${OUTPUT_MANIFEST} \ - --spellmapper_result ${SPELLMAPPER_OUTPUT} \ - --replace_hyphen_to_space \ - --field_name "pred_text" \ - --use_dp \ - --ngram_mappings ${NGRAM_MAPPINGS} \ - --min_dp_score_per_symbol -1.5 - -# Check WER of initial manifest -python ${NEMO_PATH}/examples/asr/speech_to_text_eval.py \ - dataset_manifest=${INPUT_MANIFEST} \ - use_cer=False \ - only_score_manifest=True - -# Check WER of corrected manifest -python ${NEMO_PATH}/examples/asr/speech_to_text_eval.py \ - dataset_manifest=${OUTPUT_MANIFEST} \ - use_cer=False \ - only_score_manifest=True diff --git a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/run_training.sh b/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/run_training.sh deleted file mode 100644 index 85dddbb2a038ac9c9a91965d81381334aceb2c59..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/run_training.sh +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -## TRAIN WITH NON-TARRED DATA - -# Path to NeMo repository -NEMO_PATH=NeMo - -## Download repo with training data (very small example) -## If clone doesn't work, run "git lfs install" and try again -git clone https://huggingface.co/datasets/bene-ges/spellmapper_en_train_micro - -DATA_PATH=spellmapper_en_train_micro - -## Example of all files needed to run training with non-tarred data: -## spellmapper_en_train_micro -## ├── config.json -##   ├── label_map.txt -##   ├── semiotic_classes.txt -## ├── test.tsv -## └── train.tsv - -## To generate files config.json, label_map.txt, semiotic_classes.txt - run generate_configs.sh -## Files "train.tsv" and "test.tsv" contain training examples. -## For data preparation see https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/dataset_preparation/build_training_data.sh - -## Note that training with non-tarred data only works on single gpu. It makes sense if you use 1-2 million examples or less. - -python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py \ - lang="en" \ - data.validation_ds.data_path=${DATA_PATH}/test.tsv \ - data.train_ds.data_path=${DATA_PATH}/train.tsv \ - data.train_ds.batch_size=32 \ - data.train_ds.num_workers=8 \ - model.max_sequence_len=512 \ - model.language_model.pretrained_model_name=huawei-noah/TinyBERT_General_6L_768D \ - model.language_model.config_file=${DATA_PATH}/config.json \ - model.label_map=${DATA_PATH}/label_map.txt \ - model.semiotic_classes=${DATA_PATH}/semiotic_classes.txt \ - model.optim.lr=3e-5 \ - trainer.devices=[1] \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.strategy=ddp \ - trainer.max_epochs=5 diff --git a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh b/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh deleted file mode 100644 index 655c3e23e6103a1fd1092303ed516ad6c2edba96..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -## TRAIN WITH TARRED DATA - -# Path to NeMo repository -NEMO_PATH=NeMo - -DATA_PATH=data_folder - -## data_folder_example -## ├── train_tarred -## | ├── part1.tar -## | ├── ... -## | └── part200.tar -## ├── config.json -##   ├── label_map.txt -##   ├── semiotic_classes.txt -## └── test.tsv -## To generate files config.json, label_map.txt, semiotic_classes.txt, run generate_configs.sh -## To prepare data, see ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/dataset_preparation/build_training_data.sh -## To convert data to tarred format, split all.tsv to pieces of 110'000 examples (except for validation part) and use ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/dataset_preparation/convert_data_to_tarred.sh -## To run training with tarred data, use ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/run_training_tarred.sh - -## ATTENTION: How to calculate model.optim.sched.max_steps: -## Suppose, you have 2'000'000 training examples, and want to train for 5 epochs on 4 gpus with batch size 32. -## 5 (epochs) * 32 (bs) * 4 (gpus) -## 1 step consumes 128 examples (32(bs) * 4(gpus)) -## 1 epoch makes 2000000/128=15625 steps (updates) -## 5 epochs make 5*15625=78125 steps - -python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py \ - lang="en" \ - data.validation_ds.data_path=${DATA_PATH}/test.tsv \ - data.train_ds.data_path=${DATA_PATH}/train_tarred/part_OP_1..100_CL_.tar \ - data.train_ds.batch_size=32 \ - data.train_ds.num_workers=16 \ - +data.train_ds.use_tarred_dataset=true \ - data.train_ds.shuffle=false \ - data.validation_ds.batch_size=16 \ - model.max_sequence_len=512 \ - model.language_model.pretrained_model_name=huawei-noah/TinyBERT_General_6L_768D \ - model.language_model.config_file=${DATA_PATH}/config.json \ - model.label_map=${DATA_PATH}/label_map.txt \ - model.semiotic_classes=${DATA_PATH}/semiotic_classes.txt \ - model.optim.sched.name=CosineAnnealing \ - +model.optim.sched.max_steps=195313 \ - trainer.devices=8 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.strategy=ddp \ - trainer.max_epochs=5 diff --git a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py b/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py deleted file mode 100644 index 593264f14a5d617589fa147ef059c7d5975daccf..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script contains an example on how to run inference with the SpellcheckingAsrCustomizationModel. - -An input line should consist of 4 tab-separated columns: - 1. text of ASR-hypothesis - 2. texts of 10 candidates separated by semicolon - 3. 1-based ids of non-dummy candidates - 4. approximate start/end coordinates of non-dummy candidates (correspond to ids in third column) - -Example input (in one line): - t h e _ t a r a s i c _ o o r d a _ i s _ a _ p a r t _ o f _ t h e _ a o r t a _ l o c a t e d _ i n _ t h e _ t h o r a x - h e p a t i c _ c i r r h o s i s;u r a c i l;c a r d i a c _ a r r e s t;w e a n;a p g a r;p s y c h o m o t o r;t h o r a x;t h o r a c i c _ a o r t a;a v f;b l o c k a d e d - 1 2 6 7 8 9 10 - CUSTOM 6 23;CUSTOM 4 10;CUSTOM 4 15;CUSTOM 56 62;CUSTOM 5 19;CUSTOM 28 31;CUSTOM 39 48 - -Each line in SpellMapper output is tab-separated and consists of 4 columns: - 1. ASR-hypothesis (same as in input) - 2. 10 candidates separated with semicolon (same as in input) - 3. fragment predictions, separated with semicolon, each prediction is a tuple (start, end, candidate_id, probability) - 4. letter predictions - candidate_id predicted for each letter (this is only for debug purposes) - -Example output (in one line): - t h e _ t a r a s i c _ o o r d a _ i s _ a _ p a r t _ o f _ t h e _ a o r t a _ l o c a t e d _ i n _ t h e _ t h o r a x - h e p a t i c _ c i r r h o s i s;u r a c i l;c a r d i a c _ a r r e s t;w e a n;a p g a r;p s y c h o m o t o r;t h o r a x;t h o r a c i c _ a o r t a;a v f;b l o c k a d e d - 56 62 7 0.99998;4 20 8 0.95181;12 20 8 0.44829;4 17 8 0.99464;12 17 8 0.97645 - 8 8 8 0 8 8 8 8 8 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 7 7 7 7 7 - - -USAGE Example: -1. Train a model, or use a pretrained checkpoint. -2. Run on a single file: - python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \ - pretrained_model=${PRETRAINED_NEMO_CHECKPOINT} \ - model.max_sequence_len=512 \ - inference.from_file=input.txt \ - inference.out_file=output.txt \ - inference.batch_size=16 \ - lang=en -or on multiple files: - python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \ - pretrained_model=${PRETRAINED_NEMO_CHECKPOINT} \ - model.max_sequence_len=512 \ - +inference.from_filelist=filelist.txt \ - +inference.output_folder=output_folder \ - inference.batch_size=16 \ - lang=en - -This script uses the `/examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml` -config file by default. The other option is to set another config file via command -line arguments by `--config-name=CONFIG_FILE_PATH'. -""" - - -import os - -from helpers import MODEL, instantiate_model_and_trainer -from omegaconf import DictConfig, OmegaConf - -from nemo.core.config import hydra_runner -from nemo.utils import logging - - -@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config") -def main(cfg: DictConfig) -> None: - logging.debug(f'Config Params: {OmegaConf.to_yaml(cfg)}') - - if cfg.pretrained_model is None: - raise ValueError("A pre-trained model should be provided.") - _, model = instantiate_model_and_trainer(cfg, MODEL, False) - - if cfg.model.max_sequence_len != model.max_sequence_len: - model.max_sequence_len = cfg.model.max_sequence_len - model.builder._max_seq_length = cfg.model.max_sequence_len - input_filenames = [] - output_filenames = [] - - if "from_filelist" in cfg.inference and "output_folder" in cfg.inference: - filelist_file = cfg.inference.from_filelist - output_folder = cfg.inference.output_folder - with open(filelist_file, "r", encoding="utf-8") as f: - for line in f: - path = line.strip() - input_filenames.append(path) - folder, name = os.path.split(path) - output_filenames.append(os.path.join(output_folder, name)) - else: - text_file = cfg.inference.from_file - logging.info(f"Running inference on {text_file}...") - if not os.path.exists(text_file): - raise ValueError(f"{text_file} not found.") - input_filenames.append(text_file) - output_filenames.append(cfg.inference.out_file) - - dataloader_cfg = { - "batch_size": cfg.inference.get("batch_size", 8), - "num_workers": cfg.inference.get("num_workers", 4), - "pin_memory": cfg.inference.get("num_workers", False), - } - for input_filename, output_filename in zip(input_filenames, output_filenames): - if not os.path.exists(input_filename): - logging.info(f"Skip non-existing {input_filename}.") - continue - model.infer(dataloader_cfg, input_filename, output_filename) - logging.info(f"Predictions saved to {output_filename}.") - - -if __name__ == "__main__": - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py b/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py deleted file mode 100644 index 7ea9314d196dea3097785cc28d9d5ee0c490bed8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_train.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script contains an example on how to train SpellMapper (SpellcheckingAsrCustomizationModel). -It uses the `examples/nlp/spellchecking_asr_customization/conf/spellchecking_asr_customization_config.yaml` -config file by default. The other option is to set another config file via command -line arguments by `--config-name=CONFIG_FILE_PATH'. Probably it is worth looking -at the example config file to see the list of parameters used for training. - -USAGE Example: - See `examples/nlp/spellchecking_asr_customization/run_training.sh` for training on non-tarred data. - and - `examples/nlp/spellchecking_asr_customization/run_training_tarred.sh` for training on tarred data. - -One (non-tarred) training example should consist of 4 tab-separated columns: - 1. text of ASR-hypothesis - 2. texts of 10 candidates separated by semicolon - 3. 1-based ids of correct candidates, or 0 if none - 4. start/end coordinates of correct candidates (correspond to ids in third column) -Example (in one line): - a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o - d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y - 1 3 - CUSTOM 12 23;CUSTOM 28 41 -""" - -from helpers import MODEL, instantiate_model_and_trainer -from omegaconf import DictConfig, OmegaConf - -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="spellchecking_asr_customization_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config Params: {OmegaConf.to_yaml(cfg)}') - - # Train the model - if cfg.model.do_training: - logging.info( - "================================================================================================" - ) - logging.info('Start training...') - trainer, model = instantiate_model_and_trainer(cfg, MODEL, True) - spellchecking_exp_manager = cfg.get('exp_manager', None) - exp_manager(trainer, spellchecking_exp_manager) - trainer.fit(model) - logging.info('Training finished!') - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/text2sparql/conf/text2sparql_config.yaml b/SoundScribe/SpeakerID/examples/nlp/text2sparql/conf/text2sparql_config.yaml deleted file mode 100644 index b9823e79b0507dc9a13c90c9443381196e82f6ff..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text2sparql/conf/text2sparql_config.yaml +++ /dev/null @@ -1,106 +0,0 @@ -# Text2Sparql with BART - -name: &name Text2Sparql - -trainer: - devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices - num_nodes: 1 - max_epochs: 2 # the number of training epochs - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - accelerator: gpu - strategy: ddp - gradient_clip_val: 0.0 - log_every_n_steps: 1 - val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch - enable_checkpointing: False # provided by exp_manager - logger: false # provided by exp_manager - -model: - nemo_path: null # exported .nemo path - max_seq_length: 150 - batch_size: 16 - convert_labels: true # true if Bart, false otherwise (converts pad_id to -100 for masked loss) - data_dir: null - - language_model: - pretrained_model_name: facebook/bart-base # huggingface end-to-end model name - pretrained_encoder_model_name: null # huggingface encoder model name - pretrained_decoder_model_name: null # huggingface decoder model name - lm_checkpoint: null - config: null - config_file: null # json file, precedence over config - - encoder_tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # tokenizer that inherits from TokenizerSpec - vocab_file: null # path to vocab file - tokenizer_model: null # tokenizer model for sentencepiece - special_tokens: null - add_special_tokens: true - - decoder_tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # tokenizer that inherits from TokenizerSpec - vocab_file: null # path to vocab file - tokenizer_model: null # tokenizer model for sentencepiece - special_tokens: null - add_special_tokens: true - - train_ds: - filepath: ${model.data_dir}/train.tsv # path to data file - shuffle: true - num_samples: -1 - num_workers: 2 - drop_last: false - pin_memory: false - - validation_ds: - filepath: ${model.data_dir}/test_easy.tsv # path to data file - shuffle: false - num_samples: -1 - num_workers: 2 - drop_last: false - pin_memory: false - - test_ds: - filepath: ${model.data_dir}/test_hard.tsv # path to data file - shuffle: false - num_samples: -1 - num_workers: 2 - drop_last: false - pin_memory: false - - optim: - name: adamw - lr: 4e-5 - weight_decay: 0.0 - - sched: - name: CosineAnnealing - warmup_steps: null - warmup_ratio: 0.06 - min_lr: 0.0 - last_epoch: -1 - - generate: - max_length: ${model.max_seq_length} - num_beams: 1 - length_penalty: 2.0 - early_stopping: true - repetition_penalty: 1.0 - do_sample: false - top_k: null - top_p: null - num_return_sequences: 1 - -exp_manager: - exp_dir: null # where to store logs and checkpoints - name: *name # name of experiment - create_tensorboard_logger: True - create_checkpoint_callback: True - -hydra: - run: - dir: . - job_logging: - root: - handlers: null \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/text2sparql/data/import_datasets.py b/SoundScribe/SpeakerID/examples/nlp/text2sparql/data/import_datasets.py deleted file mode 100644 index ce5f86393b95e13fe550788132ece2c9d3f54134..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text2sparql/data/import_datasets.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright (c) 2020, MeetKai Inc. All rights reserved. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script downloads Text2Sparql data and processes it into NeMo's neural machine translation dataset format. - -Text2Sparql data consists of 3 files which are saved to source_data_dir: - - train_queries_v3.tsv - - test_easy_queries_v3.tsv - - test_hard_queries_v3.tsv - -After processing, the script saves them to the target_data_dir as: - - train.tsv - - test_easy.tsv - - test_hard.tsv - - -You may run it with: - -python import_datasets \ - --source_data_dir ./text2sparql_src \ - --target_data_dir ./text2sparql_tgt -""" - -import argparse -import csv -import os -from urllib.request import Request, urlopen - -from nemo.collections.nlp.data.data_utils.data_preprocessing import MODE_EXISTS_TMP, if_exist -from nemo.utils import logging - -base_url = "https://m.meetkai.com/public_datasets/knowledge/" -prefix_map = { - "train_queries_v3.tsv": "train.tsv", - "test_easy_queries_v3.tsv": "test_easy.tsv", - "test_hard_queries_v3.tsv": "test_hard.tsv", -} - - -def download_text2sparql(infold: str): - """Downloads text2sparql train, test_easy, and test_hard data - - Args: - infold: save directory path - """ - os.makedirs(infold, exist_ok=True) - - for prefix in prefix_map: - url = base_url + prefix - - logging.info(f"Downloading: {url}") - if if_exist(infold, [prefix]): - logging.info("** Download file already exists, skipping download") - else: - req = Request(url, headers={"User-Agent": "Mozilla/5.0"}) - with open(os.path.join(infold, prefix), "wb") as handle: - handle.write(urlopen(req, timeout=20).read()) - - -def process_text2sparql(infold: str, outfold: str, do_lower_case: bool): - """ Process and convert MeetKai's text2sparql datasets to NeMo's neural machine translation format. - - Args: - infold: directory path to raw text2sparql data containing - train.tsv, test_easy.tsv, test_hard.tsv - outfold: output directory path to save formatted data for NeuralMachineTranslationDataset - the first line is header (sentence [tab] label) - each line should be [sentence][tab][label] - do_lower_case: if true, convert all sentences and labels to lower - """ - logging.info(f"Processing Text2Sparql dataset and storing at: {outfold}") - - os.makedirs(outfold, exist_ok=True) - - dataset_name = "Text2Sparql" - for prefix in prefix_map: - input_file = os.path.join(infold, prefix) - output_file = os.path.join(outfold, prefix_map[prefix]) - - if if_exist(outfold, [prefix_map[prefix]]): - logging.info(f"** {MODE_EXISTS_TMP.format(prefix_map[prefix], dataset_name, output_file)}") - continue - - if not if_exist(infold, [prefix]): - logging.info(f"** {prefix} of {dataset_name}" f" is skipped as it was not found") - continue - - assert input_file != output_file, "input file cannot equal output file" - with open(input_file, "r") as in_file: - with open(output_file, "w") as out_file: - reader = csv.reader(in_file, delimiter="\t") - - # replace headers - out_file.write("sentence\tlabel\n") - next(reader) - - for line in reader: - sentence = line[0] - label = line[1] - if do_lower_case: - sentence = sentence.lower() - label = label.lower() - out_file.write(f"{sentence}\t{label}\n") - - -if __name__ == "__main__": - # Parse the command-line arguments. - parser = argparse.ArgumentParser(description="Process and convert datasets into NeMo's format") - parser.add_argument( - "--source_data_dir", required=True, type=str, help="Path to the folder containing the dataset files" - ) - parser.add_argument("--target_data_dir", required=True, type=str, help="Path to save the processed dataset") - parser.add_argument("--do_lower_case", action="store_true") - args = parser.parse_args() - - source_dir = args.source_data_dir - target_dir = args.target_data_dir - do_lower_case = args.do_lower_case - - download_text2sparql(infold=source_dir) - process_text2sparql(infold=source_dir, outfold=target_dir, do_lower_case=do_lower_case) diff --git a/SoundScribe/SpeakerID/examples/nlp/text2sparql/evaluate_text2sparql.py b/SoundScribe/SpeakerID/examples/nlp/text2sparql/evaluate_text2sparql.py deleted file mode 100644 index 52baa2a7e78c6046ec4e54569fd27e9bb2a1edd2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text2sparql/evaluate_text2sparql.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright (c) 2020, MeetKai Inc. All rights reserved. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script contains an example on how to evaluate a NeuralMachineTranslationModel. -To load the example Text2Sparql dataset, please refer to ./data/import_datasets.py. -To train a model, please refer to text2sparql.py. - - -***Setting the configs*** -This script uses the `/examples/nlp/text2sparql/conf/text2sparql_config.yaml` config file by default. -You may update the config file from the file directly or by using the command line arguments. -Another other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'. - -Please refer to text2sparql.py for detailed instructions on setting the configuration. - - -***How to run the script?*** -- To reload and evaluate the model, run: - -python evaluate_text2sparql.py \ - model.test_ds.filepath="$TGT_DATA_DIR"/test_easy.tsv \ - model.batch_size=16 \ - model.nemo_path=./NeMo_logs/bart.nemo \ - exp_manager.exp_dir=./NeMo_logs -""" - -import os - -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models.text2sparql import Text2SparqlModel -from nemo.core.config import hydra_runner -from nemo.utils import logging - - -@hydra_runner(config_path="conf", config_name="text2sparql_config") -def main(cfg: DictConfig) -> None: - logging.info(f"Config:\n {OmegaConf.to_yaml(cfg)}") - trainer = pl.Trainer(devices=cfg.trainer.devices, accelerator=cfg.trainer.accelerator) - nmt_model = Text2SparqlModel.restore_from(restore_path=cfg.model.nemo_path) - nmt_model.setup_test_data(cfg.model.test_ds) - results = trainer.test(nmt_model) - - with open(cfg.model.test_ds.filepath, "r", encoding='utf-8') as f: - lines = f.readlines() - - lines[0] = lines[0].strip() + f"\tpredictions\n" - for i, res in enumerate(results[0]["texts"]): - lines[i + 1] = lines[i + 1].strip() + f"\t{res}\n" - - savepath = os.path.join(cfg.exp_manager.exp_dir, os.path.basename(cfg.model.test_ds.filepath)) - with open(savepath, "w", encoding='utf-8') as f: - f.writelines(lines) - logging.info(f"Predictions saved to {savepath}") - - -if __name__ == "__main__": - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/text2sparql/text2sparql.py b/SoundScribe/SpeakerID/examples/nlp/text2sparql/text2sparql.py deleted file mode 100644 index 1353a3967735c9413480075042cfe8f5dfa64915..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text2sparql/text2sparql.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright (c) 2020, MeetKai Inc. All rights reserved. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script contains an example on how to train and save a Text2SparqlModel. -Text2SparqlModel in NeMo supports sequence to sequence problems such as language translation -and text summarization, provided the data follows the format specified below. - - -***Data format*** -Text2SparqlModel requires the data to be stored in TAB separated files (.tsv) with two columns of -sentence and label, where the first line is a header of format: - sentence[TAB]label -And each line is of the format: - [SENTENCE][TAB][LABEL] - -If your dataset is stored in another format, you need to convert it to this format to use a -Text2SparqlModel. - - -***Setting the configs*** -This script uses the `/examples/nlp/text2sparql/conf/text2sparql_config.yaml` config file by default. -You may update the config file from the file directly or by using the command line arguments. -Another other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'. - -A Text2SparqlModel's config file declares multiple import sections. They are: - - trainer: Arguments to be passed to PyTorch Lightning. - - model: All arguments that relate to the Model - language_model, tokenizers, datasets, optimizer, generate. - - exp_manager: Arguments to be passed to NeMo's experiment manager. - - hydra: Arguments to be passed to Hydra. - -If using text2sparql_config.yaml, you must first update the following fields in the config: - - model.nemo_path: Model save path. Eg. [PATH]/bart.nemo - - model.data_dir: Path to data directory. Alternatively, you can adjust the file paths directly: - - model.train_ds.filepath - - model.validation_ds.filepath - - model.test_ds.filepath - - exp_manager.exp_dir: Directory to log results from the experiment. - -It is highly recommended to also adjust these parameters as necessary: - - trainer.devices: Set to 0 to use CPU. Otherwise the number denotes the number of GPUs. - - trainer.max_epochs: Maximum number of epochs to train for. - - model.batch_size: 8 is sufficient to train a decent Bart model for Text2Sparql. - - model.max_seq_length: Maximum (tokenized) sequence length. 150 is sufficient for Text2Sparql. - - model.language_model.pretrained_model_name: End2end pretrained model name from huggingface. - - model.encoder_tokenizer.tokenizer_name: Pretrained tokenizer name from huggingface. - - model.decoder_tokenizer.tokenizer_name: The same as above, as the tokenizer will handle encoding and decoding. - - model.optim.lr: Learning rate. - -You can also specify an encoder and decoder rather than using an end2end model like Bart by defining these parameters: - - model.language_model.pretrained_encoder_model_name: Pretrained huggingface encoder model name. - - model.encoder_tokenizer.tokenizer_name: Pretrained huggingface encoder tokenizer name. - - model.language_model.pretrained_decoder_model_name: Pretrained huggingface decoder model name. - - model.decoder_tokenizer.tokenizer_name: Pretrained huggingface decoder tokenizer name. - - model.language_model.pretrained_model_name: Set this to null. - - -***How to run the script?*** -- First, download the data to TGT_DATA_DIR (see ./data/import_datasets.py): - -SRC_DATA_DIR=./data/text2sparql_src -TGT_DATA_DIR=./data/text2sparql_tgt - -python ./data/import_datasets.py \ - --source_data_dir $SRC_DATA_DIR \ - --target_data_dir $TGT_DATA_DIR - -- And run the following to train and save the model: - -python text2sparql.py \ - model.train_ds.filepath="$TGT_DATA_DIR"/train.tsv \ - model.validation_ds.filepath="$TGT_DATA_DIR"/test_easy.tsv \ - model.test_ds.filepath="$TGT_DATA_DIR"/test_hard.tsv \ - model.batch_size=16 \ - model.nemo_path=./NeMo_logs/bart.nemo \ - exp_manager.exp_dir=./NeMo_logs -""" - -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models.text2sparql import Text2SparqlModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="text2sparql_config") -def main(cfg: DictConfig) -> None: - logging.info(f"Config:\n {OmegaConf.to_yaml(cfg)}") - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - nmt_model = Text2SparqlModel(cfg.model, trainer=trainer) - trainer.fit(nmt_model) - if cfg.model.nemo_path: - nmt_model.save_to(cfg.model.nemo_path) - - -if __name__ == "__main__": - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/text_classification/conf/ptune_text_classification_config.yaml b/SoundScribe/SpeakerID/examples/nlp/text_classification/conf/ptune_text_classification_config.yaml deleted file mode 100644 index 47180b59175cc75a831a32139a4bd9ef7095cbdb..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_classification/conf/ptune_text_classification_config.yaml +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Config file for text classification with pre-trained BERT models - -trainer: - devices: 1 # number of GPUs (0 for CPU), or list of the GPUs to use e.g. [0, 1] - num_nodes: 1 - max_epochs: 100 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. - accelerator: gpu - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - enable_checkpointing: False # Provided by exp_manager - logger: False # Provided by exp_manager - -model: - tensor_model_parallel_size: 1 # tensor model parallel size used in the LM model - seed: 1234 - nemo_path: null # filename to save the model and associated artifacts to .nemo file - use_lm_finetune: False # whether fine tune the language model - pseudo_token: '[PROMPT]' # pseudo prompt tokens - - tokenizer: - library: 'megatron' - type: 'GPT2BPETokenizer' - model: null - vocab_file: null - merge_file: null - - language_model: - nemo_file: null - - prompt_encoder: - template: [3, 3, 0] - dropout: 0.0 - num_layers: 2 - - dataset: - classes: ??? # The class labels, e.g. ['positive', 'neutral', 'negative'] - - train_ds: - file_path: null - batch_size: 64 - shuffle: true - num_samples: -1 # number of samples to be considered, -1 means all the dataset - num_workers: 3 - drop_last: false - pin_memory: false - - validation_ds: - file_path: null - batch_size: 64 - shuffle: false - num_samples: -1 # number of samples to be considered, -1 means all the dataset - num_workers: 3 - drop_last: false - pin_memory: false - - test_ds: - file_path: null - batch_size: 64 - shuffle: false - num_samples: -1 # number of samples to be considered, -1 means all the dataset - num_workers: 3 - drop_last: false - pin_memory: false - - optim: - name: adam - lr: 1e-5 - # optimizer arguments - betas: [0.9, 0.999] - weight_decay: 0.0005 - - # scheduler setup - sched: - name: WarmupAnnealing - # Scheduler params - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - - # List of some sample queries for inference after training is done - infer_samples: [ - 'For example , net sales increased by 5.9 % from the first quarter , and EBITDA increased from a negative EUR 0.2 mn in the first quarter of 2009 .', - '8 May 2009 - Finnish liquid handling products and diagnostic test systems maker Biohit Oyj ( HEL : BIOBV ) said today ( 8 May 2009 ) its net loss narrowed to EUR0 .1 m ( USD0 .14 m ) for the first quarter of 2009 from EUR0 .4 m for the same period of 2008 .', - 'CHS Expo Freight is a major Finnish fair , exhibition and culture logistics company that provides logistics services to various events by land , air and sea .', - ] - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./nemo_experiments" - name: "PTuneTextClassification" # The name of your model - create_tensorboard_logger: True # Whether you want exp_manger to create a tb logger - create_checkpoint_callback: True # Whether you want exp_manager to create a modelcheckpoint callback - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/SoundScribe/SpeakerID/examples/nlp/text_classification/conf/text_classification_config.yaml b/SoundScribe/SpeakerID/examples/nlp/text_classification/conf/text_classification_config.yaml deleted file mode 100644 index 164bf34ca8e6c6bdaae522c08b52c073edbf408e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_classification/conf/text_classification_config.yaml +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Config file for text classification with pre-trained BERT models - -trainer: - devices: 1 # number of GPUs (0 for CPU), or list of the GPUs to use e.g. [0, 1] - num_nodes: 1 - max_epochs: 100 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. - accelerator: gpu - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - - enable_checkpointing: False # Provided by exp_manager - logger: False # Provided by exp_manager - -model: - nemo_path: text_classification_model.nemo # filename to save the model and associated artifacts to .nemo file - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - - language_model: - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - classifier_head: - num_output_layers: 2 - fc_dropout: 0.1 - - class_labels: - class_labels_file : null # optional to specify a file containing the list of the labels - - dataset: - num_classes: ??? # The number of classes. 0 < Label ", "") - outfiles[mode].write(f'{review}\t{label}\n') - - for mode in modes: - outfiles[mode].close() - - class_labels_file = open(os.path.join(outfold, 'label_ids.tsv'), 'w') - class_labels_file.write('negative\npositive\n') - class_labels_file.close() - - -def process_sst2(infold, outfold, uncased, splits=['train', 'dev']): - """Process sst2 dataset.""" - # "test" split doesn't have labels, so it is skipped - if not os.path.exists(infold): - link = 'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip' - raise ValueError( - f'Data not found at {infold}. Please download SST-2 dataset from `{link}` and ' - f'extract it into the folder specified by `source_data_dir` argument.' - ) - - logging.info(f'Processing SST-2 dataset') - os.makedirs(outfold, exist_ok=True) - - def _read_tsv(input_file, quotechar=None): - """Read a tab separated value file.""" - with open(input_file, "r") as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - lines.append(line) - return lines - - for split in splits: - # Load input file. - input_file = os.path.join(infold, split + '.tsv') - lines = _read_tsv(input_file) - # Create output. - outfile = open(os.path.join(outfold, split + '.tsv'), 'w') - - # Copy lines, skip the header (line 0). - for line in lines[1:]: - text = line[0] - label = line[1] - # Lowercase when required. - if uncased: - text = text.lower() - # Write output. - outfile.write(f'{text}\t{label}\n') - # Close file. - outfile.close() - - class_labels_file = open(os.path.join(outfold, 'label_ids.tsv'), 'w') - class_labels_file.write('negative\npositive\n') - class_labels_file.close() - - logging.info(f'Result stored at {outfold}') - - -def process_chemprot(source_dir, target_dir, uncased, modes=['train', 'test', 'dev']): - if not os.path.exists(source_dir): - link = 'https://github.com/arwhirang/recursive_chemprot/tree/master/Demo/tree_LSTM/data' - raise ValueError(f'Data not found at {source_dir}. ' f'Please download ChemProt from {link}.') - - logging.info(f'Processing Chemprot dataset and store at {target_dir}') - os.makedirs(target_dir, exist_ok=True) - - naming_map = {'train': 'trainingPosit_chem', 'test': 'testPosit_chem', 'dev': 'developPosit_chem'} - - def _read_tsv(input_file, quotechar=None): - """Reads a tab separated value file.""" - with open(input_file, "r") as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - lines.append(line) - return lines - - outfiles = {} - label_mapping = {} - out_label_mapping = open(os.path.join(target_dir, 'label_mapping.tsv'), 'w') - for mode in modes: - outfiles[mode] = open(os.path.join(target_dir, mode + '.tsv'), 'w') - input_file = os.path.join(source_dir, naming_map[mode]) - lines = _read_tsv(input_file) - for line in lines: - text = line[1] - label = line[2] - if label == "True": - label = line[3] - if uncased: - text = text.lower() - if label not in label_mapping: - out_label_mapping.write(f'{label}\t{len(label_mapping)}\n') - label_mapping[label] = len(label_mapping) - label = label_mapping[label] - outfiles[mode].write(f'{text}\t{label}\n') - for mode in modes: - outfiles[mode].close() - out_label_mapping.close() - - -def process_thucnews(infold, outfold): - modes = ['train', 'test'] - train_size = 0.8 - if not os.path.exists(infold): - link = 'thuctc.thunlp.org/' - raise ValueError(f'Data not found at {infold}. ' f'Please download THUCNews from {link}.') - - logging.info(f'Processing THUCNews dataset and store at {outfold}') - os.makedirs(outfold, exist_ok=True) - - outfiles = {} - for mode in modes: - outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'a+', encoding='utf-8') - categories = ['体育', '娱乐', '家居', '彩票', '房产', '教育', '时尚', '时政', '星座', '游戏', '社会', '科技', '股票', '财经'] - for category in categories: - label = categories.index(category) - category_files = glob.glob(f'{infold}/{category}/*.txt') - test_num = int(len(category_files) * (1 - train_size)) - test_files = category_files[:test_num] - train_files = category_files[test_num:] - - for mode in modes: - logging.info(f'Processing {mode} data of the category {category}') - if mode == 'test': - files = test_files - else: - files = train_files - - if len(files) == 0: - logging.info(f'Skipping category {category} for {mode} mode') - continue - - for file in tqdm.tqdm(files): - with open(file, 'r', encoding='utf-8') as f: - news = f.read().strip().replace('\r', '') - news = news.replace('\n', '').replace('\t', ' ') - outfiles[mode].write(f'{news}\t{label}\n') - for mode in modes: - outfiles[mode].close() - - -if __name__ == "__main__": - # Parse the command-line arguments. - parser = argparse.ArgumentParser(description="Process and convert datasets into NeMo\'s format.") - parser.add_argument("--dataset_name", required=True, type=str, choices=['imdb', 'thucnews', 'chemprot']) - parser.add_argument( - "--source_data_dir", required=True, type=str, help='The path to the folder containing the dataset files.' - ) - parser.add_argument("--target_data_dir", required=True, type=str) - parser.add_argument("--do_lower_case", action='store_true') - args = parser.parse_args() - - dataset_name = args.dataset_name - do_lower_case = args.do_lower_case - source_dir = args.source_data_dir - target_dir = args.target_data_dir - - if not exists(source_dir): - raise FileNotFoundError(f"{source_dir} does not exist.") - - if dataset_name == 'imdb': - process_imdb(source_dir, target_dir, do_lower_case) - elif dataset_name == 'thucnews': - process_thucnews(source_dir, target_dir) - elif dataset_name == "chemprot": - process_chemprot(source_dir, target_dir, do_lower_case) - elif dataset_name == "sst-2": - process_sst2(source_dir, target_dir, do_lower_case) - else: - raise ValueError( - f'Dataset {dataset_name} is not supported.' - + "Please make sure that you build the preprocessing process for it. " - + "NeMo's format assumes that a data file has a header and each line of the file follows " - + "the format: text [TAB] label. Label is assumed to be an integer." - ) diff --git a/SoundScribe/SpeakerID/examples/nlp/text_classification/model_parallel_text_classification_evaluation.py b/SoundScribe/SpeakerID/examples/nlp/text_classification/model_parallel_text_classification_evaluation.py deleted file mode 100644 index ab3322f552c1141123af1cbda053eefd29fdd84f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_classification/model_parallel_text_classification_evaluation.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script runs model parallel text classification evaluation. -""" -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models.text_classification import TextClassificationModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="text_classification_config") -def main(cfg: DictConfig) -> None: - logging.info(f'\nConfig Params:\n{OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - # TODO: can we drop strict=False - model = TextClassificationModel.restore_from(cfg.model.nemo_path, trainer=trainer, strict=False) - model.setup_test_data(test_data_config=cfg.model.test_ds) - - trainer.test(model=model, ckpt_path=None) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/text_classification/text_classification_with_bert.py b/SoundScribe/SpeakerID/examples/nlp/text_classification/text_classification_with_bert.py deleted file mode 100644 index 01e8fae9bba5691b47fe24a2480bea880bde5816..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_classification/text_classification_with_bert.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script contains an example on how to train, evaluate and perform inference with the TextClassificationModel. -TextClassificationModel in NeMo supports text classification problems such as sentiment analysis or -domain/intent detection for dialogue systems, as long as the data follows the format specified below. - -***Data format*** -TextClassificationModel requires the data to be stored in TAB separated files (.tsv) with two columns of sentence and -label. Each line of the data file contains text sequences, where words are separated with spaces and label separated -with [TAB], i.e.: - -[WORD][SPACE][WORD][SPACE][WORD][TAB][LABEL] - -For example: - -hide new secretions from the parental units[TAB]0 -that loves its characters and communicates something rather beautiful about human nature[TAB]1 -... - -If your dataset is stored in another format, you need to convert it to this format to use the TextClassificationModel. - - -***Setting the configs*** -The model and the PT trainer are defined in a config file which declares multiple important sections. -The most important ones are: - model: All arguments that are related to the Model - language model, tokenizer, head classifier, optimizer, - schedulers, and datasets/data loaders. - trainer: Any argument to be passed to PyTorch Lightning including number of epochs, number of GPUs, - precision level, etc. - -This script uses the `/examples/nlp/text_classification/conf/text_classification_config.yaml` default config file -by default. You may update the config file from the file directly or by using the command line arguments. -Other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'. - -You first need to set the num_classes in the config file which specifies the number of classes in the dataset. -Notice that some config lines, including `model.dataset.classes_num`, have `???` as their value, this means that values -for these fields are required to be specified by the user. We need to specify and set the `model.train_ds.file_name`, -`model.validation_ds.file_name`, and `model.test_ds.file_name` in the config file to the paths of the train, validation, - and test files if they exist. We may do it by updating the config file or by setting them from the command line. - - -***How to run the script?*** -For example the following would train a model for 50 epochs in 2 GPUs on a classification task with 2 classes: - -# python text_classification_with_bert.py - model.dataset.num_classes=2 - model.train_ds=PATH_TO_TRAIN_FILE - model.validation_ds=PATH_TO_VAL_FILE - trainer.max_epochs=50 - trainer.devices=2 - -This script would also reload the last checkpoint after the training is done and does evaluation on the dev set, -then performs inference on some sample queries. - -By default, this script uses examples/nlp/text_classification/conf/text_classifciation_config.py config file, and -you may update all the params in the config file from the command line. You may also use another config file like this: - -# python text_classification_with_bert.py --config-name==PATH_TO_CONFIG_FILE - model.dataset.num_classes=2 - model.train_ds=PATH_TO_TRAIN_FILE - model.validation_ds=PATH_TO_VAL_FILE - trainer.max_epochs=50 - trainer.devices=2 - -***Load a saved model*** -This script would save the model after training into '.nemo' checkpoint file specified by nemo_path of the model config. -You may restore the saved model like this: - model = TextClassificationModel.restore_from(restore_path=NEMO_FILE_PATH) - -***Evaluation a saved model on another dataset*** -# If you wanted to evaluate the saved model on another dataset, you may restore the model and create a new data loader: - eval_model = TextClassificationModel.restore_from(restore_path=checkpoint_path) - -# Then, you may create a dataloader config for evaluation: - eval_config = OmegaConf.create( - {'file_path': cfg.model.test_ds.file_path, 'batch_size': 64, 'shuffle': False, 'num_workers': 3} - ) - eval_model.setup_test_data(test_data_config=eval_config) - -# You need to create a new trainer: - eval_trainer = pl.Trainer(devices=1) - eval_model.set_trainer(eval_trainer) - eval_trainer.test(model=eval_model, verbose=False) -""" -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models.text_classification import TextClassificationModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="text_classification_config") -def main(cfg: DictConfig) -> None: - logging.info(f'\nConfig Params:\n{OmegaConf.to_yaml(cfg)}') - try: - strategy = NLPDDPStrategy(find_unused_parameters=True) - except (ImportError, ModuleNotFoundError): - strategy = 'auto' - - trainer = pl.Trainer(strategy=strategy, **cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - - if not cfg.model.train_ds.file_path: - raise ValueError("'train_ds.file_path' need to be set for the training!") - - model = TextClassificationModel(cfg.model, trainer=trainer) - logging.info("===========================================================================================") - logging.info('Starting training...') - trainer.fit(model) - logging.info('Training finished!') - logging.info("===========================================================================================") - - if cfg.model.nemo_path: - # '.nemo' file contains the last checkpoint and the params to initialize the model - model.save_to(cfg.model.nemo_path) - logging.info(f'Model is saved into `.nemo` file: {cfg.model.nemo_path}') - - # We evaluate the trained model on the test set if test_ds is set in the config file - if cfg.model.test_ds.file_path: - logging.info("===========================================================================================") - logging.info("Starting the testing of the trained model on test set...") - trainer.test(model=model, ckpt_path=None, verbose=False) - logging.info("Testing finished!") - logging.info("===========================================================================================") - - # perform inference on a list of queries. - if "infer_samples" in cfg.model and cfg.model.infer_samples: - logging.info("===========================================================================================") - logging.info("Starting the inference on some sample queries...") - - # max_seq_length=512 is the maximum length BERT supports. - results = model.classifytext(queries=cfg.model.infer_samples, batch_size=16, max_seq_length=512) - logging.info('The prediction results of some sample queries with the trained model:') - for query, result in zip(cfg.model.infer_samples, results): - logging.info(f'Query : {query}') - logging.info(f'Predicted label: {result}') - - logging.info("Inference finished!") - logging.info("===========================================================================================") - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/conf/thutmose_tagger_itn_config.yaml b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/conf/thutmose_tagger_itn_config.yaml deleted file mode 100644 index 7211b4410078850788a5893230848be231e66141..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/conf/thutmose_tagger_itn_config.yaml +++ /dev/null @@ -1,97 +0,0 @@ -name: &name itn -lang: ??? # e.g. 'ru', 'en' - -# Pretrained Nemo Models -pretrained_model: null - -trainer: - devices: 1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 3 # the number of training epochs - enable_checkpointing: false # provided by exp_manager - logger: false # provided by exp_manager - accumulate_grad_batches: 1 # accumulates grads every k batches - gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. - accelerator: gpu - strategy: ddp - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - -model: - do_training: true - label_map: ??? # path/.../label_map.txt - semiotic_classes: ??? # path/to/.../semiotic_classes.txt - max_sequence_len: 128 - lang: ${lang} - hidden_size: 768 - - optim: - name: adamw - lr: 3e-5 - weight_decay: 0.1 - - sched: - name: WarmupAnnealing - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - - # scheduler config override - warmup_ratio: 0.1 - last_epoch: -1 - - language_model: - pretrained_model_name: bert-base-uncased # For ru, try DeepPavlov/rubert-base-cased | For de or multilingual, try bert-base-multilingual-cased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - -exp_manager: - exp_dir: nemo_experiments # where to store logs and checkpoints - name: training # name of experiment - create_tensorboard_logger: True - create_checkpoint_callback: True - checkpoint_callback_params: - save_top_k: 3 - monitor: "val_loss" - mode: "min" - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - -tokenizer: - tokenizer_name: ${model.transformer} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - -# Data -data: - train_ds: - data_path: ??? # provide the full path to the file - batch_size: 8 - shuffle: true - num_workers: 3 - pin_memory: false - drop_last: false - - validation_ds: - data_path: ??? # provide the full path to the file. - batch_size: 8 - shuffle: false - num_workers: 3 - pin_memory: false - drop_last: false - - -# Inference -inference: - from_file: null # Path to the raw text, no labels required. Each sentence on a separate line - out_file: null # Path to the output file - batch_size: 16 # batch size for inference.from_file diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/corpus_errors.ru b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/corpus_errors.ru deleted file mode 100644 index e03168f8e8cd5ea8b70a548127a45d0db171c675..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/corpus_errors.ru +++ /dev/null @@ -1,643 +0,0 @@ -CARDINAL два 132 -CARDINAL два 122 -CARDINAL два 102 -CARDINAL два 22 -CARDINAL два 72 -CARDINAL два 12 -CARDINAL два 62 -CARDINAL два 42 -CARDINAL два 4 2 -CARDINAL два 92 -CARDINAL два 212 -CARDINAL два 605792 -CARDINAL два 512 -CARDINAL два 82 -CARDINAL два 422 -CARDINAL два 41615252 -CARDINAL два 192 -CARDINAL два 32 -CARDINAL два 232 -CARDINAL два 162 -CARDINAL два 152 -CARDINAL два 432 -CARDINAL два 52 -CARDINAL два 112 -CARDINAL два 61417272 -CARDINAL два 852 -CARDINAL два 33554432 -CARDINAL два 262 -CARDINAL два 302 -CARDINAL два 272 -CARDINAL два 252 -CARDINAL два 2002 -CARDINAL шестнадцать 768 16 -CARDINAL один 2041 -CARDINAL один 31 -CARDINAL один 11 -CARDINAL один 81 -CARDINAL один 71 -CARDINAL один 21 -CARDINAL один 64 1 -CARDINAL один 101 -CARDINAL один 61 -CARDINAL один 211 -CARDINAL один 51 -CARDINAL один 151 -CARDINAL один 441 -CARDINAL один 41 -CARDINAL один 91 -CARDINAL один 191 -CARDINAL один 281 -CARDINAL один 111 -CARDINAL один 141 -CARDINAL один 161 -CARDINAL три 13 -CARDINAL три 3-и -CARDINAL три 33 -CARDINAL три 53 -CARDINAL три 43 -CARDINAL три 63 -CARDINAL три 21993 -CARDINAL три 3и -CARDINAL три 83 -CARDINAL три 73 -CARDINAL три 23 -CARDINAL три 343 -CARDINAL три 103 -CARDINAL три 153 -CARDINAL три 203 -CARDINAL три 2003 -CARDINAL три 93 -CARDINAL три 123 -CARDINAL три 4 3 -CARDINAL три 133 -CARDINAL четыре 54 -CARDINAL четыре 64 -CARDINAL четыре 24 -CARDINAL четыре 224 -CARDINAL четыре 1024 -CARDINAL четыре 34 -CARDINAL четыре 14 -CARDINAL четыре 64 4 -CARDINAL четыре 134 -CARDINAL четыре 44 -CARDINAL четыре 124 -CARDINAL четыре 84 -CARDINAL четыре 144 -CARDINAL четыре 3744 -CARDINAL четыре 3024 -CARDINAL четыре 274 -CARDINAL четыре 104 -CARDINAL четыре 74 -CARDINAL четыре 16384 -CARDINAL четыре 864 -CARDINAL четыре 2304 -CARDINAL четыре 244 -CARDINAL четыре 1944 -CARDINAL четыре 114 -CARDINAL четыре 3 2014 -CARDINAL четыре 194 -CARDINAL четыре 294 -CARDINAL девяносто три 93-и -CARDINAL шестидесяти 60-и -CARDINAL шестидесяти 61 -CARDINAL одного 21 -CARDINAL одного 31 -CARDINAL одного 91 -CARDINAL шесть 86 -CARDINAL шесть 476 -CARDINAL шесть 296 -CARDINAL шесть 56 -CARDINAL шесть 36 -CARDINAL шесть 256 -CARDINAL шесть 16 -CARDINAL шесть 376 -CARDINAL шесть 236 -CARDINAL шесть 96 -CARDINAL шесть 26 -CARDINAL шесть 46 -CARDINAL шесть 106 -CARDINAL шесть 66 -CARDINAL шесть 576 -CARDINAL шесть 146 -CARDINAL шесть 76 -CARDINAL шесть 116 -CARDINAL шесть 186 -CARDINAL шесть 1536 -CARDINAL шесть 746 -CARDINAL шесть 306 -CARDINAL шесть 466 -CARDINAL шесть 316 -CARDINAL четырнадцати 14-и -CARDINAL четырнадцати 14и -CARDINAL семи 7-и -CARDINAL семи 77 -CARDINAL семи 37 -CARDINAL семи 7и -CARDINAL восемь 18 -CARDINAL восемь 28 -CARDINAL восемь 758 -CARDINAL восемь 208 -CARDINAL восемь 68 -CARDINAL восемь 78 -CARDINAL восемь 58 -CARDINAL восемь 768 -CARDINAL восемь 128 -CARDINAL восемь 108 -CARDINAL восемь 138 -CARDINAL восемь 38 -CARDINAL восемь 258 -CARDINAL восемь 2448 -CARDINAL восемь 88 -CARDINAL восемь 48 -CARDINAL восемь 248 -CARDINAL восемь 2048 -CARDINAL восемь 858 -CARDINAL восемь 118 -CARDINAL восемь 288 -CARDINAL восемь 178 -CARDINAL восемь 238 -CARDINAL восемь 278 -CARDINAL восемь 9958 -CARDINAL восемь 488 -CARDINAL восемь 98 -CARDINAL двадцати восьми 28-и -CARDINAL семь 317 -CARDINAL семь 57 -CARDINAL семь 97 -CARDINAL семь 697 -CARDINAL семь 17 -CARDINAL семь 297 -CARDINAL семь 27 -CARDINAL семь 217 -CARDINAL семь 207 -CARDINAL семь 67 -CARDINAL семь 257 -CARDINAL семь 187 -CARDINAL семь 47 -CARDINAL семь 147 -CARDINAL семь 377 -CARDINAL семь 107 -CARDINAL семь 177 -CARDINAL семь 237 -CARDINAL семь 37 -CARDINAL семь 337 -CARDINAL семь 167 -CARDINAL семь 117 -CARDINAL пять 125 -CARDINAL пять 15 -CARDINAL пять 235 -CARDINAL пять 95 -CARDINAL пять 205 -CARDINAL пять 45 -CARDINAL пять 305 -CARDINAL пять 165 -CARDINAL пять 85 -CARDINAL пять 65 -CARDINAL пять 245 -CARDINAL пять 115 -CARDINAL пять 25 -CARDINAL пять 285 -CARDINAL пять 55 -CARDINAL пять 215 -CARDINAL пять 75 -CARDINAL пять 1315 -CARDINAL пять 185 -CARDINAL пять 35 -CARDINAL пять 395 -CARDINAL пять 255 -CARDINAL пять 345 -CARDINAL пять 275 -CARDINAL пять 1525 -CARDINAL пять 765 -CARDINAL пять 225 -CARDINAL пять 335 -CARDINAL пять 195 -CARDINAL пять 105 -CARDINAL пять 435 -CARDINAL пять 325 -CARDINAL пять 155 -CARDINAL пять 315 -CARDINAL пять 635 -CARDINAL пять 145 -CARDINAL пять 175 -CARDINAL пять 295 -CARDINAL пять 445 -CARDINAL пять 495 -CARDINAL пять 265 -CARDINAL пять 555 -CARDINAL пять 375 -CARDINAL девять 109 -CARDINAL девять 29 -CARDINAL девять 19 -CARDINAL девять 79 -CARDINAL девять 99 -CARDINAL девять 59 -CARDINAL девять 49 -CARDINAL девять 139 -CARDINAL девять 249 -CARDINAL девять 159 -CARDINAL девять 219 -CARDINAL девять 39 -CARDINAL девять 89 -CARDINAL одна 2011 -CARDINAL одна 11 -CARDINAL одна 61 -CARDINAL тридцати 31 -CARDINAL тридцати 30-и -CARDINAL ста 101 -CARDINAL тридцать три 33-и -CARDINAL две 192 -CARDINAL sil 4 -CARDINAL sil 1 -CARDINAL sil 2 -CARDINAL sil 5 -CARDINAL sil 7 -CARDINAL sil 3 -CARDINAL sil -2 -CARDINAL sil 6 -CARDINAL sil -9 -CARDINAL sil 8 -CARDINAL sil 9 -CARDINAL sil 0 -CARDINAL sil -3 -CARDINAL sil -0 -CARDINAL sil -5 -CARDINAL sil -8 -CARDINAL sil 24-и -CARDINAL sil -4 -CARDINAL sil 2-ти -CARDINAL sil 122-и -CARDINAL sil -6 -CARDINAL sil 4-и -CARDINAL sil 2-и -CARDINAL sil -1 -CARDINAL sil 24-ти -CARDINAL sil 2004и -CARDINAL sil 90 -CARDINAL sil 3-ти -CARDINAL sil 9810245394и -CARDINAL sil -7 -CARDINAL sil iii -CARDINAL sil 27-ти -CARDINAL sil 28-ти -CARDINAL sil -23-ти -CARDINAL sil 90-ти -CARDINAL sil 23-ти -CARDINAL sil 22 -CARDINAL sil 100 -CARDINAL sil 1340-и -CARDINAL sil 2и -CARDINAL sil 63ти -CARDINAL sil 1990и -CARDINAL sil 27 -CARDINAL sil 4-ти -CARDINAL sil 100-ти -CARDINAL sil 45 -CARDINAL sil 57-ти -CARDINAL sil 51-ти -CARDINAL sil 69-мя -CARDINAL sil 600-ти -CARDINAL sil 73-ти -CARDINAL sil 7-ти -CARDINAL sil -40и -CARDINAL sil 37-ти -CARDINAL sil 8-ти -CARDINAL sil 2002и -CARDINAL sil 70-ех -CARDINAL sil 46 -CARDINAL sil 40-ти -CARDINAL sil 1490и -CARDINAL sil 1924и -CARDINAL sil -07 -CARDINAL sil 12-мя -CARDINAL sil 34-ти -CARDINAL sil 552и -CARDINAL sil 1992и -CARDINAL sil 32-ти -CARDINAL sil 94и -CARDINAL sil 33-ти -CARDINAL sil 1982и -CARDINAL sil 62-ти -CARDINAL sil 1990-и -CARDINAL sil sil -CARDINAL минус пяти -5-и -CARDINAL пятидесяти 51 -CARDINAL пятидесяти 50-и -CARDINAL пятнадцати 15-и -CARDINAL тысячью девятистах восьмидесяти 1980и -CARDINAL ноль 100 -CARDINAL ноль 10 -CARDINAL ноль 3200 -CARDINAL ноль 480 -CARDINAL ноль 200 -CARDINAL ноль 800 -CARDINAL ноль 80 -CARDINAL ноль 50 -CARDINAL ноль 400 -CARDINAL ноль 60 -CARDINAL ноль 40 -CARDINAL ноль 110 -CARDINAL ноль 1080 -CARDINAL ноль 140 -CARDINAL ноль 2000 -CARDINAL ноль 1920 -CARDINAL ноль 510 -CARDINAL ноль 150 -CARDINAL ноль 90 -CARDINAL ноль 600 -CARDINAL ноль 8100 -CARDINAL ноль 4000 -CARDINAL ноль 170 -CARDINAL ноль 20 -CARDINAL ноль 580 -CARDINAL ноль 70 -CARDINAL ноль 160 -CARDINAL ноль 130 -CARDINAL ноль 1200 -CARDINAL ноль 320 -CARDINAL ноль 360 -CARDINAL ноль 250 -CARDINAL ноль 300 -CARDINAL ноль 350 -CARDINAL ноль 410 -CARDINAL ноль 720 -CARDINAL ноль 30 -CARDINAL ноль 120 -CARDINAL ноль 180 -CARDINAL ноль 310 -CARDINAL ноль 280 -CARDINAL ноль 1500 -CARDINAL ноль 1600 -CARDINAL ноль 210 -CARDINAL ноль 700 -CARDINAL ноль 240 -CARDINAL ноль 230 -CARDINAL ноль 540 -CARDINAL ноль 260 -CARDINAL ноль 1000 -CARDINAL ноль 520 -CARDINAL ноль 220 -CARDINAL ноль 500 -CARDINAL ноль 3000 -CARDINAL ноль 960 -CARDINAL ноль 390 -CARDINAL ноль 2440 -CARDINAL ноль 1220 -CARDINAL ноль 2500 -CARDINAL ноль 1250 -CARDINAL ноль 3050 -CARDINAL ноль 680 -CARDINAL ноль 640 -CARDINAL ноль 1050 -CARDINAL ноль 2400 -CARDINAL ноль 530 -CARDINAL ноль 620 -CARDINAL ноль 330 -CARDINAL ноль 290 -CARDINAL ноль 550 -CARDINAL ноль 1800 -CARDINAL ноль 4600 -CARDINAL ноль 177245385090 -CARDINAL ноль 1280 -CARDINAL ноль 7000 -CARDINAL ноль 2070 -CARDINAL ноль 10000 -CARDINAL ноль 1700 -CARDINAL ноль 750 -CARDINAL ноль 710 -CARDINAL ноль 900 -CARDINAL ноль 270 -CARDINAL ноль 1120 -CARDINAL ноль 490 -CARDINAL ноль 850 -CARDINAL ноль 450 -CARDINAL ноль 630 -CARDINAL ноль 790 -CARDINAL ноль 2880 -CARDINAL ноль 1150 -CARDINAL ноль 610 -CARDINAL одиннадцати 11-и -CARDINAL четырех 1024 -CARDINAL четырех 54 -CARDINAL ста семидесяти восьми 178-и -CARDINAL восьмидесяти девяти 89-и -CARDINAL тринадцати 13-и -CARDINAL ста сорока 141 -CARDINAL шести 6-и -CARDINAL шести 16 -CARDINAL шести 6и -CARDINAL шести 146 -CARDINAL шести 26 -CARDINAL двадцати пяти 25-и -CARDINAL двадцати пяти 25и -CARDINAL трех 53 -CARDINAL трех 233 -CARDINAL двенадцати 12-и -CARDINAL двенадцати 12и -CARDINAL сорок шесть 28 46 -CARDINAL двадцати 21 -CARDINAL двадцати 20и -CARDINAL двадцати 20-и -CARDINAL девятнадцати 19-и -CARDINAL двадцать пять 3 25 -CARDINAL двадцать три 23-и -CARDINAL двадцать три 2 23 -CARDINAL семнадцати 17-и -CARDINAL шестисот 601 -CARDINAL одной 21 -CARDINAL сорок три 11 43 -CARDINAL тридцать семь 1 37 -CARDINAL тридцать семь 5 37 -CARDINAL сто 1 100 -CARDINAL восемнадцати 18-и -CARDINAL десяти 10-и -CARDINAL десяти 10и -CARDINAL девяти 9-и -CARDINAL сорока 41 -CARDINAL ста пятидесяти 151 -CARDINAL ста пятидесяти 150-и -CARDINAL тридцати восьми 38и -CARDINAL тридцати восьми 38-и -CARDINAL тридцати девяти 39-и -CARDINAL нуля 30 -CARDINAL нуля 400 -CARDINAL нуля 200 -CARDINAL нуля 300 -CARDINAL нуля 40 -CARDINAL нуля 70 -CARDINAL нуля 100 -CARDINAL нуля 20 -CARDINAL нуля 10 -CARDINAL нуля 600 -CARDINAL нуля 2000 -CARDINAL нуля 90 -CARDINAL нуля 3000 -CARDINAL нуля 80 -CARDINAL нуля 1500 -CARDINAL нуля 130 -CARDINAL шестидесяти пяти 65-и -CARDINAL ста семидесяти 170-и -CARDINAL ста семидесяти 171 -CARDINAL пятидесяти пяти 55-и -CARDINAL ста двадцати 121 -CARDINAL ста двадцати 120-и -CARDINAL двадцати шести 26-и -CARDINAL шестнадцати 16-и -CARDINAL шестнадцати 16и -CARDINAL двух 12 -CARDINAL двух 192 -CARDINAL двух 52 -CARDINAL двух 32 -CARDINAL восьми 8-и -CARDINAL восьми 8и -CARDINAL восьми 28 -CARDINAL двадцать 1 20 -CARDINAL двумя 32 -CARDINAL пяти 5-и -CARDINAL пяти 15 -CARDINAL пяти 85 -CARDINAL пяти 25 -CARDINAL пяти 5и -CARDINAL пяти 215 -CARDINAL пяти 65 -CARDINAL пяти 105 -CARDINAL пяти 55 -CARDINAL семидесяти 70-и -CARDINAL семидесяти 71 -CARDINAL девяноста 91 -CARDINAL пятьдесят 1 50 -CARDINAL двадцати семи 27-и -CARDINAL восьмидесяти 80-и -CARDINAL восьмидесяти 81 -CARDINAL ста двадцати пяти 125-и -CARDINAL сто пятьдесят 1 150 -CARDINAL тридцати семи 37-и -CARDINAL тридцати семи 37и -CARDINAL ста пятидесяти семи 157-и -CARDINAL сорока восьми 48-и -CARDINAL сорок 2 40 -CARDINAL сорок 1 40 -CARDINAL сто пять 1 105 -CARDINAL сорока девяти 49-и -CARDINAL двух тысяч 2001 -CARDINAL восемьдесят восемь 1 88 -CARDINAL сорок пять 37 45 -CARDINAL девяноста семи 97-и -CARDINAL девяноста восьми 98-и -CARDINAL четыреста пятьдесят 1 450 -CARDINAL восьмидесяти шести 86-и -CARDINAL двух тысяч семи 2007и -CARDINAL двух тысяч семи 2007-и -CARDINAL ста десяти 110-и -CARDINAL пятидесяти девяти 59-и -CARDINAL пятидесяти девяти 59и -CARDINAL пятидесяти восьми 58-и -CARDINAL семидесяти семи 77-и -CARDINAL семидесяти пяти 75-и -CARDINAL восемьсот 0800 -CARDINAL восьмидесяти семи 87-и -CARDINAL восьмидесяти семи 87и -CARDINAL шестидесяти девяти 69-и -CARDINAL трехсот пятидесяти 351 -CARDINAL шестидесяти семи 67-и -CARDINAL один восемь xviii -CARDINAL тридцати шести 36-и -CARDINAL девяноста шести 96-и -CARDINAL сорок семь 1 47 -CARDINAL пятидесяти шести 56-и -CARDINAL пятидесяти шести 56и -CARDINAL ста двадцати шести 126-и -CARDINAL ста тринадцати 113-и -CARDINAL тысячью девятистах девятнадцати 1919-и -CARDINAL тысячи семисот девяноста 1791 -CARDINAL сорок четыре 18 44 -CARDINAL ста тридцати шести 136-и -CARDINAL ста сорока пяти 145-и -CARDINAL тысячью девятистах восьмидесяти шести 1986-и -CARDINAL тысячью девятистах семидесяти 1970-и -CARDINAL двух тысяч шести 2006-и -CARDINAL двух тысяч шести 2006и -CARDINAL девяноста пяти 95-и -CARDINAL девяноста пяти 95и -CARDINAL ста восемнадцати 118-и -CARDINAL ста восемнадцати 118и -CARDINAL ста шестидесяти 161 -CARDINAL трехсот восьмидесяти 381 -CARDINAL тысячи 1001 -CARDINAL тысячи семисот пятидесяти 1751 -CARDINAL восьмидесяти пяти 85-и -CARDINAL тысячи семисот 1701 -CARDINAL сорока пяти 45-и -CARDINAL двадцати девяти 29-и -CARDINAL трехстах восьмидесяти 380-и -CARDINAL сорока шести 46-и -CARDINAL девяноста девяти 99-и -CARDINAL ста шестидесяти шести 166и -CARDINAL восьмью 68 -CARDINAL восьмью 128 -CARDINAL двухсот сорока 241 -CARDINAL тысячи девятисот тридцати 1931 -CARDINAL ста пяти 105-и -CARDINAL ста тридцати 131 -CARDINAL четырехсот шестидесяти 461 -CARDINAL тысячи девятисот девяноста 1991 -CARDINAL семидесяти шести 76и -CARDINAL семидесяти шести 76-и -CARDINAL тысячи девятисот двадцати 1921 -CARDINAL тремястами 300-и -CARDINAL пятидесяти семи 57-и -CARDINAL ста двенадцати 112-и -CARDINAL семидесяти восьми 78и -CARDINAL пятисот 501 -CARDINAL пятисот 1 500 -CARDINAL трехсот тридцати 331 -CARDINAL четырехсот сорока 441 -CARDINAL тысячи девятисот 1901 -CARDINAL двух тысяч тринадцати 2013и -CARDINAL четырехсот двадцати 421 -CARDINAL одни 1и -CARDINAL одни 1-и -CARDINAL трехсот двадцати 321 -CARDINAL семидесяти девяти 79-и -CARDINAL двухсот восьмидесяти 281 -CARDINAL трехсот семидесяти 371 -CARDINAL минус три -3-и -CARDINAL минус тринадцать 11-13 -CARDINAL тысячью восьмистах тридцати восьми 1838и -CARDINAL ста девяноста 191 -CARDINAL ста шестидесяти пяти 165-и -CARDINAL тысячи шестисот восьмидесяти 1681 -CARDINAL двухстах пятидесяти 250-и -CARDINAL семисот пятидесяти 751 -CARDINAL ста четырнадцати 114-и -CARDINAL пятисот пятидесяти 551 -CARDINAL двух тысяч четырнадцати 2014и -CARDINAL шестистах десяти 610-и -CARDINAL ста семнадцати 117-и -CARDINAL ста тридцати восьми 138-и -CARDINAL трехсот шестидесяти 361 -CARDINAL тысячью девятистах четырнадцати 1914и -CARDINAL шестьюстами 600-и -CARDINAL один четыре xiv -CARDINAL тысячи девятисот восьмидесяти 1981 -CARDINAL двухстах семидесяти пяти 275-и -CARDINAL три один xxxi -CARDINAL семьсот семьдесят три 773-и -CARDINAL два три xxiii -CARDINAL тысячи сорока 1041 -CARDINAL тысячью двухстах семидесяти 1270-и -CARDINAL четырехсот семидесяти 471 -CARDINAL двумястами 200-и -CARDINAL трехстах семидесяти 370-и -CARDINAL нулю 400 -CARDINAL нулю 200 -CARDINAL нулем 510 -CARDINAL нулем 10 -CARDINAL нулем 200 -CARDINAL нулем 100 -CARDINAL нулем 400 -CARDINAL тысячи восьмисот сорока 1841 -CARDINAL двухстах десяти 210-и -CARDINAL тысячи восьмисот двадцати 1821 -CARDINAL пятисот сорока 541 diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/extract_giza_alignments.py b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/extract_giza_alignments.py deleted file mode 100644 index f5a53b1f331dbac722bd9c265bb9c7b0247fe96b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/extract_giza_alignments.py +++ /dev/null @@ -1,522 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script can be used after GIZA++ alignment to extract final alignments for each semiotic class. -""" - -import re -from argparse import ArgumentParser - -from nemo.collections.nlp.data.text_normalization_as_tagging.utils import ( - check_monotonicity, - fill_alignment_matrix, - get_targets, - get_targets_from_back, -) - - -parser = ArgumentParser(description='Extract final alignments from GIZA++ alignments') -parser.add_argument('--mode', type=str, required=True, help='tn or itn') -parser.add_argument('--giza_dir', type=str, required=True, help='Path to folder with GIZA++ alignment') -parser.add_argument( - '--giza_suffix', type=str, required=True, help='suffix of alignment files, e.g. \"Ahmm.5\", \"A3.final\"' -) -parser.add_argument('--out_filename', type=str, required=True, help='Output file') -parser.add_argument('--lang', type=str, required=True, help="Language") -args = parser.parse_args() - - -def main() -> None: - g = open(args.giza_dir + "/GIZA++." + args.giza_suffix, "r", encoding="utf-8") - f = open(args.giza_dir + "/GIZA++reverse." + args.giza_suffix, "r", encoding="utf-8") - target_inner_delimiter = "" - if args.mode == "tn": - g, f = f, g - target_inner_delimiter = "_" - out = open(args.giza_dir + "/" + args.out_filename, "w", encoding="utf-8") - cache = {} - good_count, not_mono_count, not_covered_count, exception_count = 0, 0, 0, 0 - n = 0 - while True: - n += 3 - if n % 10000 == 0: - print(n, "lines processed") - fline1 = f.readline().strip() - fline2 = f.readline().strip() - fline3 = f.readline().strip() - gline1 = g.readline().strip() - gline2 = g.readline().strip() - gline3 = g.readline().strip() - if fline1 == "" and gline1 == "": - break - cache_key = fline1 + "\t" + fline2 + "\t" + gline1 + "\t" + gline2 - if cache_key in cache: - out.write(cache[cache_key] + "\n") - continue - if fline1 == "" or gline1 == "" or fline2 == "" or gline2 == "" or fline3 == "" or gline3 == "": - raise ValueError("Empty line: " + str(n)) - try: - matrix, srctokens, dsttokens = fill_alignment_matrix(fline2, fline3, gline2, gline3) - except Exception: - print(fline1) - print(fline2) - print(fline3) - print(gline1) - print(gline2) - print(gline3) - exception_count += 1 - out_str = "-exception:\t" + fline2 + "\t" + gline2 - out.write(out_str + "\n") - continue - else: - matrix[matrix <= 2] = 0 # leave only 1-to-1 alignment points - if check_monotonicity(matrix): - targets = get_targets(matrix, dsttokens, delimiter=target_inner_delimiter) - targets_from_back = get_targets_from_back(matrix, dsttokens, delimiter=target_inner_delimiter) - if len(targets) != len(srctokens): - raise ValueError( - "targets length doesn't match srctokens length: len(targets)=" - + str(len(targets)) - + "; len(srctokens)=" - + str(len(srctokens)) - ) - leftside_align = " ".join(targets) - rightside_align = " ".join(targets_from_back) - - rightside_align = rightside_align.replace(" _11100_", "_11 100_") - leftside_align = leftside_align.replace(" _11100_", "_11 100_") - - # _1 4000_ => _14 000_ - # 1 5,000 => 15 ,000 - rightside_align = re.sub(r"^_1 ([\d])(,?000)", r"_1\g<1> \g<2>", rightside_align) - leftside_align = re.sub(r"^_1 ([\d])(,?000)", r"_1\g<1> \g<2>", leftside_align) - - # "_2 10 0_" => "_2 100_" - rightside_align = re.sub(r"([\d]) 10 0_", r"\g<1> 100_", rightside_align) - leftside_align = re.sub(r"([\d]) 10 0_", r"\g<1> 100_", leftside_align) - - if srctokens[0] in [ - "ten", - "twenty", - "thirty", - "forty", - "fifty", - "sixty", - "seventy", - "eighty", - "ninety", - ]: - # ten thousand sixty _1 00 60_ => _10 0 60_ - rightside_align = re.sub(r"^(_\d) 00 (\d)", r"\g<1>0 0 \g<2>", rightside_align) - leftside_align = re.sub(r"^(_\d) 00 (\d)", r"\g<1>0 0 \g<2>", leftside_align) - - # ten thousand sixty three _1 0, 06 3_ => _10 ,0 6 3_ - rightside_align = re.sub(r"([ _]\d) 0, 0(\d)", r"\g<1>0 ,0 \g<2>", rightside_align) - leftside_align = re.sub(r"([ _]\d) 0, 0(\d)", r"\g<1>0 ,0 \g<2>", leftside_align) - - # _3 0, 7 7 4=> _30 , 7 7 4_ - rightside_align = re.sub(r"(\d) 0, ", r"\g<1>0 , ", rightside_align) - leftside_align = re.sub(r"(\d) 0, ", r"\g<1>0 , ", leftside_align) - - # _1 1, 1 40_ => _11 , 1 40_ - rightside_align = re.sub(r"1 1, (\d)", r"11 , \g<1>", rightside_align) - leftside_align = re.sub(r"1 1, (\d)", r"11 , \g<1>", leftside_align) - - if re.match(r".+надцат", srctokens[0]) or srctokens[0] in [ - "ten", - "eleven", - "twelve", - "thirteen", - "fourteen", - "fifteen", - "sixteen", - "seventeen", - "eighteen", - "nineteen", - ]: - # "_1 12 14_" -> "_11 2 14_" - rightside_align = re.sub( - r"^(_1) () ([\d])([\d])", r"\g<1>\g<3> \g<2> \g<4>", rightside_align - ) - leftside_align = re.sub( - r"^(_1) () ([\d])([\d])", r"\g<1>\g<3> \g<2> \g<4>", leftside_align - ) - - # "_1 10 10_" -> "_11 0 10_" - rightside_align = re.sub(r"^_1 ([\d])0 ([\d] ?[\d])", r"_1\g<1> 0 \g<2>", rightside_align) - leftside_align = re.sub(r"^_1 ([\d])0 ([\d] ?[\d])", r"_1\g<1> 0 \g<2>", leftside_align) - - if args.giza_dir.endswith("decimal") and args.lang == "ru": - # "_1 0, 5_" => "_10 , 5_" #десять целых и пять десятых - rightside_align = re.sub( - r"(\d) () ([0123456789])(,) ([\d])", r"\g<1>\g<3> \g<2> \g<4> \g<5>", rightside_align - ) - leftside_align = re.sub( - r"(\d) () ([0123456789])(,) ([\d])", r"\g<1>\g<3> \g<2> \g<4> \g<5>", leftside_align - ) - - if args.giza_dir.endswith("decimal") and args.lang == "en": - # "_7 0. 7_" => _70 . 7_ - rightside_align = re.sub(r"^(_\d) 0\. (\d)", r"\g<1>0 . \g<2>", rightside_align) - leftside_align = re.sub(r"^(_\d) 0\. (\d)", r"\g<1>0 . \g<2>", leftside_align) - - if args.giza_dir.endswith("money") and args.lang == "en": - # "_1 , 000__£<<" => "_1 ,000_ _£<<" - rightside_align = re.sub(r"(\d) , 000_(_[£$€])", r"\g<1> ,000_ \g<2>", rightside_align) - leftside_align = re.sub(r"(\d) , 000_(_[£$€])", r"\g<1> ,000_ \g<2>", leftside_align) - - if args.giza_dir.endswith("money"): - # "_5 000000__иен_" => "_5 000000_ _иен_" - rightside_align = re.sub( - r"([\d]) 000000_(_[^\d])", r"\g<1> 000000_ \g<2>", rightside_align - ) - leftside_align = re.sub(r"([\d]) 000000_(_[^\d])", r"\g<1> 000000_ \g<2>", leftside_align) - - # _5_ _m__£<< => "_5_ _m_ _£<<" - rightside_align = re.sub( - r"([\d]_) (_[mk]_)(_[^\d])", r"\g<1> \g<2> \g<3>", rightside_align - ) - leftside_align = re.sub(r"([\d]_) (_[mk]_)(_[^\d])", r"\g<1> \g<2> \g<3>", leftside_align) - - # "_3 0__m__£<<" => "_30 _m_ _£<<" - rightside_align = re.sub( - r"([\d]) 0_(_[mk]_)(_[^\d])", r"\g<1>0 \g<2> \g<3>", rightside_align - ) - leftside_align = re.sub( - r"([\d]) 0_(_[mk]_)(_[^\d])", r"\g<1>0 \g<2> \g<3>", leftside_align - ) - - # "_15 000__руб._" => "_15 000_ _руб._" - rightside_align = re.sub(r"([\d]) (000_)(_[^\d])", r"\g<1> \g<2> \g<3>", rightside_align) - leftside_align = re.sub(r"([\d]) (000_)(_[^\d])", r"\g<1> \g<2> \g<3>", leftside_align) - - # "_2 5 0 000__$<<" => "_2 50 000_ _$<<" - rightside_align = re.sub(r"([\d]) 0 000_(_[^\d])", r"\g<1>0 000_ \g<2>", rightside_align) - leftside_align = re.sub(r"([\d]) 0 000_(_[^\d])", r"\g<1>0 000_ \g<2>", leftside_align) - - # "_5 0 0000__$_" => "_500 000_ _$_" - rightside_align = re.sub(r"([\d]) 0 0000_(_[^\d])", r"\g<1>00 000_ \g<2>", rightside_align) - leftside_align = re.sub(r"([\d]) 0 0000_(_[^\d])", r"\g<1>00 000_ \g<2>", leftside_align) - - # "_1 000__руб._" => "_1000_ _руб._" - rightside_align = re.sub(r"_1 000_(_[^\d])", r"_1000_ \g<1>", rightside_align) - leftside_align = re.sub(r"_1 000_(_[^\d])", r"_1000_ \g<1>", leftside_align) - - # replace cases like "2 0__января" with "20_ _января" - leftside_align = re.sub(r"([\d]) (00?_)(_[^\d])", r"\g<1>\g<2> \g<3>", leftside_align) - rightside_align = re.sub(r"([\d]) (00?_)(_[^\d])", r"\g<1>\g<2> \g<3>", rightside_align) - - # "_3 0__september_ _2 014_" => "_30_ _september_ _2 014_" - # "_3 00__тыс.__руб._" => "_300_ _тыс.__руб._" - leftside_align = re.sub( - r"([\d]) (00?_)(_[^\d])", r"\g<1>\g<2> \g<3>", leftside_align - ) - rightside_align = re.sub( - r"([\d]) (00?_)(_[^\d])", r"\g<1>\g<2> \g<3>", rightside_align - ) - - # "_october_ _2 0,2 015_" => "_october_ _20 ,2 015_" - leftside_align = re.sub(r"([\d]) (0),(\d)", r"\g<1>\g<2> ,\g<3>", leftside_align) - rightside_align = re.sub(r"([\d]) (0),(\d)", r"\g<1>\g<2> ,\g<3>", rightside_align) - - # "_3 0_.10. _1 9 4 3_" => "_30_ .10. _1 9 4 3_" - leftside_align = re.sub(r"([\d]) (0_)(\.[\d])", r"\g<1>\g<2> \g<3>", leftside_align) - rightside_align = re.sub(r"([\d]) (0_)(\.[\d])", r"\g<1>\g<2> \g<3>", rightside_align) - - # replace cases like "_1 0000_" with "_10 000_" - # replace cases like "_5 00000_" with "_500 000_" - rightside_align = re.sub(r"([\d]) ([0][0]?)(000000000_)", r"\g<1>\g<2> \g<3>", rightside_align) - leftside_align = re.sub(r"([\d]) ([0][0]?)(000000000_)", r"\g<1>\g<2> \g<3>", leftside_align) - rightside_align = re.sub(r"([\d]) ([0][0]?)(000000_)", r"\g<1>\g<2> \g<3>", rightside_align) - leftside_align = re.sub(r"([\d]) ([0][0]?)(000000_)", r"\g<1>\g<2> \g<3>", leftside_align) - rightside_align = re.sub(r"([\d]) ([0][0]?)(000_)", r"\g<1>\g<2> \g<3>", rightside_align) - leftside_align = re.sub(r"([\d]) ([0][0]?)(000_)", r"\g<1>\g<2> \g<3>", leftside_align) - - # "_4 00,000_" -> "_400 ,000_" - rightside_align = re.sub(r"([\d]) ([0][0]?),(000_)", r"\g<1>\g<2> ,\g<3>", rightside_align) - leftside_align = re.sub(r"([\d]) ([0][0]?),(000_)", r"\g<1>\g<2> ,\g<3>", leftside_align) - - # "_9 3 ,0__²_> _км_" => "_9 3 ,0__²_> _км_" - rightside_align = re.sub(r"([\d]) (,00?_)(_[^\d])", r"\g<1>\g<2> \g<3>", rightside_align) - leftside_align = re.sub(r"([\d]) (,00?_)(_[^\d])", r"\g<1>\g<2> \g<3>", leftside_align) - - # "_0 , 01__г_" => "_0 , 01 _г_" - rightside_align = re.sub( - r"(,) 01_(_[^\d])", r"\g<1> 01_ \g<2>", rightside_align - ) - leftside_align = re.sub( - r"(,) 01_(_[^\d])", r"\g<1> 01_ \g<2>", leftside_align - ) - - # "_0 , 7 6 1__км_" => "_0 , 7 6 1_ _км_" - rightside_align = re.sub( - r"(,) (\d) (\d) 1_(_[^\d])", - r"\g<1> \g<2> \g<3> 1_ \g<4>", - rightside_align, - ) - leftside_align = re.sub( - r"(,) (\d) (\d) 1_(_[^\d])", - r"\g<1> \g<2> \g<3> 1_ \g<4>", - leftside_align, - ) - - # "_5 0000__рублей_" => "_50 000_ рублей" - rightside_align = re.sub( - r"([\d]) ([0][0]?)(000_)(_)", r"\g<1>\g<2> \g<3> \g<4>", rightside_align - ) - leftside_align = re.sub( - r"([\d]) ([0][0]?)(000_)(_)", r"\g<1>\g<2> \g<3> \g<4>", leftside_align - ) - - # "_1 115_" -> "_1 1 15_" - rightside_align = re.sub(r" ([1])([1][\d])", r"\g<1> \g<2>", rightside_align) - leftside_align = re.sub(r" ([1])([1][\d])", r"\g<1> \g<2>", leftside_align) - - # "_1 990-х_" -> "_1 9 90-х_" - rightside_align = re.sub(r" (9)(90)", r"\g<1> \g<2>", rightside_align) - leftside_align = re.sub(r" (9)(90)", r"\g<1> \g<2>", leftside_align) - rightside_align = re.sub(r" (8)(80)", r"\g<1> \g<2>", rightside_align) - leftside_align = re.sub(r" (8)(80)", r"\g<1> \g<2>", leftside_align) - rightside_align = re.sub(r" (7)(70)", r"\g<1> \g<2>", rightside_align) - leftside_align = re.sub(r" (7)(70)", r"\g<1> \g<2>", leftside_align) - rightside_align = re.sub(r" (6)(60)", r"\g<1> \g<2>", rightside_align) - leftside_align = re.sub(r" (6)(60)", r"\g<1> \g<2>", leftside_align) - rightside_align = re.sub(r" (5)(50)", r"\g<1> \g<2>", rightside_align) - leftside_align = re.sub(r" (5)(50)", r"\g<1> \g<2>", leftside_align) - rightside_align = re.sub(r" (4)(40)", r"\g<1> \g<2>", rightside_align) - leftside_align = re.sub(r" (4)(40)", r"\g<1> \g<2>", leftside_align) - rightside_align = re.sub(r" (3)(30)", r"\g<1> \g<2>", rightside_align) - leftside_align = re.sub(r" (3)(30)", r"\g<1> \g<2>", leftside_align) - rightside_align = re.sub(r" (2)(20)", r"\g<1> \g<2>", rightside_align) - leftside_align = re.sub(r" (2)(20)", r"\g<1> \g<2>", leftside_align) - - # восемь ноль ноль ноль ноль ноль ноль ноль _8 0 0 0 0 0 0 0_ - # _8 0000000_ - rightside_align = re.sub( - r" 0000000_", - r"0 0 0 0 0 0 0_", - rightside_align, - ) - leftside_align = re.sub( - r" 0000000_", - r"0 0 0 0 0 0 0_", - leftside_align, - ) - - # _8 000000_ - rightside_align = re.sub( - r" 000000_", r"0 0 0 0 0 0_", rightside_align - ) - leftside_align = re.sub( - r" 000000_", r"0 0 0 0 0 0_", leftside_align - ) - - # _8 00000_ - rightside_align = re.sub(r" 00000_", r"0 0 0 0 0_", rightside_align) - leftside_align = re.sub(r" 00000_", r"0 0 0 0 0_", leftside_align) - - # _8 0000_ - rightside_align = re.sub(r" 0000_", r"0 0 0 0_", rightside_align) - leftside_align = re.sub(r" 0000_", r"0 0 0 0_", leftside_align) - - # _8 000_ - rightside_align = re.sub(r" 000_", r"0 0 0_", rightside_align) - leftside_align = re.sub(r" 000_", r"0 0 0_", leftside_align) - - # "_2 010/11" => "_2 0 10 /11" - rightside_align = re.sub( - r" (0)([1][\d])/([\d])", r"\g<1> \g<2> /\g<3>", rightside_align - ) - leftside_align = re.sub( - r" (0)([1][\d])/([\d])", r"\g<1> \g<2> /\g<3>", leftside_align - ) - - # "_2 0 11/12_" => "_2 0 11 /12_" - rightside_align = re.sub(r" ([\d]+)/([\d])", r"\g<1> /\g<2>", rightside_align) - leftside_align = re.sub(r" ([\d]+)/([\d])", r"\g<1> /\g<2>", leftside_align) - - # "_2 0 1 0/2 0 11_" => "_2 0 10 /2 0 11_" - rightside_align = re.sub(r"([\d]) ([\d]+)/([\d])", r"\g<1>\g<2> /\g<3>", rightside_align) - leftside_align = re.sub(r"([\d]) ([\d]+)/([\d])", r"\g<1>\g<2> /\g<3>", leftside_align) - - # "_5 0%_" => "_50 %_" - # "_1 00%_" => "_100 %_" - # "_1 00,00%_" => "_100,00 %_" - rightside_align = re.sub(r"([\d]) ([0,]+)%", r"\g<1>\g<2> %", rightside_align) - leftside_align = re.sub(r"([\d]) ([0,]+)%", r"\g<1>\g<2> %", leftside_align) - - # ATTENTION: keep the order of next two rules - # "_2 0½_" => "_20 ½_" - rightside_align = re.sub(r"([\d]) ([\d]+)½", r"\g<1>\g<2> ½", rightside_align) - leftside_align = re.sub(r"([\d]) ([\d]+)½", r"\g<1>\g<2> ½", leftside_align) - # "_1 ½_ " => "_1 ½_" #одна целая и одна вторая - rightside_align = re.sub( - r"([\d]) (_?½_)? ", - r"\g<1> \g<2>", - rightside_align, - ) - leftside_align = re.sub( - r"([\d]) (_?½_)? ", - r"\g<1> \g<2>", - leftside_align, - ) - - if args.lang == "en" and srctokens[-1] == "half": - # _2 1/ 2_ => _2 ½_ - rightside_align = re.sub(r"(\d) 1/ 2_$", r"\g<1> ½_", rightside_align) - leftside_align = re.sub(r"(\d) 1/ 2_$", r"\g<1> ½_", leftside_align) - - # "_1 50_ _тыс.__руб._" => "_1 50_ _тыс._ _руб._" - rightside_align = re.sub(r"_ (_[^\d]+_)(_[^\d]+_)", r"_ \g<1> \g<2>", rightside_align) - leftside_align = re.sub(r"_ (_[^\d]+_)(_[^\d]+_)", r"_ \g<1> \g<2>", leftside_align) - - # _1000 000__$_ => "_1000000_ _$_" - rightside_align = re.sub(r"_1000 000_(_[^\d])", r"_1000000_ \g<1>", rightside_align) - leftside_align = re.sub(r"_1000 000_(_[^\d])", r"_1000000_ \g<1>", leftside_align) - - if args.giza_dir.endswith("date") and args.lang == "en": - # "_1 2_ _november_ _2 014_" => " _12_ _november_ _2 014_" - if srctokens[0] == "the": - leftside_align = re.sub(r"^_1 (\d_)", r" _1\g<1>", leftside_align) - rightside_align = re.sub(r"^_1 (\d_)", r" _1\g<1>", rightside_align) - - # " _12,2012_" => "_12_ ,20 12_" - leftside_align = re.sub(r"^ _12,2012_", r"_12_ ,20 12_", leftside_align) - rightside_align = re.sub(r"^ _12,2012_", r"_12_ ,20 12_", rightside_align) - - # " _1,20 14_" => "_1 ,20 14_" - leftside_align = re.sub(r"^ _1,(\d)", r"_1 ,\g<1>", leftside_align) - rightside_align = re.sub(r"^ _1,(\d)", r"_1 ,\g<1>", rightside_align) - - # "_2 1,20 14_" => "_2 1 ,20 14_" - leftside_align = re.sub(r" 1,(\d)", r"1 ,\g<1>", leftside_align) - rightside_align = re.sub(r" 1,(\d)", r"1 ,\g<1>", rightside_align) - - # _11,19 9 7_ => _11 ,19 9 7_ - leftside_align = re.sub(r" _11,(\d)", r"_11 ,\g<1>", leftside_align) - rightside_align = re.sub(r" _11,(\d)", r"_11 ,\g<1>", rightside_align) - - if len(srctokens) >= 2 and srctokens[-2] == "twenty": - # " _12,200 9_" => "_12 ,20 09_" - leftside_align = re.sub( - r"^ _12,200 (\d_)", r"_12_ ,20 0\g<1>", leftside_align - ) - rightside_align = re.sub( - r"^ _12,200 (\d_)", r"_12_ ,20 0\g<1>", rightside_align - ) - - # "_april_ _2 015_" => "_april_ _20 15_" - leftside_align = re.sub(r"2 0(\d\d_)$", r"20 \g<1>", leftside_align) - rightside_align = re.sub(r"2 0(\d\d_)$", r"20 \g<1>", rightside_align) - elif len(srctokens) >= 2 and srctokens[-2] == "thousand": - # " _12,200 9_" => "_12 ,2 00 9_" - leftside_align = re.sub( - r"^ _12,200 (\d_)", r"_12_ ,2 00 \g<1>", leftside_align - ) - rightside_align = re.sub( - r"^ _12,200 (\d_)", r"_12_ ,2 00 \g<1>", rightside_align - ) - - # thirtieth twenty fifteen _3 0th__,20 15_ => _30th_ _,20 15_ - leftside_align = re.sub(r"(\d) 0th_(_,\d)", r"\g<1>0th_ \g<2>", leftside_align) - rightside_align = re.sub(r"(\d) 0th_(_,\d)", r"\g<1>0th_ \g<2>", rightside_align) - - if args.giza_dir.endswith("date") and args.lang == "ru": - # тысяча девятьсот шестидесятого года _1 9 6 0_ => _1 9 60_ - if srctokens[-1] == "года": - leftside_align = re.sub(r"(\d) 0_", r"\g<1>0_ ", leftside_align) - rightside_align = re.sub(r"(\d) 0_", r"\g<1>0_ ", rightside_align) - - if args.giza_dir.endswith("time"): - if srctokens[-1] == "hundred": - # fifteen hundred _15:00_ - rightside_align = re.sub(r" (_\d\d:)00_", r"\g<1> 00_", rightside_align) - leftside_align = re.sub(r" (_\d\d:)00_", r"\g<1> 00_", leftside_align) - - # !! Do not change the order of next two rules - # twenty one hundred _2 1:00_ - rightside_align = re.sub(r"(_\d) (\d:)00_ ", r"\g<1> \g<2> 00_", rightside_align) - leftside_align = re.sub(r"(_\d) (\d:)00_ ", r"\g<1> \g<2> 00_", leftside_align) - # twenty hundred _2 0:00_ - rightside_align = re.sub(r"(_\d) (\d:)00_", r"\g<1>\g<2> 00_", rightside_align) - leftside_align = re.sub(r"(_\d) (\d:)00_", r"\g<1>\g<2> 00_", leftside_align) - - if srctokens[-1] == "o'clock": - # nine o'clock _09:00_ => "_09:00_ " - rightside_align = re.sub(r"^ ([^ ])$", r"\g<1> ", rightside_align) - leftside_align = re.sub(r"^ ([^ ])$", r"\g<1> ", leftside_align) - - # "_1 1:3 3_" => "_11: 3 3_" - rightside_align = re.sub(r"_(\d) (\d:)(\d)", r"\g<1>\g<2> \g<3>", rightside_align) - leftside_align = re.sub(r"_(\d) (\d:)(\d)", r"\g<1>\g<2> \g<3>", leftside_align) - - ban = False - if args.giza_dir.endswith("ordinal"): - if dsttokens[0] == "_—": # тысяча девятьсот сорок пятом _— 1 9 4 5_ - ban = True - - # ban roman numbers with at least two symbols, because we do not split them to parts - for t in rightside_align.split(): - if re.match(r"^_?[ivxl][ivxl]+_?$", t): - ban = True - - # ban cases like "_11/05/2013_", "_2005-11-25_", because they are source of incorrect alignments - if args.giza_dir.endswith("date") and args.lang == "en": - if "/" in rightside_align or "-" in rightside_align: - ban = True - - # ban brackets - if "(" in rightside_align or ")" in rightside_align: - ban = True - - if ban: - out_str = ( - "ban:\t" - + " ".join(srctokens) - + "\t" - + " ".join(dsttokens) - + "\t" - + leftside_align - + "\t" - + rightside_align - ) - else: - out_str = ( - "good:\t" - + " ".join(srctokens) - + "\t" - + " ".join(dsttokens) - + "\t" - + leftside_align - + "\t" - + rightside_align - ) - out.write(out_str + "\n") - cache[cache_key] = out_str - else: - out_str = "-mon:\t" + " ".join(srctokens) + "\t" + " ".join(dsttokens) - out.write(out_str + "\n") - cache[cache_key] = out_str - not_mono_count += 1 - - f.close() - g.close() - out.close() - - -# Main code -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/filter_sentences_with_errors.py b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/filter_sentences_with_errors.py deleted file mode 100644 index 3376a28905dde0f9b434049c7328f1e8a8579688..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/filter_sentences_with_errors.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script is used to filter sentences containing bad examples from Google TN Dataset. -""" - -from argparse import ArgumentParser -from os import listdir, mkdir -from os.path import exists, isfile, join -from typing import Set - -parser = ArgumentParser(description="Filter Google TN Dataset by error vocabulary") -parser.add_argument( - "--data_dir", required=True, type=str, help='Path to data directory with files like output-00000-of-00100.tsv' -) -parser.add_argument( - "--out_dir", required=True, type=str, help='Output data directory, same files (with some sentences filtered)' -) -parser.add_argument("--errors_vocab_filename", required=True, type=str, help='File with error vocabulary') -parser.add_argument("--lang", required=True, type=str, help="Language") -args = parser.parse_args() - - -def filter_file(inp_filename: str, out_filename: str, error_vcb: Set) -> None: - """Filter out whole sentences containing bad itn conversions. The output format is the same as input. - - Args: - inp_filename: Name of input file in Google TN Dataset format. - out_filename: Name of output file in Google TN Dataset format. - error_vcb: Set of tuples with erroneous conversion, e.g. ("CARDINAL", "two", "132") - """ - out = open(out_filename, "w", encoding="utf-8") - sent_lines = [] - sent_is_ok = True - with open(inp_filename, "r", encoding="utf-8") as f: - for line in f: - sent_lines.append(line.strip()) - if line.startswith(""): - if sent_is_ok and len(sent_lines) > 1: # there should be at least one line except - out.write("\n".join(sent_lines) + "\n") - sent_lines = [] - sent_is_ok = True - else: - cls, written, spoken = line.strip().split("\t") - k = (cls, spoken.casefold(), written.casefold()) - if k in error_vcb: - sent_is_ok = False - out.close() - - -def main() -> None: - if not exists(args.data_dir): - raise ValueError(f"Data dir {args.data_dir} does not exist") - - # load errors vocabulary - error_vcb = set() - with open(args.errors_vocab_filename, "r", encoding="utf-8") as f: - for line in f: - cls, spoken, written = line.strip().split("\t") - k = (cls, spoken, written) - error_vcb.add(k) - - for subdir in listdir(args.data_dir): - mkdir(join(args.out_dir, subdir)) - for filename in listdir(join(args.data_dir, subdir)): - if not filename.startswith('output'): - continue - inp_filename = join(args.data_dir, subdir, filename) - out_filename = join(args.out_dir, subdir, filename) - if not isfile(inp_filename): - continue - filter_file(inp_filename, out_filename, error_vcb) - - -if __name__ == "__main__": - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/get_label_vocab.py b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/get_label_vocab.py deleted file mode 100644 index bfeb1d3508b4c1eb0e0043e082b8bc01ad03f5dd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/get_label_vocab.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script can be used to get label vocab from train and dev labeled files. -""" - -import sys -from argparse import ArgumentParser -from collections import Counter - -parser = ArgumentParser(description="Get label vocab") -parser.add_argument("--train_filename", required=True, type=str, help='File with training data') -parser.add_argument("--dev_filename", required=True, type=str, help='File with development data') -parser.add_argument("--out_filename", required=True, type=str, help='Output file') -args = parser.parse_args() - -vocab = Counter() - -n = 0 -for fn in [args.train_filename, args.dev_filename]: - with open(fn, "r", encoding="utf-8") as f: - for line in f: - parts = line.strip().split("\t") - if len(parts) < 2: - print("Warning: bad format in line: " + str(n) + ": " + line, file=sys.stderr) - continue - tags = parts[1].split(" ") - for t in tags: - if t == "": - vocab["KEEP"] += 1 - elif t == "": - vocab["DELETE"] += 1 - else: - vocab["DELETE|" + t] += 1 - n += 1 - -print("len(vocab)=", len(vocab)) -with open(args.out_filename, "w", encoding="utf-8") as out: - out.write("KEEP\n") - out.write("DELETE\n") - for t, freq in vocab.most_common(10000000): - if t == "KEEP": - continue - if t == "DELETE": - continue - out.write(t + "\n") diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py deleted file mode 100644 index 33608b529c7016c8c43b66d68ba4ae6c82987e39..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py +++ /dev/null @@ -1,254 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script can be used to combine joined giza alignments and Google Text Normalization dataset -to produce training corpus for the ThutmoseTaggerModel. -""" - -import glob -import os -from argparse import ArgumentParser -from collections import Counter -from typing import Dict, Optional, TextIO, Tuple - -from nemo.collections.nlp.data.text_normalization_as_tagging.utils import get_src_and_dst_for_alignment -from nemo.utils import logging - -parser = ArgumentParser(description="Produce data for the ThutmoseTaggerModel") -parser.add_argument( - "--mode", - required=True, - type=str, - help='Mode, one of ["get_replacement_vocab", "filter_by_vocab", "get_labeled_corpus"]', -) -parser.add_argument( - "--data_dir", required=True, type=str, help='Path to data directory with files like output-00000-of-00100.tsv' -) -parser.add_argument( - "--giza_dir", required=True, type=str, help='Path to directory with class folders like ordinal, date etc' -) -parser.add_argument( - "--alignment_filename", required=True, type=str, help='Name of alignment file, like "itn.out", "itn.out.vocab2000"' -) -parser.add_argument("--out_filename", required=True, type=str, help='Output file') -parser.add_argument("--vocab_filename", required=True, type=str, help='Vocab name') -parser.add_argument("--lang", required=True, type=str, help="Language") -args = parser.parse_args() - - -def process_file_itn(inputname: str, out: TextIO, keys2replacements: Dict[str, str]) -> None: - """Processes one file in Google TN Dataset format to get the labeled data for ThutmoseTaggerModel - - Args: - inputname: name of input file - out: output stream - keys2replacements: Mapping from (semiotic class, spoken, written) to the segmented written form, - which is aligned one-to-one to spoken words (this is the result obtained from Giza++ alignment pipeline) - - """ - words = [] - tags = [] - semiotic_info = [] - sent_is_ok = True - with open(inputname, "r", encoding="utf-8") as f: - for line in f: - if line.startswith(""): - if sent_is_ok and len(words) > 0: - out.write(" ".join(words) + "\t" + " ".join(tags) + "\t" + ";".join(semiotic_info) + "\n") - words = [] - tags = [] - semiotic_info = [] - sent_is_ok = True - else: - cls, written, spoken = line.strip().split("\t") - if spoken == "sil": - continue - if spoken == "": - words.append(written.casefold()) - tags.append("") - continue - src, dst, same_begin, same_end = get_src_and_dst_for_alignment( - cls.casefold(), written, spoken, args.lang - ) - same_from_begin = [] if same_begin == "" else same_begin.split(" ") - same_from_end = [] if same_end == "" else same_end.split(" ") - key = cls.casefold() + "\t" + src + "\t" + dst - if key in keys2replacements: - replacements = keys2replacements[key].split(" ") - spoken_words = dst.split(" ") - for w, r in zip( - same_from_begin + spoken_words + same_from_end, same_from_begin + replacements + same_from_end - ): - words.append(w) - if cls == "LETTERS" or cls == "PLAIN": - if w == r: - tags.append("") - else: - tags.append(r) - elif w == r.replace("_", ""): - tags.append("") - else: - tags.append(r) - semiotic_info.append( - cls - + " " - + str(len(words) - len(spoken_words) - len(same_from_begin) - len(same_from_end)) - + " " - + str(len(words)) - ) - else: - sent_is_ok = False - - -def process_line(semiotic_class: str, line: str) -> Optional[Tuple[str, str, str, int]]: - """A helper function to read the file with alignment results""" - - parts = line.strip().split("\t") - if len(parts) != 6: - return None - freq = int(parts[0]) - if parts[1] != "good:": - return None - - src, dst, leftside_align, rightside_align = parts[2], parts[3], parts[4], parts[5] - align = rightside_align - if semiotic_class == "letters" or semiotic_class == "plain": - align = leftside_align - - return src, dst, align, freq - - -def get_replacement_vocab() -> None: - """Loops through the files with alignment results in each semiotic class subfolder, counts frequencies of different - replacement segments. - """ - - full_vocab = Counter() - alignment_files = glob.glob(args.giza_dir + "/*/" + args.alignment_filename) - for fn in alignment_files: - fn_parts = fn.split("/") - if len(fn_parts) < 2: - raise ValueError("Bad filename: " + fn) - semiotic_class = fn_parts[-2] - class_vocab = Counter() - with open(fn, "r", encoding="utf-8") as f: - for line in f: - t = process_line(semiotic_class, line) - if t is None: - continue - src, dst, replacement, freq = t - inputs = src.split(" ") - replacements = replacement.split(" ") - if len(inputs) != len(replacements): - raise ValueError("Length mismatch in: " + line) - for inp, rep in zip(inputs, replacements): - if inp == rep: # skip same words - continue - full_vocab[rep] += freq - class_vocab[rep] += freq - with open(args.vocab_filename + "." + semiotic_class, "w", encoding="utf-8") as out: - for k, v in class_vocab.most_common(1000000000): - out.write(k + "\t" + str(v) + "\n") - - with open(args.vocab_filename, "w", encoding="utf-8") as out: - for k, v in full_vocab.most_common(1000000000): - out.write(k + "\t" + str(v) + "\n") - - -def filter_by_vocab() -> None: - """Given a restricted vocabulary of replacements, - loops through the files with alignment results in each semiotic class subfolder, - discards the examples containing a replacement which is not in our restricted vocabulary. - """ - - if not os.path.exists(args.vocab_filename): - raise ValueError(f"Alignments dir {args.giza_dir} does not exist") - # load vocab from file - vocab = {} - with open(args.vocab_filename, "r", encoding="utf-8") as f: - for line in f: - k, v = line.strip().split("\t") - vocab[k] = int(v) - print("len(vocab)=", len(vocab)) - alignment_files = glob.glob(args.giza_dir + "/*/" + args.alignment_filename) - for fn in alignment_files: - fn_parts = fn.split("/") - if len(fn_parts) < 2: - raise ValueError("Bad filename: " + fn) - semiotic_class = fn_parts[-2] - out = open(args.giza_dir + "/" + semiotic_class + "/" + args.out_filename, "w", encoding="utf-8") - with open(fn, "r", encoding="utf-8") as f: - for line in f: - t = process_line(semiotic_class, line) - if t is None: - continue - src, dst, replacement, freq = t - ok = True - for s, r in zip(src.split(" "), replacement.split(" ")): - if s != r and r not in vocab: - ok = False - if ok: - out.write(semiotic_class + "\t" + src + "\t" + dst + "\t" + replacement + "\n") - out.close() - - -def get_labeled_corpus() -> None: - """Loops through the files with alignment results in each semiotic class subfolder, - collects a mapping from (semiotic class, spoken, written) to the segmented written form, - which is aligned one-to-one to spoken words. - Then loops through the files in Google TN Dataset format to get the labeled data for ThutmoseTaggerModel. - It extracts the whole sentences and substitutes the semiotic spans to their aligned form from the dictionary. - """ - - if not os.path.exists(args.data_dir): - raise ValueError(f"Data dir {args.data_dir} does not exist") - - keys2replacements = {} - alignment_files = glob.glob(args.giza_dir + "/*/" + args.alignment_filename) - if len(alignment_files) == 0: - raise ValueError("Did not found any such files: " + args.giza_dir + "/*/" + args.alignment_filename) - for af in alignment_files: - with open(af, "r", encoding="utf-8") as f: - for line in f: - cls, src, dst, replacements = line.strip().split("\t") - key = cls + "\t" + dst + "\t" + src - if key in keys2replacements and keys2replacements[key] != replacements: - logging.warning("keys2replacements[key] != replacements", keys2replacements[key], replacements) - keys2replacements[key] = replacements - print("size of phrase-to-replacements dictionary =", len(keys2replacements)) - out = open(args.out_filename, "w", encoding="utf-8") - input_paths = sorted([os.path.join(args.data_dir, f) for f in os.listdir(args.data_dir)]) - for inputname in input_paths: - process_file_itn(inputname, out, keys2replacements) - out.close() - - -def main() -> None: - if not os.path.exists(args.giza_dir): - raise ValueError(f"Alignments dir {args.giza_dir} does not exist") - - if args.mode == "get_replacement_vocab": - get_replacement_vocab() - elif args.mode == "filter_by_vocab": - filter_by_vocab() - elif args.mode == "get_labeled_corpus": - get_labeled_corpus() - else: - raise ValueError("unknown mode: " + args.mode) - - -if __name__ == "__main__": - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_for_alignment.py b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_for_alignment.py deleted file mode 100644 index 9fe64c1105b8b5680b9561071435c59c048f0156..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_for_alignment.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script can be used to after google_data_preprocessing_before_alignment.py -to obtain separate "parallel" corpora for each semiotic class. - -USAGE Example: -1. Download the Google TN dataset from https://www.kaggle.com/google-nlu/text-normalization -2. Unzip the English subset (e.g., by running `tar zxvf en_with_types.tgz`). - Then there will a folder named `en_with_types`. -3. Run python google_data_preprocessing_before_alignment.py - which will produce a file data.tsv in its --output-dir -4. [Optional]. sort -u and rewrite data.tsv -5. Clone https://github.com/moses-smt/giza-pp.git, run "make" from its root folder. -6. Run this script - python ${NEMO}/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_for_alignment.py \ - --data_dir=<--output-dir from the previous step> \ - --out_dir= \ - --giza_dir=/.../giza-pp/GIZA++-v2 \ - --mckls_binary=/.../giza-pp/mkcls-v2/mkcls \ - --lang={en,ru} - - -Each corpus will be stored within <--data-dir> in the subdirectory with the name of the semiotic class, - containing files ready to be fed to Giza++: - src - written form, tokenized as characters - dst - spoken form, tokenized as words - run.sh - script for running Giza++ - -""" -from argparse import ArgumentParser -from collections import Counter -from os import listdir, mkdir -from os.path import isdir, join -from shutil import rmtree - -from nemo.collections.nlp.data.text_normalization_as_tagging.utils import get_src_and_dst_for_alignment - -parser = ArgumentParser(description='Split corpus to subcorpora for giza alignment') -parser.add_argument('--data_dir', type=str, required=True, help='Path to folder with data') -parser.add_argument('--out_dir', type=str, required=True, help='Path to output folder') -parser.add_argument('--giza_dir', type=str, required=True, help='Path to folder with GIZA++ binaries') -parser.add_argument('--mckls_binary', type=str, required=True, help='Path to mckls binary') -parser.add_argument('--lang', type=str, required=True, help='Language') -args = parser.parse_args() - - -def prepare_subcorpora_from_data() -> None: - """Preprocess a corpus in Google TN Dataset format, extract TN-ITN phrase pairs, prepare input for GIZA++ alignment. - """ - semiotic_vcb = Counter() - cache_vcb = {} - filenames = [] - for fn in listdir(args.data_dir + "/train"): - filenames.append(args.data_dir + "/train/" + fn) - for fn in listdir(args.data_dir + "/dev"): - filenames.append(args.data_dir + "/dev/" + fn) - for fn in filenames: - with open(fn, "r", encoding="utf-8") as f: - # Loop through each line of the file - for line in f: - parts = line.strip().split("\t") - if len(parts) < 3: - continue - if len(parts) != 3: - raise ValueError("Expect 3 parts, got " + str(len(parts))) - semiotic_class, written, spoken = parts[0], parts[1].strip(), parts[2].strip() - if spoken == "": - continue - semiotic_class = semiotic_class.casefold() - semiotic_vcb[semiotic_class] += 1 - classdir = join(args.out_dir, semiotic_class) - if not isdir(classdir): - mkdir(classdir) - src, dst, _, _ = get_src_and_dst_for_alignment(semiotic_class, written, spoken, args.lang) - if src == "" or dst == "": - continue - if len(src.split(" ")) >= 100: - continue - if semiotic_class not in cache_vcb: - cache_vcb[semiotic_class] = Counter() - cache_vcb[semiotic_class][(src, dst)] += 1 - for sem in semiotic_vcb: - classdir = join(args.out_dir, sem) - if not isdir(classdir): - raise ValueError("No such directory: " + classdir) - print(classdir, " has ", semiotic_vcb[sem], " instances") - with open(join(classdir, "run.sh"), "w") as out: - out.write("GIZA_PATH=\"" + args.giza_dir + "\"\n") - out.write("MKCLS=\"" + args.mckls_binary + "\"\n") - out.write("\n") - out.write("${GIZA_PATH}/plain2snt.out src dst\n") - out.write("${MKCLS} -m2 -psrc -c15 -Vsrc.classes opt >& mkcls1.log\n") - out.write("${MKCLS} -m2 -pdst -c15 -Vdst.classes opt >& mkcls2.log\n") - out.write("${GIZA_PATH}/snt2cooc.out src.vcb dst.vcb src_dst.snt > src_dst.cooc\n") - out.write( - "${GIZA_PATH}/GIZA++ -S src.vcb -T dst.vcb -C src_dst.snt -coocurrencefile src_dst.cooc -p0 0.98 -o GIZA++ >& GIZA++.log\n" - ) - out.write("##reverse direction\n") - out.write("${GIZA_PATH}/snt2cooc.out dst.vcb src.vcb dst_src.snt > dst_src.cooc\n") - out.write( - "${GIZA_PATH}/GIZA++ -S dst.vcb -T src.vcb -C dst_src.snt -coocurrencefile dst_src.cooc -p0 0.98 -o GIZA++reverse >& GIZA++reverse.log\n" - ) - out_src = open(join(classdir, "src"), 'w', encoding="utf-8") - out_dst = open(join(classdir, "dst"), 'w', encoding="utf-8") - out_freq = open(join(classdir, "freq"), 'w', encoding="utf-8") - for src, dst in cache_vcb[sem]: - freq = cache_vcb[sem][(src, dst)] - out_src.write(src + "\n") - out_dst.write(dst + "\n") - out_freq.write(str(freq) + "\n") - out_freq.close() - out_dst.close() - out_src.close() - - -# Main code -if __name__ == '__main__': - for name in listdir(args.out_dir): - path = join(args.out_dir, name) - if isdir(path): - rmtree(path) - - # Processing - prepare_subcorpora_from_data() diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/sample_each_label.py b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/sample_each_label.py deleted file mode 100644 index e3bf0103390ffbffcf870729533a9268e8168995..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/dataset_preparation/sample_each_label.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script can be used to sample each label from the labeled files. -""" - -import sys -from argparse import ArgumentParser -from collections import Counter - -parser = ArgumentParser(description="Sample labels") -parser.add_argument("--filename", required=True, type=str, help='File with input data') -parser.add_argument("--max_count", required=True, type=int, help='Count') -args = parser.parse_args() - - -vocab = Counter() - -out_sample = open(args.filename + ".sample_" + str(args.max_count), "w", encoding="utf-8") -out_rest = open(args.filename + ".rest_" + str(args.max_count), "w", encoding="utf-8") - -n = 0 -with open(args.filename, "r", encoding="utf-8") as f: - for line in f: - parts = line.strip().split("\t") - if len(parts) < 2: - print("Warning: bad format in line: " + str(n) + ": " + line, file=sys.stderr) - continue - - tags = parts[1].split(" ") - ok = False - for t in tags: - if t not in vocab: - vocab[t] = 0 - if vocab[t] < args.max_count: - ok = True - vocab[t] += 1 - if ok: - out_sample.write(line) - else: - out_rest.write(line) - n += 1 - -out_sample.close() -out_rest.close() diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/evaluation/eval.py b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/evaluation/eval.py deleted file mode 100644 index 69521f9dde404a8a3a675293e82a8b761fc59cac..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/evaluation/eval.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script can be used to compare the inference output of Thutmose tagger with multi_reference file - -USAGE Example: - python eval.py \ - --inference_file= \ - --reference_file= \ - --print_other_errors - -The inference file is a tsv file in which the first column contains the predicted sentence text. -The reference file is a tsv file in which - the first column contains the input sentence text, - the second column contains the reference sentence text (taken from Google TN dataset) - the third column (optional) contains additional acceptable references for semiotic spans in this sentence. - E.g. - mizoguchi akiko september twenty ten mizoguchi akiko september 2010 DATE 2 5 | sept 2010 | sep. 2010 ... - (to get a reference file see the last steps in examples/nlp/text_normalization_as_tagging/prepare_dataset_en.sh, - starting from ".../examples/nlp/text_normalization_as_tagging/evaluation/get_multi_reference_vocab.py" - ) - -The script outputs the following metrics: - Word Error Rate (WER) - an automatic metric commonly used in ASR. - It does not take into account additional references. - Sentence accuracy: - The sentence is regarded as correct if its characters (without spaces) match to the reference, - It takes into account additional references. - - If at least one digital character doesn't match this sentence is regarded as containing Digit Error. - If all digital character match, but at least one non-digital character doesn't match - this sentence is regarded as containing Other Error. -""" - - -import re -from argparse import ArgumentParser - -from nemo.collections.asr.metrics.wer import word_error_rate - -parser = ArgumentParser(description="Compare inference output with multi-reference") -parser.add_argument("--inference_file", type=str, required=True, help="Path to inference file") -parser.add_argument( - "--print_other_errors", - action='store_true', - help="Whether to print other errors, if false only digit errors will be printed", -) -parser.add_argument("--reference_file", type=str, required=True, help="Path to reference file") -args = parser.parse_args() - -# Main code -if __name__ == "__main__": - inputs = [] - references = [] # list(size=len(inputs)) of lists - skip_ids = set() # sentences ids to be skipped during evaluation - with open(args.reference_file, "r", encoding="utf-8") as f: - for line in f: - multi_references = [] - parts = line.strip().split("\t") - if len(parts) < 2 or len(parts) > 3: - raise ValueError("Bad format: " + line) - words = parts[0].split() - inputs.append(words) - if len(parts) == 3: # there are non-trivial semiotic spans - multi_references.append("") - input_position = 0 - if "TELEPHONE" in parts[2] or "ELECTRONIC" in parts[2]: - skip_ids.add(len(references)) - spans = parts[2].split(";") - multi_references_updated = [] - for span in spans: - span_parts = span.split(" | ") - try: - sem, begin, end = span_parts[0].split(" ") - except Exception: - print("error: ", line) - continue - begin = int(begin) - end = int(end) - for ref in multi_references: - if len(span_parts) > 20 or len(multi_references_updated) > 20000: - print("warning: too many references: ", inputs[-1]) - break - for tr_variant in span_parts[1:]: - multi_references_updated.append( - ref - + " " - + " ".join(inputs[-1][input_position:begin]) # copy needed words from input - + " " - + tr_variant - ) - multi_references = multi_references_updated[:] # copy - multi_references_updated = [] - input_position = end - for i in range(len(multi_references)): # copy needed words from the input end - multi_references[i] += " " + " ".join(inputs[-1][input_position : len(inputs[-1])]) - # the last reference added is the actual one - multi_references.append(parts[1]) - references.append(multi_references) - - predictions = [] - predicted_tags = [] - predicted_semiotic = [] - # load predictions - with open(args.inference_file, "r", encoding="utf-8") as f: - for line in f: - parts = line.strip().split("\t") - if len(parts) == 1: - predictions.append(parts[0].casefold()) - predicted_tags.append([]) - continue - if len(parts) != 5: - raise ValueError("Bad format: " + line) - prediction, inp_str, tag_str, tags_with_swap_str, semiotic = parts - predictions.append(prediction.casefold()) - tags = tag_str.split(" ") - predicted_tags.append(tags) - predicted_semiotic.append(semiotic) - - sentences_with_errors_on_digits = 0 - correct_sentences_disregarding_space = 0 - - if len(inputs) != len(predictions) or len(inputs) != len(references): - raise ValueError( - "Length mismatch: len(inputs)=" - + str(len(inputs)) - + "; len(predictions)=" - + str(len(predictions)) - + "; len(references)=" - + str(len(references)) - ) - - refs_for_wer = [] - preds_for_wer = [] - for i in range(len(inputs)): - ok_digit = False - ok_all = False - if i in skip_ids: - continue - refs_for_wer.append(references[i][-1]) - preds_for_wer.append(predictions[i]) - for ref in references[i]: - ref_digit_fragments = re.findall(r"\d+", ref) - pred_digit_fragments = re.findall(r"\d+", predictions[i]) - if "".join(pred_digit_fragments) == "".join(ref_digit_fragments): - ok_digit = True - if predictions[i].replace("_", "").replace(" ", "") == ref.replace("_", "").replace(" ", ""): - ok_all = True - if not ok_digit: - print("digit error:") - print("\tinput=", " ".join(inputs[i])) - print("\ttags=", " ".join(predicted_tags[i])) - print("\tpred=", predictions[i]) - print("\tsemiotic=", predicted_semiotic[i]) - print("\tref=", references[i][-1]) # last reference is actual reference - sentences_with_errors_on_digits += 1 - elif ok_all: - correct_sentences_disregarding_space += 1 - elif args.print_other_errors: - print("other error:") - print("\tinput=", " ".join(inputs[i])) - print("\ttags=", " ".join(predicted_tags[i])) - print("\tpred=", predictions[i]) - print("\tsemiotic=", predicted_semiotic[i]) - print("\tref=", references[i][-1]) # last reference is actual reference - - wer = word_error_rate(refs_for_wer, preds_for_wer) - print("WER: ", wer) - print( - "Sentence accuracy: ", - correct_sentences_disregarding_space / (len(inputs) - len(skip_ids)), - correct_sentences_disregarding_space, - ) - print( - "digit errors: ", - sentences_with_errors_on_digits / (len(inputs) - len(skip_ids)), - sentences_with_errors_on_digits, - ) - print( - "other errors: ", - (len(inputs) - len(skip_ids) - correct_sentences_disregarding_space - sentences_with_errors_on_digits) - / (len(inputs) - len(skip_ids)), - len(inputs) - len(skip_ids) - correct_sentences_disregarding_space - sentences_with_errors_on_digits, - ) diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/evaluation/eval_per_class.py b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/evaluation/eval_per_class.py deleted file mode 100644 index 1ed55ecac948fb03cae35935bf29747fabc14617..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/evaluation/eval_per_class.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script can be used to compare the inference output of Thutmose tagger with multi_reference file. -The additional report is stored to a separate file for each semiotic class. - -USAGE Example: - python eval_per_class.py \ - --inference_file= \ - --reference_file= \ - --output_file= - -The inference file is a tsv file in which the first column contains the predicted sentence text. -The reference file is a tsv file in which - the first column contains the input sentence text, - the second column contains the reference sentence text (taken from Google TN dataset) - the third column (optional) contains additional acceptable references for semiotic spans in this sentence. - E.g. - mizoguchi akiko september twenty ten mizoguchi akiko september 2010 DATE 2 5 | sept 2010 | sep. 2010 ... - -The script generates: - a file with report on accuracy per semiotiotic class (output_file). - files (.) with sentences, containing errors in this semiotic span. - -""" -import glob -import os -from argparse import ArgumentParser -from collections import Counter - -parser = ArgumentParser(description="Compare inference output with multi-reference, print report per class") -parser.add_argument("--inference_file", type=str, required=True, help="Path to inference file 1") -parser.add_argument("--reference_file", type=str, required=True, help="Path to reference file") -parser.add_argument("--output_file", type=str, required=True, help="Path to output file") -args = parser.parse_args() - -# Main code -if __name__ == '__main__': - - # delete all class-specific reports, as they are created in the append mode - for f in glob.glob(args.output_file + ".*"): - os.remove(f) - - total_count = Counter() - correct_count = Counter() - - f_ref = open(args.reference_file, "r", encoding="utf-8") - f_infer = open(args.inference_file, "r", encoding="utf-8") - f_out = open(args.output_file, "w", encoding="utf-8") - lines_ref = f_ref.readlines() - lines_infer = f_infer.readlines() - f_ref.close() - f_infer.close() - if len(lines_ref) != len(lines_infer): - raise ValueError( - "Number of lines doesn't match: len(lines_ref)=" - + str(len(lines_ref)) - + "; len(lines_infer)=" - + str(len(lines_infer)) - ) - for i in range(len(lines_infer)): - _, inp_str, _, tag_with_swap_str, semiotic = lines_infer[i].strip().split("\t") - input_words = inp_str.split(" ") - predicted_tags = tag_with_swap_str.split(" ") - predicted_words = predicted_tags[:] - for k in range(len(predicted_tags)): - t = predicted_tags[k] - if t == "": - predicted_words[k] = input_words[k] - elif t == "": - predicted_words[k] = "" - else: - predicted_words[k] = predicted_words[k].replace(">", "").replace("<", "") - - parts = lines_ref[i].strip().split("\t") - if len(parts) < 2 or len(parts) > 3: - raise ValueError("Bad format: " + lines_ref[i]) - if len(parts) == 3: # there are non-trivial semiotic spans - spans = parts[2].split(";") - for span in spans: - span_parts = span.split(" | ") - try: - sem, begin, end = span_parts[0].split(" ") - except Exception: - print("error: ", lines_ref[i]) - continue - begin = int(begin) - end = int(end) - - ok = False - predicted_span = " ".join(predicted_words[begin:end]).replace("_", " ").replace(" ", "").casefold() - input_span = " ".join(input_words[begin:end]) - total_count[sem] += 1 - for tr_variant in span_parts[1:]: - ref_span = tr_variant.replace("_", " ").replace(" ", "").casefold() - if ref_span == predicted_span: - ok = True - correct_count[sem] += 1 - break - if not ok: - out_sem = open(args.output_file + "." + sem, "a", encoding="utf-8") - out_sem.write( - "error: pred=" - + " ".join(predicted_words[begin:end]) - + "; inp=" - + input_span - + "; ref=" - + span - + "\n" - ) - out_sem.write("\tinput=" + " ".join(input_words) + "\n") - out_sem.write("\ttags=" + " ".join(predicted_tags) + "\n") - out_sem.write("\tpred=" + " ".join(predicted_words) + "\n") - out_sem.write("\tsemiotic=" + semiotic + "\n") - out_sem.write("\tref=" + parts[1] + "\n") - out_sem.close() - - f_out.write("class\ttotal\tcorrect\terrors\taccuracy\n") - for sem in total_count: - f_out.write( - sem - + "\t" - + str(total_count[sem]) - + "\t" - + str(correct_count[sem]) - + "\t" - + str(total_count[sem] - correct_count[sem]) - + "\t" - + str(correct_count[sem] / total_count[sem]) - + "\n" - ) - f_out.close() diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/evaluation/get_multi_reference_vocab.py b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/evaluation/get_multi_reference_vocab.py deleted file mode 100644 index 43b0dd5dc4dcf57062dbc18c977cb74c31a34b66..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/evaluation/get_multi_reference_vocab.py +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script can be used to construct a vocabulary of multiple references -""" -from argparse import ArgumentParser -from collections import Counter -from os import listdir - -from nemo.collections.nlp.data.text_normalization_as_tagging.utils import spoken_preprocessing - -parser = ArgumentParser(description="Get reference vocabulary from corpus (it will be used in testing)") -parser.add_argument("--data_dir", type=str, required=True, help="Path to folder with data") -parser.add_argument("--out_filename", type=str, required=True, help="Path to output file") -args = parser.parse_args() - -if __name__ == "__main__": - - vcb = {} - filenames = [] - for fn in listdir(args.data_dir + "/train"): - filenames.append(args.data_dir + "/train/" + fn) - for fn in listdir(args.data_dir + "/dev"): - filenames.append(args.data_dir + "/dev/" + fn) - for fn in filenames: - print("Processing ", fn) - with open(fn, "r", encoding="utf-8") as f: - for line in f: - parts = line.strip().split("\t") - if len(parts) < 3: - continue - if len(parts) != 3: - raise ValueError("Expect 3 parts, got " + str(len(parts))) - semiotic_class, written, spoken = parts[0], parts[1].strip().casefold(), parts[2].strip().casefold() - spoken = spoken_preprocessing(spoken) - if spoken == "": - continue - if spoken == "" or written == "": - continue - if len(spoken.split(" ")) >= 100: - continue - k = (semiotic_class, spoken) - if k not in vcb: - vcb[k] = Counter() - vcb[k][written] += 1 - - with open(args.out_filename, "w", encoding="utf-8") as out: - for sem, spoken in vcb: - for written in vcb[(sem, spoken)]: - out.write(sem + "\t" + spoken + "\t" + written + "\t" + str(vcb[(sem, spoken)][written]) + "\n") - out.write(sem + "\t" + spoken + "\t" + spoken + "\t1\n") diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/evaluation/prepare_corpora_for_testing.py b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/evaluation/prepare_corpora_for_testing.py deleted file mode 100644 index 10cb9162202a50a5ca9046231e5a92615f333167..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/evaluation/prepare_corpora_for_testing.py +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script can be used to prepare test corpus for the ThutmoseTaggerModel from Google Text Normalization dataset. -""" - -import os -import re -from argparse import ArgumentParser -from collections import Counter -from typing import Dict, TextIO, Tuple - -from nemo.collections.nlp.data.text_normalization_as_tagging.utils import spoken_preprocessing - -parser = ArgumentParser(description="Text Normalization Data Preprocessing for English") -parser.add_argument( - "--data_dir", required=True, type=str, help="Path to data directory with files like output-00000-of-00100.tsv" -) -parser.add_argument("--reference_vocab", required=True, type=str, help="Multi Reference vocabulary") -parser.add_argument("--output_file", required=True, type=str, help="Output file") -parser.add_argument( - "--sampling_count", required=True, type=int, help="Number of examples per class, you want, use -1 for all examples" -) -args = parser.parse_args() - - -def process_file( - inputname: str, - out: TextIO, - out_raw: TextIO, - reference_vcb: Dict[Tuple[str, str], Dict[str, int]], - sampling_vcb: Dict[str, int], -) -> None: - words = [] - reference_words = [] # size may be different - semiotic_info = [] - raw_lines = [] - sent_ok = True if args.sampling_count == -1 else False - with open(inputname, "r", encoding="utf-8") as f: - for line in f: - if line.startswith(""): - if len(words) > 0 and sent_ok: - out.write( - " ".join(words) + "\t" + " ".join(reference_words) + "\t" + ";".join(semiotic_info) + "\n" - ) - out_raw.write("\n".join(raw_lines) + "\n" + line) - words = [] - reference_words = [] - semiotic_info = [] - raw_lines = [] - sent_ok = True if args.sampling_count == -1 else False - else: - raw_lines.append(line.strip()) - cls, written, spoken = line.strip().split("\t") - spoken = spoken_preprocessing(spoken) - written = written.casefold() - references = set() - if spoken == "sil": - continue - if spoken == "": - words.append(written) - reference_words.append(written) - # if reference is , but the word has itn conversions in our dictionary, add them - for cls in ["CARDINAL", "ORDINAL", "DATE"]: # date, ex sixties -> 60s - k = (cls, written) - if k in reference_vcb: - for tr_variant in reference_vcb[k]: - references.add(tr_variant) - semiotic_info.append( - cls - + " " - + str(len(words) - 1) - + " " - + str(len(words)) - + " | " - + " | ".join(references) - ) - break - continue - - spoken_words = spoken.split() - words.extend(spoken_words) - - k = (cls, spoken) - if k in reference_vcb: - for tr_variant in reference_vcb[k]: - references.add(tr_variant) - references.add(spoken) - references.add(written) - for tr_variant in list(references): - # 6,51 km² => 6,51 km 2 - (tr_variant2, n2) = re.subn(r"²", " 2", tr_variant) - (tr_variant3, n3) = re.subn(r"³", " 3", tr_variant) - if n2 > 0: - references.add(tr_variant2) - if n3 > 0: - references.add(tr_variant3) - - semiotic_info.append( - cls - + " " - + str(len(words) - len(spoken_words)) - + " " - + str(len(words)) - + " | " - + " | ".join(list(references)) - ) - reference_words.append(written.casefold()) - - if cls not in sampling_vcb: - sampling_vcb[cls] = 0 - if sampling_vcb[cls] < args.sampling_count: - sent_ok = True - sampling_vcb[cls] += 1 - - -def main() -> None: - if not os.path.exists(args.data_dir): - raise ValueError(f"Data dir {args.data_dir} does not exist") - reference_vcb = {} - with open(args.reference_vocab, "r", encoding="utf-8") as f: - for line in f: - sem, spoken, written, freq = line.strip().split("\t") - k = (sem, spoken) - if k not in reference_vcb: - reference_vcb[k] = {} - reference_vcb[k][written] = int(freq) - sampling_vcb = Counter() - out = open(args.output_file, "w", encoding="utf-8") - out_raw = open(args.output_file + ".raw", "w", encoding="utf-8") - input_paths = sorted([os.path.join(args.data_dir, f) for f in os.listdir(args.data_dir)]) - for inputname in input_paths: - process_file(inputname, out, out_raw, reference_vcb, sampling_vcb) - out.close() - out_raw.close() - - -if __name__ == "__main__": - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/helpers.py b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/helpers.py deleted file mode 100644 index 347b05b25fba3e87fbb06950ffae10cace84a237..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/helpers.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -from typing import Tuple - -import pytorch_lightning as pl -from omegaconf import DictConfig - -from nemo.collections.nlp.models import ThutmoseTaggerModel -from nemo.utils import logging - -__all__ = ["ITN_MODEL", "MODEL_NAMES", "instantiate_model_and_trainer"] - -ITN_MODEL = "itn" -MODEL_NAMES = [ITN_MODEL] - - -def instantiate_model_and_trainer( - cfg: DictConfig, model_name: str, do_training: bool -) -> Tuple[pl.Trainer, ThutmoseTaggerModel]: - """ Function for instantiating a model and a trainer - Args: - cfg: The config used to instantiate the model and the trainer. - model_name: A str indicates the model direction, currently only 'itn'. - do_training: A boolean flag indicates whether the model will be trained or evaluated. - - Returns: - trainer: A PyTorch Lightning trainer - model: A ThutmoseTaggerModel - """ - - if model_name not in MODEL_NAMES: - raise ValueError(f"{model_name} is unknown model type") - - # Get configs for the corresponding models - trainer_cfg = cfg.get("trainer") - model_cfg = cfg.get("model") - pretrained_cfg = cfg.get("pretrained_model", None) - trainer = pl.Trainer(**trainer_cfg) - if not pretrained_cfg: - logging.info(f"Initializing {model_name} model") - if model_name == ITN_MODEL: - model = ThutmoseTaggerModel(model_cfg, trainer=trainer) - else: - raise ValueError(f"{model_name} is unknown model type") - elif os.path.exists(pretrained_cfg): - logging.info(f"Restoring pretrained {model_name} model from {pretrained_cfg}") - model = ThutmoseTaggerModel.restore_from(pretrained_cfg) - else: - logging.info(f"Loading pretrained model {pretrained_cfg}") - if model_name == ITN_MODEL: - if pretrained_cfg not in ThutmoseTaggerModel.get_available_model_names(): - raise ( - ValueError( - f"{pretrained_cfg} not in the list of available Tagger models." - f"Select from {ThutmoseTaggerModel.list_available_models()}" - ) - ) - model = ThutmoseTaggerModel.from_pretrained(pretrained_cfg) - else: - raise ValueError(f"{model_name} is unknown model type") - - # Setup train and validation data - if do_training: - model.setup_training_data(train_data_config=cfg.data.train_ds) - model.setup_validation_data(val_data_config=cfg.data.validation_ds) - - logging.info(f"Model {model_name} -- Device {model.device}") - return trainer, model diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/install_requirements.sh b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/install_requirements.sh deleted file mode 100644 index f54a6cb3f8faa04d2aa419dfd26e1cfc4881e9ad..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/install_requirements.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -git clone https://github.com/moses-smt/giza-pp.git giza-pp -cd giza-pp -make -cd .. diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_infer.py b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_infer.py deleted file mode 100644 index 0516f3bf5a8f352489414bb5aca30bfc5982e39f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_infer.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script contains an example on how to run itn inference with the ThutmoseTaggerModel. - -The inference works on a raw file (no labels required). -Each line of the input file represents a single example for inference. - Specify inference.from_file and inference.batch_size parameters. - -USAGE Example: -1. Train a model, or use a pretrained checkpoint. -2. Run: - export TOKENIZERS_PARALLELISM=false - python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_infer.py \ - pretrained_model=./training.nemo \ - inference.from_file=./input.txt \ - inference.out_file=./output.txt \ - model.max_sequence_len=1024 #\ - inference.batch_size=128 - -This script uses the `/examples/nlp/text_normalization_as_tagging/conf/thutmose_tagger_itn_config.yaml` -config file by default. The other option is to set another config file via command -line arguments by `--config-name=CONFIG_FILE_PATH'. -""" - - -import os - -from helpers import ITN_MODEL, instantiate_model_and_trainer -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.data.text_normalization_as_tagging.utils import spoken_preprocessing -from nemo.core.config import hydra_runner -from nemo.utils import logging - - -@hydra_runner(config_path="conf", config_name="thutmose_tagger_itn_config") -def main(cfg: DictConfig) -> None: - logging.debug(f'Config Params: {OmegaConf.to_yaml(cfg)}') - - if cfg.pretrained_model is None: - raise ValueError("A pre-trained model should be provided.") - _, model = instantiate_model_and_trainer(cfg, ITN_MODEL, False) - - text_file = cfg.inference.from_file - logging.info(f"Running inference on {text_file}...") - if not os.path.exists(text_file): - raise ValueError(f"{text_file} not found.") - - with open(text_file, "r", encoding="utf-8") as f: - lines = f.readlines() - - batch_size = cfg.inference.get("batch_size", 8) - - batch, all_preds = [], [] - for i, line in enumerate(lines): - s = spoken_preprocessing(line) # this is the same input transformation as in corpus preparation - batch.append(s.strip()) - if len(batch) == batch_size or i == len(lines) - 1: - outputs = model._infer(batch) - for x in outputs: - all_preds.append(x) - batch = [] - if len(all_preds) != len(lines): - raise ValueError( - "number of input lines and predictions is different: predictions=" - + str(len(all_preds)) - + "; lines=" - + str(len(lines)) - ) - out_file = cfg.inference.out_file - with open(f"{out_file}", "w", encoding="utf-8") as f_out: - f_out.write("\n".join(all_preds)) - logging.info(f"Predictions saved to {out_file}.") - - -if __name__ == "__main__": - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_train.py b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_train.py deleted file mode 100644 index 36fe97d2341b8eea4600bcb196e1023c1fa0ad18..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_train.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This script contains an example on how to train a ThutmoseTaggerModel for inverse text normalization(ITN). - -This script uses the `/examples/nlp/text_normalization_as_tagging/conf/thutmose_tagger_itn_config.yaml` -config file by default. The other option is to set another config file via command -line arguments by `--config-name=CONFIG_FILE_PATH'. Probably it is worth looking -at the example config file to see the list of parameters used for training. - -USAGE Example: -1. Obtain a processed dataset -2. Run: - python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_train.py \ - lang=${LANG} \ - data.validation_ds.data_path=${DATA_PATH}/valid.tsv \ - data.train_ds.data_path=${DATA_PATH}/train.tsv \ - data.train_ds.batch_size=128 \ - data.train_ds.num_workers=8 \ - model.language_model.pretrained_model_name=${LANGUAGE_MODEL} \ - model.label_map=${DATA_PATH}/label_map.txt \ - model.semiotic_classes=${DATA_PATH}/semiotic_classes.txt \ - model.optim.lr=3e-5 \ - trainer.devices=[1] \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.strategy=ddp \ - trainer.max_epochs=5 - -Information on the arguments: - -Most arguments in the example config file are quite self-explanatory (e.g., -`model.optim.lr` refers to the learning rate for training the model). - -Some arguments we want to mention are: - -+ lang: The language of the dataset. -+ model.language_model.pretrained_model_name: This is the backbone BERT model (depends on the language) -e.g. bert-base-uncased (English), DeepPavlov/rubert-base-cased (Russian) -""" - -from helpers import ITN_MODEL, instantiate_model_and_trainer -from omegaconf import DictConfig, OmegaConf - -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="thutmose_tagger_itn_config") -def main(cfg: DictConfig) -> None: - # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True - # when there are unused parameters like here - if cfg.trainer.strategy == 'ddp': - cfg.trainer.strategy = "ddp_find_unused_parameters_true" - logging.info(f'Config Params: {OmegaConf.to_yaml(cfg)}') - - # Train the model - if cfg.model.do_training: - logging.info( - "================================================================================================" - ) - logging.info('Start training...') - trainer, model = instantiate_model_and_trainer(cfg, ITN_MODEL, True) - thutmose_tagger_exp_manager = cfg.get('exp_manager', None) - exp_manager(trainer, thutmose_tagger_exp_manager) - trainer.fit(model) - logging.info('Training finished!') - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/prepare_dataset_en.sh b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/prepare_dataset_en.sh deleted file mode 100644 index 54fcdd268a5e0eabfe63c62a27b3a57d74a4c4d9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/prepare_dataset_en.sh +++ /dev/null @@ -1,334 +0,0 @@ -#!/bin/bash - -## This bash-script reproduces the pipeline of data preparation for the Thutmose Tagger model (tagger-based ITN model) - -## In order to use it, you need: -## 1. install and compile GIZA++ -## git clone https://github.com/moses-smt/giza-pp.git giza-pp -## cd giza-pp -## make -## 2. Download Google TN Dataset -## https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish -## 3. install NeMo -## git clone https://github.com/NVIDIA/NeMo -## 4. Specify the following paths - -## path to NeMo repository, e.g. /home/user/nemo -NEMO_PATH= -## path to GIZA++, e.g. /home/user/giza-pp/GIZA++-v2 -GIZA_BIN_DIR= -## path to MCKLS_BINARY, e.g. /home/user/giza-pp/mkcls-v2/mkcls -MCKLS_BINARY= -## initial unzipped Google Text Normalization Dataset, e.g. /home/user/data/en_with_types -GOOGLE_CORPUS_DIR= - - -## corpus language -CORPUS_LANG=en - -WORK_DIR=`pwd` # directory from which this bash-script is run -echo "Working directory:" ${WORK_DIR} - -## names of working subfolders -CORPUS_DIR=${WORK_DIR}/corpus -ALIGNMENT_DIR=${WORK_DIR}/alignment - -## read the data and split it into train, dev, test -## files in test folder is truncated to match the default test dataset from the paper on Google TN Dataset -## option --add_test_full=true creates additional test_full folder, which is not truncated -python ${NEMO_PATH}/examples/nlp/duplex_text_normalization/data/data_split.py \ - --data_dir=${GOOGLE_CORPUS_DIR} \ - --output_dir=${CORPUS_DIR} \ - --lang=${CORPUS_LANG} \ - --add_test_full - -## we need only output-00099-of-00100.tsv as the final test data -rm ${CORPUS_DIR}/test/output-00095-of-00100.tsv ${CORPUS_DIR}/test/output-00096-of-00100.tsv ${CORPUS_DIR}/test/output-00097-of-00100.tsv ${CORPUS_DIR}/test/output-00098-of-00100.tsv - -## This script extracts all unique ITN phrase-pairs from the Google TN dataset, tokenizes them and stores in separate -## folders for each semiotic class. In each folder we generate a bash script for running the alignment. -mkdir ${ALIGNMENT_DIR} -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_for_alignment.py \ - --data_dir=${CORPUS_DIR} \ - --out_dir=${ALIGNMENT_DIR} \ - --giza_dir=${GIZA_BIN_DIR} \ - --mckls_binary=${MCKLS_BINARY} \ - --lang=${CORPUS_LANG} - -##exclude punct class -rm -r ${ALIGNMENT_DIR}/punct - -## for better GIZA++ alignments mix in examples from other classes -## they will append to the tail of "src" and "dst" files and they will not have corresponding freqs in "freq" file -## all these appended lines will be skipped in the get_replacement_vocab step -for fn in "src" "dst" -do - cat ${ALIGNMENT_DIR}/money/${fn} \ - ${ALIGNMENT_DIR}/cardinal/${fn} \ - ${ALIGNMENT_DIR}/decimal/${fn} \ - ${ALIGNMENT_DIR}/fraction/${fn} \ - ${ALIGNMENT_DIR}/measure/${fn} > ${ALIGNMENT_DIR}/money/${fn}.new - - cat ${ALIGNMENT_DIR}/measure/${fn} \ - ${ALIGNMENT_DIR}/cardinal/${fn} \ - ${ALIGNMENT_DIR}/decimal/${fn} \ - ${ALIGNMENT_DIR}/fraction/${fn} \ - ${ALIGNMENT_DIR}/money/${fn} > ${ALIGNMENT_DIR}/measure/${fn}.new - - cat ${ALIGNMENT_DIR}/fraction/${fn} \ - ${ALIGNMENT_DIR}/cardinal/${fn} \ - ${ALIGNMENT_DIR}/measure/${fn} \ - ${ALIGNMENT_DIR}/money/${fn} > ${ALIGNMENT_DIR}/fraction/${fn}.new - - cat ${ALIGNMENT_DIR}/decimal/${fn} \ - ${ALIGNMENT_DIR}/cardinal/${fn} \ - ${ALIGNMENT_DIR}/measure/${fn} \ - ${ALIGNMENT_DIR}/money/${fn} > ${ALIGNMENT_DIR}/decimal/${fn}.new - -done - -for c in "decimal" "fraction" "measure" "money" -do - mv ${ALIGNMENT_DIR}/${c}/src.new ${ALIGNMENT_DIR}/${c}/src - mv ${ALIGNMENT_DIR}/${c}/dst.new ${ALIGNMENT_DIR}/${c}/dst -done - -for subfolder in ${ALIGNMENT_DIR}/* -do - echo ${subfolder} - chmod +x ${subfolder}/run.sh -done - -## Run alignment using multiple processes -for subfolder in ${ALIGNMENT_DIR}/* -do - cd ${subfolder} - ./run.sh & -done -wait - -## Extract final alignments for each semiotic class -for subfolder in ${ALIGNMENT_DIR}/* -do - python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/extract_giza_alignments.py \ - --mode=itn \ - --giza_dir=${subfolder} \ - --out_filename=itn.out \ - --giza_suffix=A3.final \ - --lang=en & -done -wait - -## add column with frequencies of phrase pairs in the corpus -for subfolder in ${ALIGNMENT_DIR}/* -do - paste -d"\t" ${subfolder}/freq ${subfolder}/itn.out > ${subfolder}/itn.out2 - awk 'BEGIN {FS="\t"} match($3, " "){print $0}' < ${subfolder}/itn.out2 | sort -rn > ${subfolder}/itn.debug -done - -## loop through the obtained alignments and collect vocabularies (for each semiotic class) -## of all possible replacement fragments (aka tags) -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py \ - --mode=get_replacement_vocab \ - --giza_dir=${ALIGNMENT_DIR} \ - --alignment_filename=itn.out2 \ - --data_dir="" \ - --vocab_filename=${WORK_DIR}/replacement_vocab_full.txt \ - --out_filename="" \ - --lang=${CORPUS_LANG} - -## Here we put some voluntary thresholds on how many tags we take. -## Tags with low frequencies are likely to be derived from sporadic alignment mistakes -grep -v "0__" ${WORK_DIR}/replacement_vocab_full.txt.verbatim | head -n 108 > ${WORK_DIR}/replacement_vocab_verbatim.txt -grep -v "0__" ${WORK_DIR}/replacement_vocab_full.txt.time | head -n 148 > ${WORK_DIR}/replacement_vocab_time.txt -grep -v "0__" ${WORK_DIR}/replacement_vocab_full.txt.telephone | head -n 52 > ${WORK_DIR}/replacement_vocab_telephone.txt -head -n 0 ${WORK_DIR}/replacement_vocab_full.txt.plain > ${WORK_DIR}/replacement_vocab_plain.txt -grep -v "0__" ${WORK_DIR}/replacement_vocab_full.txt.ordinal | head -n 251 > ${WORK_DIR}/replacement_vocab_ordinal.txt -grep -v "0__" ${WORK_DIR}/replacement_vocab_full.txt.money | grep -v "a__" | head -n 532 > ${WORK_DIR}/replacement_vocab_money.txt -grep -v "0__" ${WORK_DIR}/replacement_vocab_full.txt.measure | head -n 488 > ${WORK_DIR}/replacement_vocab_measure.txt -head -n 257 ${WORK_DIR}/replacement_vocab_full.txt.letters > ${WORK_DIR}/replacement_vocab_letters.txt -grep -v "0__" ${WORK_DIR}/replacement_vocab_full.txt.fraction | head -n 169 > ${WORK_DIR}/replacement_vocab_fraction.txt -head -n 276 ${WORK_DIR}/replacement_vocab_full.txt.electronic > ${WORK_DIR}/replacement_vocab_electronic.txt -head -n 73 ${WORK_DIR}/replacement_vocab_full.txt.digit > ${WORK_DIR}/replacement_vocab_digit.txt -grep -v "0__" ${WORK_DIR}/replacement_vocab_full.txt.decimal | head -n 149 > ${WORK_DIR}/replacement_vocab_decimal.txt -grep -v "0__" ${WORK_DIR}/replacement_vocab_full.txt.date | grep -v "[0-9]-[0-9]" | grep -v "[0-9]\,[0-9]" | grep -v "[0-9]\.[0-9]" | grep -v "[0-9]\/[0-9]" | head -n 554 > ${WORK_DIR}/replacement_vocab_date.txt -grep -v "0__" ${WORK_DIR}/replacement_vocab_full.txt.cardinal | head -n 402 > ${WORK_DIR}/replacement_vocab_cardinal.txt -head -n 137 ${WORK_DIR}/replacement_vocab_full.txt.address > ${WORK_DIR}/replacement_vocab_address.txt - -## concatenate all tags in a single vocabulary (repetitions don't matter) -cat ${WORK_DIR}/replacement_vocab_address.txt \ - ${WORK_DIR}/replacement_vocab_cardinal.txt \ - ${WORK_DIR}/replacement_vocab_date.txt \ - ${WORK_DIR}/replacement_vocab_decimal.txt \ - ${WORK_DIR}/replacement_vocab_digit.txt \ - ${WORK_DIR}/replacement_vocab_electronic.txt \ - ${WORK_DIR}/replacement_vocab_fraction.txt \ - ${WORK_DIR}/replacement_vocab_letters.txt \ - ${WORK_DIR}/replacement_vocab_measure.txt \ - ${WORK_DIR}/replacement_vocab_money.txt \ - ${WORK_DIR}/replacement_vocab_ordinal.txt \ - ${WORK_DIR}/replacement_vocab_plain.txt \ - ${WORK_DIR}/replacement_vocab_telephone.txt \ - ${WORK_DIR}/replacement_vocab_time.txt \ - ${WORK_DIR}/replacement_vocab_verbatim.txt > ${WORK_DIR}/replacement_vocab.select.txt - -## Here we loop once again through the alignments and discard those examples that are not fully covered -## by our restricted tag vocabulary -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py \ - --mode=filter_by_vocab \ - --giza_dir=${ALIGNMENT_DIR} \ - --alignment_filename=itn.out2 \ - --data_dir="" \ - --vocab_filename=${WORK_DIR}/replacement_vocab.select.txt \ - --out_filename=itn.select.out \ - --lang=${CORPUS_LANG} - -## We now have a large collection of ITN phrase conversions that we know how to tag. -## Once again we loop through the Google TN dataset and create tag-labeled datasets, containing full sentences. -## If a sentence contains something that we do not know how to tag, we discard the whole sentence. -for subset in "train" "dev" -do - python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py \ - --mode=get_labeled_corpus \ - --giza_dir=${ALIGNMENT_DIR} \ - --alignment_filename=itn.select.out \ - --data_dir=${CORPUS_DIR}/${subset} \ - --vocab_filename="" \ - --out_filename=${CORPUS_DIR}/${subset}.labeled \ - --lang=${CORPUS_LANG} -done - -## Loop through the obtained datasets and get final tag vocabulary -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/get_label_vocab.py \ - --train_filename=${CORPUS_DIR}/train.labeled \ - --dev_filename=${CORPUS_DIR}/dev.labeled \ - --out_filename=${CORPUS_DIR}/label_map.txt - -## The full dataset is very large, while some tags occur rarely. So we can try some sampling. -## Here we try to sample sentences that contain at least one tag that we have not yet seen at least for N times -## This script will split the input dataset into two parts: sampled sentences and the rest sentences. -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/sample_each_label.py \ - --filename=${CORPUS_DIR}/dev.labeled \ - --max_count=10 - -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/sample_each_label.py \ - --filename=${CORPUS_DIR}/train.labeled \ - --max_count=500 - -## Here we create final train and dev datasets, mixing sampled sentences and some quantity of the rest sentences. -mkdir ${WORK_DIR}/datasets -DATASET=${WORK_DIR}/datasets/itn_sample500k_rest1500k_select_vocab -mkdir $DATASET -cat ${CORPUS_DIR}/train.labeled.sample_500 > ${DATASET}/train.tsv -head -n 1500000 ${CORPUS_DIR}/train.labeled.rest_500 >> ${DATASET}/train.tsv -cat ${CORPUS_DIR}/dev.labeled.sample_10 > ${DATASET}/valid.tsv -head -n 12000 ${CORPUS_DIR}/dev.labeled.rest_10 >> ${DATASET}/valid.tsv -cp ${DATASET}/valid.tsv ${DATASET}/test.tsv - -## The model will also need a file with semiotic classes and label map (derived from tag vocabulary) -echo "ADDRESS" > ${CORPUS_DIR}/semiotic_classes.txt -echo "CARDINAL" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "DATE" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "DECIMAL" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "DIGIT" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "ELECTRONIC" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "FRACTION" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "LETTERS" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "MEASURE" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "MONEY" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "ORDINAL" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "PLAIN" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "PUNCT" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "TELEPHONE" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "TIME" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "VERBATIM" >> ${CORPUS_DIR}/semiotic_classes.txt - -cp ${CORPUS_DIR}/label_map.txt ${WORK_DIR}/datasets/label_map.txt -cp ${CORPUS_DIR}/semiotic_classes.txt ${WORK_DIR}/datasets/semiotic_classes.txt - -## Now all data is ready to train the model. - -## We also prepare the test data to test the model after the training. -## The test data knows nothing about alignment, it only contains input sentences and references. - -## The original test data from Google TN Dataset contains a single reference for each ITN span. -## In order to take into account more than one acceptable variant, -## we prepare a dictionary of multiple possible references. -## The following script maps the whole input text of ITN span to the list of different conversions that occurred -## with this input anywhere in the Google TN Dataset. -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/get_multi_reference_vocab.py \ - --data_dir=${CORPUS_DIR} \ - --out_filename=${CORPUS_DIR}/reference_vocab.txt - -## Filter some errors from the obtained multi-reference vocabulary -grep -P "[\d] m[\t]" ${CORPUS_DIR}/reference_vocab.txt | grep -v -P "^MEASURE" | grep -v -P "^MONEY" > ${CORPUS_DIR}/reference_vocab.bad1 -grep -P "[\d] a[\t]" ${CORPUS_DIR}/reference_vocab.txt | grep -v -P "^MEASURE" > ${CORPUS_DIR}/reference_vocab.bad2 -grep -P "[\d] b[\t]" ${CORPUS_DIR}/reference_vocab.txt | grep -v -P "^MEASURE" | grep -v -P "^MONEY" > ${CORPUS_DIR}/reference_vocab.bad3 -grep -P "[\d] i[\t]" ${CORPUS_DIR}/reference_vocab.txt | grep -v -P "^MEASURE" > ${CORPUS_DIR}/reference_vocab.bad4 -grep -P "[\d] i\-[\t]" ${CORPUS_DIR}/reference_vocab.txt > ${CORPUS_DIR}/reference_vocab.bad5 -grep -P "[\d] us[\t]" ${CORPUS_DIR}/reference_vocab.txt | grep -v -P "^MONEY" | grep -v -P "TELEPHONE" > ${CORPUS_DIR}/reference_vocab.bad6 -grep -P "[\d] u\.s\.[\t]" ${CORPUS_DIR}/reference_vocab.txt | grep -v -P "^MEASURE" | grep -v -P "^MONEY" > ${CORPUS_DIR}/reference_vocab.bad7 -cat ${CORPUS_DIR}/reference_vocab.bad* > ${CORPUS_DIR}/reference_vocab.bad -grep -Fvxf ${CORPUS_DIR}/reference_vocab.bad ${CORPUS_DIR}/reference_vocab.txt > ${CORPUS_DIR}/reference_vocab.filt - -## Generate the "default" test data for Google TN Dataset (same as usually used in papers) -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/prepare_corpora_for_testing.py \ - --data_dir=${CORPUS_DIR}/test \ - --reference_vocab=${CORPUS_DIR}/reference_vocab.filt \ - --output_file=${WORK_DIR}/datasets/test.labeled \ - --sampling_count=-1 -awk 'BEGIN {FS="\t"}{print $1}' < ${WORK_DIR}/datasets/test.labeled > ${WORK_DIR}/datasets/test.input -awk 'BEGIN {FS="\t"}{print $1 "\t" $3}' < ${WORK_DIR}/datasets/test.labeled > ${WORK_DIR}/datasets/test.input_ref - -## Generate the "hard" test data for Google TN Dataset. -## We try to sample at least 1000 examples per semiotic class. -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/prepare_corpora_for_testing.py \ - --data_dir=${CORPUS_DIR}/test_full \ - --reference_vocab=${CORPUS_DIR}/reference_vocab.txt \ - --output_file=${WORK_DIR}/datasets/test1000.labeled \ - --sampling_count=1000 -awk 'BEGIN {FS="\t"}{print $1}' < ${WORK_DIR}/datasets/test1000.labeled > ${WORK_DIR}/datasets/test1000.input -awk 'BEGIN {FS="\t"}{print $1 "\t" $3}' < ${WORK_DIR}/datasets/test1000.labeled > ${WORK_DIR}/datasets/test1000.input_ref - -## After we have train a model, we can run inference and evaluation like below - -##export TOKENIZERS_PARALLELISM=false -##PRETRAINED_MODEL=./nemo_experiments/training.nemo -### run inference on default Google Dataset test -#python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_infer.py \ -# pretrained_model=${PRETRAINED_MODEL} \ -# inference.from_file=${DATA_PATH}/test.input \ -# inference.out_file=./final_test.output \ -# model.max_sequence_len=1024 #\ -# inference.batch_size=128 -# -### run inference on "hard" test (sample of at least 1000 examples of each semiotic class) -#python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_infer.py \ -# pretrained_model=${PRETRAINED_MODEL} \ -# inference.from_file=${DATA_PATH}/test1000.input \ -# inference.out_file=./final_test1000.output \ -# model.max_sequence_len=1024 \ -# inference.batch_size=128 -# -### compare inference results to the reference -#python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/eval.py \ -# --reference_file=${DATA_PATH}/test.labeled \ -# --inference_file=final_test.output \ -# > final_test.report -# -#python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/eval.py \ -# --reference_file=${DATA_PATH}/test1000.labeled \ -# --inference_file=final_test1000.output \ -# --print_other_errors \ -# > final_test1000.report -# -### compare inference results to the reference, get separate report per semiotic class -#python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/eval_per_class.py \ -# --reference_file=${DATA_PATH}/test.labeled \ -# --inference_file=final_test.output \ -# --output_file=per_class.report -# -#python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/eval_per_class.py \ -# --reference_file=${DATA_PATH}/test1000.labeled \ -# --inference_file=final_test1000.output \ -# --output_file=per_class1000.report diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/prepare_dataset_ru.sh b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/prepare_dataset_ru.sh deleted file mode 100644 index b16fb00d24b262a84a43edb1d94dcb84c4478431..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/prepare_dataset_ru.sh +++ /dev/null @@ -1,331 +0,0 @@ -#!/bin/bash - -## This bash-script reproduces the pipeline of data preparation for the Thutmose Tagger model (tagger-based ITN model) - -## In order to use it, you need: -## 1. install and compile GIZA++ -## git clone https://github.com/moses-smt/giza-pp.git giza-pp -## cd giza-pp -## make -## 2. Download Google TN Dataset -## https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish -## 3. install NeMo -## git clone https://github.com/NVIDIA/NeMo -## 4. Specify the following paths - -## path to NeMo repository, e.g. /home/user/nemo -NEMO_PATH= -## path to GIZA++, e.g. /home/user/giza-pp/GIZA++-v2 -GIZA_BIN_DIR= -## path to MCKLS_BINARY, e.g. /home/user/giza-pp/mkcls-v2/mkcls -MCKLS_BINARY= -## initial unzipped Google Text Normalization Dataset, e.g. /home/user/data/ru_with_types -GOOGLE_CORPUS_DIR= - - -## corpus language -CORPUS_LANG=ru - -WORK_DIR=`pwd` # directory from which this bash-script is run -echo "Working directory:" ${WORK_DIR} - -## names of working subfolders -CORPUS_DIR=${WORK_DIR}/corpus -ALIGNMENT_DIR=${WORK_DIR}/alignment - -## read the data and split it into train, dev, test -## files in test folder is truncated to match the default test dataset from the paper on Google TN Dataset -## option --add_test_full=true creates additional test_full folder, which is not truncated -python ${NEMO_PATH}/examples/nlp/duplex_text_normalization/data/data_split.py \ - --data_dir=${GOOGLE_CORPUS_DIR} \ - --output_dir=${CORPUS_DIR}_tmp \ - --lang=${CORPUS_LANG} \ - --add_test_full - -## we need only output-00099-of-00100.tsv as the final test data -rm ${CORPUS_DIR}_tmp/test/output-00095-of-00100.tsv ${CORPUS_DIR}_tmp/test/output-00096-of-00100.tsv ${CORPUS_DIR}_tmp/test/output-00097-of-00100.tsv ${CORPUS_DIR}_tmp/test/output-00098-of-00100.tsv - -## apply a blacklist of erroneous conversions to remove the sentences containing them from the corpus -mkdir ${CORPUS_DIR} -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/filter_sentences_with_errors.py \ - --data_dir=${CORPUS_DIR}_tmp \ - --out_dir=${CORPUS_DIR} \ - --lang=${CORPUS_LANG} \ - --errors_vocab_filename=${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/corpus_errors.${CORPUS_LANG} - -rm -r ${CORPUS_DIR}_tmp - -## This script extracts all unique ITN phrase-pairs from the Google TN dataset, tokenizes them and stores in separate -## folders for each semiotic class. In each folder we generate a bash script for running the alignment. -mkdir ${ALIGNMENT_DIR} -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_for_alignment.py \ - --data_dir=${CORPUS_DIR} \ - --out_dir=${ALIGNMENT_DIR} \ - --giza_dir=${GIZA_BIN_DIR} \ - --mckls_binary=${MCKLS_BINARY} \ - --lang=${CORPUS_LANG} - -##exclude punct class -rm -r ${ALIGNMENT_DIR}/punct - -## for better GIZA++ alignments mix in examples from other classes -## they will append to the tail of "src" and "dst" files and they will not have corresponding freqs in "freq" file -## all these appended lines will be skipped in the get_replacement_vocab step -for fn in "src" "dst" -do - cat ${ALIGNMENT_DIR}/money/${fn} \ - ${ALIGNMENT_DIR}/cardinal/${fn} \ - ${ALIGNMENT_DIR}/decimal/${fn} \ - ${ALIGNMENT_DIR}/fraction/${fn} \ - ${ALIGNMENT_DIR}/measure/${fn} > ${ALIGNMENT_DIR}/money/${fn}.new - - cat ${ALIGNMENT_DIR}/time/${fn} \ - ${ALIGNMENT_DIR}/cardinal/${fn} > ${ALIGNMENT_DIR}/time/${fn}.new - - cat ${ALIGNMENT_DIR}/measure/${fn} \ - ${ALIGNMENT_DIR}/cardinal/${fn} \ - ${ALIGNMENT_DIR}/decimal/${fn} \ - ${ALIGNMENT_DIR}/fraction/${fn} \ - ${ALIGNMENT_DIR}/money/${fn} > ${ALIGNMENT_DIR}/measure/${fn}.new - - cat ${ALIGNMENT_DIR}/fraction/${fn} \ - ${ALIGNMENT_DIR}/cardinal/${fn} \ - ${ALIGNMENT_DIR}/measure/${fn} \ - ${ALIGNMENT_DIR}/money/${fn} > ${ALIGNMENT_DIR}/fraction/${fn}.new - - cat ${ALIGNMENT_DIR}/decimal/${fn} \ - ${ALIGNMENT_DIR}/cardinal/${fn} \ - ${ALIGNMENT_DIR}/measure/${fn} \ - ${ALIGNMENT_DIR}/money/${fn} > ${ALIGNMENT_DIR}/decimal/${fn}.new -done - -for c in "decimal" "fraction" "measure" "time" "money" -do - mv ${ALIGNMENT_DIR}/${c}/src.new ${ALIGNMENT_DIR}/${c}/src - mv ${ALIGNMENT_DIR}/${c}/dst.new ${ALIGNMENT_DIR}/${c}/dst -done - -for subfolder in ${ALIGNMENT_DIR}/* -do - echo ${subfolder} - chmod +x ${subfolder}/run.sh -done - -## Run alignment using multiple processes -for subfolder in ${ALIGNMENT_DIR}/* -do - cd ${subfolder} - ./run.sh & -done -wait - -## Extract final alignments for each semiotic class -for subfolder in ${ALIGNMENT_DIR}/* -do - python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/extract_giza_alignments.py \ - --mode=itn \ - --giza_dir=${subfolder} \ - --out_filename=itn.out \ - --giza_suffix=A3.final \ - --lang=ru & -done -wait - -## add column with frequencies of phrase pairs in the corpus -for subfolder in ${ALIGNMENT_DIR}/* -do - paste -d"\t" ${subfolder}/freq ${subfolder}/itn.out > ${subfolder}/itn.out2 - awk 'BEGIN {FS="\t"} match($3, " "){print $0}' < ${subfolder}/itn.out2 | sort -rn > ${subfolder}/itn.debug -done - -## loop through the obtained alignments and collect vocabularies (for each semiotic class) -## of all possible replacement fragments (aka tags) -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py \ - --mode=get_replacement_vocab \ - --giza_dir=${ALIGNMENT_DIR} \ - --alignment_filename=itn.out2 \ - --data_dir="" \ - --vocab_filename=${WORK_DIR}/replacement_vocab_full.txt \ - --out_filename="" \ - --lang=${CORPUS_LANG} - -## Here we put some voluntary thresholds on how many tags we take. -## Tags with low frequencies are likely to be derived from sporadic alignment mistakes -head -n 57 ${WORK_DIR}/replacement_vocab_full.txt.verbatim > ${WORK_DIR}/replacement_vocab_verbatim.txt -grep -v "0__" ${WORK_DIR}/replacement_vocab_full.txt.time | head -n 106 > ${WORK_DIR}/replacement_vocab_time.txt -grep -v "0__" ${WORK_DIR}/replacement_vocab_full.txt.telephone | head -n 134 > ${WORK_DIR}/replacement_vocab_telephone.txt -head -n 763 ${WORK_DIR}/replacement_vocab_full.txt.plain > ${WORK_DIR}/replacement_vocab_plain.txt -head -n 1397 ${WORK_DIR}/replacement_vocab_full.txt.ordinal > ${WORK_DIR}/replacement_vocab_ordinal.txt -grep -v "0__" ${WORK_DIR}/replacement_vocab_full.txt.money | grep -v '0\$' | head -n 294 > ${WORK_DIR}/replacement_vocab_money.txt -grep -v "0__" ${WORK_DIR}/replacement_vocab_full.txt.measure | head -n 508 > ${WORK_DIR}/replacement_vocab_measure.txt -cp ${WORK_DIR}/replacement_vocab_full.txt.letters ${WORK_DIR}/replacement_vocab_letters.txt -head -n 163 ${WORK_DIR}/replacement_vocab_full.txt.fraction > ${WORK_DIR}/replacement_vocab_fraction.txt -head -n 262 ${WORK_DIR}/replacement_vocab_full.txt.electronic > ${WORK_DIR}/replacement_vocab_electronic.txt -cp ${WORK_DIR}/replacement_vocab_full.txt.digit ${WORK_DIR}/replacement_vocab_digit.txt -head -n 270 ${WORK_DIR}/replacement_vocab_full.txt.decimal > ${WORK_DIR}/replacement_vocab_decimal.txt -head -n 271 ${WORK_DIR}/replacement_vocab_full.txt.date > ${WORK_DIR}/replacement_vocab_date.txt -head -n 455 ${WORK_DIR}/replacement_vocab_full.txt.cardinal > ${WORK_DIR}/replacement_vocab_cardinal.txt - -## concatenate all tags in a single vocabulary (repetitions don't matter) -cat ${WORK_DIR}/replacement_vocab_cardinal.txt \ - ${WORK_DIR}/replacement_vocab_date.txt \ - ${WORK_DIR}/replacement_vocab_decimal.txt \ - ${WORK_DIR}/replacement_vocab_digit.txt \ - ${WORK_DIR}/replacement_vocab_electronic.txt \ - ${WORK_DIR}/replacement_vocab_fraction.txt \ - ${WORK_DIR}/replacement_vocab_letters.txt \ - ${WORK_DIR}/replacement_vocab_measure.txt \ - ${WORK_DIR}/replacement_vocab_money.txt \ - ${WORK_DIR}/replacement_vocab_ordinal.txt \ - ${WORK_DIR}/replacement_vocab_plain.txt \ - ${WORK_DIR}/replacement_vocab_telephone.txt \ - ${WORK_DIR}/replacement_vocab_time.txt \ - ${WORK_DIR}/replacement_vocab_verbatim.txt > ${WORK_DIR}/replacement_vocab.select.txt - -## Here we loop once again through the alignments and discard those examples that are not fully covered -## by our restricted tag vocabulary -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py \ - --mode=filter_by_vocab \ - --giza_dir=${ALIGNMENT_DIR} \ - --alignment_filename=itn.out2 \ - --data_dir="" \ - --vocab_filename=${WORK_DIR}/replacement_vocab.select.txt \ - --out_filename=itn.select.out \ - --lang=${CORPUS_LANG} - -## We now have a large collection of ITN phrase conversions that we know how to tag. -## Once again we loop through the Google TN dataset and create tag-labeled datasets, containing full sentences. -## If a sentence contains something that we do not know how to tag, we discard the whole sentence. -for subset in "train" "dev" -do - python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/prepare_corpora_after_alignment.py \ - --mode=get_labeled_corpus \ - --giza_dir=${ALIGNMENT_DIR} \ - --alignment_filename=itn.select.out \ - --data_dir=${CORPUS_DIR}/${subset} \ - --vocab_filename="" \ - --out_filename=${CORPUS_DIR}/${subset}.labeled \ - --lang=${CORPUS_LANG} -done - -## Loop through the obtained datasets and get final tag vocabulary -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/get_label_vocab.py \ - --train_filename=${CORPUS_DIR}/train.labeled \ - --dev_filename=${CORPUS_DIR}/dev.labeled \ - --out_filename=${CORPUS_DIR}/label_map.txt - -## The full dataset is very large, while some tags occur rarely. So we can try some sampling. -## Here we try to sample sentences that contain at least one tag that we have not yet seen at least for N times -## This script will split the input dataset into two parts: sampled sentences and the rest sentences. -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/sample_each_label.py \ - --filename=${CORPUS_DIR}/dev.labeled \ - --max_count=10 - -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/dataset_preparation/sample_each_label.py \ - --filename=${CORPUS_DIR}/train.labeled \ - --max_count=500 - -## Here we create final train and dev datasets, mixing sampled sentences and some quantity of the rest sentences. -mkdir ${WORK_DIR}/datasets -DATASET=${WORK_DIR}/datasets/itn_sample500k_rest1500k_select_vocab -mkdir $DATASET -cat ${CORPUS_DIR}/train.labeled.sample_500 > ${DATASET}/train.tsv -head -n 1500000 ${CORPUS_DIR}/train.labeled.rest_500 >> ${DATASET}/train.tsv -cat ${CORPUS_DIR}/dev.labeled.sample_10 > ${DATASET}/valid.tsv -head -n 12000 ${CORPUS_DIR}/dev.labeled.rest_10 >> ${DATASET}/valid.tsv -cp ${DATASET}/valid.tsv ${DATASET}/test.tsv - -echo "CARDINAL" > ${CORPUS_DIR}/semiotic_classes.txt -echo "DATE" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "DECIMAL" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "DIGIT" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "ELECTRONIC" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "FRACTION" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "LETTERS" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "MEASURE" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "MONEY" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "ORDINAL" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "PLAIN" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "PUNCT" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "TELEPHONE" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "TIME" >> ${CORPUS_DIR}/semiotic_classes.txt -echo "VERBATIM" >> ${CORPUS_DIR}/semiotic_classes.txt - -cp ${CORPUS_DIR}/label_map.txt ${WORK_DIR}/datasets/label_map.txt -cp ${CORPUS_DIR}/semiotic_classes.txt ${WORK_DIR}/datasets/semiotic_classes.txt - -## Now all data is ready to train the model. - -## We also prepare the test data to test the model after the training. -## The test data knows nothing about alignment, it only contains input sentences and references. - -## The original test data from Google TN Dataset contains a single reference for each ITN span. -## In order to take into account more than one acceptable variant, -## we prepare a dictionary of multiple possible references. -## The following script maps the whole input text of ITN span to the list of different conversions that occurred -## with this input anywhere in the Google TN Dataset. -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/get_multi_reference_vocab.py \ - --data_dir=${CORPUS_DIR} \ - --out_filename=${CORPUS_DIR}/reference_vocab.txt - -## Generate the "default" test data for Google TN Dataset (same as usually used in papers) -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/prepare_corpora_for_testing.py \ - --data_dir=${CORPUS_DIR}/test \ - --reference_vocab=${CORPUS_DIR}/reference_vocab.txt \ - --output_file=${WORK_DIR}/datasets/test.labeled \ - --sampling_count=-1 -awk 'BEGIN {FS="\t"}{print $1}' < ${WORK_DIR}/datasets/test.labeled > ${WORK_DIR}/datasets/test.input -awk 'BEGIN {FS="\t"}{print $1 "\t" $3}' < ${WORK_DIR}/datasets/test.labeled > ${WORK_DIR}/datasets/test.input_ref - -## Generate the "hard" test data for Google TN Dataset. -## We try to sample at least 1000 examples per semiotic class. -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/prepare_corpora_for_testing.py \ - --data_dir=${CORPUS_DIR}/test_full \ - --reference_vocab=${CORPUS_DIR}/reference_vocab.txt \ - --output_file=${WORK_DIR}/datasets/test1000.labeled \ - --sampling_count=1000 -awk 'BEGIN {FS="\t"}{print $1}' < ${WORK_DIR}/datasets/test1000.labeled > ${WORK_DIR}/datasets/test1000.input -awk 'BEGIN {FS="\t"}{print $1 "\t" $3}' < ${WORK_DIR}/datasets/test1000.labeled > ${WORK_DIR}/datasets/test1000.input_ref - -## After we have train a model, we can run inference and evaluation like below - -##export TOKENIZERS_PARALLELISM=false -##PRETRAINED_MODEL=./nemo_experiments/training.nemo -### run inference on default Google Dataset test -#python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_infer.py \ -# pretrained_model=${PRETRAINED_MODEL} \ -# inference.from_file=${DATA_PATH}/test.input \ -# inference.out_file=./final_test.output \ -# model.max_sequence_len=1024 #\ -# inference.batch_size=128 -# -### run inference on "hard" test (sample of at least 1000 examples of each semiotic class) -#python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_infer.py \ -# pretrained_model=${PRETRAINED_MODEL} \ -# inference.from_file=${DATA_PATH}/test1000.input \ -# inference.out_file=./final_test1000.output \ -# model.max_sequence_len=1024 \ -# inference.batch_size=128 -# -### compare inference results to the reference -#python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/eval.py \ -# --reference_file=${DATA_PATH}/test.labeled \ -# --inference_file=final_test.output \ -# > final_test.report -# -#python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/eval.py \ -# --reference_file=${DATA_PATH}/test1000.labeled \ -# --inference_file=final_test1000.output \ -# --print_other_errors \ -# > final_test1000.report -# -### compare inference results to the reference, get separate report per semiotic class -#python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/eval_per_class.py \ -# --reference_file=${DATA_PATH}/test.labeled \ -# --inference_file=final_test.output \ -# --output_file=per_class.report -# -#python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/eval_per_class.py \ -# --reference_file=${DATA_PATH}/test1000.labeled \ -# --inference_file=final_test1000.output \ -# --output_file=per_class1000.report diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/readme.txt b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/readme.txt deleted file mode 100644 index e439df0b0b21480ecce0bb71e6c87fe79bcd4c87..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/readme.txt +++ /dev/null @@ -1,7 +0,0 @@ -Thuthmose-tagger is a single-pass model for inverse text normalization (ITN). - -prepare_dataset_en.sh - English data preparation -prepare_dataset_ru.sh - Russian data preparation -normalization_as_tagging_train.py - Training a model -run_infer.sh - Inference and evaluation - diff --git a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/run_infer.sh b/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/run_infer.sh deleted file mode 100644 index 5c9d70b6bd9f3205db5046fa5660b474426c4492..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/text_normalization_as_tagging/run_infer.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -## This bash-script demonstrates how to run inference and evaluation for the Thutmose Tagger model (tagger-based ITN model) - -## In order to use it, you need: -## 1. install NeMo -## git clone https://github.com/NVIDIA/NeMo -## 2. Specify the following paths - -## path to NeMo repository, e.g. /home/user/nemo -NEMO_PATH= - -## name or local path to pretrained model, e.g. ./nemo_experiments/training.nemo -PRETRAINED_MODEL= - -## path to input and reference files -# (see the last steps in examples/nlp/text_normalization_as_tagging/prepare_dataset_en.sh, -# starting from "python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/get_multi_reference_vocab.py" -#) -INPUT_FILE= -REFERENCE_FILE= - - -export TOKENIZERS_PARALLELISM=false - -### run inference on default Google Dataset test -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_infer.py \ - pretrained_model=${PRETRAINED_MODEL} \ - inference.from_file=${INPUT_FILE} \ - inference.out_file=./final_test.output \ - model.max_sequence_len=1024 #\ - inference.batch_size=128 - -### compare inference results to the reference -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/eval.py \ - --reference_file=${REFERENCE_FILE} \ - --inference_file=final_test.output \ - > final_test.report - -### compare inference results to the reference, get separate report per semiotic class -python ${NEMO_PATH}/examples/nlp/text_normalization_as_tagging/evaluation/eval_per_class.py \ - --reference_file=${REFERENCE_FILE} \ - --inference_file=final_test.output \ - --output_file=per_class.report diff --git a/SoundScribe/SpeakerID/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml b/SoundScribe/SpeakerID/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml deleted file mode 100644 index cc374f538c934a3fa736fd81e193a906a32d442d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml +++ /dev/null @@ -1,179 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Punctuation and capitalization model with pretrained BERT-like models - -pretrained_model: null # pretrained Punctuation and Capitalization model from list_available_models(), for example: -# punctuation_en_bert or punctuation_en_distilbert -# or your_model.nemo -trainer: - devices: 1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 3 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - gradient_clip_val: 0.0 - precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 - accelerator: gpu - strategy: ddp - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./nemo_experiments" - name: Punctuation_and_Capitalization # The name of your model - create_tensorboard_logger: true # Whether you want exp_manger to create a tb logger - create_checkpoint_callback: true # Whether you want exp_manager to create a model checkpoint callback - -model: - class_labels: - punct_labels_file: punct_label_ids.csv - capit_labels_file: capit_label_ids.csv - - common_dataset_parameters: - pad_label: 'O' - ignore_extra_tokens: false - ignore_start_end: true - punct_label_ids: null - capit_label_ids: null - label_vocab_dir: null - - train_ds: - # Tarred dataset is recommended if all dataset cannot be loaded in memory. Use script - # `examples/nlp/token_classification/create_punctuation_capitalization_tarred_dataset.py` for tarred dataset - # creation. - use_tarred_dataset: false - # A path to directory where `tar_metadata_file` or `text_file` and `labels_file` are stored. - ds_item: ??? - - text_file: text_train.txt - labels_file: labels_train.txt - # Permutes batches every epoch - shuffle: true - num_samples: -1 - # A max number of source text tokens in a batch. Examples are sorted by number of tokens in a source text before - # batching. Examples which number of tokens do not differ much are added to the batch. This procedure reduces - # number of pad tokens in a batch. A number of examples in a batch varies: longer input sequences -> less - # examples in a batch. - tokens_in_batch: 15000 - max_seq_length: 512 - # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null, - # number of jobs is equal to the number of CPU cores. - # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece) - n_jobs: 0 - - # Path to tarred dataset metadata file. Required if tarred dataset is used. Metadata file is a JSON file which - # contains total number of batches in the dataset, a list of paths to tar files and paths to label vocabularies. - # Metadata file is create by script - # `examples/nlp/token_classification/create_punctuation_capitalization_tarred_dataset.py` - tar_metadata_file: null - # Controls batch shuffling in tarred dataset. `tar_shuffle_n` is a size of shuffled batch buffer. Mind that this - # shuffling only permutes batches and doesn't exchange samples between batches. Proper shuffling is turned on in - # regular dataset. - tar_shuffle_n: 1 - - validation_ds: - # if evaluation data is not in the model.train_ds.ds_item as the training data or multiple datasets are used for - # evaluation is needed, specify ds_item, otherwise by default model.train_ds.ds_item is used - # See `train_ds` section for more details on tarred dataset - use_tarred_dataset: false - # expected format: `[PATH_TO_DEV1,PATH_TO_DEV2]` OR `PATH_TO_DEV` (Note no space between the paths and square - # brackets) - ds_item: ??? - - text_file: text_dev.txt - labels_file: labels_dev.txt - shuffle: false - num_samples: -1 - # See comment above `model.train_ds.tokens_in_batch` parameter for explanation. - tokens_in_batch: 15000 - max_seq_length: 512 - # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null, - # number of jobs is equal to the number of CPU cores. - # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece) - n_jobs: 0 - - # For more details see `train_ds` section. - tar_metadata_file: null - - test_ds: - # if evaluation data is not in the model.train_ds.ds_item as the training data or multiple datasets are used for - # evaluation is needed, specify ds_item, otherwise by default model.train_ds.ds_item is used - # See `train_ds` section for more details on tarred dataset - use_tarred_dataset: false - ds_item: ??? # expected format: [PATH_TO_DEV1,PATH_TO_DEV2] (Note no space between the paths and square brackets) - - text_file: text_dev.txt - labels_file: labels_dev.txt - shuffle: false - num_samples: -1 - # See comment above `model.train_ds.tokens_in_batch` parameter for explanation. - tokens_in_batch: 15000 - max_seq_length: 512 - # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null, - # number of jobs is equal to the number of CPU cores. - # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece) - n_jobs: 0 - - # For more details see `train_ds` section. - tar_metadata_file: null - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - - language_model: - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - punct_head: - num_fc_layers: 1 - fc_dropout: 0.1 - activation: 'relu' - use_transformer_init: True - - capit_head: - num_fc_layers: 1 - fc_dropout: 0.1 - activation: 'relu' - use_transformer_init: true - - optim: - name: adam - lr: 1e-4 - weight_decay: 0.00 - - sched: - name: WarmupAnnealing - # Scheduler params - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/SoundScribe/SpeakerID/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml b/SoundScribe/SpeakerID/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml deleted file mode 100644 index e727d22aca548f9ce976b56cd50f99c8f9806b7d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml +++ /dev/null @@ -1,230 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Punctuation and capitalization lexical audio model with pretrained BERT-like models and Encoder-Decoder-like models. -pretrained_model: null # pretrained Punctuation and Capitalization Lexical Audio model from list_available_models(), for example: -# -# or your_model.nemo -trainer: - devices: -1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 5 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 - accelerator: gpu - strategy: ddp - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, - # LR schedulers, apex, etc. - log_every_n_steps: 50 - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./nemo_experiments" - name: Punctuation_and_Capitalization_Lexical_Audio # The name of your model - create_tensorboard_logger: true # Whether you want exp_manger to create a tb logger - create_checkpoint_callback: true # Whether you want exp_manager to create a model checkpoint callback - checkpoint_callback_params: - save_top_k: 3 - monitor: "val_loss" - mode: "min" - save_best_model: true - resume_from_checkpoint: null - -model: - audio_encoder: - pretrained_model: stt_en_conformer_ctc_medium # You can choose any pretrained ASR model from list_available_models() of EncDecCTCModel. - freeze: - is_enabled: false # If set to True weights of audio encoder will not be updated during training. - d_model: 256 # Input dimension of MultiheadAttentionMechanism and PositionwiseFeedForward - d_ff: 1024 # Hidden dimension of PositionwiseFeedForward - num_layers: 4 # Number of additional Conformer layers - adapter: - enable: false # If set to True will enable adapters for audio encoder. - config: - # For more details see `nemo.collections.common.parts.LinearAdapter` class - in_features: -1 # Will be replaced with size of audio encoder - dim: 128 # Hidden dimension of the feed forward network. - activation: 'swish' # Str name for an activation function. - fusion: - num_layers: 4 # Number of layers to use in fusion - num_attention_heads: 4 # Number of attention heads to use in fusion - inner_size: 2048 # Fusion inner size - - class_labels: - punct_labels_file: punct_label_ids.txt - capit_labels_file: capit_label_ids.txt - - common_dataset_parameters: - pad_label: 'O' - ignore_extra_tokens: false - ignore_start_end: true - punct_label_ids: null - capit_label_ids: null - label_vocab_dir: null - - train_ds: - # Tarred dataset is recommended if all dataset cannot be loaded in memory. Use script - # `examples/nlp/token_classification/create_punctuation_capitalization_tarred_dataset.py` for tarred dataset - # creation. - use_tarred_dataset: false - - # A path to directory where `tar_metadata_file` or `text_file` and `labels_file` and `audio_file` are stored. - ds_item: ??? - text_file: text_train.txt - labels_file: labels_train.txt - audio_file: audio_train.txt - - use_audio: true # Has to be set to true to use it for lexical audio model. - use_bucketing: true # If set to true batches will be sorted by length of audios and packed in batches limited by `tokens_in_batch`. Otherwise, provide `batch_size` parameter. - # If set to true audios will be loaded to memory during __init__ call of `BertPunctuationCapitalizationDataset`, consumes more RAM. - # Otherwise, audios will be loaded during `collate_fn` call of `BertPunctuationCapitalizationDataset`. - preload_audios: true - - # A max number of source text tokens in a batch. Examples are sorted by number of tokens in a source text before - # batching. Examples which number of tokens do not differ much are added to the batch. This procedure reduces - # number of pad tokens in a batch. A number of examples in a batch varies: longer input sequences -> less - # examples in a batch. - tokens_in_batch: 2048 - max_seq_length: 512 - - sample_rate: 16000 # Target sample rate of audios can be used for downsampling or upsamling. - num_workers: 0 - - # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null, - # number of jobs is equal to the number of CPU cores. - # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece) - n_jobs: 0 - - # Path to tarred dataset metadata file. Required if tarred dataset is used. Metadata file is a JSON file which - # contains total number of batches in the dataset, a list of paths to tar files and paths to label vocabularies. - # Metadata file is create by script - # `examples/nlp/token_classification/create_punctuation_capitalization_tarred_dataset.py` - tar_metadata_file: null - # Controls batch shuffling in tarred dataset. `tar_shuffle_n` is a size of shuffled batch buffer. Mind that this - # shuffling only permutes batches and doesn't exchange samples between batches. Proper shuffling is turned on in - # regular dataset. - tar_shuffle_n: 1 - - validation_ds: - # if evaluation data is not in the model.train_ds.ds_item as the training data or multiple datasets are used for - # evaluation is needed, specify ds_item, otherwise by default model.train_ds.ds_item is used - # See `train_ds` section for more details on tarred dataset - use_tarred_dataset: false - # expected format: `[PATH_TO_DEV1,PATH_TO_DEV2]` OR `PATH_TO_DEV` (Note no space between the paths and square - # brackets) - ds_item: ??? - - text_file: text_dev.txt - labels_file: labels_dev.txt - audio_file: audio_dev.txt - - use_audio: true - use_bucketing: false - preload_audios: false - - shuffle: false - num_samples: -1 - batch_size: 32 - # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null, - # number of jobs is equal to the number of CPU cores. - # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece) - n_jobs: 0 - - # For more details see `train_ds` section. - tar_metadata_file: null - - sample_rate: 16000 - num_workers: 0 - - test_ds: - # if evaluation data is not in the model.train_ds.ds_item as the training data or multiple datasets are used for - # evaluation is needed, specify ds_item, otherwise by default model.train_ds.ds_item is used - # See `train_ds` section for more details on tarred dataset - use_tarred_dataset: false - # expected format: `[PATH_TO_DEV1,PATH_TO_DEV2]` OR `PATH_TO_DEV` (Note no space between the paths and square - # brackets) - ds_item: ??? - - text_file: text_dev.txt - labels_file: labels_dev.txt - audio_file: audio_dev.txt - - use_audio: true - use_bucketing: false - preload_audios: false - - shuffle: false - num_samples: -1 - batch_size: 32 - # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null, - # number of jobs is equal to the number of CPU cores. - # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece) - n_jobs: 0 - - # For more details see `train_ds` section. - tar_metadata_file: null - - sample_rate: 16000 - num_workers: 0 - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - - language_model: - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - punct_head: - num_fc_layers: 1 - fc_dropout: 0.1 - activation: 'relu' - use_transformer_init: True - - capit_head: - num_fc_layers: 1 - fc_dropout: 0.1 - activation: 'relu' - use_transformer_init: true - - optim: - name: adam - lr: 1e-4 - weight_decay: 0.00 - - sched: - name: WarmupAnnealing - # Scheduler params - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - -hydra: - run: - dir: . - job_logging: - root: - handlers: null \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/token_classification/conf/token_classification_config.yaml b/SoundScribe/SpeakerID/examples/nlp/token_classification/conf/token_classification_config.yaml deleted file mode 100644 index 05024c781dab43f452a84b253e557131cf7e826c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/token_classification/conf/token_classification_config.yaml +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Token Classification tasks (for example, Named Entity Recognition) with pretrained BERT-like models - -pretrained_model: null # pretrained TokenClassification model from list_available_models() or path to a .nemo file, -# for example: ner_en_bert or your_model.nemo -trainer: - devices: 1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 5 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - gradient_clip_val: 0.0 - precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 - accelerator: gpu - enable_checkpointing: False # Provided by exp_manager - logger: False # Provided by exp_manager - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./nemo_experiments" - name: token_classification_model # The name of your model - create_tensorboard_logger: true # Whether you want exp_manager to create a tb logger - create_checkpoint_callback: true # Whether you want exp_manager to create a model checkpoint callback - -model: - label_ids: null # will be filled during training - class_labels: - class_labels_file: label_ids.csv # will be generated during training and saved in .nemo file - dataset: - data_dir: ??? # /path/to/data - class_balancing: null # choose from [null, weighted_loss]. Weighted_loss enables the weighted class balancing of the loss, may be used for handling unbalanced classes - max_seq_length: 128 - pad_label: 'O' - ignore_extra_tokens: false - ignore_start_end: false - use_cache: false - # shared among dataloaders - num_workers: 2 - pin_memory: false - drop_last: false - - train_ds: - text_file: text_train.txt - labels_file: labels_train.txt - shuffle: true - num_samples: -1 - batch_size: 64 - - validation_ds: - text_file: text_dev.txt - labels_file: labels_dev.txt - shuffle: false - num_samples: -1 - batch_size: 64 - - test_ds: - text_file: text_dev.txt - labels_file: labels_dev.txt - shuffle: false - num_samples: -1 - batch_size: 64 - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - - language_model: - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - - head: - num_fc_layers: 2 - fc_dropout: 0.5 - activation: 'relu' - use_transformer_init: True - - optim: - name: adam - lr: 5e-5 - weight_decay: 0.00 - - sched: - name: WarmupAnnealing - # Scheduler params - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - -hydra: - run: - dir: . - job_logging: - root: - handlers: null diff --git a/SoundScribe/SpeakerID/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py b/SoundScribe/SpeakerID/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py deleted file mode 100644 index d74c2d8bc19aad03abf6cb1e0929086831c959ff..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py +++ /dev/null @@ -1,356 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import multiprocessing as mp -from pathlib import Path - -from nemo.collections.nlp.data.token_classification.punctuation_capitalization_tarred_dataset import ( - DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME, - DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME, - METADATA_CAPIT_LABEL_VOCAB_KEY, - METADATA_PUNCT_LABEL_VOCAB_KEY, - build_label_ids_from_list_of_labels, - check_labels_for_being_unique_before_building_label_ids, - check_tar_file_prefix, - create_tarred_dataset, -) - - -""" -A tarred dataset allows to train on large amounts without storing it all into memory simultaneously. In case of -punctuation and capitalization model, tarred dataset is a directory which contains metadata file, tar files with -batches, punct_label_vocab.csv and capit_label_vocab.csv files. - -A metadata file is a JSON file with 4 fields: 'num_batches', 'tar_files', 'punct_label_vocab_file', -'capit_label_vocab_file'. 'num_batches' (int) is a total number of batches in tarred dataset. 'tar_files' is a list of -paths to tar files relative to directory containing the metadata file. 'punct_label_vocab_file' and -'capit_label_vocab_file' are paths to .csv files containing all unique punctuation and capitalization labels. Each -label in these files is written in a separate line. The first labels in both files are equal and serve for padding and -as neutral labels. - -Every tar file contains objects written using `webdataset.TarWriter`. Each object is a dictionary with two items: -'__key__' and 'batch.pyd'. '__key__' is a name of a batch and 'batch.pyd' is a pickled dictionary which contains -'input_ids', 'subtokens_mask', 'punct_labels', 'capit_labels'. 'input_ids' is an array containing ids of source tokens, -'subtokens_mask' is a boolean array showing first tokens in words, 'punct_labels' and 'capit_labels' are arrays with -ids of labels. Metadata file should be passed to constructor of -`nemo.collections.nlp.data.token_classification.PunctuationCapitalizationTarredDataset` and the instance of -the class will handle iteration and constructing masks and token types for BERT model. - -Example of usage: - -python create_punctuation_capitalization_tarred_dataset.py \ - --text \ - --labels \ - --output_dir \ - --lines_per_dataset_fragment 10000 \ - --tokens_in_batch 8000 \ - --num_batches_per_tarfile 5 \ - --tokenizer_name char \ - --vocab_file -""" - - -def get_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description=f"A tarred dataset allows to train on large amounts without storing it all into memory " - f"simultaneously. In case of punctuation and capitalization model, tarred dataset is a directory which " - f"contains metadata file, tar files with batches, {DEFAULT_PUNCT_LABEL_VOCAB_FILE_NAME} and " - f"{DEFAULT_CAPIT_LABEL_VOCAB_FILE_NAME} files. A metadata file is a JSON file with 4 fields: 'num_batches', " - f"'tar_files', '{METADATA_PUNCT_LABEL_VOCAB_KEY}', '{METADATA_CAPIT_LABEL_VOCAB_KEY}'. 'num_batches' (int) is " - f"a total number of batches in tarred dataset. 'tar_files' is a list of paths to tar files relative " - f"to directory containing the metadata file. '{METADATA_PUNCT_LABEL_VOCAB_KEY}' and " - f"'{METADATA_CAPIT_LABEL_VOCAB_KEY}' are paths to .csv files containing all unique punctuation and " - f"capitalization labels. Each label in these files is written in a separate line. The first labels in both " - f"files are equal and serve for padding and as neutral labels. Every tar file contains objects written " - f"using `webdataset.TarWriter`. Each object is a dictionary with two items: '__key__' and 'batch.pyd'. " - f"'__key__' is a name of a batch and 'batch.pyd' is a pickled dictionary which contains 'input_ids', " - f"'subtokens_mask', 'punct_labels', 'capit_labels'. 'input_ids' is an array containing ids of source tokens, " - f"'subtokens_mask' is a boolean array showing first tokens in words, 'punct_labels' and 'capit_labels' are " - f"arrays with ids of labels. Metadata file should be passed to constructor of " - "`nemo.collections.nlp.data.token_classification.PunctuationCapitalizationTarredDataset` and the instance of " - "the class will handle iteration and constructing masks and token types for BERT model.", - ) - parser.add_argument( - "--text", - "-t", - help="Path to source lowercased text without punctuation. Number of lines in `--text` file has to be equal " - "to number of lines in `--labels` file.", - type=Path, - required=True, - ) - parser.add_argument( - "--audio_file", - type=Path, - required=False, - help="Path to source file which contains paths to audio one path per line. " - "Number of lines in `--audio_file` has to be equal to number of lines in `--labels` file", - ) - parser.add_argument( - "--use_audio", - required=False, - action="store_true", - help="If set to `True` script creates lexical audio dataset which can be used with `PunctuationCapitalizationLexicalAudioModel`.", - ) - parser.add_argument( - "--sample_rate", - type=int, - required=False, - help="Target sample rate of audios. Can be used for downsampling or upsampling.", - ) - parser.add_argument( - "--labels", - "-L", - type=Path, - required=True, - help="Path to file with labels in the format described here " - "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#" - "nemo-data-format . Number of lines in `--labels` file has to be equal to the number of lines in `--text` " - "file.", - ) - parser.add_argument( - "--output_dir", - "-o", - type=Path, - required=True, - help="Path to directory where .tar files, metadata file, label id files are stored.", - ) - parser.add_argument( - "--max_seq_length", - "-s", - type=int, - default=512, - help="Maximum number of subtokens in an input sequence. A source sequence which contain too many subtokens are " - "clipped to `--max_seq_length - 2` subtokens and then [CLS] token is prepended to the clipped sequence and " - "[SEP] token is appended to the clipped sequence. The clipping is performed via removal of subtokens in the " - "end of a source sequence.", - ) - parser.add_argument( - "--tokens_in_batch", - "-b", - type=int, - default=15000, - help="Maximum number of tokens in a batch including [CLS], [SEP], [UNK], and [PAD] tokens. Before packing into " - "batches source sequences are sorted by number of tokens in order to reduce number of pad tokens. So the " - "number of sequences in a batch may be different.", - ) - parser.add_argument( - "--lines_per_dataset_fragment", - type=int, - default=10 ** 6, - help="A number of lines processed by one worker during creation of tarred dataset. A worker tokenizes " - "`--lines_per_dataset_fragment` lines and keeps in RAM tokenized text labels before packing them into " - "batches. Reducing `--lines_per_dataset_fragment` leads to reducing of the amount of memory required by this " - "script.", - ) - parser.add_argument( - "--num_batches_per_tarfile", - type=int, - default=1000, - help="A number of batches saved in a tar file. If you increase `--num_batches_per_tarfile`, then there will " - "be less tar files in the dataset. There cannot be less then `--num_batches_per_tarfile` batches in a tar " - "file, and all excess batches are removed. Maximum number of discarded batches is " - "`--num_batches_per_tarfile - 1`.", - ) - parser.add_argument( - "--tokenizer_name", - "-T", - default="bert-base-uncased", - help="Name of the tokenizer used for tokenization of source sequences. Possible options are 'sentencepiece', " - "'word', 'char', HuggingFace tokenizers. For more options see function " - "`nemo.collections.nlp.modules.common.get_tokenizer`. The tokenizer has to have properties `cls_id`, " - "`pad_id`, `sep_id`, `unk_id`.", - ) - parser.add_argument( - "--tokenizer_model", "-m", type=Path, help="Path to tokenizer model required for 'sentencepiece' tokenizer." - ) - parser.add_argument( - "--vocab_file", - "-v", - type=Path, - help="Path to vocabulary file which can be used in 'word', 'char', and HuggingFace tokenizers.", - ) - parser.add_argument( - "--merges_file", "-M", type=Path, help="Path to merges file which can be used in HuggingFace tokenizers." - ) - parser.add_argument( - "--special_token_names", - "-n", - nargs="+", - help="Names of special tokens which may be passed to constructors of 'char', 'word', 'sentencepiece', and " - "HuggingFace tokenizers.", - ) - parser.add_argument( - "--special_token_values", - "-V", - nargs="+", - help="Values of special tokens which may be passed to constructors of 'char', 'word', 'sentencepiece', and " - "HuggingFace tokenizers.", - ) - parser.add_argument( - "--use_fast_tokenizer", "-f", action="store_true", help="Whether to use fast HuggingFace tokenizer." - ) - parser.add_argument( - "--pad_label", - "-P", - default='O', - help="Pad label both for punctuation and capitalization. This label is also is used for marking words which " - "do not need punctuation and capitalization. It is also a neutral label used for marking words which do " - "not require punctuation and capitalization.", - ) - punct = parser.add_mutually_exclusive_group(required=False) - punct.add_argument( - "--punct_labels", - "-p", - nargs="+", - help="All punctuation labels EXCEPT PAD LABEL. Punctuation labels are strings separated by spaces. " - "Alternatively you can use parameter `--punct_label_vocab_file`. If none of parameters `--punct_labels` " - "and `--punct_label_vocab_file` are provided, then punctuation label ids will be inferred from `--labels` " - "file.", - ) - punct.add_argument( - "--punct_label_vocab_file", - type=Path, - help="A path to file with punctuation labels. These labels include pad label. Pad label has to be the first " - "label in the file. Each label is written on separate line. Alternatively you can use `--punct_labels` " - "parameter. If none of parameters `--punct_labels` and `--punct_label_vocab_file` are provided, then " - "punctuation label ids will be inferred from `--labels` file.", - ) - capit = parser.add_mutually_exclusive_group(required=False) - capit.add_argument( - "--capit_labels", - "-c", - nargs="+", - help="All capitalization labels EXCEPT PAD LABEL. Capitalization labels are strings separated by spaces. " - "Alternatively you can use parameter `--capit_label_vocab_file`. If none of parameters `--capit_labels` " - "and `--capit_label_vocab_file` are provided, then capitalization label ids will be inferred from `--labels` " - "file.", - ) - capit.add_argument( - "--capit_label_vocab_file", - type=Path, - help="A path to file with capitalization labels. These labels include pad label. Pad label has to be the " - "first label in the file. Each label is written on separate line. Alternatively you can use `--capit_labels` " - "parameter. If none of parameters `--capit_labels` and `--capit_label_vocab_file` are provided, then " - "capitalization label ids will be inferred from `--labels` file.", - ) - parser.add_argument( - "--tar_file_prefix", - "-x", - default="punctuation_capitalization", - help="A string from which tar file names start. It can contain only characters 'A-Z', 'a-z', '0-9', '_', '-', " - "'.'.", - ) - parser.add_argument( - "--n_jobs", - "-j", - type=int, - default=mp.cpu_count(), - help="Number of workers for creating tarred dataset. By default it is equal to the number of CPU cores.", - ) - args = parser.parse_args() - for name in [ - "text", - "labels", - "output_dir", - "tokenizer_model", - "vocab_file", - "merges_file", - "punct_label_vocab_file", - "capit_label_vocab_file", - ]: - if getattr(args, name) is not None: - setattr(args, name, getattr(args, name).expanduser()) - if args.special_token_names is not None or args.special_token_values is not None: - if args.special_token_names is None: - parser.error( - "If you provide parameter `--special_token_values` you have to provide parameter " - "`--special_token_names`." - ) - if args.special_token_values is None: - parser.error( - "If you provide parameter `--special_token_names` you have to provide parameter " - "`--special_token_values`." - ) - if len(args.special_token_names) != len(args.special_token_values): - parser.error( - f"Parameters `--special_token_names` and `--special_token_values` have to have equal number of values " - f"whereas parameter `--special_token_names` has {len(args.special_token_names)} values and parameter " - f"`--special_token_values` has {len(args.special_token_values)} values." - ) - if len(set(args.special_token_names)) != len(args.special_token_names): - for i in range(len(args.special_token_names) - 1): - if args.special_token_names[i] in args.special_token_names[i + 1 :]: - parser.error( - f"Values of parameter `--special_token_names` has to be unique. Found duplicate value " - f"'{args.special_token_names[i]}'." - ) - if args.punct_labels is not None: - check_labels_for_being_unique_before_building_label_ids( - args.pad_label, args.punct_labels, '--pad_label', '--punct_labels', parser.error - ) - check_labels_for_being_unique_before_building_label_ids( - args.pad_label, args.capit_labels, '--pad_label', '--capit_labels', parser.error - ) - check_tar_file_prefix(args.tar_file_prefix, parser.error, '--tar_file_prefix') - return args - - -def main() -> None: - args = get_args() - if args.special_token_names is None: - special_tokens = None - else: - special_tokens = dict(zip(args.special_token_names, args.special_token_values)) - - if args.punct_labels is not None: - punct_label_ids = build_label_ids_from_list_of_labels(args.pad_label, args.punct_labels) - else: - punct_label_ids = None - - if args.capit_labels is not None: - capit_label_ids = build_label_ids_from_list_of_labels(args.pad_label, args.capit_labels) - else: - capit_label_ids = None - - create_tarred_dataset( - args.text, - args.labels, - args.output_dir, - args.max_seq_length, - args.tokens_in_batch, - args.lines_per_dataset_fragment, - args.num_batches_per_tarfile, - args.tokenizer_name, - tokenizer_model=args.tokenizer_model, - vocab_file=args.vocab_file, - merges_file=args.merges_file, - special_tokens=special_tokens, - use_fast_tokenizer=args.use_fast_tokenizer, - pad_label=args.pad_label, - punct_label_ids=punct_label_ids, - capit_label_ids=capit_label_ids, - punct_label_vocab_file=args.punct_label_vocab_file, - capit_label_vocab_file=args.capit_label_vocab_file, - tar_file_prefix=args.tar_file_prefix, - n_jobs=args.n_jobs, - audio_file=args.audio_file, - sample_rate=args.sample_rate, - use_audio=args.use_audio, - ) - - -if __name__ == "__main__": - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/token_classification/data/get_libritts_data.py b/SoundScribe/SpeakerID/examples/nlp/token_classification/data/get_libritts_data.py deleted file mode 100644 index 86a5d01eb9dcfb99c97bed3a3d07bd5d804fb64c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/token_classification/data/get_libritts_data.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This script downloads and unpacks LibriTTS data. And prepares it for punctuation and capitalization lexical audio model. -Data is being downloaded from www.openslr.org and then extracted via tar. -The script gathers text from every *.normalized.txt file inside of archive into single file with text and file with audio filepaths. -""" -import argparse -import glob -import os -import re -import shutil -import subprocess -import tarfile - -from tqdm import tqdm - -from nemo.collections.nlp.data.token_classification.token_classification_utils import create_text_and_labels -from nemo.utils import logging - -URL = { - 'train_clean_100': "https://www.openslr.org/resources/60/train-clean-100.tar.gz", - 'train_clean_360': "https://www.openslr.org/resources/60/train-clean-360.tar.gz", - 'train_other_500': "https://www.openslr.org/resources/60/train-other-500.tar.gz", - 'dev_clean': "https://www.openslr.org/resources/60/dev-clean.tar.gz", - 'dev_other': "https://www.openslr.org/resources/60/dev-other.tar.gz", - 'test_clean': "https://www.openslr.org/resources/60/test-clean.tar.gz", - 'test_other': "https://www.openslr.org/resources/60/test-other.tar.gz", -} - - -def __extract_file(filepath, data_dir): - try: - tar = tarfile.open(filepath) - tar.extractall(data_dir) - tar.close() - except Exception: - print(f"Error while extracting {filepath}. Already extracted?") - - -def __maybe_download_file(destination: str, source: str): - """ - Downloads source to destination if not exists. - If exists, skips download - Args: - destination: local filepath - source: url of resource - """ - source = URL[source] - if not os.path.exists(destination): - logging.info(f'Downloading {source} to {destination}') - subprocess.run(['wget', '-O', destination, source]) - return 1 - else: - logging.info(f'{destination} found. Skipping download') - return 0 - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='Prepare LibriTTS dataset for punctuation capitalization lexical audio model training/evaluating.' - ) - parser.add_argument("--data_sets", default="dev_clean", type=str, help="List of subsets separated by comma") - parser.add_argument("--data_dir", required=True, type=str, help="Path to dir where data will be stored") - parser.add_argument( - "--clean", "-c", action="store_true", help="If set to True will delete all files except produced .txt and .wav" - ) - args = parser.parse_args() - - data_dir = args.data_dir - - if not os.path.exists(data_dir): - os.makedirs(data_dir) - - for subset in args.data_sets.split(','): - logging.info(f'Downloading {subset} subset') - if __maybe_download_file(data_dir + f'/{subset}.tar.gz', subset): - logging.info(f'Extracting {subset} subset') - __extract_file(data_dir + f'/{subset}.tar.gz', data_dir) - - logging.info(f'Processing data') - - splits = set([split.split('_')[0] for split in args.data_sets.split(',')]) - for split in splits: - os.makedirs(f'{data_dir}/audio/{split}', exist_ok=True) - with open(f'{data_dir}/{split}.txt', 'w') as text_data, open( - f'{data_dir}/audio_{split}.txt', 'w' - ) as audio_data: - for file in tqdm(glob.glob(f'{data_dir}/LibriTTS/{split}*/*/*/*.wav'), desc=f'Processing {split}'): - with open(file[:-4] + '.normalized.txt', 'r') as source_file: - lines = source_file.readlines() - text = lines[0] - text = re.sub(r"[^a-zA-Z\d,?!.']", ' ', text) - text = re.sub(' +', ' ', text) - shutil.copy(file.strip(), (f'{data_dir}/audio/{split}/' + file.split('/')[-1]).strip()) - text_data.write(text.strip() + "\n") - audio_data.write((f'{data_dir}/audio/{split}/' + file.split('/')[-1]).strip() + "\n") - create_text_and_labels(f'{data_dir}/', f'{data_dir}/{split}.txt') - logging.info(f'Processed {split} subset') - - if args.clean: - shutil.rmtree(f'{data_dir}/LibriTTS') - for tar in glob.glob(f'{data_dir}/**.tar.gz'): - os.remove(tar) diff --git a/SoundScribe/SpeakerID/examples/nlp/token_classification/data/get_tatoeba_data.py b/SoundScribe/SpeakerID/examples/nlp/token_classification/data/get_tatoeba_data.py deleted file mode 100644 index 6a4cd23b249d13a9f90f6714ed2b1a1e0bf90db4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/token_classification/data/get_tatoeba_data.py +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging -import os -import random -import re -import subprocess - -from nemo.collections.nlp.data.token_classification.token_classification_utils import create_text_and_labels -from nemo.utils import logging - -URL = {'tatoeba': 'https://downloads.tatoeba.org/exports/sentences.csv'} - - -def __maybe_download_file(destination: str, source: str): - """ - Downloads source to destination if not exists. - If exists, skips download - Args: - destination: local filepath - source: url of resource - """ - source = URL[source] - if not os.path.exists(destination): - logging.info(f'Downloading {source} to {destination}') - subprocess.run(['wget', '-O', destination, source]) - else: - logging.info(f'{destination} found. Skipping download') - - -def __process_english_sentences( - in_file: str, out_file: str, percent_to_cut: float = 0, num_to_combine: int = 1, num_samples: int = -1 -): - """ - Extract English sentences from the Tatoeba dataset. - - Expected in_file format - that - contain letters and punctuation marks (,.?). - Chop and combine sentences. - Args: - in_file: local filepath to the tatoeba dataset. - Format: id [TAB] region_name [TAB] sentence, - for example: "1276\teng\tLet's try something.\n" - out_file: local filepath to the clean dataset - percent_to_cut: Percent of sentences to cut in the middle - to get examples of incomplete sentences. - This could be useful since ASR output not always - represents a complete sentence - num_to_combine: Number of sentences to combine into - a single example - num_samples: Number of samples in the final dataset - """ - if not os.path.exists(in_file): - raise FileNotFoundError(f'{in_file} not found.') - - in_file = open(in_file, 'r') - out_file = open(out_file, 'w') - lines_to_combine = [] - samples_count = 0 - - for line in in_file: - line = line.split('\t') - # use only English sentences - if line[1] == 'eng': - line = line[2].strip() - if re.match("^[A-Z][A-Za-z.,'?\s]+$", line): # nopep8 - # chop some sentences in the middle - if percent_to_cut > 0: - line = line.split() - if random.random() < percent_to_cut: - line = line[: len(line) // 2] - line = ' '.join(line) - - # combine multiple sentences into a single example - # to make it harder for the model to learn eos punctuation - if len(lines_to_combine) >= num_to_combine: - if samples_count == num_samples: - return - out_file.write(' '.join(lines_to_combine) + '\n') - lines_to_combine = [] - samples_count += 1 - lines_to_combine.append(line) - - if len(lines_to_combine) > 0 and (samples_count < num_samples or num_samples < 0): - out_file.write(' '.join(lines_to_combine) + '\n') - - -def __split_into_train_dev(in_file: str, train_file: str, dev_file: str, percent_dev: float): - """ - Create train and dev split of the dataset. - Args: - in_file: local filepath to the dataset - train_file: local filepath to the train dataset - dev_file: local filepath to the dev dataset - percent_dev: Percent of the sentences in the dev set - """ - if not os.path.exists(in_file): - raise FileNotFoundError(f'{in_file} not found.') - - lines = open(in_file, 'r').readlines() - train_file = open(train_file, 'w') - dev_file = open(dev_file, 'w') - - dev_size = int(len(lines) * percent_dev) - train_file.write(' '.join(lines[:-dev_size])) - dev_file.write(' '.join(lines[-dev_size:])) - - -def __delete_file(file_to_del: str): - """ - Deletes the file - Args: - file_to_del: local filepath to the file to delete - """ - if os.path.exists(file_to_del): - os.remove(file_to_del) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Prepare tatoeba dataset') - parser.add_argument("--data_dir", required=True, type=str) - parser.add_argument("--dataset", default='tatoeba', type=str) - parser.add_argument("--num_samples", default=-1, type=int, help='-1 to use the whole dataset') - parser.add_argument("--percent_to_cut", default=0, type=float, help='Percent of sentences to cut in the middle') - parser.add_argument( - "--num_lines_to_combine", default=1, type=int, help='Number of lines to combine into single example' - ) - parser.add_argument("--percent_dev", default=0.2, type=float, help='Size of the dev set, float') - parser.add_argument("--clean_dir", action='store_true') - args = parser.parse_args() - - if not os.path.exists(args.data_dir): - os.makedirs(args.data_dir) - - if args.dataset != 'tatoeba': - raise ValueError("Unsupported dataset.") - - logging.info(f'Downloading tatoeba dataset') - tatoeba_dataset = os.path.join(args.data_dir, 'sentences.csv') - __maybe_download_file(tatoeba_dataset, args.dataset) - - logging.info(f'Processing English sentences...') - clean_eng_sentences = os.path.join(args.data_dir, 'clean_eng_sentences.txt') - __process_english_sentences( - tatoeba_dataset, clean_eng_sentences, args.percent_to_cut, args.num_lines_to_combine, args.num_samples - ) - - train_file = os.path.join(args.data_dir, 'train.txt') - dev_file = os.path.join(args.data_dir, 'dev.txt') - - logging.info( - f'Splitting the {args.dataset} dataset into train and dev sets' + ' and creating labels and text files' - ) - __split_into_train_dev(clean_eng_sentences, train_file, dev_file, args.percent_dev) - - logging.info(f'Creating text and label files for training') - create_text_and_labels(args.data_dir, os.path.join(args.data_dir, 'train.txt')) - create_text_and_labels(args.data_dir, os.path.join(args.data_dir, 'dev.txt')) - - if args.clean_dir: - logging.info(f'Cleaning up {args.data_dir}') - __delete_file(clean_eng_sentences) - __delete_file(tatoeba_dataset) - __delete_file(train_file) - __delete_file(dev_file) - logging.info(f'Processing of the {args.dataset} is complete') diff --git a/SoundScribe/SpeakerID/examples/nlp/token_classification/data/import_from_iob_format.py b/SoundScribe/SpeakerID/examples/nlp/token_classification/data/import_from_iob_format.py deleted file mode 100644 index 4a6f15442b98422769ae3fa74b17b72cd86e65ff..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/token_classification/data/import_from_iob_format.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -from nemo.utils import logging - - -def __convert_data(in_file: str, out_text_f: str, out_labels_f: str, max_length: int): - """ - Convert data from the IOB format to NeMo accepted format described below. - in_file should be in the IOB format, see example here: - https://www.clips.uantwerpen.be/conll2003/ner/. - - Args: - in_file: input file name - out_text_f: output file with text - out_labels_f: output file with labels - max_length: use -1 to leave the examples' length as is, otherwise long examples will be split into multiple - examples - After the conversion, the dataset is split into 2 files: text.txt - and labels.txt. - Each line of the text.txt file contains text sequences, where words - are separated with spaces. The labels.txt file contains corresponding - labels for each word in text.txt, the labels are separated with spaces. - Each line of the files should follow the format: - [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and - [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt). - - """ - in_file = open(in_file, 'r') - - if max_length == -1: - with open(out_text_f, 'w') as out_text, open(out_labels_f, 'w') as out_labels: - for line in in_file: - if line == '\n': - out_text.write(line) - out_labels.write(line) - else: - line = line.split() - out_text.write(line[0] + ' ') - out_labels.write(line[-1] + ' ') - - else: - words = [] - labels = [] - with open(out_text_f, 'w') as out_text, open(out_labels_f, 'w') as out_labels: - lines = in_file.readlines() - for line_id, line in enumerate(lines): - logging.info(f"{line_id} {len(lines)}") - contends = line.strip() - if len(contends) == 0: - assert len(words) == len(labels) - if len(words) > max_length: - # split if the sentence is longer than max_length - while len(words) > max_length: - tmplabel = labels[:max_length] - for iidx in range(len(tmplabel)): - if tmplabel.pop() == 'O': - break - l = ' '.join([label for label in labels[: len(tmplabel) + 1] if len(label) > 0]) - w = ' '.join([word for word in words[: len(tmplabel) + 1] if len(word) > 0]) - - out_text.write(w + "\n") - out_labels.write(l + "\n") - words = words[len(tmplabel) + 1 :] - labels = labels[len(tmplabel) + 1 :] - - if len(words) == 0: - continue - l = ' '.join([label for label in labels if len(label) > 0]) - w = ' '.join([word for word in words if len(word) > 0]) - - out_text.write(w + "\n") - out_labels.write(l + "\n") - words = [] - labels = [] - continue - - word = line.strip().split()[0] - label = line.strip().split()[-1] - words.append(word) - labels.append(label) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='Convert data from IOB format to the format compatible with \ - nlp/examples/token_classification/scripts/token_classification_train.py and \ - token_classification_evaluate.py' - ) - parser.add_argument("--data_file", required=True, type=str, help='path to a file in IOB format') - parser.add_argument( - "--max_length", - default=-1, - type=int, - help='use -1 to leave the examples\'s length as is, ' - 'otherwise long examples will be split into multiple examples', - ) - args = parser.parse_args() - - data_dir, basename = os.path.split(args.data_file) - prefix = os.path.splitext(basename)[0] - if not os.path.exists(args.data_file): - raise FileNotFoundError(f"{args.data_file} not found") - - logging.info(f'Processing {args.data_file}') - out_text = os.path.join(data_dir, 'text_' + prefix + '.txt') - out_labels = os.path.join(data_dir, 'labels_' + prefix + '.txt') - - __convert_data(args.data_file, out_text, out_labels, args.max_length) - logging.info(f'Processing of the {args.data_file} is complete') diff --git a/SoundScribe/SpeakerID/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py b/SoundScribe/SpeakerID/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py deleted file mode 100644 index 78a0763d3b54a7330d0dfe187abff0cdcf2e1dd2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -The script converts raw text to the NeMo format for punctuation and capitalization task. - -Raw Data Format ---------------- - -The Punctuation and Capitalization model can work with any text dataset, although it is recommended to balance the data, especially for the punctuation task. -Before pre-processing the data to the format expected by the model, the data should be split into train.txt and dev.txt (and optionally test.txt). -Each line in the **train.txt/dev.txt/test.txt** should represent one or more full and/or truncated sentences. - -Example of the train.txt/dev.txt file: - When is the next flight to New York? - The next flight is ... - .... - - -The `source_data_dir` structure should look like this: - . - |--sourced_data_dir - |-- dev.txt - |-- train.txt - - - -NeMo Data Format for training the model ---------------------------------------- - -The punctuation and capitalization model expects the data in the following format: - -The training and evaluation data is divided into 2 files: text.txt and labels.txt. \ -Each line of the **text.txt** file contains text sequences, where words are separated with spaces, i.e. - -[WORD] [SPACE] [WORD] [SPACE] [WORD], for example: - when is the next flight to new york - the next flight is ... - ... - -The **labels.txt** file contains corresponding labels for each word in text.txt, the labels are separated with spaces. \ -Each label in labels.txt file consists of 2 symbols: - -* the first symbol of the label indicates what punctuation mark should follow the word (where O means no punctuation needed); -* the second symbol determines if a word needs to be capitalized or not (where U indicates that the word should be upper-cased, and O - no capitalization needed.) - -By default, the following punctuation marks are considered: commas, periods, and question marks; the rest punctuation marks were removed from the data. -This can be changed by introducing new labels in the labels.txt files - -Each line of the labels.txt should follow the format: [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt). \ -For example, labels for the above text.txt file should be: - - OU OO OO OO OO OO OU ?U - OU OO OO OO ... - ... - -The complete list of all possible labels for this task used in this tutorial is: OO, ,O, .O, ?O, OU, ,U, .U, ?U. - -Converting Raw data to NeMo format ----------------------------------- - -To pre-process the raw text data, stored under :code:`sourced_data_dir` (see the :ref:`raw_data_format_punct` -section), run the following command: - - python examples/nlp/token_classification/data/prepare_data_for_punctuation_capitalization.py \ - -s \ - -o - -""" - -import argparse -import os - -from get_tatoeba_data import create_text_and_labels - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Prepare data for punctuation and capitalization tasks') - parser.add_argument("-s", "--source_file", required=True, type=str, help="Path to the source file") - parser.add_argument("-o", "--output_dir", required=True, type=str, help="Path to the output directory") - parser.add_argument( - "-p", - "--marks", - required=False, - type=str, - help="Punctuation marks to consider for dataset", - default=[",", ".", "?"], - nargs="+", - ) - args = parser.parse_args() - - if not os.path.exists(args.source_file): - raise ValueError(f'{args.source_file} was not found') - - os.makedirs(args.output_dir, exist_ok=True) - create_text_and_labels(args.output_dir, args.source_file, "".join(args.marks)) - - print(f'Processing of the {args.source_file} is complete') diff --git a/SoundScribe/SpeakerID/examples/nlp/token_classification/punctuate_capitalize_infer.py b/SoundScribe/SpeakerID/examples/nlp/token_classification/punctuate_capitalize_infer.py deleted file mode 100644 index 8fdb3ab5a1ed20d8ec142a1082b63d6f62585a4d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/token_classification/punctuate_capitalize_infer.py +++ /dev/null @@ -1,282 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json -from pathlib import Path -from typing import Dict, List, Union - -import torch.cuda - -from nemo.collections.nlp.models import PunctuationCapitalizationLexicalAudioModel, PunctuationCapitalizationModel - - -""" -This script is for restoring punctuation and capitalization. - -Usage example: - -python punctuate_capitalize.py \ - --input_manifest \ - --output_manifest - -Usage example for lexical audio model: -python punctuate_capitalize.py \ - --input_manifest \ - --output_manifest \ - --use_audio - - - is a path to NeMo ASR manifest. Usually it is an output of - NeMo/examples/asr/transcribe_speech.py but can be a manifest with 'text' key. Alternatively you can use - --input_text parameter for passing text for inference. - is a path to NeMo ASR manifest into which script output will be written. Alternatively - you can use parameter --output_text. - -For more details on this script usage look in argparse help. -""" - - -def get_args() -> argparse.Namespace: - default_model_parameter = "pretrained_name" - default_model = "punctuation_en_bert" - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description="The script is for restoring punctuation and capitalization in text or text and audio. To use text and audio use '--use_audio'. Long strings are split into " - "segments of length `--max_seq_length`. `--max_seq_length` is the length which includes [CLS] and [SEP] " - "tokens. If `--use_audio` is set, samples with texts longer than `--max_seq_length` will be ignored. Parameter `--step` controls segments overlapping. `--step` is a distance between beginnings of " - "consequent segments. Model outputs for tokens near the borders of tensors are less accurate and can be " - "discarded before final predictions computation. Parameter `--margin` is number of discarded outputs near " - "segments borders. Probabilities of tokens in overlapping parts of segments multiplied before selecting the " - "best prediction. Default values of parameters `--max_seq_length`, `--step`, and `--margin` are optimal for " - "IWSLT 2019 test dataset.", - ) - parser.add_argument( - '--use_audio', - required=False, - action="store_true", - help="If set `PunctuationCapitalizationLexicalAudioModel` will be used for inference", - ) - input_ = parser.add_mutually_exclusive_group(required=True) - input_.add_argument( - "--input_manifest", - "-m", - type=Path, - help="Path to the file with NeMo manifest which needs punctuation and capitalization. If the first element " - "of manifest contains key 'pred_text', 'pred_text' values are passed for tokenization. Otherwise 'text' " - "values are passed for punctuation and capitalization. Exactly one parameter of `--input_manifest` and " - "`--input_text` should be provided.", - ) - input_.add_argument( - "--input_text", - "-t", - type=Path, - help="Path to file with text which needs punctuation and capitalization. Exactly one parameter of " - "`--input_manifest` and `--input_text` should be provided.", - ) - parser.add_argument( - '--audio_file', - required=False, - type=Path, - help="Path to file with paths to audio. One path per row. Required if '--input_text' provided. Else 'audio_filepath' from manifest will be used.", - ) - output = parser.add_mutually_exclusive_group(required=True) - output.add_argument( - "--output_manifest", - "-M", - type=Path, - help="Path to output NeMo manifest. Text with restored punctuation and capitalization will be saved in " - "'pred_text' elements if 'pred_text' key is present in the input manifest. Otherwise text with restored " - "punctuation and capitalization will be saved in 'text' elements. Exactly one parameter of `--output_manifest` " - "and `--output_text` should be provided.", - ) - output.add_argument( - "--output_text", - "-T", - type=Path, - help="Path to file with text with restored punctuation and capitalization. Exactly one parameter of " - "`--output_manifest` and `--output_text` should be provided.", - ) - model = parser.add_mutually_exclusive_group(required=False) - model.add_argument( - "--pretrained_name", - "-p", - help=f"The name of NGC pretrained model. No more than one of parameters `--pretrained_name`, `--model_path`" - f"should be provided. If neither of parameters `--pretrained_name` and `--model_path` are provided, then the " - f"script is run with `--{default_model_parameter}={default_model}`.", - choices=[m.pretrained_model_name for m in PunctuationCapitalizationModel.list_available_models()] - + [m.pretrained_model_name for m in PunctuationCapitalizationLexicalAudioModel.list_available_models()], - ) - model.add_argument( - "--model_path", - "-P", - type=Path, - help=f"Path to .nemo checkpoint of punctuation and capitalization model. No more than one of parameters " - f"`--pretrained_name` and `--model_path` should be provided. If neither of parameters `--pretrained_name` and " - f"`--model_path` are provided, then the script is run with `--{default_model_parameter}={default_model}`.", - ) - parser.add_argument( - "--max_seq_length", - "-L", - type=int, - default=64, - help="Length of segments into which queries are split. `--max_seq_length` includes [CLS] and [SEP] tokens.", - ) - parser.add_argument( - "--step", - "-s", - type=int, - default=8, - help="Relative shift of consequent segments into which long queries are split. Long queries are split into " - "segments which can overlap. Parameter `step` controls such overlapping. Imagine that queries are " - "tokenized into characters, `max_seq_length=5`, and `step=2`. In such a case query 'hello' is tokenized " - "into segments `[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]`.", - ) - parser.add_argument( - "--margin", - "-g", - type=int, - default=16, - help="A number of subtokens in the beginning and the end of segments which output probabilities are not used " - "for prediction computation. The first segment does not have left margin and the last segment does not have " - "right margin. For example, if input sequence is tokenized into characters, `max_seq_length=5`, `step=1`, " - "and `margin=1`, then query 'hello' will be tokenized into segments `[['[CLS]', 'h', 'e', 'l', '[SEP]'], " - "['[CLS]', 'e', 'l', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]`. These segments are passed to the " - "model. Before final predictions computation, margins are removed. In the next list, subtokens which logits " - "are not used for final predictions computation are marked with asterisk: `[['[CLS]'*, 'h', 'e', 'l'*, " - "'[SEP]'*], ['[CLS]'*, 'e'*, 'l', 'l'*, '[SEP]'*], ['[CLS]'*, 'l'*, 'l', 'o', '[SEP]'*]]`.", - ) - parser.add_argument( - "--batch_size", "-b", type=int, default=128, help="Number of segments which are processed simultaneously.", - ) - parser.add_argument( - "--save_labels_instead_of_text", - "-B", - action="store_true", - help="If this option is set, then punctuation and capitalization labels are saved instead text with restored " - "punctuation and capitalization. Labels are saved in format described here " - "https://docs.nvidia.com/deeplearning/nemo/" - "user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#nemo-data-format", - ) - parser.add_argument( - "--device", - "-d", - choices=['cpu', 'cuda'], - help="Which device to use. If device is not set and CUDA is available, then GPU will be used. If device is " - "not set and CUDA is not available, then CPU is used.", - ) - parser.add_argument( - "--sample_rate", - type=int, - default=16000, - help="Target sample rate for audios if `--use_audio` was passed", - required=False, - ) - args = parser.parse_args() - if args.input_manifest is None and args.output_manifest is not None: - parser.error("--output_manifest requires --input_manifest") - if args.use_audio and (args.input_manifest is None and args.audio_file is None): - parser.error("--use_audio and --input_text require --audio_file") - if args.pretrained_name is None and args.model_path is None: - setattr(args, default_model_parameter, default_model) - for name in ["input_manifest", "input_text", "output_manifest", "output_text", "model_path", "audio_file"]: - if getattr(args, name) is not None: - setattr(args, name, getattr(args, name).expanduser()) - return args - - -def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]: - result = [] - with manifest.open() as f: - for i, line in enumerate(f): - data = json.loads(line) - result.append(data) - return result - - -def main() -> None: - args = get_args() - if args.pretrained_name is None: - model = ( - PunctuationCapitalizationModel.restore_from(args.model_path) - if not args.use_audio - else PunctuationCapitalizationLexicalAudioModel.restore_from(args.model_path) - ) - else: - model = ( - PunctuationCapitalizationModel.from_pretrained(args.pretrained_name) - if not args.use_audio - else PunctuationCapitalizationLexicalAudioModel.restore_from(args.model_path) - ) - if args.device is None: - if torch.cuda.is_available(): - model = model.cuda() - else: - model = model.cpu() - else: - model = model.to(args.device) - if args.input_manifest is None: - texts = [] - audios = [] - with args.input_text.open() as f: - for line in f: - texts.append(line.strip()) - if args.use_audio: - with args.audio_file.open() as f: - for line in f: - audios.append(line.strip()) - else: - manifest = load_manifest(args.input_manifest) - text_key = "pred_text" if "pred_text" in manifest[0] else "text" - texts = [] - audios = [] - for item in manifest: - texts.append(item[text_key]) - if args.use_audio: - audios.append(item["audio_filepath"]) - if args.use_audio: - processed_texts = model.add_punctuation_capitalization( - texts, - batch_size=args.batch_size, - max_seq_length=args.max_seq_length, - step=args.step, - margin=args.margin, - return_labels=args.save_labels_instead_of_text, - audio_queries=audios, - target_sr=args.sample_rate, - ) - else: - processed_texts = model.add_punctuation_capitalization( - texts, - batch_size=args.batch_size, - max_seq_length=args.max_seq_length, - step=args.step, - margin=args.margin, - return_labels=args.save_labels_instead_of_text, - ) - if args.output_manifest is None: - args.output_text.parent.mkdir(exist_ok=True, parents=True) - with args.output_text.open('w') as f: - for t in processed_texts: - f.write(t + '\n') - else: - args.output_manifest.parent.mkdir(exist_ok=True, parents=True) - with args.output_manifest.open('w') as f: - for item, t in zip(manifest, processed_texts): - item[text_key] = t - f.write(json.dumps(item) + '\n') - - -if __name__ == "__main__": - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py b/SoundScribe/SpeakerID/examples/nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py deleted file mode 100644 index 149a9a4515e242ee6f169e2d19d5c33f363c3ba2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/token_classification/punctuation_capitalization_lexical_audio_train_evaluate.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pytorch_lightning as pl -import torch -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models.token_classification.punctuation_capitalization_config import ( - PunctuationCapitalizationLexicalAudioConfig, -) -from nemo.collections.nlp.models.token_classification.punctuation_capitalization_lexical_audio_model import ( - PunctuationCapitalizationLexicalAudioModel, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -""" -This script show how to train a Punctuation and Capitalization Model with lexical and acoustic features. -More details on the task and data format could be found in tutorials/nlp/Punctuation_and_Capitalization.ipynb - -*** Setting the configs *** - -The model and the PT trainer are defined in a config file which declares multiple important sections. -The most important ones are: - model: All arguments that are related to the Model - language model, audio encoder, tokenizer, token classifier, optimizer, - schedulers, and datasets/data loaders. - trainer: Any argument to be passed to PyTorch Lightning including number of epochs, number of GPUs, - precision level, etc. -This script uses the `/examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml` config file -by default. You may update the config file from the file directly. -The other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'. - -*** Model training *** - -To run this script and train the model from scratch, use: - python punctuation_capitalization_lexical_audio_train_evaluate.py \ - model.train_ds.ds_item= \ - model.train_ds.text_file= \ - model.train_ds.labels_file= \ - model.train_ds.audio_file= \ - model.validation_ds.ds_item= \ - model.validation_ds.text_file= \ - model.validation_ds.labels_file= \ - model.validation_ds.audio_file= - -To use BERT-like pretrained P&C models' weights to initialize lexical encoder, use: - python punctuation_capitalization_lexical_audio_train_evaluate.py \ - model.train_ds.ds_item= \ - model.train_ds.text_file= \ - model.train_ds.labels_file= \ - model.train_ds.audio_file= \ - model.validation_ds.ds_item= \ - model.validation_ds.text_file= \ - model.validation_ds.labels_file= \ - model.validation_ds.audio_file= \ - model.restore_lexical_encoder_from= - - -If you wish to perform testing after training set `do_testing` to `true: - python punctuation_capitalization_lexical_audio_train_evaluate.py \ - +do_testing=true \ - pretrained_model= \ - model.train_ds.ds_item= \ - model.train_ds.text_file= \ - model.train_ds.labels_file= \ - model.train_ds.audio_file= \ - model.validation_ds.ds_item= \ - model.validation_ds.text_file= \ - model.validation_ds.labels_file= \ - model.validation_ds.audio_file= \ - model.test_ds.ds_item= \ - model.test_ds.text_file= \ - model.test_ds.labels_file= \ - model.test_ds.audio_file= - -Set `do_training` to `false` and `do_testing` to `true` to perform evaluation without training: - python punctuation_capitalization_lexical_audio_train_evaluate.py \ - +do_testing=true \ - +do_training=false \ - pretrained_model== \ - model.test_ds.ds_item= \ - model.test_ds.text_file= \ - model.test_ds.labels_file= \ - model.test_ds.audio_file= - -""" - - -@hydra_runner(config_path="conf", config_name="punctuation_capitalization_lexical_audio_config") -def main(cfg: DictConfig) -> None: - # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True - # when there are unused parameters like here - if cfg.trainer.strategy == 'ddp': - cfg.trainer.strategy = "ddp_find_unused_parameters_true" - torch.manual_seed(42) - cfg = OmegaConf.merge(OmegaConf.structured(PunctuationCapitalizationLexicalAudioConfig()), cfg) - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - if not cfg.do_training and not cfg.do_testing: - raise ValueError("At least one of config parameters `do_training` and `do_testing` has to be `true`.") - if cfg.do_training: - if cfg.model.get('train_ds') is None: - raise ValueError('`model.train_ds` config section is required if `do_training` config item is `True`.') - if cfg.do_testing: - if cfg.model.get('test_ds') is None: - raise ValueError('`model.test_ds` config section is required if `do_testing` config item is `True`.') - - if not cfg.pretrained_model: - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - model = PunctuationCapitalizationLexicalAudioModel(cfg.model, trainer=trainer) - else: - if os.path.exists(cfg.pretrained_model): - model = PunctuationCapitalizationLexicalAudioModel.restore_from(cfg.pretrained_model) - elif cfg.pretrained_model in PunctuationCapitalizationLexicalAudioModel.get_available_model_names(): - model = PunctuationCapitalizationLexicalAudioModel.from_pretrained(cfg.pretrained_model) - else: - raise ValueError( - f'Provide path to the pre-trained .nemo file or choose from ' - f'{PunctuationCapitalizationLexicalAudioModel.list_available_models()}' - ) - model.update_config_after_restoring_from_checkpoint( - class_labels=cfg.model.class_labels, - common_dataset_parameters=cfg.model.common_dataset_parameters, - train_ds=cfg.model.get('train_ds') if cfg.do_training else None, - validation_ds=cfg.model.get('validation_ds') if cfg.do_training else None, - test_ds=cfg.model.get('test_ds') if cfg.do_testing else None, - optim=cfg.model.get('optim') if cfg.do_training else None, - ) - model.set_trainer(trainer) - if cfg.do_training: - model.setup_training_data() - model.setup_multiple_validation_data(cfg.model.validation_ds) - model.setup_optimization() - else: - model.setup_multiple_test_data(cfg.model.test_ds) - if cfg.do_training: - trainer.fit(model) - if cfg.do_testing: - trainer.test(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py b/SoundScribe/SpeakerID/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py deleted file mode 100644 index 137ee3a30622a3c57cdca1d095512eea2ce84121..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pytorch_lightning as pl -import torch -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import PunctuationCapitalizationModel -from nemo.collections.nlp.models.token_classification.punctuation_capitalization_config import ( - PunctuationCapitalizationConfig, -) -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -""" -This script show how to train a Punctuation and Capitalization Model. -More details on the task and data format could be found in tutorials/nlp/Punctuation_and_Capitalization.ipynb - -*** Setting the configs *** - -The model and the PT trainer are defined in a config file which declares multiple important sections. -The most important ones are: - model: All arguments that are related to the Model - language model, tokenizer, token classifier, optimizer, - schedulers, and datasets/data loaders. - trainer: Any argument to be passed to PyTorch Lightning including number of epochs, number of GPUs, - precision level, etc. -This script uses the `/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml` config file -by default. You may update the config file from the file directly. -The other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'. - -Additional default parameters could be found in PunctuationCapitalizationDataConfigBase from -/nemo/collections/nlp/data/token_classification/punctuation_capitalization_dataset.py, -use `+` to modify their values via command line, e.g.: `+model.train_ds.num_workers=2` - -For more details about the config files and different ways of model restoration, see tutorials/00_NeMo_Primer.ipynb - -*** Model training *** - -To run this script and train the model from scratch, use: - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.ds_item= \ - model.train_ds.text_file= \ - model.train_ds.labels_file= \ - model.validation_ds.ds_item= \ - model.validation_ds.text_file= \ - model.validation_ds.labels_file= \ - ~model.test_ds - -To use one of the pretrained versions of the model and finetune it, run: - python punctuation_capitalization_train_evaluate.py \ - pretrained_model=punctuation_en_bert \ - model.train_ds.ds_item= \ - model.train_ds.text_file= \ - model.train_ds.labels_file= \ - model.validation_ds.ds_item= \ - model.validation_ds.text_file= \ - model.validation_ds.labels_file= \ - ~model.test_ds - - pretrained_model - pretrained PunctuationCapitalization model from list_available_models() or - path to a .nemo file, for example: punctuation_en_bert or model.nemo - -If you wish to perform testing after training set `do_testing` to `true: - python punctuation_capitalization_train_evaluate.py \ - +do_testing=true \ - pretrained_model=punctuation_en_bert \ - model.train_ds.ds_item= \ - model.train_ds.text_file= \ - model.train_ds.labels_file= \ - model.validation_ds.ds_item= \ - model.validation_ds.text_file= \ - model.validation_ds.labels_file= \ - model.test_ds.ds_item= \ - model.test_ds.text_file= \ - model.test_ds.labels_file= - -Set `do_training` to `false` and `do_testing` to `true` to perform evaluation without training: - python punctuation_capitalization_train_evaluate.py \ - +do_testing=true \ - +do_training=false \ - pretrained_model=punctuation_en_bert \ - model.test_ds.ds_item= \ - model.test_ds.text_file= \ - model.test_ds.labels_file= - -""" - - -@hydra_runner(config_path="conf", config_name="punctuation_capitalization_config") -def main(cfg: DictConfig) -> None: - torch.manual_seed(42) - cfg = OmegaConf.merge(OmegaConf.structured(PunctuationCapitalizationConfig()), cfg) - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - if not cfg.do_training and not cfg.do_testing: - raise ValueError("At least one of config parameters `do_training` and `do_testing` has to `true`.") - if cfg.do_training: - if cfg.model.get('train_ds') is None: - raise ValueError('`model.train_ds` config section is required if `do_training` config item is `True`.') - if cfg.do_testing: - if cfg.model.get('test_ds') is None: - raise ValueError('`model.test_ds` config section is required if `do_testing` config item is `True`.') - - if not cfg.pretrained_model: - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - model = PunctuationCapitalizationModel(cfg.model, trainer=trainer) - else: - if os.path.exists(cfg.pretrained_model): - model = PunctuationCapitalizationModel.restore_from(cfg.pretrained_model) - elif cfg.pretrained_model in PunctuationCapitalizationModel.get_available_model_names(): - model = PunctuationCapitalizationModel.from_pretrained(cfg.pretrained_model) - else: - raise ValueError( - f'Config parameter `pretrained_model` should contain a path to the pre-trained .nemo file or a model ' - f'name from ' - f'{[m.pretrained_model_name for m in PunctuationCapitalizationModel.list_available_models()]}. ' - f'Provided `pretrained_model="{cfg.pretrained_model}"` is neither a valid path, nor a valid model ' - f'name.' - ) - model.update_config_after_restoring_from_checkpoint( - class_labels=cfg.model.class_labels, - common_dataset_parameters=cfg.model.common_dataset_parameters, - train_ds=cfg.model.get('train_ds') if cfg.do_training else None, - validation_ds=cfg.model.get('validation_ds') if cfg.do_training else None, - test_ds=cfg.model.get('test_ds') if cfg.do_testing else None, - optim=cfg.model.get('optim') if cfg.do_training else None, - ) - model.set_trainer(trainer) - if cfg.do_training: - model.setup_training_data() - model.setup_multiple_validation_data(cfg.model.validation_ds) - model.setup_optimization() - else: - model.setup_multiple_test_data(cfg.model.test_ds) - if cfg.do_training: - trainer.fit(model) - if cfg.do_testing: - trainer.test(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/token_classification/token_classification_evaluate.py b/SoundScribe/SpeakerID/examples/nlp/token_classification/token_classification_evaluate.py deleted file mode 100644 index b69212f59de4e18e6fd2465e70d46f4c2d36bedc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/token_classification/token_classification_evaluate.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pytorch_lightning as pl -from omegaconf import DictConfig - -from nemo.collections.nlp.models import TokenClassificationModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -""" -This script shows how to perform evaluation and runs inference of a few examples. - -More details on Token Classification model could be found in tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb - -*** Setting the configs *** - -This script uses the `/examples/nlp/token_classification/conf/token_classification_config.yaml` config file -by default. You may update the config file from the file directly. -The other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'. - -For more details about the config files and different ways of model restoration, see tutorials/00_NeMo_Primer.ipynb - -*** Model Evaluation *** - -The script runs two types of evaluation: - * model.test() - this eval will use the config setting for evaluation such as model.dataset.max_seq_length - * model.evaluate_from_file(): - * disregards model.dataset.max_seq_length and evaluates all the tokens, BERT max seq length - 512 tokens after tokenization - * creates confusion matrix - * saves predictions and labels (if provided) - -To run the script: - - python token_classification_evaluate.py \ - model.dataset.data_dir= \ - pretrained_model=ner_en_bert - - - a directory that contains test_ds.text_file and test_ds.labels_file (see the config) -pretrained_model - pretrained TokenClassification model from list_available_models() or - path to a .nemo file, for example: ner_en_bert or your_model.nemo - -""" - - -@hydra_runner(config_path="conf", config_name="token_classification_config") -def main(cfg: DictConfig) -> None: - logging.info( - 'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU and \ - no DDP to obtain accurate results' - ) - - if not hasattr(cfg.model, 'test_ds'): - raise ValueError(f'model.test_ds was not found in the config, skipping evaluation') - - trainer = pl.Trainer( - devices=1, - precision=cfg.trainer.precision, - logger=False, - enable_checkpointing=False, - accelerator=cfg.trainer.accelerator, - ) - exp_dir = exp_manager(trainer, cfg.exp_manager) - - if not cfg.pretrained_model: - raise ValueError( - 'To run evaluation and inference script a pre-trained model or .nemo file must be provided.' - f'Choose from {TokenClassificationModel.list_available_models()} or "pretrained_model"="your_model.nemo"' - ) - - if os.path.exists(cfg.pretrained_model): - model = TokenClassificationModel.restore_from(cfg.pretrained_model) - elif cfg.pretrained_model in TokenClassificationModel.get_available_model_names(): - model = TokenClassificationModel.from_pretrained(cfg.pretrained_model) - else: - raise ValueError( - f'Provide path to the pre-trained .nemo checkpoint or choose from {TokenClassificationModel.list_available_models()}' - ) - - data_dir = cfg.model.dataset.get('data_dir', None) - if data_dir is None: - logging.error( - 'No dataset directory provided. Skipping evaluation. ' - 'To run evaluation on a file, specify path to the directory that contains test_ds.text_file and test_ds.labels_file with "model.dataset.data_dir" argument.' - ) - elif not os.path.exists(data_dir): - logging.error(f'{data_dir} is not found, skipping evaluation on the test set.') - else: - model.update_data_dir(data_dir=data_dir) - model._cfg.dataset = cfg.model.dataset - - if not hasattr(cfg.model, 'test_ds'): - logging.error(f'model.test_ds was not found in the config, skipping evaluation') - elif model.prepare_test(trainer): - model.setup_test_data(cfg.model.test_ds) - trainer.test(model) - - model.evaluate_from_file( - text_file=os.path.join(data_dir, cfg.model.test_ds.text_file), - labels_file=os.path.join(data_dir, cfg.model.test_ds.labels_file), - output_dir=exp_dir, - add_confusion_matrix=True, - normalize_confusion_matrix=True, - ) - else: - logging.error('Skipping the evaluation. The trainer is not setup properly.') - - # run an inference on a few examples - queries = ['we bought four shirts from the nvidia gear store in santa clara.', 'Nvidia is a company.'] - results = model.add_predictions(queries, output_file='predictions.txt') - - for query, result in zip(queries, results): - logging.info(f'Query : {query}') - logging.info(f'Result: {result.strip()}\n') - - logging.info(f'Results are saved at {exp_dir}') - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/token_classification/token_classification_train.py b/SoundScribe/SpeakerID/examples/nlp/token_classification/token_classification_train.py deleted file mode 100644 index 56c1487cf9c5b2437243e5b660743e8677fa9dc0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/token_classification/token_classification_train.py +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import TokenClassificationModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -""" -This scripts shows how to train a Token Classification model. - -The Token Classification model supports Named Entity Recognition task and other token level classification tasks, -as long as the data follows the format specified below. - -More details on how to use this script could be found in -tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb - -*** Data Format *** -Token Classification Model requires the data to be split into 2 files: text.txt and labels.txt. -Each line of the text.txt file contains text sequences, where words are separated with spaces, i.e.: -[WORD] [SPACE] [WORD] [SPACE] [WORD]. -The labels.txt file contains corresponding labels for each word in text.txt, the labels are separated with spaces, i.e.: -[LABEL] [SPACE] [LABEL] [SPACE] [LABEL]. - -Example of a text.txt file: -Jennifer is from New York City . -She likes ... -... - -Corresponding labels.txt file: -B-PER O O B-LOC I-LOC I-LOC O -O O ... -... - -*** Preparing the dataset *** - -To convert an IOB format data to the format required for training, run -examples/nlp/token_classification/data/import_from_iob_format.py on your train and dev files, as follows: - -python examples/nlp/token_classification/data/import_from_iob_format.py --data_file PATH_TO_IOB_FORMAT_DATAFILE - -*** Setting the configs *** - -The model and the PT trainer are defined in a config file which declares multiple important sections. -The most important ones are: - model: All arguments that are related to the Model - language model, tokenizer, token classifier, optimizer, - schedulers, and datasets/data loaders. - trainer: Any argument to be passed to PyTorch Lightning including number of epochs, number of GPUs, - precision level, etc. -This script uses the `/examples/nlp/token_classification/conf/token_classification_config.yaml` config file -by default. You may update the config file from the file directly. -The other option is to set another config file via command line arguments by `--config-name=CONFIG_FILE_PATH'. - -For more details about the config files and different ways of model restoration, see tutorials/00_NeMo_Primer.ipynb - -*** Model Training *** - -To train TokenClassification model from scratch with the default config file, run: - - python token_classification_train.py \ - model.dataset.data_dir= \ - trainer.max_epochs= \ - trainer.devices=[] - -To use one of the pretrained versions of the model specify a `pretrained_model` arg with either -TokenClassification model from list_available_models() or path to a .nemo file, for example: -ner_en_bert or model.nemo, run: - - python token_classification_train.py pretrained_model=ner_en_bert - -To use one of the pretrained versions of the model and fine-tune it, run: - - python token_classification_train.py \ - model.dataset.data_dir= \ - pretrained_model=ner_en_bert - - - a directory that contains test_ds.text_file and test_ds.labels_file (see the config) -pretrained_model - pretrained TokenClassification model from list_available_models() or - path to a .nemo file, for example: ner_en_bert or model.nemo - -For more ways of restoring a pre-trained model, see tutorials/00_NeMo_Primer.ipynb -""" - - -@hydra_runner(config_path="conf", config_name="token_classification_config") -def main(cfg: DictConfig) -> None: - try: - strategy = NLPDDPStrategy(find_unused_parameters=True) - except (ImportError, ModuleNotFoundError): - strategy = 'auto' - - trainer = pl.Trainer(strategy=strategy, **cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - - if not cfg.pretrained_model: - logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') - model = TokenClassificationModel(cfg.model, trainer=trainer) - else: - if os.path.exists(cfg.pretrained_model): - # TODO: can we drop strict=False? - model = TokenClassificationModel.restore_from(cfg.pretrained_model, trainer=trainer, strict=False) - elif cfg.pretrained_model in TokenClassificationModel.get_available_model_names(): - model = TokenClassificationModel.from_pretrained(cfg.pretrained_model) - else: - raise ValueError( - f'Provide path to the pre-trained .nemo file or choose from {TokenClassificationModel.list_available_models()}' - ) - - data_dir = cfg.model.dataset.get('data_dir', None) - if data_dir: - if not os.path.exists(data_dir): - raise ValueError(f'{data_dir} is not found at') - - # we can also do finetuning of the pretrained model but it will require - # setup the data dir to get class weights statistics - model.update_data_dir(data_dir=data_dir) - # finally, setup train and validation Pytorch DataLoaders - model.setup_training_data() - model.setup_validation_data() - # then we're setting up loss, use model.dataset.class_balancing, - # if you want to add class weights to the CrossEntropyLoss - model.setup_loss(class_balancing=cfg.model.dataset.class_balancing) - logging.info(f'Using config file of the pretrained model') - else: - raise ValueError( - 'Specify a valid dataset directory that contains test_ds.text_file and test_ds.labels_file \ - with "model.dataset.data_dir" argument' - ) - - trainer.fit(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml b/SoundScribe/SpeakerID/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml deleted file mode 100644 index 64fd5f8542f3536d3311037138db63cfd50839bc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Config file for Zero Shot Intent Recognition (BERT model trained NLI) -trainer: - devices: 1 # the number of gpus, 0 for CPU - num_nodes: 1 - max_epochs: 1 - max_steps: -1 # precedence over max_epochs - accumulate_grad_batches: 1 # accumulates grads every k batches - precision: 16 - accelerator: gpu - strategy: ddp - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - enable_checkpointing: False # Provided by exp_manager - logger: False # Provided by exp_manager - -model: - dataset: - data_dir: ??? # /path/to/data - sentence_1_column: 8 # index of the column containing the premise or sentence 1 - sentence_2_column: 9 # index of the column containing the hypothesis or sentence 2 - label_column: -1 # index of the column containing labels. Labels should be "entailment", "contradiction", and "neutral". - class_balancing: null # null or 'weighted_loss'. 'weighted_loss' enables the weighted class balancing of the loss, may be used for handling unbalanced classes - use_cache: true # uses a cache to store the processed dataset, you may use it for large datasets for speed up - num_classes: 3 - max_seq_length: 128 - do_lower_case: true # true for uncased models, false for cased models, will be set automatically if pre-trained tokenizer model is used - - train_ds: - file_name: train.tsv - batch_size: 64 - shuffle: true - num_samples: -1 # number of samples to be considered, -1 means all the dataset - num_workers: 2 - drop_last: false - pin_memory: false - - validation_ds: - file_name: dev_matched.tsv - batch_size: 64 - shuffle: false - num_samples: -1 # number of samples to be considered, -1 means all the dataset - num_workers: 2 - drop_last: false - pin_memory: false - - test_ds: - file_name: null - batch_size: 64 - shuffle: false - num_samples: -1 # number of samples to be considered, -1 means all the dataset - num_workers: 2 - drop_last: false - pin_memory: false - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null # only necessary for adding transformer/bert-specific special tokens to tokenizer if the tokenizer does not already have these inherently. - - language_model: - pretrained_model_name: bert-base-uncased - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - classifier_head: - num_output_layers: 2 - fc_dropout: 0.1 - - optim: - name: adam - lr: 5e-5 - weight_decay: 0.00 - - sched: - name: WarmupAnnealing - # Scheduler params - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - # pytorch lightning args - monitor: val_loss - reduce_on_plateau: false - -exp_manager: - exp_dir: null # exp_dir for your experiment, if None, defaults to "./NeMo_experiments" - name: "ZeroShotIntentRecognition" # The name of your model - create_tensorboard_logger: True # Whether you want exp_manger to create a tb logger - create_checkpoint_callback: True # Whether you want exp_manager to create a modelcheckpoint callback - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - -pretrained_model: # pretrained ZeroShotIntent model to be used for inference (.nemo file) \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_infer.py b/SoundScribe/SpeakerID/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_infer.py deleted file mode 100644 index eca8f1ef87c652c04ae754204e6686bb30cac37f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_infer.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os - -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import ZeroShotIntentModel -from nemo.core.config import hydra_runner -from nemo.utils import logging - - -@hydra_runner(config_path="conf", config_name="zero_shot_intent_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config Params:\n {OmegaConf.to_yaml(cfg)}') - - # initialize the model using the config file - if cfg.pretrained_model and os.path.exists(cfg.pretrained_model): - model = ZeroShotIntentModel.restore_from(cfg.pretrained_model, strict=False) - else: - raise ValueError('Provide path to the pre-trained .nemo checkpoint') - - # predicting an intent of a query - queries = [ - "I'd like a veggie burger and fries", - "Turn off the lights in the living room", - ] - - candidate_labels = ['Food order', 'Play music', 'Request for directions', 'Change lighting', 'Calendar query'] - - predictions = model.predict(queries, candidate_labels, batch_size=4, multi_label=True) - - logging.info('The prediction results of some sample queries with the trained model:') - for query in predictions: - logging.info(json.dumps(query, indent=4)) - logging.info("Inference finished!") - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py b/SoundScribe/SpeakerID/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py deleted file mode 100644 index 5b91049e965db0b5f642f59ff7448c5405168333..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/nlp/zero_shot_intent_recognition/zero_shot_intent_train.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.nlp.models import ZeroShotIntentModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="zero_shot_intent_config") -def main(cfg: DictConfig) -> None: - logging.info(f'Config Params:\n {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - - # initialize the model using the config file - model = ZeroShotIntentModel(cfg.model, trainer=trainer) - - # training - logging.info("================================================================================================") - logging.info('Starting training...') - trainer.fit(model) - logging.info('Training finished!') - if cfg.model.nemo_path: - model.save_to(cfg.model.nemo_path) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/README.md b/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/README.md deleted file mode 100644 index ac11e43f2ace0b012d9a98d8952cafe68b338188..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/README.md +++ /dev/null @@ -1,128 +0,0 @@ -# NeMo End-to-End Speech Intent Classification and Slot Filling on SLURP Dataset - -## Introduction -This example shows how to train an end-to-end model for spoken language understanding on the SLURP dataset [2]. The model is an encoder-decoder framework, where the encoder is a Conformer-large [3] model initialized from [here](https://ngc.nvidia.com/models/nvidia:nemo:stt_en_conformer_ctc_large), while the decoder is a Transformer decoder [4] that is randomly initialized. The model is trained by minimizing the negative log-likelihood (NLL) loss with teacher forcing. - -## Results - -We present the main results of our models in the following table. -| | | | **Intent (Scenario_Action)** | | **Entity** | | | **SLURP Metrics** | | -|--------------------------------------------------|----------------|--------------------------|------------------------------|---------------|------------|--------|--------------|-------------------|---------------------| -| **Model** | **Params (M)** | **Pretrained** | **Accuracy** | **Precision** | **Recall** | **F1** | **Precision** | **Recall** | **F1** | -| NeMo-Conformer-Transformer-Large | 127 | NeMo ASR-Set 3.0 | 90.14 | 86.46 | 82.29 | 84.33 | 84.31 | 80.33 | 82.27 | -| NeMo-Conformer-Transformer-Large | 127 | NeMo SSL-LL60kh | 89.04 | 73.19 | 71.8 | 72.49 | 77.9 | 76.65 | 77.22 | -| NeMo-Conformer-Transformer-Large | 127 | None | 72.56 | 43.19 | 43.5 | 43.34 | 53.59 | 53.92 | 53.76 | -| NeMo-Conformer-Transformer-XLarge | 617 | NeMo SSL-LL60kh | 91.04 | 76.67 | 74.36 | 75.49 | 82.44 | 80.14 | 81.28 | - -Note: LL60kh refers to the Libri-Light dataset [7]. - -## Usage -Please install [NeMo](https://github.com/NVIDIA/NeMo) [1] before proceeding. **All following scripts are run under the current directory of this README.md file**. - -### Data Preparation -1. Under the current directory, run the following script to download and process data. -```bash -python ../../../scripts/dataset_processing/process_slurp_data.py \ - --data_dir="./slurp_data" \ - --text_key="semantics" \ - --suffix="slu" -``` - -2. Download evaluation code: -```bash -wget https://github.com/pswietojanski/slurp/raw/master/scripts/evaluation/util.py -P eval_utils/evaluation -wget https://github.com/pswietojanski/slurp/raw/master/scripts/evaluation/metrics/distance.py -P eval_utils/evaluation/metrics -wget https://github.com/pswietojanski/slurp/raw/master/scripts/evaluation/metrics/metrics.py -P eval_utils/evaluation/metrics -``` - - -### Building Tokenizers -1. Build the tokenizer for slu by running: -```bash -DATA_ROOT="./slurp_data" -python ../../../scripts/tokenizers/process_asr_text_tokenizer.py \ - --manifest="${DATA_ROOT}/train_slu.json,${DATA_ROOT}/train_synthetic_slu.json" \ - --data_root="${DATA_ROOT}/tokenizers_slu/" \ - --vocab_size=58 \ - --tokenizer="spe" \ - --spe_type="unigram" \ - --log \ - --spe_bos \ - --spe_eos \ - --spe_pad -``` - - -### Training -Run with the default config that uses ASR-pretrained encoder on NeMo ASR-set 3.0. The default batch size is set to 16 for a GPU with 32GB memory, please adjust it to your own case. - -```bash -DATA_DIR="./slurp_data" -EXP_NAME="slurp_conformer_transformer_large" -CUDA_VISIBLE_DEVICES=0 python run_speech_intent_slot_train.py \ - --config-path="./configs" --config-name=conformer_transformer_large_bpe \ - model.train_ds.manifest_filepath="[${DATA_DIR}/train_slu.json,${DATA_DIR}/train_synthetic_slu.json]" \ - model.validation_ds.manifest_filepath="${DATA_DIR}/devel_slu.json" \ - model.test_ds.manifest_filepath="${DATA_DIR}/test_slu.json" \ - model.tokenizer.dir="${DATA_DIR}/tokenizers_slu/tokenizer_spe_unigram_v58_pad_bos_eos" \ - model.train_ds.batch_size=16 \ - model.validation_ds.batch_size=16 \ - model.test_ds.batch_size=16 \ - trainer.devices=1 \ - trainer.max_epochs=100 \ - model.optim.sched.warmup_steps=2000 \ - exp_manager.name=$EXP_NAME \ - exp_manager.resume_if_exists=true \ - exp_manager.resume_ignore_no_checkpoint=true -``` - - -### Evaluation -After training, we can evaluate the model by running the following script, which will first perform checkpoint averaging and then run beam search with the averaged checkpoint on the test set. -```bash -DATA_DIR="./slurp_data" -EXP_NAME="slurp_conformer_transformer_large" -CKPT_DIR="./nemo_experiments/${EXP_NAME}/checkpoints" -CKPT_AVG_DIR="../../../examples/slu/speech_intent_slot/${CKPT_DIR}" - -python ../../../scripts/checkpoint_averaging/checkpoint_averaging.py $CKPT_AVG_DIR - -NEMO_MODEL="${CKPT_DIR}/${EXP_NAME}-averaged.nemo" -CUDA_VISIBLE_DEVICES=0 python run_speech_intent_slot_eval.py \ - dataset_manifest="${DATA_DIR}/test_slu.json" \ - model_path=$NEMO_MODEL \ - batch_size=32 \ - num_workers=8 \ - sequence_generator.type="beam" \ - sequence_generator.beam_size=32 \ - sequence_generator.temperature=1.25 \ - only_score_manifest=false -``` - -### Using Encoder Finetuned on SLURP Speech Recognition -To learn how to finetune the Conformer encoder on SLURP ASR, please refer to the tutorials at -- [Finetuning CTC models on other languages](https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb) -- [Self-Supervised pre-training for ASR](https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Self_Supervised_Pre_Training.ipynb) - - -## Pretrained Models -The pretrained models and directions on how to use them are available [here](https://ngc.nvidia.com/catalog/models/nvidia:nemo:slu_conformer_transformer_large_slurp). - - -## Reference -[1] [NVIDIA NeMo Toolkit](https://github.com/NVIDIA/NeMo) - -[2] [SLURP: A Spoken Language Understanding Resource Package](https://arxiv.org/abs/2011.13205) - -[3] [Conformer: Convolution-augmented Transformer for Speech Recognition](https://arxiv.org/abs/2005.08100) - -[4] [Attention Is All You Need](https://arxiv.org/abs/1706.03762?context=cs) - -[5] [Integration of Pre-trained Networks with Continuous Token Interface for End-to-End Spoken Language Understanding](https://arxiv.org/abs/2104.07253) - -[6] [SpeechBrain SLURP Recipe](https://github.com/speechbrain/speechbrain/tree/develop/recipes/SLURP) - -[7] [Libri-Light: A Benchmark for ASR with Limited or No Supervision](https://arxiv.org/abs/1912.07875) - -## Acknowledgments -The evaluation code is borrowed from the official [SLURP package](https://github.com/pswietojanski/slurp/tree/master/scripts/evaluation), and some data processing code is adapted from [SpeechBrain SLURP Recipe](https://github.com/speechbrain/speechbrain/tree/develop/recipes/SLURP). diff --git a/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/configs/conformer_transformer_large_bpe.yaml b/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/configs/conformer_transformer_large_bpe.yaml deleted file mode 100644 index 5d309f3cd193c1c10463494fefeb3afada6a6efd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/configs/conformer_transformer_large_bpe.yaml +++ /dev/null @@ -1,211 +0,0 @@ -# Example config for speech intent classification and slot filling with Conformer-Transformer architecture. - -name: "Conformer-Transformer-BPE" - -pretrained_encoder: - name: stt_en_conformer_ctc_large - freeze: false - -model: - sample_rate: 16000 - log_prediction: true # enables logging sample predictions in the output during training - skip_nan_grad: false - - train_ds: - manifest_filepath: ??? - sample_rate: ${model.sample_rate} - batch_size: 16 # 16 for 32GB GPUs - shuffle: true - num_workers: 8 - pin_memory: true - use_start_end_token: true - trim_silence: false - max_duration: 11.0 - min_duration: 0.0 - # tarred datasets - is_tarred: false - tarred_audio_filepaths: null - shuffle_n: 2048 - # bucketing params - bucketing_strategy: "synced_randomized" - bucketing_batch_size: null - - validation_ds: - manifest_filepath: ??? - sample_rate: ${model.sample_rate} - batch_size: 32 # you may increase batch_size if your memory allows - shuffle: false - num_workers: 8 - pin_memory: true - use_start_end_token: true - - test_ds: - manifest_filepath: null - sample_rate: ${model.sample_rate} - batch_size: 32 # you may increase batch_size if your memory allows - shuffle: false - num_workers: 8 - pin_memory: true - use_start_end_token: true - - # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py - tokenizer: - dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (wpe) - type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - sample_rate: ${model.sample_rate} - normalize: "per_feature" - window_size: 0.025 - window_stride: 0.01 - window: "hann" - features: 80 - n_fft: 512 - log: true - frame_splicing: 1 - dither: 0.00001 - pad_to: 0 - pad_value: 0.0 - - spec_augment: - _target_: nemo.collections.asr.modules.SpectrogramAugmentation - freq_masks: 2 # set to zero to disable it - # you may use lower time_masks for smaller models to have a faster convergence - time_masks: 10 # set to zero to disable it - freq_width: 27 - time_width: 0.05 - - encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.preprocessor.features} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 17 # SSL conformer-large have only 17 layers - d_model: 512 - - # Sub-sampling params - subsampling: striding # vggnet or striding, vggnet may give better results but needs more memory - subsampling_factor: 4 # must be power of 2 - subsampling_conv_channels: -1 # -1 sets it to d_model - - # Feed forward module's params - ff_expansion_factor: 4 - - # Multi-headed Attention Module's params - self_attention_model: rel_pos # rel_pos or abs_pos - n_heads: 8 # may need to be lower for smaller d_models - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - att_context_size: [-1, -1] # -1 means unlimited context - xscaling: true # scales up the input embeddings by sqrt(d_model) - untie_biases: true # unties the biases of the TransformerXL layers - pos_emb_max_len: 5000 - - # Convolution module's params - conv_kernel_size: 31 - conv_norm_type: 'batch_norm' # batch_norm or layer_norm - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_pre_encoder: 0.1 # The dropout used before the encoder - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - embedding: - _target_: nemo.collections.asr.modules.transformer.TransformerEmbedding - vocab_size: -1 - hidden_size: ${model.encoder.d_model} - max_sequence_length: 512 - num_token_types: 1 - embedding_dropout: 0.0 - learn_positional_encodings: false - - decoder: - _target_: nemo.collections.asr.modules.transformer.TransformerDecoder - num_layers: 3 - hidden_size: ${model.encoder.d_model} - inner_size: 2048 - num_attention_heads: 8 - attn_score_dropout: 0.0 - attn_layer_dropout: 0.0 - ffn_dropout: 0.0 - - classifier: - _target_: nemo.collections.common.parts.MultiLayerPerceptron - hidden_size: ${model.encoder.d_model} - num_classes: -1 - num_layers: 1 - activation: 'relu' - log_softmax: true - - loss: - label_smoothing: 0.0 - - sequence_generator: - type: greedy # choices=[greedy, topk, beam] - max_sequence_length: ${model.embedding.max_sequence_length} - temperature: 1.0 # for top-k sampling - beam_size: 1 # K for top-k sampling, N for beam search - len_pen: 0 # for beam-search - - optim_param_groups: - encoder: - lr: 0.0002 - - optim: - name: adamw - lr: 0.0003 - # optimizer arguments - betas: [0.9, 0.98] - # less necessity for weight_decay as we already have large augmentations with SpecAug - # you may need weight_decay for large models, stable AMP training, small datasets, or when lower augmentations are used - weight_decay: 0.0 - - # scheduler setup - sched: - name: CosineAnnealing # WarmupAnnealing - warmup_steps: 2000 - warmup_ratio: null - min_lr: 1e-5 - -trainer: - devices: -1 # number of GPUs, -1 would use all available GPUs - num_nodes: 1 - max_epochs: 100 - max_steps: -1 # computed at runtime if not set - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - accelerator: auto - strategy: ddp_find_unused_parameters_true - accumulate_grad_batches: 1 - gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. - log_every_n_steps: 20 # Interval of logging. - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs - sync_batchnorm: true - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - benchmark: false # needs to be false for models with variable-length speech input as it slows down training - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - # in case of multiple validation sets, first one is used - monitor: "val_wer" - mode: "min" - save_top_k: 5 - always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints - save_best_model: false - - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - # you need to set these two to True to continue the training - resume_if_exists: false - resume_ignore_no_checkpoint: false - - # You may use this section to create a W&B logger - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null diff --git a/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/eval_utils/evaluator.py b/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/eval_utils/evaluator.py deleted file mode 100644 index e56711edf21520e0f1534b1573782ee1b651c129..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/eval_utils/evaluator.py +++ /dev/null @@ -1,178 +0,0 @@ -# ! /usr/bin/python -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import ast -from typing import Dict, List, Tuple, Union - -from .evaluation.metrics.metrics import ErrorMetric - - -def parse_semantics_str2dict(semantics_str: Union[List[str], str, Dict]) -> Tuple[Dict, bool]: - """ - This function parse the input string to a valid python dictionary for later evaluation. - Part of this function is adapted from - https://github.com/speechbrain/speechbrain/blob/develop/recipes/SLURP/direct/train_with_wav2vec2.py#L110-L127 - """ - invalid = False - if isinstance(semantics_str, dict): - return semantics_str, invalid - if isinstance(semantics_str, list): - semantics_str = " ".join(semantics_str) - - try: - if "|" in semantics_str: - semantics_str = semantics_str.replace("|", ",") - _dict = ast.literal_eval(semantics_str) - if not isinstance(_dict, dict): - _dict = { - "scenario": "none", - "action": "none", - "entities": [], - } - invalid = True - except Exception: # need this if the output is not a valid dict - _dict = { - "scenario": "none", - "action": "none", - "entities": [], - } - invalid = True - - if "scenario" not in _dict or not isinstance(_dict["scenario"], str): - _dict["scenario"] = "none" - invalid = True - if "action" not in _dict or not isinstance(_dict["action"], str): - _dict["action"] = "none" - invalid = True - if "entities" not in _dict: - _dict["entities"] = [] - invalid = True - else: - - def _parse_entity(item: Dict): - error = False - for key in ["type", "filler"]: - if key not in item or not isinstance(item[key], str): - item[key] = "none" - error = True - return item, error - - for i, x in enumerate(_dict["entities"]): - item, entity_error = _parse_entity(x) - invalid = invalid or entity_error - _dict["entities"][i] = item - - return _dict, invalid - - -class SLURPEvaluator: - """ - Evaluator class for calculating SLURP metrics - """ - - def __init__(self, average_mode: str = 'micro') -> None: - if average_mode not in ['micro', 'macro']: - raise ValueError(f"Only supports 'micro' or 'macro' average, but got {average_mode} instead.") - self.average_mode = average_mode - self.scenario_f1 = ErrorMetric.get_instance(metric="f1", average=average_mode) - self.action_f1 = ErrorMetric.get_instance(metric="f1", average=average_mode) - self.intent_f1 = ErrorMetric.get_instance(metric="f1", average=average_mode) - self.span_f1 = ErrorMetric.get_instance(metric="span_f1", average=average_mode) - self.distance_metrics = {} - for distance in ['word', 'char']: - self.distance_metrics[distance] = ErrorMetric.get_instance( - metric="span_distance_f1", average=average_mode, distance=distance - ) - self.slu_f1 = ErrorMetric.get_instance(metric="slu_f1", average=average_mode) - self.invalid = 0 - self.total = 0 - - def reset(self): - self.scenario_f1 = ErrorMetric.get_instance(metric="f1", average=self.average_mode) - self.action_f1 = ErrorMetric.get_instance(metric="f1", average=self.average_mode) - self.intent_f1 = ErrorMetric.get_instance(metric="f1", average=self.average_mode) - self.span_f1 = ErrorMetric.get_instance(metric="span_f1", average=self.average_mode) - self.distance_metrics = {} - for distance in ['word', 'char']: - self.distance_metrics[distance] = ErrorMetric.get_instance( - metric="span_distance_f1", average=self.average_mode, distance=distance - ) - self.slu_f1 = ErrorMetric.get_instance(metric="slu_f1", average=self.average_mode) - self.invalid = 0 - self.total = 0 - - def update(self, predictions: Union[List[str], str], groundtruth: Union[List[str], str]) -> None: - if isinstance(predictions, str): - predictions = [predictions] - if isinstance(groundtruth, str): - groundtruth = [groundtruth] - - for pred, truth in zip(predictions, groundtruth): - pred, syntax_error = parse_semantics_str2dict(pred) - truth, _ = parse_semantics_str2dict(truth) - self.scenario_f1(truth["scenario"], pred["scenario"]) - self.action_f1(truth["action"], pred["action"]) - self.intent_f1(f"{truth['scenario']}_{truth['action']}", f"{pred['scenario']}_{pred['action']}") - self.span_f1(truth["entities"], pred["entities"]) - for distance, metric in self.distance_metrics.items(): - metric(truth["entities"], pred["entities"]) - - self.total += 1 - self.invalid += int(syntax_error) - - def compute(self, aggregate=True) -> Dict: - scenario_results = self.scenario_f1.get_metric() - action_results = self.action_f1.get_metric() - intent_results = self.intent_f1.get_metric() - entity_results = self.span_f1.get_metric() - word_dist_results = self.distance_metrics['word'].get_metric() - char_dist_results = self.distance_metrics['char'].get_metric() - self.slu_f1(word_dist_results) - self.slu_f1(char_dist_results) - slurp_results = self.slu_f1.get_metric() - - if not aggregate: - return { - "scenario": scenario_results, - "action": action_results, - "intent": intent_results, - "entity": entity_results, - "word_dist": word_dist_results, - "char_dist": char_dist_results, - "slurp": slurp_results, - "invalid": self.invalid, - "total": self.total, - } - - scores = dict() - scores["invalid"] = self.invalid - scores["total"] = self.total - self.update_scores_dict(scenario_results, scores, "scenario") - self.update_scores_dict(action_results, scores, "action") - self.update_scores_dict(intent_results, scores, "intent") - self.update_scores_dict(entity_results, scores, "entity") - self.update_scores_dict(word_dist_results, scores, "word_dist") - self.update_scores_dict(char_dist_results, scores, "char_dist") - self.update_scores_dict(slurp_results, scores, "slurp") - - return scores - - def update_scores_dict(self, source: Dict, target: Dict, tag: str = '') -> Dict: - scores = source['overall'] - p, r, f1 = scores[:3] - target[f"{tag}_p"] = p - target[f"{tag}_r"] = r - target[f"{tag}_f1"] = f1 - return target diff --git a/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/eval_utils/inference.py b/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/eval_utils/inference.py deleted file mode 100644 index d83d48b688fc0bdeaba6e0be0ea605e3be4231a7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/eval_utils/inference.py +++ /dev/null @@ -1,240 +0,0 @@ -# ! /usr/bin/python -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import contextlib -import glob -import json -import os -from dataclasses import dataclass, is_dataclass -from pathlib import Path -from typing import List, Optional - -import pytorch_lightning as pl -import torch -from omegaconf import OmegaConf -from tqdm.auto import tqdm - -from nemo.collections.asr.models import SLUIntentSlotBPEModel -from nemo.collections.asr.parts.utils.slu_utils import SequenceGeneratorConfig -from nemo.core.config import hydra_runner -from nemo.utils import logging - - -@dataclass -class InferenceConfig: - # Required configs - model_path: Optional[str] = None # Path to a .nemo file - pretrained_name: Optional[str] = None # Name of a pretrained model - audio_dir: Optional[str] = None # Path to a directory which contains audio files - dataset_manifest: Optional[str] = None # Path to dataset's JSON manifest - - # General configs - output_filename: Optional[str] = None - batch_size: int = 32 - num_workers: int = 8 - - # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA - # device anyway, and do inference on CPU only if CUDA device is not found. - # If `cuda` is a negative number, inference will be on CPU only. - cuda: Optional[int] = None - amp: bool = False - audio_type: str = "wav" - - # Recompute model transcription, even if the output folder exists with scores. - overwrite_transcripts: bool = True - - # Decoding strategy for semantic outputs - sequence_generator: SequenceGeneratorConfig = SequenceGeneratorConfig(type="greedy") - - -def slurp_inference(model, path2manifest: str, batch_size: int = 4, num_workers: int = 0,) -> List[str]: - - if num_workers is None: - num_workers = min(batch_size, os.cpu_count() - 1) - - # We will store transcriptions here - hypotheses = [] - # Model's mode and device - mode = model.training - device = next(model.parameters()).device - dither_value = model.preprocessor.featurizer.dither - pad_to_value = model.preprocessor.featurizer.pad_to - - try: - model.preprocessor.featurizer.dither = 0.0 - model.preprocessor.featurizer.pad_to = 0 - # Switch model to evaluation mode - model.eval() - - logging_level = logging.get_verbosity() - logging.set_verbosity(logging.WARNING) - - config = { - 'manifest_filepath': path2manifest, - 'batch_size': batch_size, - 'num_workers': num_workers, - } - - temporary_datalayer = model._setup_transcribe_dataloader(config) - for test_batch in tqdm(temporary_datalayer, desc="Transcribing", ncols=80): - predictions = model.predict( - input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) - ) - - hypotheses += predictions - - del predictions - del test_batch - - finally: - # set mode back to its original value - model.train(mode=mode) - model.preprocessor.featurizer.dither = dither_value - model.preprocessor.featurizer.pad_to = pad_to_value - logging.set_verbosity(logging_level) - return hypotheses - - -@hydra_runner(config_name="InferenceConfig", schema=InferenceConfig) -def run_inference(cfg: InferenceConfig) -> InferenceConfig: - logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - - if is_dataclass(cfg): - cfg = OmegaConf.structured(cfg) - - if cfg.model_path is None and cfg.pretrained_name is None: - raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None!") - if cfg.audio_dir is None and cfg.dataset_manifest is None: - raise ValueError("Both cfg.audio_dir and cfg.dataset_manifest cannot be None!") - - # setup GPU - if cfg.cuda is None: - if torch.cuda.is_available(): - device = [0] # use 0th CUDA device - accelerator = 'gpu' - else: - device = 1 - accelerator = 'cpu' - else: - device = [cfg.cuda] - accelerator = 'gpu' - - map_location = torch.device('cuda:{}'.format(device[0]) if accelerator == 'gpu' else 'cpu') - - # setup model - if cfg.model_path is not None: - # restore model from .nemo file path - logging.info(f"Restoring model : {cfg.model_path}") - model = SLUIntentSlotBPEModel.restore_from(restore_path=cfg.model_path, map_location=map_location) - model_name = os.path.splitext(os.path.basename(cfg.model_path))[0] - else: - # restore model by name - model = SLUIntentSlotBPEModel.from_pretrained(model_name=cfg.pretrained_name, map_location=map_location) - model_name = cfg.pretrained_name - - trainer = pl.Trainer(devices=device, accelerator=accelerator) - model.set_trainer(trainer) - model = model.eval() - - # Setup decoding strategy - model.set_decoding_strategy(cfg.sequence_generator) - - # get audio filenames - if cfg.audio_dir is not None: - filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True)) - else: - # get filenames from manifest - filepaths = [] - if os.stat(cfg.dataset_manifest).st_size == 0: - logging.error(f"The input dataset_manifest {cfg.dataset_manifest} is empty. Exiting!") - return None - - manifest_dir = Path(cfg.dataset_manifest).parent - with open(cfg.dataset_manifest, 'r') as f: - has_two_fields = [] - for line in f: - item = json.loads(line) - if "offset" in item and "duration" in item: - has_two_fields.append(True) - else: - has_two_fields.append(False) - audio_file = Path(item['audio_filepath']) - if not audio_file.is_file() and not audio_file.is_absolute(): - audio_file = manifest_dir / audio_file - filepaths.append(str(audio_file.absolute())) - - logging.info(f"\nStart inference with {len(filepaths)} files...\n") - - # setup AMP (optional) - if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'): - logging.info("AMP enabled!\n") - autocast = torch.cuda.amp.autocast - else: - - @contextlib.contextmanager - def autocast(): - yield - - # Compute output filename - if cfg.output_filename is None: - # create default output filename - if cfg.audio_dir is not None: - cfg.output_filename = os.path.dirname(os.path.join(cfg.audio_dir, '.')) + '.json' - else: - cfg.output_filename = cfg.dataset_manifest.replace('.json', f'_{model_name}.json') - - # if transcripts should not be overwritten, and already exists, skip re-transcription step and return - if not cfg.overwrite_transcripts and os.path.exists(cfg.output_filename): - logging.info( - f"Previous transcripts found at {cfg.output_filename}, and flag `overwrite_transcripts`" - f"is {cfg.overwrite_transcripts}. Returning without re-transcribing text." - ) - - return cfg - - # transcribe audio - with autocast(): - with torch.no_grad(): - predictions = slurp_inference( - model=model, - path2manifest=cfg.dataset_manifest, - batch_size=cfg.batch_size, - num_workers=cfg.num_workers, - ) - - logging.info(f"Finished transcribing {len(filepaths)} files !") - - logging.info(f"Writing transcriptions into file: {cfg.output_filename}") - - # write audio transcriptions - with open(cfg.output_filename, 'w', encoding='utf-8') as f: - if cfg.audio_dir is not None: - for idx, text in enumerate(predictions): - item = {'audio_filepath': filepaths[idx], 'pred_text': text} - f.write(json.dumps(item) + "\n") - else: - with open(cfg.dataset_manifest, 'r') as fr: - for idx, line in enumerate(fr): - item = json.loads(line) - item['pred_text'] = predictions[idx] - f.write(json.dumps(item) + "\n") - - logging.info("Finished writing predictions !") - return cfg - - -if __name__ == '__main__': - run_inference() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/run_speech_intent_slot_eval.py b/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/run_speech_intent_slot_eval.py deleted file mode 100644 index 8ed0abead342c7f3ec8b398ecdd9d4f904acc318..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/run_speech_intent_slot_eval.py +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -from dataclasses import dataclass, is_dataclass -from pathlib import Path -from typing import Optional - -import torch -from eval_utils.evaluation.util import format_results -from eval_utils.evaluator import SLURPEvaluator -from eval_utils.inference import InferenceConfig, run_inference -from omegaconf import MISSING, OmegaConf, open_dict - -from nemo.core.config import hydra_runner -from nemo.utils import logging - - -@dataclass -class EvaluationConfig(InferenceConfig): - dataset_manifest: str = MISSING - output_filename: Optional[str] = "evaluation_transcripts.json" - average: str = "micro" - full: bool = False - errors: bool = False - table_layout: str = "fancy_grid" - only_score_manifest: bool = False - - -@hydra_runner(config_name="EvaluationConfig", schema=EvaluationConfig) -def main(cfg: EvaluationConfig): - torch.set_grad_enabled(False) - - cfg.output_filename = str(Path(Path(cfg.model_path).parent) / Path("predictions.json")) - - if is_dataclass(cfg): - cfg = OmegaConf.structured(cfg) - - if cfg.audio_dir is not None: - raise RuntimeError( - "Evaluation script requires ground truth labels to be passed via a manifest file. " - "If manifest file is available, submit it via `dataset_manifest` argument." - ) - - if not os.path.exists(cfg.dataset_manifest): - raise FileNotFoundError(f"The dataset manifest file could not be found at path : {cfg.dataset_manifest}") - - if not cfg.only_score_manifest: - # Transcribe speech into an output directory - transcription_cfg = run_inference(cfg) # type: EvaluationConfig - - # Release GPU memory if it was used during transcription - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - logging.info("Finished transcribing speech dataset. Computing metrics..") - - else: - cfg.output_filename = cfg.dataset_manifest - transcription_cfg = cfg - - ground_truth_text = [] - predicted_text = [] - invalid_manifest = False - - with open(transcription_cfg.output_filename, 'r') as f: - for line in f: - data = json.loads(line) - - if 'pred_text' not in data: - invalid_manifest = True - break - - ground_truth_text.append(data['text']) - predicted_text.append(data['pred_text']) - - # Test for invalid manifest supplied - if invalid_manifest: - raise ValueError( - f"Invalid manifest provided: {transcription_cfg.output_filename} does not " - f"contain value for `pred_text`." - ) - - # Compute the metrics - evaluator = SLURPEvaluator(cfg.average) - evaluator.update(predictions=predicted_text, groundtruth=ground_truth_text) - results = evaluator.compute(aggregate=False) - total = results["total"] - invalid = results["invalid"] - slurp_f1 = results["slurp"]["overall"][2] - - print("-------------- Results --------------") - print( - format_results( - results=results["scenario"], - label="scenario", - full=cfg.full, - errors=cfg.errors, - table_layout=cfg.table_layout, - ), - "\n", - ) - - print( - format_results( - results=results["action"], label="action", full=cfg.full, errors=cfg.errors, table_layout=cfg.table_layout - ), - "\n", - ) - - print( - format_results( - results=results["intent"], - label="intent (scen_act)", - full=cfg.full, - errors=cfg.errors, - table_layout=cfg.table_layout, - ), - "\n", - ) - - print( - format_results( - results=results["entity"], - label="entities", - full=cfg.full, - errors=cfg.errors, - table_layout=cfg.table_layout, - ), - "\n", - ) - - print( - format_results( - results=results["word_dist"], - label="entities (word distance)", - full=cfg.full, - errors=cfg.errors, - table_layout=cfg.table_layout, - ), - "\n", - ) - - print( - format_results( - results=results["char_dist"], - label="entities (char distance)", - full=cfg.full, - errors=cfg.errors, - table_layout=cfg.table_layout, - ), - "\n", - ) - - print( - format_results( - results=results["slurp"], label="SLU F1", full=cfg.full, errors=cfg.errors, table_layout=cfg.table_layout - ), - "\n", - ) - - print(f"Found {invalid} out of {total} predictions that have syntax error.") - - # Inject the metric name and score into the config, and return the entire config - with open_dict(cfg): - cfg.metric_name = "slurp_f1" - cfg.metric_value = slurp_f1 - - return cfg - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/run_speech_intent_slot_train.py b/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/run_speech_intent_slot_train.py deleted file mode 100644 index d8989bf01cdd0c26a6707cc867dae37990d4ffd6..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/slu/speech_intent_slot/run_speech_intent_slot_train.py +++ /dev/null @@ -1,127 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -# Preparing the Tokenizer for the dataset -Use the `process_asr_text_tokenizer.py` script under /scripts/tokenizers/ in order to prepare the tokenizer. - -```sh -python /scripts/tokenizers/process_asr_text_tokenizer.py \ - --manifest= - OR - --data_file= \ - --data_root="" \ - --vocab_size= \ - --tokenizer=<"spe" or "wpe"> \ - --no_lower_case \ - --spe_type=<"unigram", "bpe", "char" or "word"> \ - --spe_character_coverage=1.0 \ - --log -``` - -# Training the model -```sh -python run_speech_intent_slot_train.py \ - # (Optional: --config-path= --config-name=) \ - model.train_ds.manifest_filepath= \ - model.validation_ds.manifest_filepath= \ - model.tokenizer.dir= \ - model.tokenizer.type= \ - trainer.devices=-1 \ - trainer.accelerator="gpu" \ - trainer.strategy="ddp" \ - trainer.max_epochs=100 \ - model.optim.name="adamw" \ - model.optim.lr=0.001 \ - model.optim.betas=[0.9,0.999] \ - model.optim.weight_decay=0.0001 \ - model.optim.sched.warmup_steps=2000 - exp_manager.create_wandb_logger=True \ - exp_manager.wandb_logger_kwargs.name="" \ - exp_manager.wandb_logger_kwargs.project="" -``` - -# Fine-tune a model - -For documentation on fine-tuning this model, please visit - -https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/configs.html#fine-tuning-configurations - -# Pretrained Models - -For documentation on existing pretrained models, please visit - -https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speech_intent_slot/results.html - -""" - -from pathlib import Path - -import pytorch_lightning as pl -import torch -from omegaconf import OmegaConf - -from nemo.collections.asr.models import ASRModel, SLUIntentSlotBPEModel, SpeechEncDecSelfSupervisedModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="./configs/", config_name="conformer_transformer_large_bpe") -def main(cfg): - logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - model = SLUIntentSlotBPEModel(cfg=cfg.model, trainer=trainer) - - # Init encoder from pretrained model - pretrained_encoder_name = cfg.pretrained_encoder.name - if pretrained_encoder_name is not None: - if Path(pretrained_encoder_name).is_file(): - logging.info(f"Loading pretrained encoder from local: {pretrained_encoder_name}") - pretraind_model = ASRModel.restore_from( - restore_path=pretrained_encoder_name, map_location=torch.device("cpu") - ) - model.encoder.load_state_dict(pretraind_model.encoder.state_dict(), strict=False) - del pretraind_model - else: - logging.info(f"Loading pretrained encoder from NGC: {pretrained_encoder_name}") - if pretrained_encoder_name.startswith("ssl_"): - model_cls = SpeechEncDecSelfSupervisedModel - elif pretrained_encoder_name.startswith("stt_"): - model_cls = ASRModel - else: - raise ValueError(f"Unknown pretrained encoder: {pretrained_encoder_name}") - pretraind_model = model_cls.from_pretrained( - model_name=pretrained_encoder_name, map_location=torch.device("cpu") - ) - model.encoder.load_state_dict(pretraind_model.encoder.state_dict(), strict=False) - del pretraind_model - else: - logging.info("Not using pretrained encoder.") - - if cfg.pretrained_encoder.freeze: - logging.info("Freezing encoder...") - model.encoder.freeze() - else: - model.encoder.unfreeze() - - trainer.fit(model) - - if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: - if model.prepare_test(trainer): - trainer.test(model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/README.md b/SoundScribe/SpeakerID/examples/speaker_tasks/README.md deleted file mode 100644 index 3ee67d71fddcc195b7c268cc4b4a43a50d72eeea..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/README.md +++ /dev/null @@ -1,8 +0,0 @@ -Speaker tasks in general are broadly classified into two tasks: -- [Speaker Recognition](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_recognition/intro.html) -- [Speaker Diarization](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_diarization/intro.html) - -**Speaker Recognition** is a research area which solves two major tasks: speaker identification (what is the identity of the speaker?) and speaker verification (is the speaker who they claim to be?). Whereas **Speaker Diarization** is a task segmenting audio recordings by speaker labels (Who Speaks When?). - -In *recognition* folder we provide scripts for training, inference and verification of audio samples. -In *diarization* folder we provide scripts for inference of speaker diarization using pretrained VAD (optional) and Speaker embedding extractor models diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/README.md b/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/README.md deleted file mode 100644 index 2753f3b2b2129e7320d640298d570ad10694646c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/README.md +++ /dev/null @@ -1,326 +0,0 @@ -# Speaker Diarization - -Documentation section for speaker related tasks can be found at: - - [Speaker Diarization](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_diarization/intro.html) - - [Speaker Identification and Verification](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_recognition/intro.html) - - -## Features of NeMo Speaker Diarization -- Provides pretrained speaker embedding extractor models and VAD models. -- Does not need to be tuned on dev-set while showing the better performance than AHC+PLDA method in general. -- Estimates the number of speakers in the given session. -- Provides example script for asr transcription with speaker labels. - -## Supported Pretrained Speaker Embedding Extractor models -- [titanet_large](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_large) -- [ecapa_tdnn](https://ngc.nvidia.com/catalog/models/nvidia:nemo:ecapa_tdnn) -- [speakerverification_speakernet](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/speakerverification_speakernet) - -## Supported Pretrained VAD models -- [vad_multilingual_marblenet](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/vad_multilingual_marblenet) -- [vad_marblenet](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/vad_marblenet) -- [vad_telephony_marblenet](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/vad_telephony_marblenet) - -## Supported ASR models -QuartzNet, CitriNet and Conformer-CTC models are supported. -Recommended models on NGC: -- [stt_en_quartznet15x5](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_quartznet15x5) -- [stt_en_conformer_ctc_large](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_conformer_ctc_large) -- [stt_en_citrinet_1024](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_citrinet_1024) - -## Performance - -#### Clustering Diarizer -Diarization Error Rate (DER) table of `titanet_large.nemo` model on well known evaluation datasets. - -| Evaluation Condition | AMI(Lapel) | AMI(MixHeadset) | CH109 | NIST SRE 2000 | -|:--------------------------------------:|:--------------:|:-------------------:|:--------:|:-------------:| -| Domain Configuration | Meeting | Meeting |Telephonic| Telephonic | -| Oracle VAD
Known # of Speakers | 1.28 | 1.07 | 0.56 | 5.62 | -| Oracle VAD
Unknown # of Speakers | 1.28 | 1.4 | 0.88 | 4.33 | - -* All models were tested using the domain specific `.yaml` files which can be found in `conf/inference/` folder. -* The above result is based on the oracle Voice Activity Detection (VAD) result. -* This result is based on [titanet_large.nemo](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_large) model. - -#### Neural Diarizer -Multi-scale Diarization Decoder (MSDD) model [Multi-scale Diarization decoder](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_diarization/model.html) -Diarization Error Rate (DER) table of [diar_msdd_telephonic.nemo](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_msdd_telephonic) model on telephonic speech datasets. - -| CH109| Forgiving | Fair | Full | -|-------------------------------------:|---------------------------------|----------------------------------|---------------------------------| -| (collar, ignore_overlap)| (0.25, True) | (0.25, True) | (0.0, False) | -| False Alarm | - | 0.62% | 1.80% | -| Miss | - | 2.47% | 5.96% | -| Confusion | - | 0.43% | 2.10% | -| DER | **0.58%** | **3.52%** | **9.86%** | - - -| CALLHOME | Forgiving | Fair | Full | -|-------------------------------------:|---------------------------------|----------------------------------|---------------------------------| -| (collar, ignore_overlap)| (0.25, True) | (0.25, True) | (0.0, False) | -| False Alarm | - | 1.05% | 2.24% | -| Miss | - | 7.62% | 11.09% | -| Confusion | - | 4.06% | 6.03% | -| DER | **4.15%** | **12.73%** | **19.37%** | - -* Evaluation setting: Oracle VAD
Unknown number of speakers (max. 8) -* Clustering parameter: `max_rp_threshold=0.15` -* All models were tested using the domain specific `.yaml` files which can be found in `conf/inference/` folder. -* The above result is based on the oracle Voice Activity Detection (VAD) result. - -## Run Speaker Diarization on Your Audio Files - -#### Example script for clustering diarizer: with system-VAD -```bash - python clustering_diarizer/offline_diar_infer.py \ - diarizer.manifest_filepath= \ - diarizer.out_dir='demo_output' \ - diarizer.speaker_embeddings.parameters.save_embeddings=False \ - diarizer.vad.model_path= \ - diarizer.speaker_embeddings.model_path= -``` - -#### Example script for neural diarizer: with system-VAD -```bash - python neural_diarizer/multiscale_diar_decoder_infer.py \ - diarizer.manifest_filepath= \ - diarizer.out_dir='demo_output' \ - diarizer.speaker_embeddings.parameters.save_embeddings=False \ - diarizer.vad.model_path= \ - diarizer.speaker_embeddings.model_path= \ - diarizer.msdd_model.model_path= \ -``` - -If you have oracle VAD files and groundtruth RTTM files for evaluation: -Provide rttm files in the input manifest file and enable oracle_vad as shown below. -```bash -... - diarizer.oracle_vad=True \ -... -``` - -#### Arguments -      To run speaker diarization on your audio recordings, you need to prepare the following file. - -- **`diarizer.manifest_filepath`: ** Path to manifest file - -Example: `manifest.json` - -```bash -{"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, label: "infer", "text": "-", "num_speakers": null, "rttm_filepath": "/path/to/rttm/file", "uem_filepath"="/path/to/uem/filepath"} -``` -Mandatory fields are `audio_filepath`, `offset`, `duration`, `label:"infer"` and `text: ` , and the rest are optional keys which can be passed based on the type of evaluation - -Some of important options in config file: - -- **`diarizer.vad.model_path`: voice activity detection model name or path to the model** - -Specify the name of VAD model, then the script will download the model from NGC. Currently, we have 'vad_multilingual_marblenet', 'vad_marblenet' and 'vad_telephony_marblenet' as options for VAD models. - -`diarizer.vad.model_path='vad_multilingual_marblenet'` - - -Instead, you can also download the model from [vad_multilingual_marblenet](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/vad_multilingual_marblenet), [vad_marblenet](https://ngc.nvidia.com/catalog/models/nvidia:nemo:vad_marblenet) and [vad_telephony_marblenet](https://ngc.nvidia.com/catalog/models/nvidia:nemo:vad_telephony_marblenet) and specify the full path name to the model as below. - -`diarizer.vad.model_path='path/to/vad_multilingual_marblenet.nemo'` - -- **`diarizer.speaker_embeddings.model_path`: speaker embedding model name** - -Specify the name of speaker embedding model, then the script will download the model from NGC. Currently, we support 'titanet_large', 'ecapa_tdnn' and 'speakerverification_speakernet'. - -`diarizer.speaker_embeddings.model_path='titanet_large'` - -You could also download *.nemo files from [this link](https://ngc.nvidia.com/catalog/models?orderBy=scoreDESC&pageNumber=0&query=SpeakerNet&quickFilter=&filters=) and specify the full path name to the speaker embedding model file (`*.nemo`). - -`diarizer.speaker_embeddings.model_path='path/to/titanet_large.nemo'` - - -- **`diarizer.speaker_embeddings.parameters.multiscale_weights`: multiscale diarization ** - -Multiscale diarization system employs multiple scales at the same time to obtain a finer temporal resolution. To use multiscale feature, at least two scales and scale weights should be provided. The scales should be provided in descending order, from the longest scale to the base scale (the shortest). If multiple scales are provided, multiscale_weights must be provided in list format. The following example shows how multiscale parameters are specified and the recommended parameters. - -- **`diarizer.msdd_model.model_path`: neural diarizer (multiscale diarization decoder) name** - -If you want to use a neural diarizer model (e.g., MSDD model), specify the name of the neural diarizer model, then the script will download the model from NGC. Currently, we support 'diar_msdd_telephonic'. - -Note that you should not specify a scale setting that does not match with the MSDD model you are using. For example, `diar_msdd_telephonic` model is based on 5 scales as in the configs in model configs. - -`diarizer.speaker_embeddings.model_path='diar_msdd_telephonic' - -You could also download [diar_msdd_telephonic](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_msdd_telephonic) -and specify the full path name to the speaker embedding model file (`*.nemo`). - -`diarizer.msdd_model.model_path='path/to/diar_msdd_telephonic.nemo'` - - -#### Example script: single-scale and multiscale - -Single-scale setting: -```bash - python offline_diar_infer.py \ - ... ... - parameters.window_length_in_sec=1.5 \ - parameters.shift_length_in_sec=0.75 \ - parameters.multiscale_weights=null \ -``` - -Multiscale setting (base scale - window_length 0.5 s and shift_length 0.25): - -```bash - python offline_diar_infer.py \ - ... ... - parameters.window_length_in_sec=[1.5,1.0,0.5] \ - parameters.shift_length_in_sec=[0.75,0.5,0.25] \ - parameters.multiscale_weights=[0.33,0.33,0.33] \ -``` - -
- -## Run Speech Recognition with Speaker Diarization - -Using the script `offline_diar_with_asr_infer.py`, you can transcribe your audio recording with speaker labels as shown below: - -``` -[00:03.34 - 00:04.46] speaker_0: back from the gym oh good how's it going -[00:04.46 - 00:09.96] speaker_1: oh pretty well it was really crowded today yeah i kind of assumed everyone would be at the shore uhhuh -[00:12.10 - 00:13.97] speaker_0: well it's the middle of the week or whatever so -``` - -Currently, NeMo offline diarization inference supports QuartzNet English model and ConformerCTC ASR models (e.g.,`QuartzNet15x5Base-En`, `stt_en_conformer_ctc_large`). - -#### Example script - -```bash -python offline_diar_with_asr_infer.py \ - diarizer.manifest_filepath= \ - diarizer.out_dir='demo_asr_output' \ - diarizer.speaker_embeddings.model_path= \ - diarizer.asr.model_path= \ - diarizer.speaker_embeddings.parameters.save_embeddings=False \ - diarizer.asr.parameters.asr_based_vad=True -``` -If you have reference rttm files or oracle number of speaker information, you can provide those file paths and number of speakers in the manifest file path and pass `diarizer.clustering.parameters.oracle_num_speakers=True` as shown in the following example. - -```bash -python offline_diar_with_asr_infer.py \ - diarizer.manifest_filepath= \ - diarizer.out_dir='demo_asr_output' \ - diarizer.speaker_embeddings.model_path= \ - diarizer.asr.model_path= \ - diarizer.speaker_embeddings.parameters.save_embeddings=False \ - diarizer.asr.parameters.asr_based_vad=True \ - diarizer.clustering.parameters.oracle_num_speakers=True -``` - -#### Output folders - -The above script will create a folder named `./demo_asr_output/`. -For example, in `./demo_asr_output/`, you can check the results as below. - -```bash -./asr_with_diar -├── pred_rttms - └── my_audio1.json - └── my_audio1.txt - └── my_audio1.rttm - └── my_audio1_gecko.json -│ -└── speaker_outputs - └── oracle_vad_manifest.json - └── subsegments_scale2_cluster.label - └── subsegments_scale0.json - └── subsegments_scale1.json - └── subsegments_scale2.json -... -``` - -`my_audio1.json` file contains word-by-word json output with speaker label and time stamps. We also provide a json output file for [gecko](https://gong-io.github.io/gecko/) tool, where you can visualize the diarization result along with the ASR output. - -Example: `./demo_asr_output/pred_rttms/my_audio1.json` -```bash -{ - "status": "Success", - "session_id": "my_audio1", - "transcription": "back from the gym oh good ...", - "speaker_count": 2, - "words": [ - { - "word": "back", - "start_time": 0.44, - "end_time": 0.56, - "speaker_label": "speaker_0" - }, -... - { - "word": "oh", - "start_time": 1.74, - "end_time": 1.88, - "speaker_label": "speaker_1" - }, - { - "word": "good", - "start_time": 2.08, - "end_time": 3.28, - "speaker_label": "speaker_1" - }, -``` - -`*.txt` files in `pred_rttms` folder contain transcriptions with speaker labels and corresponding time. - -Example: `./demo_asr_output/pred_rttms/my_audio1.txt` -``` -[00:03.34 - 00:04.46] speaker_0: back from the gym oh good how's it going -[00:04.46 - 00:09.96] speaker_1: pretty well it was really crowded today yeah i kind of assumed everylonewould be at the shore uhhuh -[00:12.10 - 00:13.97] speaker_0: well it's the middle of the week or whatever so -[00:13.97 - 00:15.78] speaker_1: but it's the fourth of july mm -[00:16.90 - 00:21.80] speaker_0: so yeah people still work tomorrow do you have to work tomorrow did you drive off yesterday -``` - -In `speaker_outputs` folder we have three kinds of files as follows: - - - `oracle_vad_manifest.json` file contains oracle VAD labels that are extracted from RTTM files. - - `subsegments_scale.json` is a manifest file for subsegments, which includes segment-by-segment start and end time with original wav file path. In multi-scale mode, this file is generated for each ``. - - `subsegments_scale_cluster.label` file contains the estimated cluster labels for each segment. This file is only generated for the base scale index in multi-scale diarization mode. - - -### Optional Features for Speech Recognition with Speaker Diarization - -#### Beam Search Decoder - -Beam-search decoder can be applied to CTC based ASR models. To use this feature, [pyctcdecode](https://github.com/kensho-technologies/pyctcdecode) should be installed. [pyctcdecode](https://github.com/kensho-technologies/pyctcdecode) supports word timestamp generation and can be applied to speaker diarization. pyctcdecode also requires [KenLM](https://github.com/kpu/kenlm) and KenLM is recommended to be installed using PyPI. Install pyctcdecode in your environment with the following commands: -``` -pip install pyctcdecode -pip install https://github.com/kpu/kenlm/archive/master.zip -``` -You should provide a trained KenLM language model to use pyctcdecode. Binary or `.arpa` format can be provided to hydra configuration as below. - -```bash - python offline_diar_with_asr_infer.py \ - ... ... - diarizer.asr.ctc_decoder_parameters.pretrained_language_model="/path/to/kenlm_language_model.binary" -``` -You can download publicly available language models (`.arpa` files) at [KALDI Tedlium Language Models](https://kaldi-asr.org/models/m5). Download [4-gram Big ARPA](https://kaldi-asr.org/models/5/4gram_big.arpa.gz) and provide the model path. - -The following CTC decoder parameters can be modified to optimize the performance. -`diarizer.asr.ctc_decoder_parameters.beam_width` (default: 32) -`diarizer.asr.ctc_decoder_parameters.alpha` (default: 0.5) -`diarizer.asr.ctc_decoder_parameters.beta` (default: 2.5) - -#### Realign Words with a Language Model (Experimental) - -Diarization result with ASR transcript can be enhanced by applying a language model. To use this feature, python package [arpa](https://pypi.org/project/arpa/) should be installed. -``` -pip install arpa -``` -`diarizer.asr.realigning_lm_parameters.logprob_diff_threshold` can be modified to optimize the diarization performance (default value is 1.2). The lower the threshold, the more changes are expected to be seen in the output transcript. - -`arpa` package also uses KenLM language models as in pyctcdecode. You can download publicly available [4-gram Big ARPA](https://kaldi-asr.org/models/5/4gram_big.arpa.gz) model and provide the model path to hydra configuration as follows. - - -```bash -python offline_diar_with_asr_infer.py \ - ... ... - diarizer.asr.realigning_lm_parameters.logprob_diff_threshold=1.2 \ - diarizer.asr.realigning_lm_parameters.arpa_language_model="/path/to/4gram_big.arpa"\ -``` diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py b/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py deleted file mode 100644 index 35077a5fe41525d636a3a124deff34df53f0dfc9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from omegaconf import OmegaConf -from pytorch_lightning import seed_everything - -from nemo.collections.asr.models import ClusteringDiarizer -from nemo.core.config import hydra_runner -from nemo.utils import logging - -""" -This script demonstrates how to use run speaker diarization. -Usage: - python offline_diar_infer.py \ - diarizer.manifest_filepath= \ - diarizer.out_dir='demo_output' \ - diarizer.speaker_embeddings.model_path= \ - diarizer.vad.model_path='vad_marblenet' \ - diarizer.speaker_embeddings.parameters.save_embeddings=False - -Check out whole parameters in ./conf/offline_diarization.yaml and their meanings. -For details, have a look at /tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb -""" - -seed_everything(42) - - -@hydra_runner(config_path="../conf/inference", config_name="diar_infer_meeting.yaml") -def main(cfg): - logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - sd_model = ClusteringDiarizer(cfg=cfg).to(cfg.device) - sd_model.diarize() - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py b/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py deleted file mode 100644 index da9671a2fc430cce4eb209dd1db855fe1e455f14..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from omegaconf import OmegaConf - -from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASRDecoderTimeStamps -from nemo.collections.asr.parts.utils.diarization_utils import OfflineDiarWithASR -from nemo.core.config import hydra_runner -from nemo.utils import logging - - -""" -This script demonstrates how to run offline speaker diarization with asr. -Usage: -python offline_diar_with_asr_infer.py \ - diarizer.manifest_filepath= \ - diarizer.out_dir='demo_asr_output' \ - diarizer.speaker_embeddings.model_path= \ - diarizer.asr.model_path= \ - diarizer.asr.parameters.asr_based_vad=True \ - diarizer.speaker_embeddings.parameters.save_embeddings=False - -Check out whole parameters in ./conf/offline_diarization_with_asr.yaml and their meanings. -For details, have a look at /tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb -Currently, the following NGC models are supported: - - stt_en_quartznet15x5 - stt_en_citrinet* - stt_en_conformer_ctc* - -""" - - -@hydra_runner(config_path="../conf/inference", config_name="diar_infer_meeting.yaml") -def main(cfg): - - logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - - # ASR inference for words and word timestamps - asr_decoder_ts = ASRDecoderTimeStamps(cfg.diarizer) - asr_model = asr_decoder_ts.set_asr_model() - word_hyp, word_ts_hyp = asr_decoder_ts.run_ASR(asr_model) - - # Create a class instance for matching ASR and diarization results - asr_diar_offline = OfflineDiarWithASR(cfg.diarizer) - asr_diar_offline.word_ts_anchor_offset = asr_decoder_ts.word_ts_anchor_offset - - # Diarization inference for speaker labels - diar_hyp, diar_score = asr_diar_offline.run_diarization(cfg, word_ts_hyp) - trans_info_dict = asr_diar_offline.get_transcript_with_speaker_labels(diar_hyp, word_hyp, word_ts_hyp) - - # If RTTM is provided and DER evaluation - if diar_score is not None: - metric, mapping_dict, _ = diar_score - - # Get session-level diarization error rate and speaker counting error - der_results = OfflineDiarWithASR.gather_eval_results( - diar_score=diar_score, - audio_rttm_map_dict=asr_diar_offline.AUDIO_RTTM_MAP, - trans_info_dict=trans_info_dict, - root_path=asr_diar_offline.root_path, - ) - - # Calculate WER and cpWER if reference CTM files exist - wer_results = OfflineDiarWithASR.evaluate( - hyp_trans_info_dict=trans_info_dict, - audio_file_list=asr_diar_offline.audio_file_list, - ref_ctm_file_list=asr_diar_offline.ctm_file_list, - ) - - # Print average DER, WER and cpWER - OfflineDiarWithASR.print_errors(der_results=der_results, wer_results=wer_results) - - # Save detailed session-level evaluation results in `root_path`. - OfflineDiarWithASR.write_session_level_result_in_csv( - der_results=der_results, - wer_results=wer_results, - root_path=asr_diar_offline.root_path, - csv_columns=asr_diar_offline.csv_columns, - ) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/conf/inference/diar_infer_general.yaml b/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/conf/inference/diar_infer_general.yaml deleted file mode 100644 index 09e4bb378c69b0ecfcd5609ad137054ce4814a28..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/conf/inference/diar_infer_general.yaml +++ /dev/null @@ -1,92 +0,0 @@ -# This YAML file is created for all types of offline speaker diarization inference tasks in `/example/speaker_tasks/diarization` folder. -# The inference parameters for VAD, speaker embedding extractor, clustering module, MSDD module, ASR decoder are all included in this YAML file. -# All the keys under `diarizer` key (`vad`, `speaker_embeddings`, `clustering`, `msdd_model`, `asr`) can be selectively used for its own purpose and also can be ignored if the module is not used. -# The configurations in this YAML file is optimized to show balanced performances on various types of domain. VAD is optimized on multilingual ASR datasets and diarizer is optimized on DIHARD3 development set. -# An example line in an input manifest file (`.json` format): -# {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": "infer", "text": "-", "num_speakers": null, "rttm_filepath": "/path/to/rttm/file", "uem_filepath": "/path/to/uem/file"} -name: &name "ClusterDiarizer" - -num_workers: 1 -sample_rate: 16000 -batch_size: 64 -device: null # can specify a specific device, i.e: cuda:1 (default cuda if cuda available, else cpu) -verbose: True # enable additional logging - -diarizer: - manifest_filepath: ??? - out_dir: ??? - oracle_vad: False # If True, uses RTTM files provided in the manifest file to get speech activity (VAD) timestamps - collar: 0.25 # Collar value for scoring - ignore_overlap: True # Consider or ignore overlap segments while scoring - - vad: - model_path: vad_multilingual_marblenet # .nemo local model path or pretrained VAD model name - external_vad_manifest: null # This option is provided to use external vad and provide its speech activity labels for speaker embeddings extraction. Only one of model_path or external_vad_manifest should be set - - parameters: # Tuned by detection error rate (false alarm + miss) on multilingual ASR evaluation datasets - window_length_in_sec: 0.63 # Window length in sec for VAD context input - shift_length_in_sec: 0.08 # Shift length in sec for generate frame level VAD prediction - smoothing: False # False or type of smoothing method (eg: median) - overlap: 0.5 # Overlap ratio for overlapped mean/median smoothing filter - onset: 0.5 # Onset threshold for detecting the beginning and end of a speech - offset: 0.3 # Offset threshold for detecting the end of a speech - pad_onset: 0.2 # Adding durations before each speech segment - pad_offset: 0.2 # Adding durations after each speech segment - min_duration_on: 0.5 # Threshold for small non_speech deletion - min_duration_off: 0.5 # Threshold for short speech segment deletion - filter_speech_first: True - - speaker_embeddings: - model_path: titanet_large # .nemo local model path or pretrained model name (titanet_large, ecapa_tdnn or speakerverification_speakernet) - parameters: - window_length_in_sec: [1.9,1.2,0.5] # Window length(s) in sec (floating-point number). either a number or a list. ex) 1.5 or [1.5,1.0,0.5] - shift_length_in_sec: [0.95,0.6,0.25] # Shift length(s) in sec (floating-point number). either a number or a list. ex) 0.75 or [0.75,0.5,0.25] - multiscale_weights: [1,1,1] # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. ex) [0.33,0.33,0.33] - save_embeddings: True # If True, save speaker embeddings in pickle format. This should be True if clustering result is used for other models, such as `msdd_model`. - - clustering: - parameters: - oracle_num_speakers: False # If True, use num of speakers value provided in manifest file. - max_num_speakers: 8 # Max number of speakers for each recording. If an oracle number of speakers is passed, this value is ignored. - enhanced_count_thres: 80 # If the number of segments is lower than this number, enhanced speaker counting is activated. - max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. - sparse_search_volume: 10 # The higher the number, the more values will be examined with more time. - maj_vote_spk_count: False # If True, take a majority vote on multiple p-values to estimate the number of speakers. - - msdd_model: - model_path: null # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD) - parameters: - use_speaker_model_from_ckpt: True # If True, use speaker embedding model in checkpoint. If False, the provided speaker embedding model in config will be used. - infer_batch_size: 25 # Batch size for MSDD inference. - sigmoid_threshold: [0.7] # Sigmoid threshold for generating binarized speaker labels. The smaller the more generous on detecting overlaps. - seq_eval_mode: False # If True, use oracle number of speaker and evaluate F1 score for the given speaker sequences. Default is False. - split_infer: True # If True, break the input audio clip to short sequences and calculate cluster average embeddings for inference. - diar_window_length: 50 # The length of split short sequence when split_infer is True. - overlap_infer_spk_limit: 5 # If the estimated number of speakers are larger than this number, overlap speech is not estimated. - - asr: - model_path: null # Provide NGC cloud ASR model name. stt_en_conformer_ctc_* models are recommended for diarization purposes. - parameters: - asr_based_vad: False # if True, speech segmentation for diarization is based on word-timestamps from ASR inference. - asr_based_vad_threshold: 1.0 # Threshold (in sec) that caps the gap between two words when generating VAD timestamps using ASR based VAD. - asr_batch_size: null # Batch size can be dependent on each ASR model. Default batch sizes are applied if set to null. - decoder_delay_in_sec: null # Native decoder delay. null is recommended to use the default values for each ASR model. - word_ts_anchor_offset: null # Offset to set a reference point from the start of the word. Recommended range of values is [-0.05 0.2]. - word_ts_anchor_pos: "start" # Select which part of the word timestamp we want to use. The options are: 'start', 'end', 'mid'. - fix_word_ts_with_VAD: False # Fix the word timestamp using VAD output. You must provide a VAD model to use this feature. - colored_text: False # If True, use colored text to distinguish speakers in the output transcript. - print_time: True # If True, the start and end time of each speaker turn is printed in the output transcript. - break_lines: False # If True, the output transcript breaks the line to fix the line width (default is 90 chars) - - ctc_decoder_parameters: # Optional beam search decoder (pyctcdecode) - pretrained_language_model: null # KenLM model file: .arpa model file or .bin binary file. - beam_width: 32 - alpha: 0.5 - beta: 2.5 - - realigning_lm_parameters: # Experimental feature - arpa_language_model: null # Provide a KenLM language model in .arpa format. - min_number_of_words: 3 # Min number of words for the left context. - max_number_of_words: 10 # Max number of words for the right context. - logprob_diff_threshold: 1.2 # The threshold for the difference between two log probability values from two hypotheses. - diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/conf/inference/diar_infer_meeting.yaml b/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/conf/inference/diar_infer_meeting.yaml deleted file mode 100644 index 2d53d0916dde9e294dfa4cff78a1794b6bc31e6e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/conf/inference/diar_infer_meeting.yaml +++ /dev/null @@ -1,92 +0,0 @@ -# This YAML file is created for all types of offline speaker diarization inference tasks in `/example/speaker_tasks/diarization` folder. -# The inference parameters for VAD, speaker embedding extractor, clustering module, MSDD module, ASR decoder are all included in this YAML file. -# All the keys under `diarizer` key (`vad`, `speaker_embeddings`, `clustering`, `msdd_model`, `asr`) can be selectively used for its own purpose and also can be ignored if the module is not used. -# The configurations in this YAML file is suitable for 3~5 speakers participating in a meeting and may not show the best performance on other types of dialogues. -# An example line in an input manifest file (`.json` format): -# {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": "infer", "text": "-", "num_speakers": null, "rttm_filepath": "/path/to/rttm/file", "uem_filepath": "/path/to/uem/file"} -name: &name "ClusterDiarizer" - -num_workers: 1 -sample_rate: 16000 -batch_size: 64 -device: null # can specify a specific device, i.e: cuda:1 (default cuda if cuda available, else cpu) -verbose: True # enable additional logging - -diarizer: - manifest_filepath: ??? - out_dir: ??? - oracle_vad: False # If True, uses RTTM files provided in the manifest file to get speech activity (VAD) timestamps - collar: 0.25 # Collar value for scoring - ignore_overlap: True # Consider or ignore overlap segments while scoring - - vad: - model_path: vad_multilingual_marblenet # .nemo local model path or pretrained VAD model name - external_vad_manifest: null # This option is provided to use external vad and provide its speech activity labels for speaker embeddings extraction. Only one of model_path or external_vad_manifest should be set - - parameters: # Tuned parameters for CH109 (using the 11 multi-speaker sessions as dev set) - window_length_in_sec: 0.63 # Window length in sec for VAD context input - shift_length_in_sec: 0.01 # Shift length in sec for generate frame level VAD prediction - smoothing: False # False or type of smoothing method (eg: median) - overlap: 0.5 # Overlap ratio for overlapped mean/median smoothing filter - onset: 0.9 # Onset threshold for detecting the beginning and end of a speech - offset: 0.5 # Offset threshold for detecting the end of a speech - pad_onset: 0 # Adding durations before each speech segment - pad_offset: 0 # Adding durations after each speech segment - min_duration_on: 0 # Threshold for small non_speech deletion - min_duration_off: 0.6 # Threshold for short speech segment deletion - filter_speech_first: True - - speaker_embeddings: - model_path: titanet_large # .nemo local model path or pretrained model name (titanet_large, ecapa_tdnn or speakerverification_speakernet) - parameters: - window_length_in_sec: [3.0,2.5,2.0,1.5,1.0,0.5] # Window length(s) in sec (floating-point number). either a number or a list. ex) 1.5 or [1.5,1.0,0.5] - shift_length_in_sec: [1.5,1.25,1.0,0.75,0.5,0.25] # Shift length(s) in sec (floating-point number). either a number or a list. ex) 0.75 or [0.75,0.5,0.25] - multiscale_weights: [1,1,1,1,1,1] # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. ex) [0.33,0.33,0.33] - save_embeddings: True # If True, save speaker embeddings in pickle format. This should be True if clustering result is used for other models, such as `msdd_model`. - - clustering: - parameters: - oracle_num_speakers: False # If True, use num of speakers value provided in manifest file. - max_num_speakers: 8 # Max number of speakers for each recording. If an oracle number of speakers is passed, this value is ignored. - enhanced_count_thres: 80 # If the number of segments is lower than this number, enhanced speaker counting is activated. - max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. - sparse_search_volume: 30 # The higher the number, the more values will be examined with more time. - maj_vote_spk_count: False # If True, take a majority vote on multiple p-values to estimate the number of speakers. - - msdd_model: - model_path: null # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD) - parameters: - use_speaker_model_from_ckpt: True # If True, use speaker embedding model in checkpoint. If False, the provided speaker embedding model in config will be used. - infer_batch_size: 25 # Batch size for MSDD inference. - sigmoid_threshold: [0.7] # Sigmoid threshold for generating binarized speaker labels. The smaller the more generous on detecting overlaps. - seq_eval_mode: False # If True, use oracle number of speaker and evaluate F1 score for the given speaker sequences. Default is False. - split_infer: True # If True, break the input audio clip to short sequences and calculate cluster average embeddings for inference. - diar_window_length: 50 # The length of split short sequence when split_infer is True. - overlap_infer_spk_limit: 5 # If the estimated number of speakers are larger than this number, overlap speech is not estimated. - - asr: - model_path: stt_en_conformer_ctc_large # Provide NGC cloud ASR model name. stt_en_conformer_ctc_* models are recommended for diarization purposes. - parameters: - asr_based_vad: False # if True, speech segmentation for diarization is based on word-timestamps from ASR inference. - asr_based_vad_threshold: 1.0 # Threshold (in sec) that caps the gap between two words when generating VAD timestamps using ASR based VAD. - asr_batch_size: null # Batch size can be dependent on each ASR model. Default batch sizes are applied if set to null. - decoder_delay_in_sec: null # Native decoder delay. null is recommended to use the default values for each ASR model. - word_ts_anchor_offset: null # Offset to set a reference point from the start of the word. Recommended range of values is [-0.05 0.2]. - word_ts_anchor_pos: "start" # Select which part of the word timestamp we want to use. The options are: 'start', 'end', 'mid'. - fix_word_ts_with_VAD: False # Fix the word timestamp using VAD output. You must provide a VAD model to use this feature. - colored_text: False # If True, use colored text to distinguish speakers in the output transcript. - print_time: True # If True, the start and end time of each speaker turn is printed in the output transcript. - break_lines: False # If True, the output transcript breaks the line to fix the line width (default is 90 chars) - - ctc_decoder_parameters: # Optional beam search decoder (pyctcdecode) - pretrained_language_model: null # KenLM model file: .arpa model file or .bin binary file. - beam_width: 32 - alpha: 0.5 - beta: 2.5 - - realigning_lm_parameters: # Experimental feature - arpa_language_model: null # Provide a KenLM language model in .arpa format. - min_number_of_words: 3 # Min number of words for the left context. - max_number_of_words: 10 # Max number of words for the right context. - logprob_diff_threshold: 1.2 # The threshold for the difference between two log probability values from two hypotheses. - diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml b/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml deleted file mode 100644 index c9d7cdf32f45d544e372209d1bd7fc0da18674dd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml +++ /dev/null @@ -1,92 +0,0 @@ -# This YAML file is created for all types of offline speaker diarization inference tasks in `/example/speaker_tasks/diarization` folder. -# The inference parameters for VAD, speaker embedding extractor, clustering module, MSDD module, ASR decoder are all included in this YAML file. -# All the keys under `diarizer` key (`vad`, `speaker_embeddings`, `clustering`, `msdd_model`, `asr`) can be selectively used for its own purpose and also can be ignored if the module is not used. -# The configurations in this YAML file is suitable for telephone recordings involving 2~8 speakers in a session and may not show the best performance on the other types of acoustic conditions or dialogues. -# An example line in an input manifest file (`.json` format): -# {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": "infer", "text": "-", "num_speakers": null, "rttm_filepath": "/path/to/rttm/file", "uem_filepath": "/path/to/uem/file"} -name: &name "ClusterDiarizer" - -num_workers: 1 -sample_rate: 16000 -batch_size: 64 -device: null # can specify a specific device, i.e: cuda:1 (default cuda if cuda available, else cpu) -verbose: True # enable additional logging - -diarizer: - manifest_filepath: ??? - out_dir: ??? - oracle_vad: False # If True, uses RTTM files provided in the manifest file to get speech activity (VAD) timestamps - collar: 0.25 # Collar value for scoring - ignore_overlap: True # Consider or ignore overlap segments while scoring - - vad: - model_path: vad_multilingual_marblenet # .nemo local model path or pretrained VAD model name - external_vad_manifest: null # This option is provided to use external vad and provide its speech activity labels for speaker embeddings extraction. Only one of model_path or external_vad_manifest should be set - - parameters: # Tuned parameters for CH109 (using the 11 multi-speaker sessions as dev set) - window_length_in_sec: 0.15 # Window length in sec for VAD context input - shift_length_in_sec: 0.01 # Shift length in sec for generate frame level VAD prediction - smoothing: "median" # False or type of smoothing method (eg: median) - overlap: 0.5 # Overlap ratio for overlapped mean/median smoothing filter - onset: 0.1 # Onset threshold for detecting the beginning and end of a speech - offset: 0.1 # Offset threshold for detecting the end of a speech - pad_onset: 0.1 # Adding durations before each speech segment - pad_offset: 0 # Adding durations after each speech segment - min_duration_on: 0 # Threshold for small non_speech deletion - min_duration_off: 0.2 # Threshold for short speech segment deletion - filter_speech_first: True - - speaker_embeddings: - model_path: titanet_large # .nemo local model path or pretrained model name (titanet_large, ecapa_tdnn or speakerverification_speakernet) - parameters: - window_length_in_sec: [1.5,1.25,1.0,0.75,0.5] # Window length(s) in sec (floating-point number). either a number or a list. ex) 1.5 or [1.5,1.0,0.5] - shift_length_in_sec: [0.75,0.625,0.5,0.375,0.25] # Shift length(s) in sec (floating-point number). either a number or a list. ex) 0.75 or [0.75,0.5,0.25] - multiscale_weights: [1,1,1,1,1] # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. ex) [0.33,0.33,0.33] - save_embeddings: True # If True, save speaker embeddings in pickle format. This should be True if clustering result is used for other models, such as `msdd_model`. - - clustering: - parameters: - oracle_num_speakers: False # If True, use num of speakers value provided in manifest file. - max_num_speakers: 8 # Max number of speakers for each recording. If an oracle number of speakers is passed, this value is ignored. - enhanced_count_thres: 80 # If the number of segments is lower than this number, enhanced speaker counting is activated. - max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. - sparse_search_volume: 30 # The higher the number, the more values will be examined with more time. - maj_vote_spk_count: False # If True, take a majority vote on multiple p-values to estimate the number of speakers. - - msdd_model: - model_path: diar_msdd_telephonic # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD) - parameters: - use_speaker_model_from_ckpt: True # If True, use speaker embedding model in checkpoint. If False, the provided speaker embedding model in config will be used. - infer_batch_size: 25 # Batch size for MSDD inference. - sigmoid_threshold: [0.7] # Sigmoid threshold for generating binarized speaker labels. The smaller the more generous on detecting overlaps. - seq_eval_mode: False # If True, use oracle number of speaker and evaluate F1 score for the given speaker sequences. Default is False. - split_infer: True # If True, break the input audio clip to short sequences and calculate cluster average embeddings for inference. - diar_window_length: 50 # The length of split short sequence when split_infer is True. - overlap_infer_spk_limit: 5 # If the estimated number of speakers are larger than this number, overlap speech is not estimated. - - asr: - model_path: stt_en_conformer_ctc_large # Provide NGC cloud ASR model name. stt_en_conformer_ctc_* models are recommended for diarization purposes. - parameters: - asr_based_vad: False # if True, speech segmentation for diarization is based on word-timestamps from ASR inference. - asr_based_vad_threshold: 1.0 # Threshold (in sec) that caps the gap between two words when generating VAD timestamps using ASR based VAD. - asr_batch_size: null # Batch size can be dependent on each ASR model. Default batch sizes are applied if set to null. - decoder_delay_in_sec: null # Native decoder delay. null is recommended to use the default values for each ASR model. - word_ts_anchor_offset: null # Offset to set a reference point from the start of the word. Recommended range of values is [-0.05 0.2]. - word_ts_anchor_pos: "start" # Select which part of the word timestamp we want to use. The options are: 'start', 'end', 'mid'. - fix_word_ts_with_VAD: False # Fix the word timestamp using VAD output. You must provide a VAD model to use this feature. - colored_text: False # If True, use colored text to distinguish speakers in the output transcript. - print_time: True # If True, the start and end time of each speaker turn is printed in the output transcript. - break_lines: False # If True, the output transcript breaks the line to fix the line width (default is 90 chars) - - ctc_decoder_parameters: # Optional beam search decoder (pyctcdecode) - pretrained_language_model: null # KenLM model file: .arpa model file or .bin binary file. - beam_width: 32 - alpha: 0.5 - beta: 2.5 - - realigning_lm_parameters: # Experimental feature - arpa_language_model: null # Provide a KenLM language model in .arpa format. - min_number_of_words: 3 # Min number of words for the left context. - max_number_of_words: 10 # Max number of words for the right context. - logprob_diff_threshold: 1.2 # The threshold for the difference between two log probability values from two hypotheses. - diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/conf/neural_diarizer/msdd_5scl_15_05_50Povl_256x3x32x2.yaml b/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/conf/neural_diarizer/msdd_5scl_15_05_50Povl_256x3x32x2.yaml deleted file mode 100644 index 03cbfd363ed7c404e331c28196f55b1f97d78bd0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/conf/neural_diarizer/msdd_5scl_15_05_50Povl_256x3x32x2.yaml +++ /dev/null @@ -1,129 +0,0 @@ -# Mutiscale diarization decoder (MSDD) is a speaker diarization model based on initializing clustering and multiscale segmentation input. -# Model name convention for MSDD: msdd_scl___Povl_xxx -# (Example) `msdd_5scl_15_05_50Povl_256x3x32x2.yaml` has 5 scales, the longest scale is 1.5 sec, the shortest scale is 0.5 sec, with 50 percent overlap, hidden layer size is 256, 3 LSTM layers, 32 CNN channels, 2 repeated Conv layers -# MSDD model checkpoint (.ckpt) and NeMo file (.nemo) contain speaker embedding model (TitaNet) and the speaker model is loaded along with standalone MSDD moodule. -# Note that MSDD models require more than one scale. Thus, the parameters in diarizer.speaker_embeddings.parameters should have more than one scale to function as a MSDD model. -# Example: a manifest line for training -# {"audio_filepath": "/path/to/audio01.wav", "offset": 390.83, "duration": 13.45, "text": "-", "num_speakers": 2, "rttm_filepath": "/path/to/audio01.rttm"} -name: "MultiscaleDiarDecoder" -sample_rate: 16000 -num_workers: 20 -batch_size: 7 - -model: - diarizer: - out_dir: null - oracle_vad: True # If True, uses RTTM files provided in manifest file to get speech activity (VAD) timestamps - speaker_embeddings: - model_path: ??? # .nemo local model path or pretrained model name (titanet_large is recommended) - parameters: - window_length_in_sec: [1.5,1.25,1.0,0.75,0.5] # Window length(s) in sec (floating-point number). either a number or a list. ex) 1.5 or [1.5,1.0,0.5] - shift_length_in_sec: [0.75,0.625,0.5,0.375,0.25] # Shift length(s) in sec (floating-point number). either a number or a list. ex) 0.75 or [0.75,0.5,0.25] - multiscale_weights: [1,1,1,1,1] # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. ex) [0.33,0.33,0.33] - save_embeddings: True # Save embeddings as pickle file for each audio input. - - num_workers: ${num_workers} - max_num_of_spks: 2 # Number of speakers per model. This is currently fixed at 2. - scale_n: 5 # Number of scales for MSDD model and initializing clustering. - soft_label_thres: 0.5 # Threshold for creating discretized speaker label from continuous speaker label in RTTM files. - emb_batch_size: 0 # If this value is bigger than 0, corresponding number of embedding vectors are attached to torch graph and trained. - - train_ds: - manifest_filepath: ??? - emb_dir: ??? - sample_rate: ${sample_rate} - num_spks: ${model.max_num_of_spks} - soft_label_thres: ${model.soft_label_thres} - labels: null - batch_size: ${batch_size} - emb_batch_size: ${model.emb_batch_size} - shuffle: True - - validation_ds: - manifest_filepath: ??? - emb_dir: ??? - sample_rate: ${sample_rate} - num_spks: ${model.max_num_of_spks} - soft_label_thres: ${model.soft_label_thres} - labels: null - batch_size: 2 - emb_batch_size: ${model.emb_batch_size} - shuffle: False - - test_ds: - manifest_filepath: null - emb_dir: null - sample_rate: 16000 - num_spks: ${model.max_num_of_spks} - soft_label_thres: ${model.soft_label_thres} - labels: null - batch_size: 2 - shuffle: False - seq_eval_mode: False - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - normalize: "per_feature" - window_size: 0.025 - sample_rate: ${sample_rate} - window_stride: 0.01 - window: "hann" - features: 80 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - - msdd_module: - _target_: nemo.collections.asr.modules.msdd_diarizer.MSDD_module - num_spks: ${model.max_num_of_spks} # Number of speakers per model. This is currently fixed at 2. - hidden_size: 256 # Hidden layer size for linear layers in MSDD module - num_lstm_layers: 3 # Number of stacked LSTM layers - dropout_rate: 0.5 # Dropout rate - cnn_output_ch: 32 # Number of filters in a conv-net layer. - conv_repeat: 2 # Determins the number of conv-net layers. Should be greater or equal to 1. - emb_dim: 192 # Dimension of the speaker embedding vectors - scale_n: ${model.scale_n} # Number of scales for multiscale segmentation input - weighting_scheme: 'conv_scale_weight' # Type of weighting algorithm. Options: ('conv_scale_weight', 'attn_scale_weight') - context_vector_type: 'cos_sim' # Type of context vector: options. Options: ('cos_sim', 'elem_prod') - - loss: - _target_: nemo.collections.asr.losses.bce_loss.BCELoss - weight: null # Weight for binary cross-entropy loss. Either `null` or list type input. (e.g. [0.5,0.5]) - - optim: - name: adam - lr: .001 - weight_decay: 0.001 - - sched: - name: CosineAnnealing - min_lr: 0.00001 - -trainer: - devices: 1 # number of gpus (devices) - accelerator: gpu - max_epochs: 200 - max_steps: -1 # computed at runtime if not set - num_nodes: 1 - strategy: ddp - accumulate_grad_batches: 1 - deterministic: True - enable_checkpointing: False - logger: False - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: True - create_checkpoint_callback: True - create_wandb_logger: False - checkpoint_callback_params: - monitor: "val_loss" - mode: "min" - save_top_k: 30 - every_n_epochs: 1 - wandb_logger_kwargs: - name: null - project: null diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/conf/neural_diarizer/msdd_6scl_30_05_50Povl_256x3x32x2.yaml b/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/conf/neural_diarizer/msdd_6scl_30_05_50Povl_256x3x32x2.yaml deleted file mode 100644 index e4daf972e024f9a64abf7f0c74a6fc993cbeb3c1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/conf/neural_diarizer/msdd_6scl_30_05_50Povl_256x3x32x2.yaml +++ /dev/null @@ -1,129 +0,0 @@ -# Mutiscale diarization decoder (MSDD) is a speaker diarization model based on initializing clustering and multiscale segmentation input. -# Model name convention for MSDD: msdd_scl___Povl_xxx -# Example: `msdd_6scl_30_05_50Povl_256x3x32x2.yaml` has 6 scales, the longest scale is 3.0 sec, the shortest scale is 0.5 sec, with 50 percent overlap, hidden layer size is 256, 3 LSTM layers, 32 CNN channels, 2 repeated Conv layers -# MSDD model checkpoint (.ckpt) and NeMo file (.nemo) contain speaker embedding model (TitaNet) and the speaker model is loaded along with standalone MSDD moodule. -# Note that MSDD models require more than one scale. Thus, the parameters in diarizer.speaker_embeddings.parameters should have more than one scale to function as a MSDD model. -# Example: a manifest line for training -# {"audio_filepath": "/path/to/audio01.wav", "offset": 390.83, "duration": 13.45, "text": "-", "num_speakers": 2, "rttm_filepath": "/path/to/audio01.rttm"} -name: "MultiscaleDiarDecoder" -sample_rate: 16000 -num_workers: 20 -batch_size: 7 - -model: - diarizer: - out_dir: null - oracle_vad: True # If True, uses RTTM files provided in manifest file to get speech activity (VAD) timestamps - speaker_embeddings: - model_path: ??? # .nemo local model path or pretrained model name (titanet_large is recommended) - parameters: - window_length_in_sec: [3.0,2.5,2.0,1.5,1.0,0.5] # Window length(s) in sec (floating-point number). either a number or a list. ex) 1.5 or [1.5,1.0,0.5] - shift_length_in_sec: [1.5,1.25,1.0,0.75,0.5,0.25] # Shift length(s) in sec (floating-point number). either a number or a list. ex) 0.75 or [0.75,0.5,0.25] - multiscale_weights: [1,1,1,1,1,1] # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. ex) [0.33,0.33,0.33] - save_embeddings: True # Save embeddings as pickle file for each audio input. - - num_workers: ${num_workers} - max_num_of_spks: 2 # Number of speakers per model. This is currently fixed at 2. - scale_n: 6 # Number of scales for MSDD model and initializing clustering. - soft_label_thres: 0.5 # Threshold for creating discretized speaker label from continuous speaker label in RTTM files. - emb_batch_size: 0 # If this value is bigger than 0, corresponding number of embedding vectors are attached to torch graph and trained. - - train_ds: - manifest_filepath: ??? - emb_dir: ??? - sample_rate: ${sample_rate} - num_spks: ${model.max_num_of_spks} - soft_label_thres: ${model.soft_label_thres} - labels: null - batch_size: ${batch_size} - emb_batch_size: ${model.emb_batch_size} - shuffle: True - - validation_ds: - manifest_filepath: ??? - emb_dir: ??? - sample_rate: ${sample_rate} - num_spks: ${model.max_num_of_spks} - soft_label_thres: ${model.soft_label_thres} - labels: null - batch_size: 2 - emb_batch_size: ${model.emb_batch_size} - shuffle: False - - test_ds: - manifest_filepath: null - emb_dir: null - sample_rate: 16000 - num_spks: ${model.max_num_of_spks} - soft_label_thres: ${model.soft_label_thres} - labels: null - batch_size: 2 - shuffle: False - seq_eval_mode: False - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - normalize: "per_feature" - window_size: 0.025 - sample_rate: ${sample_rate} - window_stride: 0.01 - window: "hann" - features: 80 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - - msdd_module: - _target_: nemo.collections.asr.modules.msdd_diarizer.MSDD_module - num_spks: ${model.max_num_of_spks} # Number of speakers per model. This is currently fixed at 2. - hidden_size: 256 # Hidden layer size for linear layers in MSDD module - num_lstm_layers: 3 # Number of stacked LSTM layers - dropout_rate: 0.5 # Dropout rate - cnn_output_ch: 32 # Number of filters in a conv-net layer. - conv_repeat: 2 # Determins the number of conv-net layers. Should be greater or equal to 1. - emb_dim: 192 # Dimension of the speaker embedding vectors - scale_n: ${model.scale_n} # Number of scales for multiscale segmentation input - weighting_scheme: 'conv_scale_weight' # Type of weighting algorithm. Options: ('conv_scale_weight', 'attn_scale_weight') - context_vector_type: 'cos_sim' # Type of context vector: options. Options: ('cos_sim', 'elem_prod') - - loss: - _target_: nemo.collections.asr.losses.bce_loss.BCELoss - weight: null # Weight for binary cross-entropy loss. Either `null` or list type input. (e.g. [0.5,0.5]) - - optim: - name: adam - lr: .001 - weight_decay: 0.001 - - sched: - name: CosineAnnealing - min_lr: 0.00001 - -trainer: - devices: 1 # number of gpus (devices) - accelerator: gpu - max_epochs: 200 - max_steps: -1 # computed at runtime if not set - num_nodes: 1 - strategy: ddp - accumulate_grad_batches: 1 - deterministic: True - enable_checkpointing: False - logger: False - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: True - create_checkpoint_callback: True - create_wandb_logger: False - checkpoint_callback_params: - monitor: "val_loss" - mode: "min" - save_top_k: 30 - every_n_epochs: 1 - wandb_logger_kwargs: - name: null - project: null diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py b/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py deleted file mode 100644 index 984b5ce934641d1d3a611f6ab8b06fbb23f8d165..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl -from omegaconf import OmegaConf -from pytorch_lightning import seed_everything - -from nemo.collections.asr.models import EncDecDiarLabelModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - -""" -Example training session (single GPU training on telephonic datasets) - -python ./multiscale_diar_decoder.py --config-path='../conf/neural_diarizer' --config-name='msdd_5scl_15_05_50Povl_256x3x32x2.yaml' \ - trainer.devices=1 \ - model.base.diarizer.speaker_embeddings.model_path="titanet_large" \ - model.train_ds.manifest_filepath="" \ - model.validation_ds.manifest_filepath="" \ - model.train_ds.emb_dir="" \ - model.validation_ds.emb_dir="" \ - exp_manager.name='sample_train' \ - exp_manager.exp_dir='./msdd_exp' -""" - -seed_everything(42) - - -@hydra_runner(config_path="../conf/neural_diarizer", config_name="msdd_5scl_15_05_50Povl_256x3x32x2.yaml") -def main(cfg): - logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - msdd_model = EncDecDiarLabelModel(cfg=cfg.model, trainer=trainer) - trainer.fit(msdd_model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py b/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py deleted file mode 100644 index 05d1b3cd13040c3de2bd5e826bf9e0ef20fe54da..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.models.msdd_models import NeuralDiarizer -from nemo.core.config import hydra_runner - - -""" -Run the entire speaker diarization pipeline: VAD, clustering diarizer for initializing clustering then Multi-scale Diarization Decoder (MSDD). -python multiscale_diar_decoder_infer.py --config-path='../conf/inference' --config-name='diar_infer_telephonic.yaml' \ - diarizer.vad.model_path= \ - diarizer.msdd_model.model_path= \ - diarizer.oracle_vad=False \ - diarizer.manifest_filepath= \ - diarizer.out_dir= \ -""" - - -@hydra_runner(config_path="../conf/inference", config_name="diar_infer_telephonic.yaml") -def main(cfg): - diarizer_model = NeuralDiarizer(cfg=cfg).to(cfg.device) - diarizer_model.diarize() - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/README.md b/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/README.md deleted file mode 100644 index 0e0f5ae3b4fc6f47da266989f76257784cf6615b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/README.md +++ /dev/null @@ -1,105 +0,0 @@ -# Speaker Recognition - -Documentation section for speaker related tasks can be found at: - - [Speaker Identification and Verification](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_recognition/intro.html) - - [Speaker Diarization](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speaker_diarization/intro.html) - -## Performance -| MODEL | type | EER (%)
Voxceleb-O (veri_test2.txt) | -|:------------------------------:|:---------------------:|:--------------------------------------:| -| [speakerverification_speakernet](https://ngc.nvidia.com/catalog/models/nvidia:nemo:speakerverification_speakernet) | xvector | 1.96 | -| [ecapa_tdnn](https://ngc.nvidia.com/catalog/models/nvidia:nemo:ecapa_tdnn) | channel-
attention | 0.92 | -| [titanet_large](https://ngc.nvidia.com/catalog/models/nvidia:nemo:ecapa_tdnn) | channel-
attention | 0.66 | - -## Training -Speaker Recognition models can be trained in a similar way as other models in NeMo using train and dev manifest files. Steps on how to create manifest files for voxceleb are provided below. -We provide three model configurations based on TitaNet, SpeakerNet and modified ECAPA_TDNN, with pretrained models provided for each of them. - -For training titanet_large (channel-attention) model: -```bash -python speaker_reco.py --config_path='conf' --config_name='titanet_large.yaml' -``` - -For training speakernet (x-vector) model: -```bash -python speaker_reco.py --config_path='conf' --config_name='SpeakerNet_verification_3x2x256.yaml' -``` - -For training ecapa_tdnn (channel-attention) model: -```bash -python speaker_reco.py --config_path='conf' --config_name='ecapa_tdnn.yaml' -``` -For step by step tutorial see [notebook](https://github.com/NVIDIA/NeMo/blob/main/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb). - -### Fine Tuning -For fine tuning on a pretrained .nemo speaker recognition model, -```bash -python speaker_reco_finetune.py --config_path='conf' --config_name='titanet-finetune.yaml' -``` -for fine tuning tips see this [tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb) - -## Inference -We provide generic scripts for manifest file creation, embedding extraction, Voxceleb evaluation and speaker ID inference. Hence most of the steps would be common and differ slightly based on your end application. - -We explain here the process for voxceleb EER calculation on voxceleb-O cleaned [trail file](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt) - -### Manifest Creation -We first generate manifest file to get embeddings. The embeddings are then used by `voxceleb_eval.py` script to get EER - -```bash -# create list of files from voxceleb1 test folder (40 speaker test set) -find -iname '*.wav' > voxceleb1_test_files.txt -python /scripts/speaker_tasks/filelist_to_manifest.py --filelist voxceleb1_test_files.txt --id -3 --out voxceleb1_test_manifest.json -``` -### Embedding Extraction -Now using the manifest file created, we can extract embeddings to `data` folder using: -```bash -python extract_speaker_embeddings.py --manifest=voxceleb1_test_manifest.json --model_path='titanet_large' --embedding_dir='./' -``` -If you have a single file, you may also be using the following one liner to get embeddings for the audio file: - -```python -speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_name="titanet_large") -embs = speaker_model.get_embedding('audio_path') -``` - -### Voxceleb Evaluation -``` bash -python voxceleb_eval.py --trial_file='/path/to/trail/file' --emb='./embeddings/voxceleb1_test_manifest_embeddings.pkl' -``` -The above command gives the performance of models on voxceleb-o cleaned trial file. - -### SpeakerID inference -Using data from an enrollment set, one can infer labels on a test set using various backends such as cosine-similarity or a neural classifier. - -To infer speaker labels using cosine_similarity backend -```bash -python speaker_identification_infer.py data.enrollment_manifest= data.test_manifest= backend.backend_model=cosine_similarity -``` -refer to conf/speaker_identification_infer.yaml for more options. - -## Voxceleb Data Preparation - -Scripts we provide for data preparation are very generic and can be applied to any dataset with a few path changes. -For VoxCeleb datasets, we first download the datasets individually and make a list of audio files. Then we use the script to generate manifest files for training and validation. -Download [voxceleb1](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) and [voxceleb2](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox2.html) data. - -Once downloaded and uncompressed, use programs such as ffmpeg to convert audio files from m4a format to wav format. -Refer to the following sample command -```bash -ffmpeg -v 8 -i -f wav -acodec pcm_s16le -``` - -Generate a list file that contains paths to all the dev audio files from voxceleb1 and voxceleb2 using find command as shown below: -```bash -find -iname '*.wav' > voxceleb1_dev.txt -find -iname '*.wav' > voxceleb2_dev.txt -cat voxceleb1_dev.txt voxceleb2_dev.txt > voxceleb12.txt -``` - -This list file is now used to generate training and validation manifest files using a script provided in `/scripts/speaker_tasks/`. This script has optional arguments to split the whole manifest file in to train and dev and also chunk audio files to smaller segments for robust training (for testing, we don't need this). - -```bash -python /scripts/speaker_tasks/filelist_to_manifest.py --filelist voxceleb12.txt --id -3 --out voxceleb12_manifest.json --split --create_segments -``` -This creates `train.json, dev.json` in the current working directory. diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/SpeakerNet_recognition_3x2x512.yaml b/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/SpeakerNet_recognition_3x2x512.yaml deleted file mode 100644 index 35c0b65c6ade7c9df7b077860b284a56d29d3d29..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/SpeakerNet_recognition_3x2x512.yaml +++ /dev/null @@ -1,154 +0,0 @@ -name: &name "SpeakerNet" -sample_rate: &sample_rate 16000 -repeat: &rep 2 -dropout: &drop 0.5 -separable: &separable True -n_filters: &n_filters 512 - -model: - train_ds: - manifest_filepath: ??? - sample_rate: 16000 - labels: null - batch_size: 64 - shuffle: True - is_tarred: False - tarred_audio_filepaths: null - tarred_shard_strategy: "scatter" - - validation_ds: - manifest_filepath: ??? - sample_rate: 16000 - labels: null - batch_size: 128 - shuffle: False - - test_ds: - manifest_filepath: ??? - sample_rate: 16000 - labels: null - batch_size: 1 - shuffle: False - embedding_dir: '.' - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - normalize: "per_feature" - window_size: 0.02 - sample_rate: *sample_rate - window_stride: 0.01 - window: "hann" - features: &n_mels 64 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - - encoder: - _target_: nemo.collections.asr.modules.ConvASREncoder - feat_in: *n_mels - activation: relu - conv_mask: true - - jasper: - - filters: *n_filters - repeat: 1 - kernel: [3] - stride: [1] - dilation: [1] - dropout: *drop - residual: true - separable: *separable - - - filters: *n_filters - repeat: *rep - kernel: [7] - stride: [1] - dilation: [1] - dropout: *drop - residual: true - separable: *separable - - - filters: *n_filters - repeat: *rep - kernel: [11] - stride: [1] - dilation: [1] - dropout: *drop - residual: true - separable: *separable - - - filters: *n_filters - repeat: *rep - kernel: [15] - stride: [1] - dilation: [1] - dropout: *drop - residual: true - separable: *separable - - - filters: &enc_feat_out 1500 - repeat: 1 - kernel: [1] - stride: [1] - dilation: [1] - dropout: 0.0 - residual: false - separable: *separable - - decoder: - _target_: nemo.collections.asr.modules.SpeakerDecoder - feat_in: *enc_feat_out - num_classes: 7205 - pool_mode: 'xvector' - emb_sizes: [512,512] - - loss: - _target_: from nemo.collections.common.losses.CrossEntropyLoss # you could also use AngularSoftmaxLoss - - optim: - name: novograd - # _target_: nemo.core.optim.optimizers.Novograd - lr: .008 - # optimizer arguments - args: - name: auto - # _target_: nemo.core.config.optimizers.NovogradParams - betas: [0.95, 0.5] - weight_decay: 0.001 - - # scheduler setup - sched: - name: CosineAnnealing - iters_per_batch: 1 # computed at runtime - max_steps: -1 # computed at runtime or explicitly set here - - # scheduler config override - args: - name: auto - # _target_: nemo.core.config.schedulers.CosineAnnealingParams - warmup_steps: null - warmup_ratio: 0.1 - min_lr: 0.0 - last_epoch: -1 - -trainer: - devices: 1 # number of gpus - max_epochs: 200 - max_steps: -1 # computed at runtime if not set - num_nodes: 1 - accelerator: gpu - strategy: ddp - accumulate_grad_batches: 1 - deterministic: True - enable_checkpointing: False - logger: False - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - gradient_clip_val: 1.0 - - -exp_manager: - exp_dir: null - name: *name - create_tensorboard_logger: True - create_checkpoint_callback: True diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/SpeakerNet_verification_3x2x256.yaml b/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/SpeakerNet_verification_3x2x256.yaml deleted file mode 100644 index dbb06bfa0a4d5d9017c565a1c1131a5597792d58..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/SpeakerNet_verification_3x2x256.yaml +++ /dev/null @@ -1,133 +0,0 @@ -name: &name "SpeakerNet" -sample_rate: &sample_rate 16000 -repeat: &rep 2 -dropout: &drop 0.5 -separable: &separable True -n_filters: &n_filters 256 - -model: - train_ds: - manifest_filepath: ??? - sample_rate: 16000 - labels: null - batch_size: 64 - shuffle: True - is_tarred: False - tarred_audio_filepaths: null - tarred_shard_strategy: "scatter" - - validation_ds: - manifest_filepath: ??? - sample_rate: 16000 - labels: null - batch_size: 128 - shuffle: False - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - normalize: "per_feature" - window_size: 0.02 - sample_rate: *sample_rate - window_stride: 0.01 - window: "hann" - features: &n_mels 64 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - - encoder: - _target_: nemo.collections.asr.modules.ConvASREncoder - feat_in: *n_mels - activation: relu - conv_mask: true - - jasper: - - filters: *n_filters - repeat: 1 - kernel: [3] - stride: [1] - dilation: [1] - dropout: *drop - residual: true - separable: *separable - - - filters: *n_filters - repeat: *rep - kernel: [7] - stride: [1] - dilation: [1] - dropout: *drop - residual: true - separable: *separable - - - filters: *n_filters - repeat: *rep - kernel: [11] - stride: [1] - dilation: [1] - dropout: *drop - residual: true - separable: *separable - - - filters: *n_filters - repeat: *rep - kernel: [15] - stride: [1] - dilation: [1] - dropout: *drop - residual: true - separable: *separable - - - filters: &enc_feat_out 1500 - repeat: 1 - kernel: [1] - stride: [1] - dilation: [1] - dropout: 0.0 - residual: false - separable: *separable - - decoder: - _target_: nemo.collections.asr.modules.SpeakerDecoder - feat_in: *enc_feat_out - num_classes: 7205 - pool_mode: 'xvector' - emb_sizes: 256 - - loss: - _target_: nemo.collections.asr.losses.angularloss.AngularSoftmaxLoss # you could also use cross-entrophy loss - scale: 30 - margin: 0.2 - - optim: - name: sgd - lr: .006 - weight_decay: 0.001 - momentum: 0.9 - - # scheduler setup - sched: - name: CosineAnnealing - warmup_ratio: 0.1 - min_lr: 0.0 - -trainer: - devices: 1 # number of gpus - max_epochs: 200 - max_steps: -1 # computed at runtime if not set - num_nodes: 1 - accelerator: gpu - strategy: ddp - accumulate_grad_batches: 1 - deterministic: True - enable_checkpointing: False - logger: False - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - gradient_clip_val: 1.0 - -exp_manager: - exp_dir: null - name: *name - create_tensorboard_logger: True - create_checkpoint_callback: True diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/ecapa_tdnn.yaml b/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/ecapa_tdnn.yaml deleted file mode 100644 index 3636d77dd3b06635c439f4ca7b8c1554a45467e1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/ecapa_tdnn.yaml +++ /dev/null @@ -1,106 +0,0 @@ -name: "ECAPA_TDNN" - -model: - - sample_rate: 16000 - - train_ds: - manifest_filepath: ??? - sample_rate: ${model.sample_rate} - labels: null - batch_size: 64 - shuffle: True - augmentor: - noise: - manifest_path: null - prob: 0.5 - min_snr_db: 0 - max_snr_db: 15 - - speed: - prob: 0.5 - sr: ${model.sample_rate} - resample_type: 'kaiser_fast' - min_speed_rate: 0.95 - max_speed_rate: 1.05 - - validation_ds: - manifest_filepath: ??? - sample_rate: ${model.sample_rate} - labels: null - batch_size: 128 - shuffle: False - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - normalize: "per_feature" - window_size: 0.025 - sample_rate: ${model.sample_rate} - window_stride: 0.01 - window: "hann" - features: 80 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - stft_conv: false - - spec_augment: - _target_: nemo.collections.asr.modules.SpectrogramAugmentation - freq_masks: 3 - freq_width: 4 - time_masks: 5 - time_width: 0.03 - - - encoder: - _target_: nemo.collections.asr.modules.ECAPAEncoder - feat_in: ${model.preprocessor.features} - filters: [1024,1024,1024,1024,3072] - kernel_sizes: [5,3,3,3,1] - dilations: [1,1,1,1,1] - scale: 8 - - - decoder: - _target_: nemo.collections.asr.modules.SpeakerDecoder - feat_in: 3072 - num_classes: 7205 - pool_mode: 'attention' #xvector,tap or attention - emb_sizes: 192 - - loss: - _target_: nemo.collections.asr.losses.angularloss.AngularSoftmaxLoss # you could also use cross-entrophy loss - scale: 30 - margin: 0.2 - - optim: - name: sgd - lr: 0.08 - weight_decay: 0.0002 - - # scheduler setup - sched: - name: CosineAnnealing - warmup_ratio: 0.1 - min_lr: 0.0001 - -trainer: - devices: 1 # number of gpus (trained on four nodes - each node has 8 gpus) - max_epochs: 250 - max_steps: -1 # computed at runtime if not set - num_nodes: 1 - accelerator: gpu - strategy: ddp - accumulate_grad_batches: 1 - deterministic: False - enable_checkpointing: False - logger: False - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - gradient_clip_val: 1.0 - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: True - create_checkpoint_callback: True diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/speaker_identification_infer.yaml b/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/speaker_identification_infer.yaml deleted file mode 100644 index 31d1f2fe1a7106752fad7b265bd1dec1b95b72e2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/speaker_identification_infer.yaml +++ /dev/null @@ -1,27 +0,0 @@ -name: &name "SpeakerIdentificationInfer" - -data: - enrollment_manifest: ??? - test_manifest: ??? - out_manifest: './infer_output.json' - sample_rate: 16000 - -backend: - backend_model: cosine_similarity # supported backends are cosine_similarity and neural_classifier - - cosine_similarity: - model_path: titanet_large # or path to .nemo file - batch_size: 32 - - neural_classifier: - model_path: ??? # path to neural model trained/finetuned with enrollment dataset - batch_size: 32 - -# json manifest line example -# -# enrollment_manifest: -# {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": ""} -# -# test_manifest: -# {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, "label": "infer"} -# diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/titanet-finetune.yaml b/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/titanet-finetune.yaml deleted file mode 100644 index ca3b2afc0670475442a479f8d7b8724a9d022123..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/titanet-finetune.yaml +++ /dev/null @@ -1,163 +0,0 @@ -name: &name "TitaNet-Finetune" -sample_rate: &sample_rate 16000 - -init_from_pretrained_model: - speaker_tasks: - name: 'titanet_large' - include: ["preprocessor","encoder"] - exclude: ["decoder.final"] # Add specific layer names here to exlude or just ["decoder"] if to exclude all of decoder pretrained weights - -model: - train_ds: - manifest_filepath: ??? - sample_rate: 16000 - labels: null - batch_size: 64 - shuffle: True - is_tarred: False - tarred_audio_filepaths: null - tarred_shard_strategy: "scatter" - augmentor: - speed: - prob: 0.3 - sr: *sample_rate - resample_type: 'kaiser_fast' - min_speed_rate: 0.95 - max_speed_rate: 1.05 - - validation_ds: - manifest_filepath: ??? - sample_rate: 16000 - labels: null - batch_size: 128 - shuffle: False - - model_defaults: - filters: 1024 - repeat: 3 - dropout: 0.1 - separable: true - se: true - se_context_size: -1 - kernel_size_factor: 1.0 - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - normalize: "per_feature" - window_size: 0.025 - sample_rate: *sample_rate - window_stride: 0.01 - window: "hann" - features: &n_mels 80 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - - encoder: - _target_: nemo.collections.asr.modules.ConvASREncoder - feat_in: *n_mels - activation: relu - conv_mask: true - - jasper: - - filters: ${model.model_defaults.filters} - repeat: 1 - kernel: [3] - stride: [1] - dilation: [1] - dropout: 0.0 - residual: false - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - - filters: ${model.model_defaults.filters} - repeat: ${model.model_defaults.repeat} - kernel: [7] - stride: [1] - dilation: [1] - dropout: ${model.model_defaults.dropout} - residual: true - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - - filters: ${model.model_defaults.filters} - repeat: ${model.model_defaults.repeat} - kernel: [11] - stride: [1] - dilation: [1] - dropout: ${model.model_defaults.dropout} - residual: true - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - - filters: ${model.model_defaults.filters} - repeat: ${model.model_defaults.repeat} - kernel: [15] - stride: [1] - dilation: [1] - dropout: ${model.model_defaults.dropout} - residual: true - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - - filters: &enc_feat_out 3072 - repeat: 1 - kernel: [1] - stride: [1] - dilation: [1] - dropout: 0.0 - residual: false - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - decoder: - _target_: nemo.collections.asr.modules.SpeakerDecoder - feat_in: *enc_feat_out - num_classes: ??? - pool_mode: 'attention' - emb_sizes: 192 - - loss: - _target_: nemo.collections.asr.losses.angularloss.AngularSoftmaxLoss # you could also use cross-entrophy loss - scale: 30 - margin: 0.2 - - optim_param_groups: - encoder: - lr: .001 - - optim: - name: adamw - lr: .0001 #(original titanet-large was trained with 0.08 lr) - weight_decay: 0.0002 - - # scheduler setup - sched: - name: CosineAnnealing - warmup_ratio: 0.1 - min_lr: 0.0 - -trainer: - devices: 1 # number of gpus (original titanet-large was trained on 4 nodes with 8 gpus each) - max_epochs: 10 - max_steps: -1 # computed at runtime if not set - num_nodes: 1 - accelerator: gpu - strategy: ddp - deterministic: True - enable_checkpointing: False - logger: False - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - gradient_clip_val: 1.0 - -exp_manager: - exp_dir: null - name: *name - create_tensorboard_logger: True - create_checkpoint_callback: True diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/titanet-large.yaml b/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/titanet-large.yaml deleted file mode 100644 index e4859678a44e1cff7a6531781fb2b4a102d39867..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/titanet-large.yaml +++ /dev/null @@ -1,167 +0,0 @@ -name: &name "TitaNet-L" -sample_rate: &sample_rate 16000 - -model: - train_ds: - manifest_filepath: ??? - sample_rate: 16000 - labels: null - batch_size: 64 - shuffle: True - is_tarred: False - tarred_audio_filepaths: null - tarred_shard_strategy: "scatter" - augmentor: - noise: - manifest_path: null - prob: 0.5 - min_snr_db: 0 - max_snr_db: 15 - - speed: - prob: 0.3 - sr: *sample_rate - resample_type: 'kaiser_fast' - min_speed_rate: 0.95 - max_speed_rate: 1.05 - - validation_ds: - manifest_filepath: ??? - sample_rate: 16000 - labels: null - batch_size: 128 - shuffle: False - - model_defaults: - filters: 1024 - repeat: 3 - dropout: 0.1 - separable: true - se: true - se_context_size: -1 - kernel_size_factor: 1.0 - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - normalize: "per_feature" - window_size: 0.025 - sample_rate: *sample_rate - window_stride: 0.01 - window: "hann" - features: &n_mels 80 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - - spec_augment: - _target_: nemo.collections.asr.modules.SpectrogramAugmentation - freq_masks: 3 - freq_width: 4 - time_masks: 5 - time_width: 0.03 - - encoder: - _target_: nemo.collections.asr.modules.ConvASREncoder - feat_in: *n_mels - activation: relu - conv_mask: true - - jasper: - - filters: ${model.model_defaults.filters} - repeat: 1 - kernel: [3] - stride: [1] - dilation: [1] - dropout: 0.0 - residual: false - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - - filters: ${model.model_defaults.filters} - repeat: ${model.model_defaults.repeat} - kernel: [7] - stride: [1] - dilation: [1] - dropout: ${model.model_defaults.dropout} - residual: true - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - - filters: ${model.model_defaults.filters} - repeat: ${model.model_defaults.repeat} - kernel: [11] - stride: [1] - dilation: [1] - dropout: ${model.model_defaults.dropout} - residual: true - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - - filters: ${model.model_defaults.filters} - repeat: ${model.model_defaults.repeat} - kernel: [15] - stride: [1] - dilation: [1] - dropout: ${model.model_defaults.dropout} - residual: true - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - - filters: &enc_feat_out 3072 - repeat: 1 - kernel: [1] - stride: [1] - dilation: [1] - dropout: 0.0 - residual: false - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - decoder: - _target_: nemo.collections.asr.modules.SpeakerDecoder - feat_in: *enc_feat_out - num_classes: 7205 - pool_mode: 'attention' - emb_sizes: 192 - - loss: - _target_: nemo.collections.asr.losses.angularloss.AngularSoftmaxLoss # you could also use cross-entrophy loss - scale: 30 - margin: 0.2 - - optim: - name: sgd - lr: .006 #(original titanet-large was trained with 0.08 lr) - weight_decay: 0.0002 - momentum: 0.9 - - # scheduler setup - sched: - name: CosineAnnealing - warmup_ratio: 0.1 - min_lr: 0.0 - -trainer: - devices: 1 # number of gpus (original titanet-large was trained on 4 nodes with 8 gpus each) - max_epochs: 250 - max_steps: -1 # computed at runtime if not set - num_nodes: 1 - accelerator: gpu - strategy: ddp - deterministic: True - enable_checkpointing: False - logger: False - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - gradient_clip_val: 1.0 - -exp_manager: - exp_dir: null - name: *name - create_tensorboard_logger: True - create_checkpoint_callback: True diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/titanet-small.yaml b/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/titanet-small.yaml deleted file mode 100644 index 5f5bd3f60e52810106ee7c959e90e3a0680fdc00..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/conf/titanet-small.yaml +++ /dev/null @@ -1,172 +0,0 @@ -name: &name "TitaNet-S" -sample_rate: &sample_rate 16000 - -model: - train_ds: - manifest_filepath: ??? - sample_rate: 16000 - labels: null - batch_size: 64 - shuffle: True - is_tarred: False - tarred_audio_filepaths: null - tarred_shard_strategy: "scatter" - augmentor: - noise: - manifest_path: null - prob: 0.5 - min_snr_db: 5 - max_snr_db: 15 - - impulse: - manifest_path: null - prob: 0.5 - - speed: - prob: 0.5 - sr: *sample_rate - resample_type: 'kaiser_fast' - min_speed_rate: 0.95 - max_speed_rate: 1.05 - - validation_ds: - manifest_filepath: ??? - sample_rate: 16000 - labels: null - batch_size: 128 - shuffle: False - - model_defaults: - filters: 256 - repeat: 3 - dropout: 0.1 - separable: true - se: true - se_context_size: -1 - kernel_size_factor: 1.0 - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - normalize: "per_feature" - window_size: 0.025 - sample_rate: *sample_rate - window_stride: 0.01 - window: "hann" - features: &n_mels 80 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - - spec_augment: - _target_: nemo.collections.asr.modules.SpectrogramAugmentation - freq_masks: 3 - freq_width: 4 - time_masks: 5 - time_width: 0.03 - - encoder: - _target_: nemo.collections.asr.modules.ConvASREncoder - feat_in: *n_mels - activation: relu - conv_mask: true - - jasper: - - filters: ${model.model_defaults.filters} - repeat: 1 - kernel: [3] - stride: [1] - dilation: [1] - dropout: 0.0 - residual: false - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - - filters: ${model.model_defaults.filters} - repeat: ${model.model_defaults.repeat} - kernel: [7] - stride: [1] - dilation: [1] - dropout: ${model.model_defaults.dropout} - residual: true - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - - filters: ${model.model_defaults.filters} - repeat: ${model.model_defaults.repeat} - kernel: [11] - stride: [1] - dilation: [1] - dropout: ${model.model_defaults.dropout} - residual: true - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - - filters: ${model.model_defaults.filters} - repeat: ${model.model_defaults.repeat} - kernel: [15] - stride: [1] - dilation: [1] - dropout: ${model.model_defaults.dropout} - residual: true - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - - filters: &enc_feat_out 3072 - repeat: 1 - kernel: [1] - stride: [1] - dilation: [1] - dropout: 0.0 - residual: false - separable: ${model.model_defaults.separable} - se: ${model.model_defaults.se} - se_context_size: ${model.model_defaults.se_context_size} - - decoder: - _target_: nemo.collections.asr.modules.SpeakerDecoder - feat_in: *enc_feat_out - num_classes: 7205 - pool_mode: 'attention' - emb_sizes: 192 - - loss: - _target_: nemo.collections.asr.losses.angularloss.AngularSoftmaxLoss - scale: 30 - margin: 0.2 - - optim: - name: sgd - lr: .006 #(original titanet-large was trained with 0.08 lr) - weight_decay: 0.001 - momentum: 0.9 - - # scheduler setup - sched: - name: CosineAnnealing - warmup_ratio: 0.1 - min_lr: 0.0 - -trainer: - devices: 1 # number of gpus (original titanet-large was trained on 4 nodes with 8 gpus each) - max_epochs: 250 - max_steps: -1 # computed at runtime if not set - num_nodes: 1 - accelerator: gpu - strategy: ddp - accumulate_grad_batches: 1 - deterministic: True - enable_checkpointing: False - logger: False - log_every_n_steps: 1 # Interval of logging. - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - gradient_clip_val: 1.0 - -exp_manager: - exp_dir: null - name: *name - create_tensorboard_logger: True - create_checkpoint_callback: True diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/extract_speaker_embeddings.py b/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/extract_speaker_embeddings.py deleted file mode 100644 index e67dc7d5aec37eec4b71246a9e4f4b4d271ac41f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/extract_speaker_embeddings.py +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This is a helper script to extract speaker embeddings based on manifest file -Usage: -python extract_speaker_embeddings.py --manifest=/path/to/manifest/file' ---model_path='/path/to/.nemo/file'(optional) ---embedding_dir='/path/to/embedding/directory' - -Args: ---manifest: path to manifest file containing audio_file paths for which embeddings need to be extracted ---model_path(optional): path to .nemo speaker verification model file to extract embeddings, if not passed SpeakerNet-M model would - be downloaded from NGC and used to extract embeddings ---embeddings_dir(optional): path to directory where embeddings need to stored default:'./' - - -""" - -import json -import os -import pickle as pkl -from argparse import ArgumentParser - -import numpy as np -import torch - -from nemo.collections.asr.models.label_models import EncDecSpeakerLabelModel -from nemo.collections.asr.parts.utils.speaker_utils import embedding_normalize -from nemo.utils import logging - - -def get_embeddings(speaker_model, manifest_file, batch_size=1, embedding_dir='./', device='cuda'): - """ - save embeddings to pickle file - Args: - speaker_model: NeMo model - manifest_file: path to the manifest file containing the audio file path from which the - embeddings should be extracted - batch_size: batch_size for inference - embedding_dir: path to directory to store embeddings file - device: compute device to perform operations - """ - - all_embs, _, _, _ = speaker_model.batch_inference(manifest_file, batch_size=batch_size, device=device) - all_embs = np.asarray(all_embs) - all_embs = embedding_normalize(all_embs) - out_embeddings = {} - - with open(manifest_file, 'r', encoding='utf-8') as manifest: - for i, line in enumerate(manifest.readlines()): - line = line.strip() - dic = json.loads(line) - uniq_name = '@'.join(dic['audio_filepath'].split('/')[-3:]) - out_embeddings[uniq_name] = all_embs[i] - - embedding_dir = os.path.join(embedding_dir, 'embeddings') - if not os.path.exists(embedding_dir): - os.makedirs(embedding_dir, exist_ok=True) - - prefix = manifest_file.split('/')[-1].rsplit('.', 1)[-2] - - name = os.path.join(embedding_dir, prefix) - embeddings_file = name + '_embeddings.pkl' - pkl.dump(out_embeddings, open(embeddings_file, 'wb')) - logging.info("Saved embedding files to {}".format(embedding_dir)) - - -def main(): - parser = ArgumentParser() - parser.add_argument( - "--manifest", type=str, required=True, help="Path to manifest file", - ) - parser.add_argument( - "--model_path", - type=str, - default='titanet_large', - required=False, - help="path to .nemo speaker verification model file to extract embeddings, if not passed SpeakerNet-M model would be downloaded from NGC and used to extract embeddings", - ) - parser.add_argument( - "--batch_size", type=int, default=1, required=False, help="batch size", - ) - parser.add_argument( - "--embedding_dir", - type=str, - default='./', - required=False, - help="path to directory where embeddings need to stored default:'./'", - ) - args = parser.parse_args() - torch.set_grad_enabled(False) - - if args.model_path.endswith('.nemo'): - logging.info(f"Using local speaker model from {args.model_path}") - speaker_model = EncDecSpeakerLabelModel.restore_from(restore_path=args.model_path) - elif args.model_path.endswith('.ckpt'): - speaker_model = EncDecSpeakerLabelModel.load_from_checkpoint(checkpoint_path=args.model_path) - else: - speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_name="titanet_large") - logging.info(f"using pretrained titanet_large speaker model from NGC") - - device = 'cuda' - if not torch.cuda.is_available(): - device = 'cpu' - logging.warning("Running model on CPU, for faster performance it is adviced to use atleast one NVIDIA GPUs") - - get_embeddings( - speaker_model, args.manifest, batch_size=args.batch_size, embedding_dir=args.embedding_dir, device=device - ) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/speaker_identification_infer.py b/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/speaker_identification_infer.py deleted file mode 100644 index 90f930fcbfa6ceede3455069c548f212a53d5c12..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/speaker_identification_infer.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json - -import numpy as np -import torch -from omegaconf import OmegaConf -from pytorch_lightning import seed_everything - -from nemo.collections.asr.data.audio_to_label import AudioToSpeechLabelDataset -from nemo.collections.asr.models import EncDecSpeakerLabelModel -from nemo.collections.asr.parts.features import WaveformFeaturizer -from nemo.core.config import hydra_runner -from nemo.utils import logging - -seed_everything(42) - - -@hydra_runner(config_path="conf", config_name="speaker_identification_infer") -def main(cfg): - - logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - - device = 'cuda' if torch.cuda.is_available() else 'cpu' - - enrollment_manifest = cfg.data.enrollment_manifest - test_manifest = cfg.data.test_manifest - out_manifest = cfg.data.out_manifest - sample_rate = cfg.data.sample_rate - - backend = cfg.backend.backend_model.lower() - - featurizer = WaveformFeaturizer(sample_rate=sample_rate) - dataset = AudioToSpeechLabelDataset(manifest_filepath=enrollment_manifest, labels=None, featurizer=featurizer) - enroll_id2label = dataset.id2label - - if backend == 'cosine_similarity': - model_path = cfg.backend.cosine_similarity.model_path - batch_size = cfg.backend.cosine_similarity.batch_size - if model_path.endswith('.nemo'): - speaker_model = EncDecSpeakerLabelModel.restore_from(model_path) - else: - speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_path) - - enroll_embs, _, enroll_truelabels, _ = speaker_model.batch_inference( - enrollment_manifest, batch_size, sample_rate, device=device, - ) - - test_embs, _, _, _ = speaker_model.batch_inference(test_manifest, batch_size, sample_rate, device=device,) - - # length normalize - enroll_embs = enroll_embs / (np.linalg.norm(enroll_embs, ord=2, axis=-1, keepdims=True)) - test_embs = test_embs / (np.linalg.norm(test_embs, ord=2, axis=-1, keepdims=True)) - - # reference embedding - reference_embs = [] - keyslist = list(enroll_id2label.values()) - for label_id in keyslist: - indices = np.where(enroll_truelabels == label_id) - embedding = (enroll_embs[indices].sum(axis=0).squeeze()) / len(indices) - reference_embs.append(embedding) - - reference_embs = np.asarray(reference_embs) - - scores = np.matmul(test_embs, reference_embs.T) - matched_labels = scores.argmax(axis=-1) - - elif backend == 'neural_classifier': - model_path = cfg.backend.neural_classifier.model_path - batch_size = cfg.backend.neural_classifier.batch_size - - if model_path.endswith('.nemo'): - speaker_model = EncDecSpeakerLabelModel.restore_from(model_path) - else: - speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_path) - - if speaker_model.decoder.final.out_features != len(enroll_id2label): - raise ValueError( - "number of labels mis match. Make sure you trained or finetuned neural classifier with labels from enrollement manifest_filepath" - ) - - _, test_logits, _, _ = speaker_model.batch_inference(test_manifest, batch_size, sample_rate, device=device,) - matched_labels = test_logits.argmax(axis=-1) - - with open(test_manifest, 'rb') as f1, open(out_manifest, 'w', encoding='utf-8') as f2: - lines = f1.readlines() - for idx, line in enumerate(lines): - line = line.strip() - item = json.loads(line) - item['infer'] = enroll_id2label[matched_labels[idx]] - json.dump(item, f2) - f2.write('\n') - - logging.info("Inference labels have been written to {} manifest file".format(out_manifest)) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/speaker_reco.py b/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/speaker_reco.py deleted file mode 100644 index a8acd4de4a3f05aff00861b3c3f8a2da372265d8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/speaker_reco.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pytorch_lightning as pl -import torch -from omegaconf import OmegaConf -from pytorch_lightning import seed_everything - -from nemo.collections.asr.models import EncDecSpeakerLabelModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - -""" -Basic run (on GPU for 10 epochs for 2 class training): -EXP_NAME=sample_run -python ./speaker_reco.py --config-path='conf' --config-name='SpeakerNet_recognition_3x2x512.yaml' \ - trainer.max_epochs=10 \ - model.train_ds.batch_size=64 model.validation_ds.batch_size=64 \ - model.train_ds.manifest_filepath="" model.validation_ds.manifest_filepath="" \ - model.test_ds.manifest_filepath="" \ - trainer.devices=1 \ - model.decoder.params.num_classes=2 \ - exp_manager.name=$EXP_NAME +exp_manager.use_datetime_version=False \ - exp_manager.exp_dir='./speaker_exps' - -See https://github.com/NVIDIA/NeMo/blob/main/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb for notebook tutorial - -Optional: Use tarred dataset to speech up data loading. - Prepare ONE manifest that contains all training data you would like to include. Validation should use non-tarred dataset. - Note that it's possible that tarred datasets impacts validation scores because it drop values in order to have same amount of files per tarfile; - Scores might be off since some data is missing. - - Use the `convert_to_tarred_audio_dataset.py` script under /speech_recognition/scripts in order to prepare tarred audio dataset. - For details, please see TarredAudioToClassificationLabelDataset in /nemo/collections/asr/data/audio_to_label.py -""" - -seed_everything(42) - - -@hydra_runner(config_path="conf", config_name="SpeakerNet_verification_3x2x256.yaml") -def main(cfg): - - logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - log_dir = exp_manager(trainer, cfg.get("exp_manager", None)) - speaker_model = EncDecSpeakerLabelModel(cfg=cfg.model, trainer=trainer) - - # save labels to file - if log_dir is not None: - with open(os.path.join(log_dir, 'labels.txt'), 'w') as f: - if speaker_model.labels is not None: - for label in speaker_model.labels: - f.write(f'{label}\n') - - trainer.fit(speaker_model) - - if not trainer.fast_dev_run: - model_path = os.path.join(log_dir, '..', 'spkr.nemo') - speaker_model.save_to(model_path) - - torch.distributed.destroy_process_group() - if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: - if trainer.is_global_zero: - trainer = pl.Trainer(devices=1, accelerator=cfg.trainer.accelerator, strategy=cfg.trainer.strategy) - if speaker_model.prepare_test(trainer): - trainer.test(speaker_model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/speaker_reco_finetune.py b/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/speaker_reco_finetune.py deleted file mode 100644 index 884e5a60bc594c78f016284cc9c205bf352cbd53..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/speaker_reco_finetune.py +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pytorch_lightning as pl -import torch -from omegaconf import OmegaConf -from pytorch_lightning import seed_everything - -from nemo.collections.asr.models import EncDecSpeakerLabelModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - -seed_everything(42) - - -@hydra_runner(config_path="conf", config_name="titanet-finetune.yaml") -def main(cfg): - - logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(**cfg.trainer) - log_dir = exp_manager(trainer, cfg.get("exp_manager", None)) - speaker_model = EncDecSpeakerLabelModel(cfg=cfg.model, trainer=trainer) - speaker_model.maybe_init_from_pretrained_checkpoint(cfg) - - # save labels to file - if log_dir is not None: - with open(os.path.join(log_dir, 'labels.txt'), 'w') as f: - if speaker_model.labels is not None: - for label in speaker_model.labels: - f.write(f'{label}\n') - - trainer.fit(speaker_model) - - torch.distributed.destroy_process_group() - if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: - if trainer.is_global_zero: - trainer = pl.Trainer(devices=1, accelerator=cfg.trainer.accelerator, strategy=cfg.trainer.strategy) - if speaker_model.prepare_test(trainer): - trainer.test(speaker_model) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/voxceleb_eval.py b/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/voxceleb_eval.py deleted file mode 100644 index bf21c62c57095f6303b50a365fa69cddccb9cb89..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/speaker_tasks/recognition/voxceleb_eval.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -import pickle as pkl -import sys - -import numpy as np -from scipy.interpolate import interp1d -from scipy.optimize import brentq -from sklearn.metrics import roc_curve -from tqdm import tqdm - - -""" -This script faciliates to get EER % based on cosine-smilarity -for Voxceleb dataset. - -Args: - trial_file str: path to voxceleb trial file - emb : path to pickle file of embeddings dictionary (generated from spkr_get_emb.py) - save_kaldi_emb: if required pass this argument to save kaldi embeddings for KALDI PLDA training later - Note: order of audio files in manifest file should match the embeddings -""" - - -def get_acc(trial_file='', emb='', save_kaldi_emb=False): - - trial_score = open('trial_score.txt', 'w') - dirname = os.path.dirname(trial_file) - with open(emb, 'rb') as f: - emb = pkl.load(f) - trial_embs = [] - keys = [] - all_scores = [] - all_keys = [] - - # for each trials in trial file - with open(trial_file, 'r') as f: - tmp_file = f.readlines() - for line in tqdm(tmp_file): - line = line.strip() - truth, x_speaker, y_speaker = line.split() - - x_speaker = x_speaker.split('/') - x_speaker = '@'.join(x_speaker) - - y_speaker = y_speaker.split('/') - y_speaker = '@'.join(y_speaker) - - X = emb[x_speaker] - Y = emb[y_speaker] - - if save_kaldi_emb and x_speaker not in keys: - keys.append(x_speaker) - trial_embs.extend([X]) - - if save_kaldi_emb and y_speaker not in keys: - keys.append(y_speaker) - trial_embs.extend([Y]) - - score = np.dot(X, Y) / ((np.dot(X, X) * np.dot(Y, Y)) ** 0.5) - score = (score + 1) / 2 - - all_scores.append(score) - trial_score.write(str(score) + "\t" + truth) - truth = int(truth) - all_keys.append(truth) - - trial_score.write('\n') - trial_score.close() - - if save_kaldi_emb: - np.save(dirname + '/all_embs_voxceleb.npy', np.asarray(trial_embs)) - np.save(dirname + '/all_ids_voxceleb.npy', np.asarray(keys)) - print("Saved KALDI PLDA related embeddings to {}".format(dirname)) - - return np.asarray(all_scores), np.asarray(all_keys) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--trial_file", help="path to voxceleb trial file", type=str, required=True) - parser.add_argument("--emb", help="path to numpy file of embeddings", type=str, required=True) - parser.add_argument( - "--save_kaldi_emb", - help=":save kaldi embeddings for KALDI PLDA training later", - required=False, - action='store_true', - ) - - args = parser.parse_args() - trial_file, emb, save_kaldi_emb = args.trial_file, args.emb, args.save_kaldi_emb - - y_score, y = get_acc(trial_file=trial_file, emb=emb, save_kaldi_emb=save_kaldi_emb) - fpr, tpr, thresholds = roc_curve(y, y_score, pos_label=1) - - eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0) - sys.stdout.write("{0:.2f}\n".format(eer * 100)) diff --git a/SoundScribe/SpeakerID/examples/tts/aligner.py b/SoundScribe/SpeakerID/examples/tts/aligner.py deleted file mode 100644 index e32c0444ca687e10cfc86eee290bacc7bc265019..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/aligner.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl - -from nemo.collections.common.callbacks import LogEpochTimeCallback -from nemo.collections.tts.models import AlignerModel -from nemo.core.config import hydra_runner -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path='conf', config_name='aligner') -def main(cfg): - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get('exp_manager', None)) - model = AlignerModel(cfg=cfg.model, trainer=trainer) - trainer.callbacks.extend([pl.callbacks.LearningRateMonitor(), LogEpochTimeCallback()]) # noqa - trainer.fit(model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/aligner_heteronym_disambiguation.py b/SoundScribe/SpeakerID/examples/tts/aligner_heteronym_disambiguation.py deleted file mode 100644 index c97d3db5f24f03dd662f5839f170065bcbe10e41..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/aligner_heteronym_disambiguation.py +++ /dev/null @@ -1,317 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json -import os -import re - -import librosa -import soundfile as sf -import torch - -from nemo.collections.tts.models import AlignerModel -from nemo.collections.tts.parts.utils.tts_dataset_utils import general_padding - - -""" -G2P disambiguation using an Aligner model's input embedding distances. - -Does not handle OOV and leaves them as graphemes. - -The output will have each token's phonemes (or graphemes) bracketed, e.g. -<\"><, ><, >< >< ><.\"> - -Example: -python aligner_heteronym_disambiguation.py \ - --model= \ - --manifest= \ - --out= \ - --confidence=0.02 \ - --verbose -""" - - -def get_args(): - """Retrieve arguments for disambiguation. - """ - parser = argparse.ArgumentParser("G2P disambiguation using Aligner input embedding distances.") - # TODO(jocelynh): Make this required=False with default download from NGC once ckpt uploaded - parser.add_argument('--model', required=True, type=str, help="Path to Aligner model checkpoint (.nemo file).") - parser.add_argument( - '--manifest', - required=True, - type=str, - help="Path to data manifest. Each entry should contain the path to the audio file as well as the text in graphemes.", - ) - parser.add_argument( - '--out', required=True, type=str, help="Path to output file where disambiguations will be written." - ) - parser.add_argument( - '--sr', - required=False, - default=22050, - type=int, - help="Target sample rate to load the dataset. Should match what the model was trained on.", - ) - parser.add_argument( - '--heteronyms', - required=False, - type=str, - default='../../scripts/tts_dataset_files/heteronyms-052722', - help="Heteronyms file to specify which words should be disambiguated. All others will use default pron.", - ) - parser.add_argument( - '--confidence', required=False, type=float, default=0.0, help="Confidence threshold to keep a disambiguation." - ) - parser.add_argument( - '--verbose', - action='store_true', - help="If set to True, logs scores for each disambiguated word in disambiguation_logs.txt.", - ) - args = parser.parse_args() - return args - - -def load_and_prepare_audio(aligner, audio_path, target_sr, device): - """Loads and resamples audio to target sample rate (if necessary), and preprocesses for Aligner input. - """ - # Load audio and get length for preprocessing - audio_data, orig_sr = sf.read(audio_path) - if orig_sr != target_sr: - audio_data = librosa.core.resample(audio_data, orig_sr=orig_sr, target_sr=target_sr) - - audio = torch.tensor(audio_data, dtype=torch.float, device=device).unsqueeze(0) - audio_len = torch.tensor(audio_data.shape[0], device=device).long().unsqueeze(0) - - # Generate spectrogram - spec, spec_len = aligner.preprocessor(input_signal=audio, length=audio_len) - - return spec, spec_len - - -def disambiguate_candidates(aligner, text, spec, spec_len, confidence, device, heteronyms, log_file=None): - """Retrieves and disambiguate all candidate sentences for disambiguation of a given some text. - - Assumes that the max number of candidates per word is a reasonable batch size. - - Note: This could be sped up if multiple words' candidates were batched, but this is conceptually easier. - """ - # Grab original G2P result - aligner_g2p = aligner.tokenizer.g2p - base_g2p = aligner_g2p(text) - - # Tokenize text - words = [word for word, _ in aligner_g2p.word_tokenize_func(text)] - - ### Loop Through Words ### - result_g2p = [] - word_start_idx = 0 - - has_heteronym = False - - for word in words: - # Retrieve the length of the word in the default G2P conversion - g2p_default_len = len(aligner_g2p(word)) - - # Check if word needs to be disambiguated - if word in heteronyms: - has_heteronym = True - - # Add candidate for each ambiguous pronunciation - word_candidates = [] - candidate_prons_and_lengths = [] - - for pron in aligner_g2p.phoneme_dict[word]: - # Replace graphemes in the base G2P result with the current variant - candidate = base_g2p[:word_start_idx] + pron + base_g2p[word_start_idx + g2p_default_len :] - candidate_tokens = aligner.tokenizer.encode_from_g2p(candidate) - - word_candidates.append(candidate_tokens) - candidate_prons_and_lengths.append((pron, len(pron))) - - ### Inference ### - num_candidates = len(word_candidates) - - # If only one candidate, just convert and continue - if num_candidates == 1: - has_heteronym = False - result_g2p.append(f"<{' '.join(candidate_prons_and_lengths[0][0])}>") - word_start_idx += g2p_default_len - continue - - text_len = [len(toks) for toks in word_candidates] - text_len_in = torch.tensor(text_len, device=device).long() - - # Have to pad text tokens in case different pronunciations have different lengths - max_text_len = max(text_len) - text_stack = [] - for i in range(num_candidates): - padded_tokens = general_padding( - torch.tensor(word_candidates[i], device=device).long(), text_len[i], max_text_len - ) - text_stack.append(padded_tokens) - text_in = torch.stack(text_stack) - - # Repeat spectrogram and spec_len tensors to match batch size - spec_in = spec.repeat([num_candidates, 1, 1]) - spec_len_in = spec_len.repeat([num_candidates]) - - with torch.no_grad(): - soft_attn, _ = aligner(spec=spec_in, spec_len=spec_len_in, text=text_in, text_len=text_len_in) - - # Need embedding distances and duration preds to calculate mean distance for just the one word - text_embeddings = aligner.embed(text_in).transpose(1, 2) - l2_dists = aligner.alignment_encoder.get_dist(keys=text_embeddings, queries=spec_in).sqrt() - - durations = aligner.alignment_encoder.get_durations(soft_attn, text_len_in, spec_len_in).int() - - # Retrieve average embedding distances - min_dist = float('inf') - max_dist = 0.0 - best_candidate = None - for i in range(num_candidates): - candidate_mean_dist = aligner.alignment_encoder.get_mean_distance_for_word( - l2_dists=l2_dists[i], - durs=durations[i], - start_token=word_start_idx + (1 if aligner.tokenizer.pad_with_space else 0), - num_tokens=candidate_prons_and_lengths[i][1], - ) - if log_file: - log_file.write(f"{candidate_prons_and_lengths[i][0]} -- {candidate_mean_dist}\n") - - if candidate_mean_dist < min_dist: - min_dist = candidate_mean_dist - best_candidate = candidate_prons_and_lengths[i][0] - if candidate_mean_dist > max_dist: - max_dist = candidate_mean_dist - - # Calculate confidence score. If below threshold, skip and use graphemes. - disamb_conf = (max_dist - min_dist) / ((max_dist + min_dist) / 2.0) - if disamb_conf < confidence: - if log_file: - log_file.write(f"Below confidence threshold: {best_candidate} ({disamb_conf})\n") - - has_heteronym = False - result_g2p.append(f"<{' '.join(aligner_g2p(word))}>") - word_start_idx += g2p_default_len - continue - - # Otherwise, can write disambiguated word - if log_file: - log_file.write(f"best candidate: {best_candidate} (confidence: {disamb_conf})\n") - - result_g2p.append(f"<{' '.join(best_candidate)}>") - else: - if re.search("[a-zA-Z]", word) is None: - # Punctuation or space - result_g2p.append(f"<{word}>") - elif word in aligner_g2p.phoneme_dict: - # Take default pronunciation for everything else in the dictionary - result_g2p.append(f"<{' '.join(aligner_g2p.phoneme_dict[word][0])}>") - else: - # OOV - result_g2p.append(f"<{' '.join(aligner_g2p(word))}>") - - # Advance to phoneme index of next word - word_start_idx += g2p_default_len - - if log_file and has_heteronym: - log_file.write(f"{text}\n") - log_file.write(f"===\n{''.join(result_g2p)}\n===\n") - log_file.write(f"===============================\n") - - return result_g2p, has_heteronym - - -def disambiguate_dataset( - aligner, manifest_path, out_path, sr, heteronyms, confidence, device, verbose, heteronyms_only=True -): - """Disambiguates the phonemes for all words with ambiguous pronunciations in the given manifest. - """ - log_file = open('disambiguation_logs.txt', 'w') if verbose else None - - with open(out_path, 'w') as f_out: - with open(manifest_path, 'r') as f_in: - count = 0 - - for line in f_in: - # Retrieve entry and base G2P conversion for full text - entry = json.loads(line) - # Set punct_post_process=True in order to preserve words with apostrophes - text = aligner.normalizer.normalize(entry['text'], punct_post_process=True) - text = aligner.tokenizer.text_preprocessing_func(text) - - # Load and preprocess audio - audio_path = entry['audio_filepath'] - spec, spec_len = load_and_prepare_audio(aligner, audio_path, sr, device) - - # Get pronunciation candidates and disambiguate - disambiguated_text, has_heteronym = disambiguate_candidates( - aligner, text, spec, spec_len, confidence, device, heteronyms, log_file - ) - - # Skip writing entry if user only wants samples with heteronyms - if heteronyms_only and not has_heteronym: - continue - - # Save entry with disambiguation - entry['disambiguated_text'] = ''.join(disambiguated_text) - f_out.write(f"{json.dumps(entry)}\n") - - count += 1 - if count % 100 == 0: - print(f"Finished {count} entries.") - - print(f"Finished all entries, with a total of {count}.") - if log_file: - log_file.close() - - -def main(): - args = get_args() - - # Check file paths from arguments - if not os.path.exists(args.model): - print("Could not find model checkpoint file: ", args.model) - if not os.path.exists(args.manifest): - print("Could not find data manifest file: ", args.manifest) - if os.path.exists(args.out): - print("Output file already exists: ", args.out) - overwrite = input("Is it okay to overwrite it? (Y/N): ") - if overwrite.lower() != 'y': - print("Not overwriting output file, quitting.") - quit() - if not os.path.exists(args.heteronyms): - print("Could not find heteronyms list: ", args.heteronyms) - - # Read heteronyms list, one per line - heteronyms = set() - with open(args.heteronyms, 'r') as f_het: - for line in f_het: - heteronyms.add(line.strip().lower()) - - # Load model - print("Restoring Aligner model from checkpoint...") - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - aligner = AlignerModel.restore_from(args.model, map_location=device) - - # Disambiguation - print("Beginning disambiguation...") - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - disambiguate_dataset(aligner, args.manifest, args.out, args.sr, heteronyms, args.confidence, device, args.verbose) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/audio_codec.py b/SoundScribe/SpeakerID/examples/tts/audio_codec.py deleted file mode 100644 index 800edfb7fb0fae7ccfc44c136c8f5f5601557adb..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/audio_codec.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl -from omegaconf import OmegaConf - -from nemo.collections.tts.models import AudioCodecModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf/audio_codec", config_name="audio_codec") -def main(cfg): - logging.info('\nConfig Params:\n%s', OmegaConf.to_yaml(cfg, resolve=True)) - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - model = AudioCodecModel(cfg=cfg.model, trainer=trainer) - trainer.fit(model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/conf/aligner.yaml b/SoundScribe/SpeakerID/examples/tts/conf/aligner.yaml deleted file mode 100644 index e6328ee50ba8a0d4f626f32d110fe6edcb6c0134..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/aligner.yaml +++ /dev/null @@ -1,181 +0,0 @@ -# This config contains the default values for training Aligner model on LJSpeech dataset. -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: Aligner - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [ "align_prior_matrix" ] - -# Default values for dataset with sample_rate=22050 -sample_rate: 22050 -n_mel_channels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: 8000 -window: hann - -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" - -model: - symbols_embedding_dim: 384 - bin_loss_start_ratio: 0.2 - bin_loss_warmup_epochs: 100 - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer - punct: true - stresses: true - chars: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: false - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 64 - num_workers: 4 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: false - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 64 - num_workers: 1 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: -11.52 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: clamp - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - alignment_encoder: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_mel_channels: ${model.n_mel_channels} - n_text_channels: ${model.symbols_embedding_dim} - n_att_channels: ${model.n_mel_channels} - - optim: - name: adam - lr: 1e-3 - weight_decay: 1e-6 - - sched: - name: CosineAnnealing - min_lr: 5e-5 - warmup_ratio: 0.35 - -trainer: - devices: 1 - num_nodes: 1 - accelerator: gpu - strategy: ddp - precision: 32 - max_epochs: 1000 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 1 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_forward_sum_loss - mode: min - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null - entity: null - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/audio_codec/audio_codec_24000.yaml b/SoundScribe/SpeakerID/examples/tts/conf/audio_codec/audio_codec_24000.yaml deleted file mode 100644 index e34c0b613306b16598bdd2e3866d52798d380d15..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/audio_codec/audio_codec_24000.yaml +++ /dev/null @@ -1,170 +0,0 @@ -# This config contains the default values for training 24khz audio codec model -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: EnCodec - -max_epochs: ??? -# Adjust batch size based on GPU memory -batch_size: 16 -# When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch. -# If null, then weighted sampling is disabled. -weighted_sampling_steps_per_epoch: null - -# Dataset metadata for each manifest -# https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/data/vocoder_dataset.py#L39-L41 -train_ds_meta: ??? -val_ds_meta: ??? - -log_ds_meta: ??? -log_dir: ??? - -# Modify these values based on your sample rate -sample_rate: 24000 -train_n_samples: 24000 -down_sample_rates: [2, 4, 5, 8] -up_sample_rates: [8, 5, 4, 2] -# The number of samples per encoded audio frame. Should be the product of the down_sample_rates. -# For example 2 * 4 * 5 * 8 = 320. -samples_per_frame: 320 - -model: - - max_epochs: ${max_epochs} - steps_per_epoch: ${weighted_sampling_steps_per_epoch} - - sample_rate: ${sample_rate} - samples_per_frame: ${samples_per_frame} - - mel_loss_l1_scale: 15.0 - mel_loss_l2_scale: 0.0 - stft_loss_scale: 15.0 - time_domain_loss_scale: 0.0 - - # Probability of updating the discriminator during each training step - # For example, update the discriminator 2/3 times (2 updates for every 3 batches) - disc_updates_per_period: 2 - disc_update_period: 3 - - # All resolutions for reconstruction loss, ordered [num_fft, hop_length, window_length] - loss_resolutions: [ - [32, 8, 32], [64, 16, 64], [128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048] - ] - mel_loss_dims: [5, 10, 20, 40, 80, 160, 320] - mel_loss_log_guard: 1.0 - stft_loss_log_guard: 1.0 - - train_ds: - dataset: - _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset - weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} - sample_rate: ${sample_rate} - n_samples: ${train_n_samples} - min_duration: 1.01 - max_duration: null - dataset_meta: ${train_ds_meta} - - dataloader_params: - batch_size: ${batch_size} - drop_last: true - num_workers: 4 - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset - sample_rate: ${sample_rate} - n_samples: null - min_duration: null - max_duration: null - trunc_duration: 10.0 # Only use the first 10 seconds of audio for computing validation loss - dataset_meta: ${val_ds_meta} - - dataloader_params: - batch_size: 8 - num_workers: 2 - - # Configures how audio samples are generated and saved during training. - # Remove this section to disable logging. - log_config: - log_dir: ${log_dir} - log_epochs: [10, 50, 100, 150, 200] - epoch_frequency: 100 - log_tensorboard: false - log_wandb: true - - generators: - - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator - log_audio: true - log_encoding: true - log_dequantized: true - - dataset: - _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset - sample_rate: ${sample_rate} - n_samples: null - min_duration: null - max_duration: null - trunc_duration: 15.0 # Only log the first 15 seconds of generated audio. - dataset_meta: ${log_ds_meta} - - dataloader_params: - batch_size: 4 - num_workers: 2 - - audio_encoder: - _target_: nemo.collections.tts.modules.encodec_modules.SEANetEncoder - down_sample_rates: ${down_sample_rates} - - audio_decoder: - _target_: nemo.collections.tts.modules.encodec_modules.SEANetDecoder - up_sample_rates: ${up_sample_rates} - - vector_quantizer: - _target_: nemo.collections.tts.modules.encodec_modules.ResidualVectorQuantizer - num_codebooks: 8 - - discriminator: - _target_: nemo.collections.tts.modules.encodec_modules.MultiResolutionDiscriminatorSTFT - resolutions: [[128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048]] - - # The original EnCodec uses hinged loss, but squared-GAN loss is more stable - # and reduces the need to tune the loss weights or use a gradient balancer. - generator_loss: - _target_: nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss - - discriminator_loss: - _target_: nemo.collections.tts.losses.audio_codec_loss.DiscriminatorSquaredLoss - - optim: - _target_: torch.optim.Adam - lr: 3e-4 - betas: [0.5, 0.9] - - sched: - name: ExponentialLR - gamma: 0.998 - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp_find_unused_parameters_true - precision: 32 # Vector quantization only works with 32-bit precision. - max_epochs: ${max_epochs} - accumulate_grad_batches: 1 - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 10 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - create_wandb_logger: false - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/audio_codec/encodec_24000.yaml b/SoundScribe/SpeakerID/examples/tts/conf/audio_codec/encodec_24000.yaml deleted file mode 100644 index 05b2cb5871afb92192ea490a41cea3a89bdaa53f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/audio_codec/encodec_24000.yaml +++ /dev/null @@ -1,170 +0,0 @@ -# This config contains the default values for training 24khz EnCodec model -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: EnCodec - -max_epochs: ??? -# Adjust batch size based on GPU memory -batch_size: 16 -# When doing weighted sampling with multiple manifests, this defines how many training steps are in an epoch. -# If null, then weighted sampling is disabled. -weighted_sampling_steps_per_epoch: null - -# Dataset metadata for each manifest -# https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/tts/data/vocoder_dataset.py#L39-L41 -train_ds_meta: ??? -val_ds_meta: ??? - -log_ds_meta: ??? -log_dir: ??? - -# Modify these values based on your sample rate -sample_rate: 24000 -train_n_samples: 24000 -down_sample_rates: [2, 4, 5, 8] -up_sample_rates: [8, 5, 4, 2] -# The number of samples per encoded audio frame. Should be the product of the down_sample_rates. -# For example 2 * 4 * 5 * 8 = 320. -samples_per_frame: 320 - -model: - - max_epochs: ${max_epochs} - steps_per_epoch: ${weighted_sampling_steps_per_epoch} - - sample_rate: ${sample_rate} - samples_per_frame: ${samples_per_frame} - - mel_loss_l1_scale: 1.0 - mel_loss_l2_scale: 1.0 - stft_loss_scale: 0.0 - time_domain_loss_scale: 0.1 - - # Probability of updating the discriminator during each training step - # For example, update the discriminator 2/3 times (2 updates for every 3 batches) - disc_updates_per_period: 2 - disc_update_period: 3 - - # All resolutions for reconstruction loss, ordered [num_fft, hop_length, window_length] - loss_resolutions: [ - [32, 8, 32], [64, 16, 64], [128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048] - ] - mel_loss_dims: [64, 64, 64, 64, 64, 64, 64] - mel_loss_log_guard: 1E-5 - stft_loss_log_guard: 1.0 - - train_ds: - dataset: - _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset - weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} - sample_rate: ${sample_rate} - n_samples: ${train_n_samples} - min_duration: 1.01 - max_duration: null - dataset_meta: ${train_ds_meta} - - dataloader_params: - batch_size: ${batch_size} - drop_last: true - num_workers: 4 - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset - sample_rate: ${sample_rate} - n_samples: null - min_duration: null - max_duration: null - trunc_duration: 10.0 # Only use the first 10 seconds of audio for computing validation loss - dataset_meta: ${val_ds_meta} - - dataloader_params: - batch_size: 8 - num_workers: 2 - - # Configures how audio samples are generated and saved during training. - # Remove this section to disable logging. - log_config: - log_dir: ${log_dir} - log_epochs: [10, 50, 100, 150, 200] - epoch_frequency: 100 - log_tensorboard: false - log_wandb: true - - generators: - - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator - log_audio: true - log_encoding: true - log_dequantized: true - - dataset: - _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset - sample_rate: ${sample_rate} - n_samples: null - min_duration: null - max_duration: null - trunc_duration: 15.0 # Only log the first 15 seconds of generated audio. - dataset_meta: ${log_ds_meta} - - dataloader_params: - batch_size: 4 - num_workers: 2 - - audio_encoder: - _target_: nemo.collections.tts.modules.encodec_modules.SEANetEncoder - down_sample_rates: ${down_sample_rates} - - audio_decoder: - _target_: nemo.collections.tts.modules.encodec_modules.SEANetDecoder - up_sample_rates: ${up_sample_rates} - - vector_quantizer: - _target_: nemo.collections.tts.modules.encodec_modules.ResidualVectorQuantizer - num_codebooks: 8 - - discriminator: - _target_: nemo.collections.tts.modules.encodec_modules.MultiResolutionDiscriminatorSTFT - resolutions: [[128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048]] - - # The original EnCodec uses hinged loss, but squared-GAN loss is more stable - # and reduces the need to tune the loss weights or use a gradient balancer. - generator_loss: - _target_: nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss - - discriminator_loss: - _target_: nemo.collections.tts.losses.audio_codec_loss.DiscriminatorSquaredLoss - - optim: - _target_: torch.optim.Adam - lr: 3e-4 - betas: [0.5, 0.9] - - sched: - name: ExponentialLR - gamma: 0.999 - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp_find_unused_parameters_true - precision: 32 # Vector quantization only works with 32-bit precision. - max_epochs: ${max_epochs} - accumulate_grad_batches: 1 - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - create_wandb_logger: false - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/de/fastpitch_align_22050_grapheme.yaml b/SoundScribe/SpeakerID/examples/tts/conf/de/fastpitch_align_22050_grapheme.yaml deleted file mode 100644 index 8f2acfefbfe4b9485fcbf27cf21d0a4740476b48..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/de/fastpitch_align_22050_grapheme.yaml +++ /dev/null @@ -1,242 +0,0 @@ -# This config contains the default values for training FastPitch model with aligner using 22KHz sampling -# rate. If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [ "align_prior_matrix", "pitch" ] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 132.524658203125 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1 -pitch_std: ??? # e.g. 37.389366149902 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1 - -# Default values for dataset with sample_rate=22050 -sample_rate: 22050 -n_mel_channels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: null -window: hann - -model: - learn_alignment: true - bin_loss_warmup_epochs: 100 - - n_speakers: 1 - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: de - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.GermanCharsTokenizer - punct: true - apostrophe: true - pad_with_space: true - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: 15 # change to null to include longer audios. - min_duration: 0.1 - ignore_file: null - trim: true - trim_top_db: 50 - trim_frame_length: ${model.n_window_size} - trim_hop_length: ${model.n_window_stride} - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 12 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: 15 # change to null to include longer audios. - min_duration: 0.1 - ignore_file: null - trim: true - trim_top_db: 50 - trim_frame_length: ${model.n_window_size} - trim_hop_length: ${model.n_window_stride} - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 8 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - input_fft: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - optim: - name: adamw - lr: 1e-3 - betas: [ 0.9, 0.999 ] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: -1 # specify all GPUs regardless of its availability - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: 1500 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/de/fastpitch_align_22050_mix.yaml b/SoundScribe/SpeakerID/examples/tts/conf/de/fastpitch_align_22050_mix.yaml deleted file mode 100644 index 3ac4593c3ab79382f0143782a8cad5cea2d44cb9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/de/fastpitch_align_22050_mix.yaml +++ /dev/null @@ -1,257 +0,0 @@ -# This config contains the default values for training FastPitch model with aligner using 22KHz sampling -# rate. If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [ "align_prior_matrix", "pitch" ] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 132.524658203125 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1 -pitch_std: ??? # e.g. 37.389366149902 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1 - -# Default values for dataset with sample_rate=22050 -sample_rate: 22050 -n_mel_channels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: null -window: hann - -phoneme_dict_path: "scripts/tts_dataset_files/de/de_nv230119.dict" -heteronyms_path: "scripts/tts_dataset_files/de/de_nv230119.heteronyms" - -model: - learn_alignment: true - bin_loss_warmup_epochs: 100 - - n_speakers: 1 - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: de - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer - locale: 'de-DE' - punct: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p - locale: 'de-DE' - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.8 - ignore_ambiguous_words: false - use_chars: true - use_stresses: true - grapheme_case: mixed - grapheme_prefix: '#' - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: 15 # change to null to include longer audios. - min_duration: 0.1 - ignore_file: null - trim: true - trim_top_db: 50 - trim_frame_length: ${model.n_window_size} - trim_hop_length: ${model.n_window_stride} - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 12 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: 15 # change to null to include longer audios. - min_duration: 0.1 - ignore_file: null - trim: true - trim_top_db: 50 - trim_frame_length: ${model.n_window_size} - trim_hop_length: ${model.n_window_stride} - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 8 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - input_fft: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - optim: - name: adamw - lr: 1e-3 - betas: [ 0.9, 0.999 ] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: -1 # specify all GPUs regardless of its availability - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: 1500 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/de/fastpitch_align_44100_grapheme.yaml b/SoundScribe/SpeakerID/examples/tts/conf/de/fastpitch_align_44100_grapheme.yaml deleted file mode 100644 index 22cde5c674e2fbe0b4f83f1bc526dd2230525f02..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/de/fastpitch_align_44100_grapheme.yaml +++ /dev/null @@ -1,242 +0,0 @@ -# This config contains the default values for training FastPitch model with aligner using 44.1KHz sampling -# rate. If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id" ] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 151.9578857421875 for top-5 speakers in https://github.com/iisys-hof/HUI-Audio-Corpus-German -pitch_std: ??? # e.g. 87.1997680664063 for top-5 speakers in https://github.com/iisys-hof/HUI-Audio-Corpus-German - -# Default values for dataset with sample_rate=44100 -sample_rate: 44100 -n_mel_channels: 80 -n_window_size: 2048 -n_window_stride: 512 -n_fft: 2048 -lowfreq: 0 -highfreq: null -window: hann - -model: - learn_alignment: true - bin_loss_warmup_epochs: 100 - - n_speakers: 119 # change to the number of speakers in your dataset. Make sure it is at least the largest speaker_id + 1 - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: de - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.GermanCharsTokenizer - punct: true - apostrophe: true - pad_with_space: true - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: 15 # change to null to include longer audios. - min_duration: 0.1 - ignore_file: null - trim: true - trim_top_db: 50 - trim_frame_length: ${model.n_window_size} - trim_hop_length: ${model.n_window_stride} - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 12 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: 15 # change to null to include longer audios. - min_duration: 0.1 - ignore_file: null - trim: true - trim_top_db: 50 - trim_frame_length: ${model.n_window_size} - trim_hop_length: ${model.n_window_stride} - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 8 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - input_fft: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - optim: - name: adamw - lr: 1e-3 - betas: [ 0.9, 0.999 ] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: -1 # specify all GPUs regardless of its availability - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: 1500 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/de/fastpitch_align_44100_phoneme.yaml b/SoundScribe/SpeakerID/examples/tts/conf/de/fastpitch_align_44100_phoneme.yaml deleted file mode 100644 index 6b8d6de19de4554d181fd162a76fb8dbbdb67db4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/de/fastpitch_align_44100_phoneme.yaml +++ /dev/null @@ -1,237 +0,0 @@ -# This config contains the default values for training FastPitch model with aligner using 44.1KHz sampling -# rate. If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id" ] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 151.9578857421875 for top-5 speakers in https://github.com/iisys-hof/HUI-Audio-Corpus-German -pitch_std: ??? # e.g. 87.1997680664063 for top-5 speakers in https://github.com/iisys-hof/HUI-Audio-Corpus-German - -# Default values for dataset with sample_rate=44100 -sample_rate: 44100 -n_mel_channels: 80 -n_window_size: 2048 -n_window_stride: 512 -n_fft: 2048 -lowfreq: 0 -highfreq: null -window: hann - -model: - learn_alignment: true - bin_loss_warmup_epochs: 100 - - n_speakers: 119 # change to the number of speakers in your dataset. Make sure it is at least the largest speaker_id + 1 - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: de - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.GermanPhonemesTokenizer - punct: true - apostrophe: true - pad_with_space: true - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: 15 # change to null to include longer audios. - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - use_beta_binomial_interpolator: false - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 12 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: 15 # change to null to include longer audios. - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - use_beta_binomial_interpolator: false - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 8 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - input_fft: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - optim: - name: adamw - lr: 1e-3 - betas: [ 0.9, 0.999 ] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: -1 # specify all GPUs regardless of its availability - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: 1500 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/es/fastpitch_align_44100.yaml b/SoundScribe/SpeakerID/examples/tts/conf/es/fastpitch_align_44100.yaml deleted file mode 100644 index 4cf2fb8c1ef7eaca217565e20c8e2ee8b7ec7b30..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/es/fastpitch_align_44100.yaml +++ /dev/null @@ -1,223 +0,0 @@ -# This config contains the default values for training grapheme Spanish FastPitch model with aligner using -# 44.1KHz sampling rate. If you want to train model on other dataset, you can change config values according -# to your dataset. Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: ["align_prior_matrix", "pitch"] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 178.86880493164062 for 174 speakers in https://research.google/pubs/pub49150/ -pitch_std: ??? # e.g. 60.64979553222656 for 174 speakers in https://research.google/pubs/pub49150/ - -sample_rate: 44100 -n_mel_channels: 80 -n_window_size: 2048 -n_window_stride: 512 -n_fft: 2048 -lowfreq: 0 -highfreq: null -window: hann - -model: - learn_alignment: true - bin_loss_warmup_epochs: 100 - - n_speakers: 1 - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.SpanishCharsTokenizer - pad_with_space: true - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: 15 - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 12 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: 15 - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 8 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - input_fft: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - optim: - name: adamw - lr: 1e-3 - betas: [0.9, 0.999] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: -1 # number of gpus - accelerator: gpu - strategy: ddp - precision: 16 - max_steps: 1000 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml b/SoundScribe/SpeakerID/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml deleted file mode 100644 index 5ecefb38a8c9e5af77b5e3bea02fda62d50adc59..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/es/fastpitch_align_44100_ipa.yaml +++ /dev/null @@ -1,236 +0,0 @@ -# This config contains the default values for training IPA Spanish FastPitch model with aligner using -# 44.1KHz sampling rate. If you want to train model on other dataset, you can change config values according -# to your dataset. Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: ["align_prior_matrix", "pitch"] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 178.86880493164062 for 174 speakers in https://research.google/pubs/pub49150/ -pitch_std: ??? # e.g. 60.64979553222656 for 174 speakers in https://research.google/pubs/pub49150/ - -sample_rate: 44100 -n_mel_channels: 80 -n_window_size: 2048 -n_window_stride: 512 -n_fft: 2048 -lowfreq: 0 -highfreq: null -window: hann - -phoneme_dict_path: ??? - -model: - learn_alignment: true - bin_loss_warmup_epochs: 100 - - n_speakers: 1 - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer - locale: es-ES - punct: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p - locale: es-ES - phoneme_dict: ${phoneme_dict_path} - phoneme_probability: 0.5 - ignore_ambiguous_words: false - use_chars: true - use_stresses: true - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: 15 - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 12 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: 15 - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 8 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - input_fft: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - optim: - name: adamw - lr: 1e-3 - betas: [0.9, 0.999] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: -1 # number of gpus - accelerator: gpu - strategy: ddp - precision: 16 - max_steps: 1000 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml b/SoundScribe/SpeakerID/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml deleted file mode 100644 index c55af512df20f72491c21317f476994fa8bb88f0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml +++ /dev/null @@ -1,231 +0,0 @@ -# This config contains the default values for training multi-speaker IPA Spanish FastPitch model with aligner using -# 44.1KHz sampling rate. If you want to train model on other dataset, you can change config values according -# to your dataset. Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: ["align_prior_matrix", "pitch", "speaker_id"] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# See "scripts/tts_dataset_files/openslr_es/pitch_stats.json" for format example -pitch_stats_path: ??? - -sample_rate: 44100 -n_mel_channels: 80 -n_window_size: 2048 -n_window_stride: 512 -n_fft: 2048 -lowfreq: 0 -highfreq: null -window: hann - -phoneme_dict_path: ??? - -model: - learn_alignment: true - bin_loss_warmup_epochs: 100 - - n_speakers: ??? - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - speaker_emb_condition_prosody: true - speaker_emb_condition_aligner: true - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer - locale: es-ES - punct: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p - locale: es-ES - phoneme_dict: ${phoneme_dict_path} - phoneme_probability: 0.5 - ignore_ambiguous_words: false - use_chars: true - use_stresses: true - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: 15 - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_stats_path: ${pitch_stats_path} - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 12 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: 15 - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_stats_path: ${pitch_stats_path} - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 8 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - input_fft: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - optim: - name: adamw - lr: 1e-3 - betas: [0.9, 0.999] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: -1 # number of gpus - accelerator: gpu - strategy: ddp - precision: 16 - max_steps: 1000 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch/fastpitch_22050.yaml b/SoundScribe/SpeakerID/examples/tts/conf/fastpitch/fastpitch_22050.yaml deleted file mode 100644 index 846d09edcfee98a79d8ef8629725f882d69fd63e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch/fastpitch_22050.yaml +++ /dev/null @@ -1,286 +0,0 @@ -# This config contains the default values for training an English 22.05kHz FastPitch model. -# If you want to train a model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -max_epochs: ??? -batch_size: 32 -weighted_sampling_steps_per_epoch: null - -n_speakers: ??? -speaker_path: null -feature_stats_path: null - -train_ds_meta: ??? -val_ds_meta: ??? -log_ds_meta: ??? - -phoneme_dict_path: ??? -heteronyms_path: ??? - -log_dir: ??? -vocoder_type: ??? -vocoder_name: null -vocoder_checkpoint_path: null - -# The below feature config should match the feature.yaml config used during preprocessing. -sample_rate: 22050 -win_length: 1024 -hop_length: 256 - -mel_feature: - _target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer - sample_rate: ${sample_rate} - win_length: ${win_length} - hop_length: ${hop_length} - mel_dim: 80 - lowfreq: 0 - highfreq: null - -pitch_feature: - _target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer - sample_rate: ${sample_rate} - win_length: ${win_length} - hop_length: ${hop_length} - pitch_fmin: 60 - pitch_fmax: 640 - -energy_feature: - _target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer - spec_featurizer: ${mel_feature} - -featurizers: - pitch: ${pitch_feature} - energy: ${energy_feature} - - -model: - learn_alignment: true - bin_loss_warmup_epochs: 100 - - n_speakers: ${n_speakers} - n_mel_channels: ${mel_feature.mel_dim} - min_token_duration: 1 - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - energy_embedding_kernel_size: 3 - speaker_emb_condition_prosody: true - speaker_emb_condition_aligner: true - use_log_energy: false - dur_loss_scale: 0.1 - pitch_loss_scale: 0.1 - energy_loss_scale: 0.1 - aligner_loss_scale: 0.1 - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${mel_feature.mel_dim} - lowfreq: ${mel_feature.lowfreq} - highfreq: ${mel_feature.highfreq} - n_fft: ${win_length} - n_window_size: ${win_length} - window_size: false - n_window_stride: ${hop_length} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${sample_rate} - window: hann - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1.0 - mag_power: 1.0 - mel_norm: null - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer - punct: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.8 - ignore_ambiguous_words: false - use_chars: true - use_stresses: true - - pitch_processor: - _target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization - field: pitch - stats_path: ${feature_stats_path} - - energy_processor: - _target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization - field: energy - stats_path: ${feature_stats_path} - - train_ds: - dataset: - _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset - dataset_meta: ${train_ds_meta} - weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} - sample_rate: ${sample_rate} - speaker_path: ${speaker_path} - align_prior_hop_length: ${hop_length} - featurizers: ${featurizers} - feature_processors: - pitch: ${model.pitch_processor} - energy: ${model.energy_processor} - min_duration: 0.1 - max_duration: 10.0 - - dataloader_params: - batch_size: ${batch_size} - num_workers: 4 - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset - dataset_meta: ${val_ds_meta} - sample_rate: ${sample_rate} - speaker_path: ${speaker_path} - align_prior_hop_length: ${hop_length} - featurizers: ${featurizers} - feature_processors: - pitch: ${model.pitch_processor} - energy: ${model.energy_processor} - - dataloader_params: - batch_size: ${batch_size} - num_workers: 2 - - log_config: - log_dir: ${log_dir} - log_epochs: [10, 50] - epoch_frequency: 100 - log_tensorboard: false - log_wandb: false - - generators: - - _target_: nemo.collections.tts.parts.utils.callbacks.FastPitchArtifactGenerator - log_spectrogram: true - log_alignment: true - audio_params: - _target_: nemo.collections.tts.parts.utils.callbacks.LogAudioParams - log_audio_gta: true - vocoder_type: ${vocoder_type} - vocoder_name: ${vocoder_name} - vocoder_checkpoint_path: ${vocoder_checkpoint_path} - - dataset: - _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset - text_tokenizer: ${model.text_tokenizer} - sample_rate: ${sample_rate} - speaker_path: ${speaker_path} - align_prior_hop_length: ${hop_length} - featurizers: ${featurizers} - - feature_processors: - pitch: ${model.pitch_processor} - energy: ${model.energy_processor} - - dataset_meta: ${log_ds_meta} - - dataloader_params: - batch_size: 8 - num_workers: 2 - - input_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 2 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - dist_type: cosine - temperature: 15.0 - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - energy_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - optim: - name: adamw - lr: 1e-3 - betas: [0.9, 0.999] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: ${max_epochs} - accumulate_grad_batches: 1 - gradient_clip_val: 10.0 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 10 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch/fastpitch_44100.yaml b/SoundScribe/SpeakerID/examples/tts/conf/fastpitch/fastpitch_44100.yaml deleted file mode 100644 index da9e9a29b1e7d8d09cae876a49ba8d8bf0670b8d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch/fastpitch_44100.yaml +++ /dev/null @@ -1,286 +0,0 @@ -# This config contains the default values for training an English 44.1kHz FastPitch model. -# If you want to train a model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -max_epochs: ??? -batch_size: 32 -weighted_sampling_steps_per_epoch: null - -n_speakers: ??? -speaker_path: null -feature_stats_path: null - -train_ds_meta: ??? -val_ds_meta: ??? -log_ds_meta: ??? - -phoneme_dict_path: ??? -heteronyms_path: ??? - -log_dir: ??? -vocoder_type: ??? -vocoder_name: null -vocoder_checkpoint_path: null - -# The below feature config should match the feature.yaml config used during preprocessing. -sample_rate: 44100 -win_length: 2048 -hop_length: 512 - -mel_feature: - _target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer - sample_rate: ${sample_rate} - win_length: ${win_length} - hop_length: ${hop_length} - mel_dim: 80 - lowfreq: 0 - highfreq: null - -pitch_feature: - _target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer - sample_rate: ${sample_rate} - win_length: ${win_length} - hop_length: ${hop_length} - pitch_fmin: 60 - pitch_fmax: 640 - -energy_feature: - _target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer - spec_featurizer: ${mel_feature} - -featurizers: - pitch: ${pitch_feature} - energy: ${energy_feature} - - -model: - learn_alignment: true - bin_loss_warmup_epochs: 100 - - n_speakers: ${n_speakers} - n_mel_channels: ${mel_feature.mel_dim} - min_token_duration: 1 - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - energy_embedding_kernel_size: 3 - speaker_emb_condition_prosody: true - speaker_emb_condition_aligner: true - use_log_energy: false - dur_loss_scale: 0.1 - pitch_loss_scale: 0.1 - energy_loss_scale: 0.1 - aligner_loss_scale: 0.1 - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${mel_feature.mel_dim} - lowfreq: ${mel_feature.lowfreq} - highfreq: ${mel_feature.highfreq} - n_fft: ${win_length} - n_window_size: ${win_length} - window_size: false - n_window_stride: ${hop_length} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${sample_rate} - window: hann - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1.0 - mag_power: 1.0 - mel_norm: null - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer - punct: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.8 - ignore_ambiguous_words: false - use_chars: true - use_stresses: true - - pitch_processor: - _target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization - field: pitch - stats_path: ${feature_stats_path} - - energy_processor: - _target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization - field: energy - stats_path: ${feature_stats_path} - - train_ds: - dataset: - _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset - dataset_meta: ${train_ds_meta} - weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} - sample_rate: ${sample_rate} - speaker_path: ${speaker_path} - align_prior_hop_length: ${hop_length} - featurizers: ${featurizers} - feature_processors: - pitch: ${model.pitch_processor} - energy: ${model.energy_processor} - min_duration: 0.1 - max_duration: 10.0 - - dataloader_params: - batch_size: ${batch_size} - num_workers: 4 - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset - dataset_meta: ${val_ds_meta} - sample_rate: ${sample_rate} - speaker_path: ${speaker_path} - align_prior_hop_length: ${hop_length} - featurizers: ${featurizers} - feature_processors: - pitch: ${model.pitch_processor} - energy: ${model.energy_processor} - - dataloader_params: - batch_size: ${batch_size} - num_workers: 2 - - log_config: - log_dir: ${log_dir} - log_epochs: [10, 50] - epoch_frequency: 100 - log_tensorboard: false - log_wandb: false - - generators: - - _target_: nemo.collections.tts.parts.utils.callbacks.FastPitchArtifactGenerator - log_spectrogram: true - log_alignment: true - audio_params: - _target_: nemo.collections.tts.parts.utils.callbacks.LogAudioParams - log_audio_gta: true - vocoder_type: ${vocoder_type} - vocoder_name: ${vocoder_name} - vocoder_checkpoint_path: ${vocoder_checkpoint_path} - - dataset: - _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset - text_tokenizer: ${model.text_tokenizer} - sample_rate: ${sample_rate} - speaker_path: ${speaker_path} - align_prior_hop_length: ${hop_length} - featurizers: ${featurizers} - - feature_processors: - pitch: ${model.pitch_processor} - energy: ${model.energy_processor} - - dataset_meta: ${log_ds_meta} - - dataloader_params: - batch_size: 8 - num_workers: 2 - - input_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 2 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - dist_type: cosine - temperature: 15.0 - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - energy_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - optim: - name: adamw - lr: 1e-3 - betas: [0.9, 0.999] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: ${max_epochs} - accumulate_grad_batches: 1 - gradient_clip_val: 10.0 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 10 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_align_44100.yaml b/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_align_44100.yaml deleted file mode 100644 index 13d631a4b6787abe5a7a33c8a123d141d8bb7bac..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_align_44100.yaml +++ /dev/null @@ -1,248 +0,0 @@ -# This config contains the default values for training FastPitch model with aligner using 44.1KHz sampling -# rate. If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [ "align_prior_matrix", "pitch" ] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech -pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech - -sample_rate: 44100 -n_mel_channels: 80 -n_window_size: 2048 -n_window_stride: 512 -n_fft: 2048 -lowfreq: 0 -highfreq: null -window: hann - -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" - -model: - learn_alignment: true - bin_loss_warmup_epochs: 100 - - n_speakers: 1 - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer - punct: true - stresses: true - chars: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - use_beta_binomial_interpolator: true - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 12 - pin_memory: true - - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: null - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - use_beta_binomial_interpolator: true - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 8 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - input_fft: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - optim: - name: adamw - lr: 1e-3 - betas: [0.9, 0.999] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: -1 # number of gpus - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: 1500 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: False # Provided by exp_manager - logger: False # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false - diff --git a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_align_44100_adapter.yaml b/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_align_44100_adapter.yaml deleted file mode 100644 index 3c41cf3e55e5952ec7368745a454ab3ebec9edb7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_align_44100_adapter.yaml +++ /dev/null @@ -1,310 +0,0 @@ -# This config contains the default values for training FastPitch speaker adaptation -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id", "reference_audio"] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech -pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech - -# Default values for dataset with sample_rate=44100 -sample_rate: 44100 -n_mel_channels: 80 -n_window_size: 2048 -n_window_stride: 512 -n_fft: 2048 -lowfreq: 0 -highfreq: 8000 -window: hann - -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" - -model: - unfreeze_aligner: false - unfreeze_duration_predictor: false - unfreeze_pitch_predictor: false - learn_alignment: true - bin_loss_warmup_epochs: 100 - - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer - punct: true - stresses: true - chars: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.modules.EnglishG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - - adapter: - # Config of the adapter training/eval script. - adapter_name: "adapter" # Name of the adapter, used by the script - adapter_module_name: "encoder+decoder+duration_predictor+pitch_predictor+aligner" # Name of the adapter module. Combine multiple modules with '+' between module names. - adapter_state_dict_name: "adapters.pt" # If the individual adapters must be saved, a file name can be provided here. null disables this. - - # Config of the adapter module itself - _target_: nemo.collections.common.parts.adapter_modules.LinearAdapter - in_features: ${model.symbols_embedding_dim} # User must provide the output dimension of the layers of the model, which is the input dimension of this adapter. - dim: 256 # The hidden dimension of the adapter, as chosen by user, but small values are preferred to reduce param count. - activation: swish - norm_position: 'pre' # Can be `pre` or `post` - dropout: 0.0 # float, dropout for the adapter - - # Adapter strategy config - adapter_strategy: - _target_: nemo.core.classes.mixins.adapter_mixin_strategies.ResidualAddAdapterStrategy - stochastic_depth: 0.0 # float, setting to > 0 will enable stochastic depth for each adapter block. - l2_lambda: 0.0 # float, setting to > 0 will enable l2 norm auxiliary loss for each adapter's output. - - # Optional global config available to all adapters at a global level. - # A global config is shared across every layer of the adapters, defining global properties rather - # than properties local to the adapter (as defined above). - # This can be useful in order to select *which type of adapter* is added, *what adapters to enable*, - # and further global operations that can decide dynamically how to support the requested adapter. - global_cfg: - check_encoder_adapter: True # determines whether to check if encoder adapter modules is supported - check_decoder_adapter: True # determines whether to check if decoder adapter modules is supported - check_duration_predictor_adapter: True # determines whether to check if duration_predictor adapter modules is supported - check_pitch_predictor_adapter: True # determines whether to check if pitch_predictor adapter modules is supported - check_aligner_adapter: True # determines whether to check if aligner adapter modules is supported - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - use_beta_binomial_interpolator: true - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 12 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - use_beta_binomial_interpolator: true - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 8 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - input_fft: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ] - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ] - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - condition_types: [ "add" ] # options: [ "add", "concat" ] - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ] - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ] - - speaker_encoder: - _target_: nemo.collections.tts.modules.submodules.SpeakerEncoder - precomputed_embedding_dim: null - lookup_module: - _target_: nemo.collections.tts.modules.submodules.SpeakerLookupTable - n_speakers: ??? - embedding_dim: ${model.symbols_embedding_dim} - gst_module: - _target_: nemo.collections.tts.modules.submodules.GlobalStyleToken - gst_size: ${model.symbols_embedding_dim} - n_style_token: 10 - n_style_attn_head: 4 - reference_encoder: - _target_: nemo.collections.tts.modules.submodules.ReferenceEncoder - n_mels: ${model.n_mel_channels} - cnn_filters: [32, 32, 64, 64, 128, 128] - dropout: 0.2 - gru_hidden: ${model.symbols_embedding_dim} - kernel_size: 3 - stride: 2 - padding: 1 - bias: true - - optim: - name: adamw - lr: 1e-3 - betas: [0.9, 0.999] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: 1000 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 1 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_align_ipa.yaml b/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_align_ipa.yaml deleted file mode 100644 index 3e515ca45ae89c7d511878c24b08c1952ee55c4d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_align_ipa.yaml +++ /dev/null @@ -1,247 +0,0 @@ -# This config contains the default values for training a FastPitch model with aligner. -# If you want to train a model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [ "align_prior_matrix", "pitch" ] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech -pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech - -# Default values for dataset with sample_rate=22050 -sample_rate: 22050 -n_mel_channels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: 8000 -window: hann - -phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" - -model: - learn_alignment: true - bin_loss_warmup_epochs: 100 - - n_speakers: 1 - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer - punct: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.8 - # Relies on the heteronyms list for anything that needs to be disambiguated - ignore_ambiguous_words: false - use_chars: true - use_stresses: true - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - use_beta_binomial_interpolator: true - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 12 - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: null - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - use_beta_binomial_interpolator: true - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 8 - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - input_fft: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - optim: - name: adamw - lr: 1e-3 - betas: [0.9, 0.999] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp - precision: 32 - max_epochs: 1000 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_align_ipa_adapter.yaml b/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_align_ipa_adapter.yaml deleted file mode 100644 index 3e7b7ec9b2aefe5f82f33b50b3aa212ca7be761b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_align_ipa_adapter.yaml +++ /dev/null @@ -1,324 +0,0 @@ -# This config contains the default values for training FastPitch speaker adaptation -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id", "reference_audio"] - - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech -pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech - -# Default values for dataset with sample_rate=44100 -sample_rate: 44100 -n_mel_channels: 80 -n_window_size: 2048 -n_window_stride: 512 -n_fft: 2048 -lowfreq: 0 -highfreq: 8000 -window: hann - -phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" - -model: - unfreeze_aligner: false - unfreeze_duration_predictor: false - unfreeze_pitch_predictor: false - unfreeze_energy_predictor: false - learn_alignment: true - bin_loss_warmup_epochs: 100 - - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - energy_embedding_kernel_size: 3 - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer - punct: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.modules.IPAG2P - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.8 - # Relies on the heteronyms list for anything that needs to be disambiguated - ignore_ambiguous_words: false - use_chars: true - use_stresses: true - - adapter: - # Config of the adapter training/eval script. - adapter_name: "adapter" # Name of the adapter, used by the script - adapter_module_name: "encoder+decoder+duration_predictor+pitch_predictor+aligner" # Name of the adapter module. Combine multiple modules with '+' between module names. - adapter_state_dict_name: "adapters.pt" # If the individual adapters must be saved, a file name can be provided here. null disables this. - - # Config of the adapter module itself - _target_: nemo.collections.common.parts.adapter_modules.LinearAdapter - in_features: ${model.symbols_embedding_dim} # User must provide the output dimension of the layers of the model, which is the input dimension of this adapter. - dim: 256 # The hidden dimension of the adapter, as chosen by user, but small values are preferred to reduce param count. - activation: swish - norm_position: 'pre' # Can be `pre` or `post` - dropout: 0.0 # float, dropout for the adapter - - # Adapter strategy config - adapter_strategy: - _target_: nemo.core.classes.mixins.adapter_mixin_strategies.ResidualAddAdapterStrategy - stochastic_depth: 0.0 # float, setting to > 0 will enable stochastic depth for each adapter block. - l2_lambda: 0.0 # float, setting to > 0 will enable l2 norm auxiliary loss for each adapter's output. - - # Optional global config available to all adapters at a global level. - # A global config is shared across every layer of the adapters, defining global properties rather - # than properties local to the adapter (as defined above). - # This can be useful in order to select *which type of adapter* is added, *what adapters to enable*, - # and further global operations that can decide dynamically how to support the requested adapter. - global_cfg: - check_encoder_adapter: True # determines whether to check if encoder adapter modules is supported - check_decoder_adapter: True # determines whether to check if decoder adapter modules is supported - check_duration_predictor_adapter: True # determines whether to check if duration_predictor adapter modules is supported - check_pitch_predictor_adapter: True # determines whether to check if pitch_predictor adapter modules is supported - check_aligner_adapter: True # determines whether to check if aligner adapter modules is supported - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - use_beta_binomial_interpolator: true - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 12 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - use_beta_binomial_interpolator: true - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 8 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - input_fft: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ] - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ] - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - condition_types: [ "add" ] # options: [ "add", "concat" ] - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ] - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ] - - energy_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ] - - speaker_encoder: - _target_: nemo.collections.tts.modules.submodules.SpeakerEncoder - precomputed_embedding_dim: null - lookup_module: - _target_: nemo.collections.tts.modules.submodules.SpeakerLookupTable - n_speakers: ??? - embedding_dim: ${model.symbols_embedding_dim} - gst_module: - _target_: nemo.collections.tts.modules.submodules.GlobalStyleToken - gst_size: ${model.symbols_embedding_dim} - n_style_token: 10 - n_style_attn_head: 4 - reference_encoder: - _target_: nemo.collections.tts.modules.submodules.ReferenceEncoder - n_mels: ${model.n_mel_channels} - cnn_filters: [32, 32, 64, 64, 128, 128] - dropout: 0.2 - gru_hidden: ${model.symbols_embedding_dim} - kernel_size: 3 - stride: 2 - padding: 1 - bias: true - - optim: - name: adamw - lr: 1e-3 - betas: [0.9, 0.999] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: 1000 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 1 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_align_v1.05.yaml b/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_align_v1.05.yaml deleted file mode 100644 index 5d7c753e94145a92c7c87cfaeeaaa8e12fcb2232..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_align_v1.05.yaml +++ /dev/null @@ -1,248 +0,0 @@ -# This config contains the default values for training FastPitch model with aligner on LJSpeech dataset. -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [ "align_prior_matrix", "pitch" ] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech -pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech - -# Default values for dataset with sample_rate=22050 -sample_rate: 22050 -n_mel_channels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: 8000 -window: hann - -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" - -model: - learn_alignment: true - bin_loss_warmup_epochs: 100 - - n_speakers: 1 - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer - punct: true - stresses: true - chars: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - use_beta_binomial_interpolator: true - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 12 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: null - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - use_beta_binomial_interpolator: true - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 8 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - input_fft: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - optim: - name: adamw - lr: 1e-3 - betas: [0.9, 0.999] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: 1000 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_ssl.yaml b/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_ssl.yaml deleted file mode 100644 index 12a3404b4d13848171e940b4a6bf101c08f7f005..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/fastpitch_ssl.yaml +++ /dev/null @@ -1,184 +0,0 @@ -# This config contains the default values for training FastPitch model with aligner on LJSpeech dataset. -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -ssl_model_ckpt_path: ??? -hifi_ckpt_path: ??? -sup_data_dir: null - -# LJSpeech stats (per frame) -# ignored if pitch_normalization: speaker_wise -pitch_mean: ??? #212.35873413085938 -pitch_std: ??? #68.52806091308594 - -# Default values for dataset with sample_rate=22050 -sample_rate: 22050 -n_mel_channels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: 8000 -window: hann - - -ssl_content_emb_type: "embedding_and_probs" -speaker_stats_pitch_fp: null -pitch_normalization: speaker_wise -use_unique_tokens: true -speaker_conditioning_type: per_sample -segment_speaker_embedding: true -ssl_downsampling_factor: 4 # How many mel-spectrogram frames map to one content embedding in the SSL model - -model: - ssl_model_ckpt_path: ${ssl_model_ckpt_path} - ssl_downsampling_factor: ${ssl_downsampling_factor} - use_encoder: true - use_duration_predictor: ${use_unique_tokens} - pitch_conditioning: true - pitch_loss_scale: 1.0 - learn_alignment: true - bin_loss_warmup_epochs: 100 - - n_speakers: 1 - n_datasets: 1 - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - content_emb_indim: 174 - speaker_emb_indim: 256 - content_emb_outdim: 192 - speaker_emb_outdim: 192 - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.FastPitchSSLDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - ssl_content_emb_type: ${ssl_content_emb_type} - pitch_conditioning: true - pitch_normalization: ${pitch_normalization} - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - speaker_stats_pitch_fp: ${speaker_stats_pitch_fp} - min_duration: 0.5 - max_duration: 16.0 - pad_multiple: 1024 - speaker_conditioning_type: ${speaker_conditioning_type} - sup_data_dir: ${sup_data_dir} - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 2 - num_workers: 8 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.FastPitchSSLDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - ssl_content_emb_type: ${ssl_content_emb_type} - pitch_conditioning: true - pitch_normalization: ${pitch_normalization} - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - speaker_stats_pitch_fp: ${speaker_stats_pitch_fp} - min_duration: 0.5 - max_duration: 16.0 - pad_multiple: 1024 - speaker_conditioning_type: ${speaker_conditioning_type} - sup_data_dir: ${sup_data_dir} - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 2 - num_workers: 0 - pin_memory: true - - # both encoder and decoder have same architecture, FFTransformerDecoder - encoder: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - optim: - _target_: torch.optim.AdamW - lr: 0.0002 - betas: [0.8, 0.99] - -trainer: - num_nodes: 1 - devices: -1 - accelerator: gpu - strategy: ddp - precision: 32 - max_epochs: 1000 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: v_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/feature/feature_22050.yaml b/SoundScribe/SpeakerID/examples/tts/conf/feature/feature_22050.yaml deleted file mode 100644 index 8071eb7933bb54f5728049ae5abdc52b3832eb43..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/feature/feature_22050.yaml +++ /dev/null @@ -1,28 +0,0 @@ -sample_rate: 22050 -win_length: 1024 -hop_length: 256 - -mel_feature: - _target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer - sample_rate: ${sample_rate} - win_length: ${win_length} - hop_length: ${hop_length} - mel_dim: 80 - lowfreq: 0 - highfreq: null - -pitch_feature: - _target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer - sample_rate: ${sample_rate} - win_length: ${win_length} - hop_length: ${hop_length} - pitch_fmin: 60 - pitch_fmax: 640 - -energy_feature: - _target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer - spec_featurizer: ${mel_feature} - -featurizers: - pitch: ${pitch_feature} - energy: ${energy_feature} diff --git a/SoundScribe/SpeakerID/examples/tts/conf/feature/feature_44100.yaml b/SoundScribe/SpeakerID/examples/tts/conf/feature/feature_44100.yaml deleted file mode 100644 index 0cfc27f4dab3789cea72d3559f1d188f06dabbc8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/feature/feature_44100.yaml +++ /dev/null @@ -1,28 +0,0 @@ -sample_rate: 44100 -win_length: 2048 -hop_length: 512 - -mel_feature: - _target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer - sample_rate: ${sample_rate} - win_length: ${win_length} - hop_length: ${hop_length} - mel_dim: 80 - lowfreq: 0 - highfreq: null - -pitch_feature: - _target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer - sample_rate: ${sample_rate} - win_length: ${win_length} - hop_length: ${hop_length} - pitch_fmin: 60 - pitch_fmax: 640 - -energy_feature: - _target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer - spec_featurizer: ${mel_feature} - -featurizers: - pitch: ${pitch_feature} - energy: ${energy_feature} diff --git a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/hifigan.yaml b/SoundScribe/SpeakerID/examples/tts/conf/hifigan/hifigan.yaml deleted file mode 100644 index 0f8be80a097323d8a5463d0d91162c6fe3174295..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/hifigan.yaml +++ /dev/null @@ -1,99 +0,0 @@ -# This config contains the default values for training HiFi-GAN model on LJSpeech dataset. -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: "HifiGan" - -train_dataset: ??? -validation_datasets: ??? - -# Default values for dataset with sample_rate=22050 -sample_rate: 22050 -n_mel_channels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: 8000 -window: hann - -train_n_segments: 8192 -train_max_duration: null -train_min_duration: 0.75 - -val_n_segments: 66048 -val_max_duration: null -val_min_duration: 3 - -defaults: - - model/generator: v1 - - model/train_ds: train_ds - - model/validation_ds: val_ds - -model: - preprocessor: - _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures - nfilt: ${n_mel_channels} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - n_fft: ${n_fft} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - pad_to: 0 - pad_value: -11.52 - sample_rate: ${sample_rate} - window: ${window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: clamp - log_zero_guard_value: 1e-05 - mag_power: 1.0 - use_grads: false - exact_pad: true - - optim: - _target_: torch.optim.AdamW - lr: 0.0002 - betas: [0.8, 0.99] - - sched: - name: CosineAnnealing - min_lr: 1e-5 - warmup_ratio: 0.02 - - max_steps: 2500000 - l1_loss_factor: 45 - denoise_strength: 0.0025 - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp_find_unused_parameters_true - precision: 32 - max_steps: ${model.max_steps} - accumulate_grad_batches: 1 - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 10 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - mode: min - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null - entity: null - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/hifigan_44100.yaml b/SoundScribe/SpeakerID/examples/tts/conf/hifigan/hifigan_44100.yaml deleted file mode 100644 index 700bace947adcf06258a346e50b973a4453f7ebf..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/hifigan_44100.yaml +++ /dev/null @@ -1,99 +0,0 @@ -# This config contains the default values for training HiFi-GAN model on HiFi-TTS dataset. -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: "HifiGan" - -train_dataset: ??? -validation_datasets: ??? - -# Default values for dataset with sample_rate=44100 -sample_rate: 44100 -n_mel_channels: 80 -n_window_size: 2048 -n_window_stride: 512 -n_fft: 2048 -lowfreq: 0 -highfreq: null -window: hann - -train_n_segments: 16384 -train_max_duration: null # change to null to include longer audios. -train_min_duration: 0.75 - -val_n_segments: 131072 -val_max_duration: null -val_min_duration: 3 - -defaults: - - model/generator: v1_44100 - - model/train_ds: train_ds - - model/validation_ds: val_ds - -model: - preprocessor: - _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures - nfilt: ${n_mel_channels} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - n_fft: ${n_fft} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - pad_to: 0 - pad_value: -11.52 - sample_rate: ${sample_rate} - window: ${window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: clamp - log_zero_guard_value: 1e-05 - mag_power: 1.0 - use_grads: false - exact_pad: true - - optim: - _target_: torch.optim.AdamW - lr: 0.0002 - betas: [0.8, 0.99] - - sched: - name: CosineAnnealing - min_lr: 1e-5 - warmup_ratio: 0.02 - - max_steps: 2500000 - l1_loss_factor: 45 - denoise_strength: 0.0025 - -trainer: - num_nodes: 1 - devices: -1 - accelerator: gpu - strategy: ddp_find_unused_parameters_true - precision: 16 - max_steps: ${model.max_steps} - accumulate_grad_batches: 1 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 10 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - mode: min - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null - entity: null - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/generator/v1.yaml b/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/generator/v1.yaml deleted file mode 100644 index 2698e15217d9d819c5bc169056835bbd6e425c08..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/generator/v1.yaml +++ /dev/null @@ -1,7 +0,0 @@ -_target_: nemo.collections.tts.modules.hifigan_modules.Generator -resblock: 1 -upsample_rates: [8,8,2,2] -upsample_kernel_sizes: [16,16,4,4] -upsample_initial_channel: 512 -resblock_kernel_sizes: [3,7,11] -resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] diff --git a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/generator/v1_44100.yaml b/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/generator/v1_44100.yaml deleted file mode 100644 index 3bd7352ae570e035076f11e85d4a13f30643d7e9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/generator/v1_44100.yaml +++ /dev/null @@ -1,7 +0,0 @@ -_target_: nemo.collections.tts.modules.hifigan_modules.Generator -resblock: 1 -upsample_rates: [8,8,4,2] -upsample_kernel_sizes: [16,16,4,4] -upsample_initial_channel: 512 -resblock_kernel_sizes: [3,7,11] -resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] diff --git a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/generator/v2.yaml b/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/generator/v2.yaml deleted file mode 100644 index 91e1719839616354705e2a9a9e93f1f8417dc3cd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/generator/v2.yaml +++ /dev/null @@ -1,7 +0,0 @@ -_target_: nemo.collections.tts.modules.hifigan_modules.Generator -resblock: 1 -upsample_rates: [8,8,2,2] -upsample_kernel_sizes: [16,16,4,4] -upsample_initial_channel: 128 -resblock_kernel_sizes: [3,7,11] -resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] diff --git a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/generator/v3.yaml b/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/generator/v3.yaml deleted file mode 100644 index e3639c1370b3edba0e879f83bef0b8d9b9a3fd58..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/generator/v3.yaml +++ /dev/null @@ -1,7 +0,0 @@ -_target_: nemo.collections.tts.modules.hifigan_modules.Generator -resblock: 2 -upsample_rates: [8,8,4] -upsample_kernel_sizes: [16,16,8] -upsample_initial_channel: 256 -resblock_kernel_sizes: [3,5,7] -resblock_dilation_sizes: [[1,2], [2,6], [3,12]] diff --git a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/train_ds/train_ds.yaml b/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/train_ds/train_ds.yaml deleted file mode 100644 index 46df72f35560594ec4fd8c16c38bb89c30a5e3fd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/train_ds/train_ds.yaml +++ /dev/null @@ -1,13 +0,0 @@ -dataset: - _target_: "nemo.collections.tts.data.dataset.VocoderDataset" - manifest_filepath: ${train_dataset} - sample_rate: ${sample_rate} - n_segments: ${train_n_segments} - max_duration: ${train_max_duration} - min_duration: ${train_min_duration} -dataloader_params: - drop_last: false - shuffle: true - batch_size: 16 - num_workers: 4 - pin_memory: true diff --git a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/train_ds/train_ds_finetune.yaml b/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/train_ds/train_ds_finetune.yaml deleted file mode 100644 index afee3b432fb0442cedaa5d941cc049c263badc68..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/train_ds/train_ds_finetune.yaml +++ /dev/null @@ -1,15 +0,0 @@ -dataset: - _target_: "nemo.collections.tts.data.dataset.VocoderDataset" - manifest_filepath: ${train_dataset} - sample_rate: ${sample_rate} - n_segments: ${train_n_segments} - max_duration: ${train_max_duration} - min_duration: ${train_min_duration} - load_precomputed_mel: true - hop_length: ${n_window_stride} -dataloader_params: - drop_last: false - shuffle: true - batch_size: 16 - num_workers: 4 - pin_memory: true diff --git a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/validation_ds/val_ds.yaml b/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/validation_ds/val_ds.yaml deleted file mode 100644 index e241f810afb70b502c3b1ee5727181ca347efddc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/validation_ds/val_ds.yaml +++ /dev/null @@ -1,13 +0,0 @@ -dataset: - _target_: "nemo.collections.tts.data.dataset.VocoderDataset" - manifest_filepath: ${validation_datasets} - sample_rate: ${sample_rate} - n_segments: ${val_n_segments} - max_duration: ${val_max_duration} - min_duration: ${val_min_duration} -dataloader_params: - drop_last: false - shuffle: false - batch_size: 16 - num_workers: 1 - pin_memory: true diff --git a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/validation_ds/val_ds_finetune.yaml b/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/validation_ds/val_ds_finetune.yaml deleted file mode 100644 index 6c5b79c9056bbdd76508fc86d04cd105a3314d73..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/hifigan/model/validation_ds/val_ds_finetune.yaml +++ /dev/null @@ -1,15 +0,0 @@ -dataset: - _target_: "nemo.collections.tts.data.dataset.VocoderDataset" - manifest_filepath: ${validation_datasets} - sample_rate: ${sample_rate} - n_segments: ${val_n_segments} - max_duration: ${val_max_duration} - min_duration: ${val_min_duration} - load_precomputed_mel: true - hop_length: ${n_window_stride} -dataloader_params: - drop_last: false - shuffle: false - batch_size: 16 - num_workers: 4 - pin_memory: true diff --git a/SoundScribe/SpeakerID/examples/tts/conf/hifigan_dataset/hifigan_22050.yaml b/SoundScribe/SpeakerID/examples/tts/conf/hifigan_dataset/hifigan_22050.yaml deleted file mode 100644 index 6086634adfd525a1a604008cdb47bc725cd30068..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/hifigan_dataset/hifigan_22050.yaml +++ /dev/null @@ -1,151 +0,0 @@ -# This config contains the default values for training a 22.05kHz HiFi-GAN model. -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: "HifiGan" - -max_epochs: ??? -batch_size: 16 -weighted_sampling_steps_per_epoch: null - -train_ds_meta: ??? -val_ds_meta: ??? -log_ds_meta: ??? - -log_dir: ??? - -mel_dim: 80 -lowfreq: 0 -highfreq: null - -# Change these values depending on your sampling rate. -sample_rate: 22050 -win_length: 1024 -hop_length: 256 -upsample_rates: [8, 8, 2, 2] -train_n_samples: 8192 -val_min_duration_seconds: 3.0 -val_n_samples: 66048 - -model: - - max_epochs: ${max_epochs} - steps_per_epoch: ${weighted_sampling_steps_per_epoch} - l1_loss_factor: 60 - - preprocessor: - _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures - nfilt: ${mel_dim} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - n_fft: ${win_length} - n_window_size: ${win_length} - n_window_stride: ${hop_length} - pad_to: 0 - pad_value: 0 - exact_pad: true - sample_rate: ${sample_rate} - window: hann - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1.0 - mag_power: 1.0 - mel_norm: null - use_grads: false - - generator: - _target_: nemo.collections.tts.modules.hifigan_modules.Generator - resblock: 1 - upsample_rates: ${upsample_rates} - upsample_kernel_sizes: [16, 16, 4, 4] - upsample_initial_channel: 512 - resblock_kernel_sizes: [3, 7, 11] - resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] - - train_ds: - dataset: - _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset - weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} - sample_rate: ${sample_rate} - n_samples: ${train_n_samples} - min_duration: 0.4 - max_duration: null - dataset_meta: ${train_ds_meta} - - dataloader_params: - batch_size: ${batch_size} - num_workers: 4 - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset - sample_rate: ${sample_rate} - n_samples: ${val_n_samples} - min_duration: ${val_min_duration_seconds} - max_duration: null - dataset_meta: ${val_ds_meta} - - dataloader_params: - batch_size: ${batch_size} - num_workers: 2 - - log_config: - log_dir: ${log_dir} - log_epochs: [10, 50] - epoch_frequency: 100 - log_tensorboard: false - log_wandb: false - - generators: - - _target_: nemo.collections.tts.parts.utils.callbacks.VocoderArtifactGenerator - - dataset: - _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset - sample_rate: ${sample_rate} - n_samples: null - min_duration: null - max_duration: null - trunc_duration: 15.0 - dataset_meta: ${log_ds_meta} - - dataloader_params: - batch_size: 4 - num_workers: 2 - - optim: - _target_: torch.optim.AdamW - lr: 2e-4 - betas: [0.8, 0.99] - weight_decay: 1e-6 - sched: - name: ExponentialLR - gamma: 0.999 - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp_find_unused_parameters_true - precision: 16 - max_epochs: ${max_epochs} - accumulate_grad_batches: 1 - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 10 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - create_wandb_logger: false - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/hifigan_dataset/hifigan_44100.yaml b/SoundScribe/SpeakerID/examples/tts/conf/hifigan_dataset/hifigan_44100.yaml deleted file mode 100644 index 5c5d1a743cabdbd691144c505fc77f7fe42f68e6..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/hifigan_dataset/hifigan_44100.yaml +++ /dev/null @@ -1,151 +0,0 @@ -# This config contains the default values for training a 44.1kHz HiFi-GAN model. -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: "HifiGan" - -max_epochs: ??? -batch_size: 16 -weighted_sampling_steps_per_epoch: null - -train_ds_meta: ??? -val_ds_meta: ??? -log_ds_meta: ??? - -log_dir: ??? - -mel_dim: 80 -lowfreq: 0 -highfreq: null - -# Change these values depending on your sampling rate. -sample_rate: 44100 -win_length: 2048 -hop_length: 512 -upsample_rates: [8, 8, 4, 2] -train_n_samples: 16384 -val_min_duration_seconds: 3.0 -val_n_samples: 131072 - -model: - - max_epochs: ${max_epochs} - steps_per_epoch: ${weighted_sampling_steps_per_epoch} - l1_loss_factor: 60 - - preprocessor: - _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures - nfilt: ${mel_dim} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - n_fft: ${win_length} - n_window_size: ${win_length} - n_window_stride: ${hop_length} - pad_to: 0 - pad_value: 0 - exact_pad: true - sample_rate: ${sample_rate} - window: hann - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1.0 - mag_power: 1.0 - mel_norm: null - use_grads: false - - generator: - _target_: nemo.collections.tts.modules.hifigan_modules.Generator - resblock: 1 - upsample_rates: ${upsample_rates} - upsample_kernel_sizes: [16, 16, 4, 4] - upsample_initial_channel: 512 - resblock_kernel_sizes: [3, 7, 11] - resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] - - train_ds: - dataset: - _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset - weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} - sample_rate: ${sample_rate} - n_samples: ${train_n_samples} - min_duration: 0.4 - max_duration: null - dataset_meta: ${train_ds_meta} - - dataloader_params: - batch_size: ${batch_size} - num_workers: 4 - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset - sample_rate: ${sample_rate} - n_samples: ${val_n_samples} - min_duration: ${val_min_duration_seconds} - max_duration: null - dataset_meta: ${val_ds_meta} - - dataloader_params: - batch_size: ${batch_size} - num_workers: 2 - - log_config: - log_dir: ${log_dir} - log_epochs: [10, 50] - epoch_frequency: 100 - log_tensorboard: false - log_wandb: false - - generators: - - _target_: nemo.collections.tts.parts.utils.callbacks.VocoderArtifactGenerator - - dataset: - _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset - sample_rate: ${sample_rate} - n_samples: null - min_duration: null - max_duration: null - trunc_duration: 15.0 - dataset_meta: ${log_ds_meta} - - dataloader_params: - batch_size: 4 - num_workers: 2 - - optim: - _target_: torch.optim.AdamW - lr: 2e-4 - betas: [0.8, 0.99] - weight_decay: 1e-6 - sched: - name: ExponentialLR - gamma: 0.999 - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp_find_unused_parameters_true - precision: 16 - max_epochs: ${max_epochs} - accumulate_grad_batches: 1 - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 10 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - create_wandb_logger: false - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/mixer-tts-x.yaml b/SoundScribe/SpeakerID/examples/tts/conf/mixer-tts-x.yaml deleted file mode 100644 index 736e0bca6c114fe466e91e1bd1c0b3a91829d519..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/mixer-tts-x.yaml +++ /dev/null @@ -1,249 +0,0 @@ -# This config contains the default values for training Mixer-TTS-X model on LJSpeech dataset. -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: MixerTTS-X - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [ "align_prior_matrix", "pitch", "lm_tokens" ] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech -pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech - -# Default values for dataset with sample_rate=22050 -sample_rate: 22050 -n_mel_channels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: 8000 -window: hann - -lm_model: albert - -model: - bin_loss_start_ratio: 0.2 - bin_loss_warmup_epochs: 100 - - symbols_embedding_dim: 384 - lm_model: ${lm_model} - cond_on_lm_embeddings: true - - pitch_loss_scale: 0.1 - durs_loss_scale: 0.1 - mel_loss_scale: 1.0 - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.tts.common.tokenizers.text_to_speech.EnglishCharsTokenizer - punct: true - apostrophe: true - pad_with_space: true - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.MixerTTSXDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - lm_model: ${model.lm_model} - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 64 - num_workers: 4 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.MixerTTSXDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - lm_model: ${model.lm_model} - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 64 - num_workers: 1 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: -11.52 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: clamp - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - - self_attention_module: - _target_: nemo.collections.tts.modules.mixer_tts.SelfAttentionModule - n_text_channels: ${model.symbols_embedding_dim} - n_lm_tokens_channels: 100 # dummy value, real value is set in model constructor - - encoder: - _target_: nemo.collections.tts.modules.mixer_tts.MixerTTSModule - num_tokens: 100 # dummy value, real value is set in model constructor - padding_idx: 100 # dummy value, real value is set in model constructor - feature_dim: 384 - kernel_sizes: [11, 13, 15, 17, 19, 21] - num_layers: 6 - expansion_factor: 4 - dropout: 0.15 - - decoder: - _target_: nemo.collections.tts.modules.mixer_tts.MixerTTSModule - num_tokens: -1 - feature_dim: 384 - kernel_sizes: [15, 17, 19, 21, 23, 25, 27, 29, 31] - num_layers: 9 - expansion_factor: 4 - dropout: 0.15 - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.15 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.15 - n_layers: 2 - - pitch_emb: - _target_: torch.nn.Conv1d - in_channels: 1 - out_channels: ${model.symbols_embedding_dim} - kernel_size: 3 - padding: 1 - - optim: - name: adamw - lr: 1e-3 - betas: [0.9, 0.999] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: 1000 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 200 - check_val_every_n_epoch: 1 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_mel_loss - mode: min - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null - entity: null - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/mixer-tts.yaml b/SoundScribe/SpeakerID/examples/tts/conf/mixer-tts.yaml deleted file mode 100644 index b7b2aa9a1917d94b0cb4be8a13754eb1486dae17..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/mixer-tts.yaml +++ /dev/null @@ -1,247 +0,0 @@ -# This config contains the default values for training Mixer-TTS model on LJSpeech dataset. -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: Mixer-TTS - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [ "align_prior_matrix", "pitch" ] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech -pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech - -# Default values for dataset with sample_rate=22050 -sample_rate: 22050 -n_mel_channels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: 8000 -window: hann - -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" - -model: - bin_loss_start_ratio: 0.2 - bin_loss_warmup_epochs: 100 - - symbols_embedding_dim: 384 - - pitch_loss_scale: 0.1 - durs_loss_scale: 0.1 - mel_loss_scale: 1.0 - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer - punct: true - stresses: true - chars: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 64 - num_workers: 4 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 64 - num_workers: 1 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: -11.52 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: clamp - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - - encoder: - _target_: nemo.collections.tts.modules.mixer_tts.MixerTTSModule - num_tokens: 100 # dummy value, real value is set in model constructor - padding_idx: 100 # dummy value, real value is set in model constructor - feature_dim: 384 - kernel_sizes: [11, 13, 15, 17, 19, 21] - num_layers: 6 - expansion_factor: 4 - dropout: 0.15 - - decoder: - _target_: nemo.collections.tts.modules.mixer_tts.MixerTTSModule - num_tokens: -1 - feature_dim: 384 - kernel_sizes: [15, 17, 19, 21, 23, 25, 27, 29, 31] - num_layers: 9 - expansion_factor: 4 - dropout: 0.15 - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.15 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.15 - n_layers: 2 - - pitch_emb: - _target_: torch.nn.Conv1d - in_channels: 1 - out_channels: ${model.symbols_embedding_dim} - kernel_size: 3 - padding: 1 - - optim: - name: adamw - lr: 1e-3 - betas: [0.9, 0.999] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: 1000 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 200 - check_val_every_n_epoch: 1 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_mel_loss - mode: min - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null - entity: null - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/rad-tts_dec.yaml b/SoundScribe/SpeakerID/examples/tts/conf/rad-tts_dec.yaml deleted file mode 100644 index e21168d8399b6ab2ad4e1b152b8aee06fa8577bb..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/rad-tts_dec.yaml +++ /dev/null @@ -1,270 +0,0 @@ -name: RadTTS -sample_rate: 22050 - -train_dataset: ??? -validation_datasets: ??? -ckpt_path: None -export_dir: ??? -sup_data_path: ??? -sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"] - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech -pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech - -# default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# default values for sample_rate=22050 -n_mels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: 8000 -window: "hann" - - -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" -mapping_file_path: "" - -model: - target: nemo.collections.tts.models.RadTTSModel - bin_loss_start_ratio: 0.2 - bin_loss_warmup_epochs: 100 - - symbols_embedding_dim: 384 - n_mel_channels: ${n_mels} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer - punct: true - stresses: true - chars: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - mapping_file: ${mapping_file_path} - - train_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.TTSDataset" - manifest_filepath: ${train_dataset} - sample_rate: ${sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${n_fft} - win_length: ${n_window_size} - hop_length: ${n_window_stride} - window: ${window} - n_mels: ${n_mels} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: False - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - - - text_tokenizer: - _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" - punct: True - stresses: True - chars: True - space: ' ' - silence: null - apostrophe: True - sep: '|' - add_blank_at: null - pad_with_space: True - g2p: - _target_: "nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p" - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - dataloader_params: - drop_last: false - shuffle: true - batch_size: 8 - num_workers: 8 - pin_memory: false - - validation_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.TTSDataset" - manifest_filepath: ${validation_datasets} - sample_rate: ${sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${n_fft} - win_length: ${n_window_size} - hop_length: ${n_window_stride} - window: ${window} - n_mels: ${n_mels} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: False - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - text_tokenizer: - _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" - punct: True - stresses: True - chars: True - space: ' ' - silence: null - apostrophe: True - sep: '|' - add_blank_at: null - pad_with_space: True - g2p: - _target_: "nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p" - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - dataloader_params: - drop_last: false - shuffle: false - batch_size: 8 - num_workers: 8 - pin_memory: false - - optim: - name: RAdam - lr: 0.0001 - betas: [0.9, 0.98] - weight_decay: 0.000001 - - sched: - name: exp_decay - warmup_steps: 40000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - trainerConfig: - sigma: 1 - iters_per_checkpoint: 3000 - seed: null - ignore_layers: [] - finetune_layers: [] - include_layers: [] - with_tensorboard: true - dur_loss_weight: 1 - ctc_loss_weight: 1 - mask_unvoiced_f0: false - log_step: 1 - binarization_start_iter: 6000 - kl_loss_start_iter: 18000 - loss_weights: - ctc_loss_weight: 0.1 - dur_loss_weight: 1.0 - f0_loss_weight: 1.0 - energy_loss_weight: 1.0 - vpred_loss_weight: 1.0 - unfreeze_modules: "all" - - load_from_checkpoint: False - init_from_ptl_ckpt: ${ckpt_path} - modelConfig: - _target_: "nemo.collections.tts.modules.radtts.RadTTSModule" - n_speakers: 1 - n_speaker_dim: 16 - n_text: 384 #185 - n_text_dim: 512 - n_flows: 8 - n_conv_layers_per_step: 4 - n_mel_channels: 80 - n_hidden: 1024 - mel_encoder_n_hidden: 512 - dummy_speaker_embedding: false - n_early_size: 2 - n_early_every: 2 - n_group_size: 2 - affine_model: wavenet - include_modules: "decatnvpred" - scaling_fn: tanh - matrix_decomposition: LUS - learn_alignments: true - use_context_lstm: true - context_lstm_norm: spectral - context_lstm_w_f0_and_energy: true - text_encoder_lstm_norm: spectral - n_f0_dims: 1 - n_energy_avg_dims: 1 - use_first_order_features: false - unvoiced_bias_activation: "relu" - decoder_use_partial_padding: false - decoder_use_unvoiced_bias: true - ap_pred_log_f0: true - ap_use_unvoiced_bias: true - ap_use_voiced_embeddings: true - dur_model_config: null - f0_model_config: null - energy_model_config: null - v_model_config : - name : dap - hparams : - n_speaker_dim : 16 - take_log_of_input: false - bottleneck_hparams: - in_dim: 512 - reduction_factor: 16 - norm: weightnorm - non_linearity: relu - arch_hparams: - out_dim: 1 - n_layers: 2 - n_channels: 256 - kernel_size: 3 - p_dropout: 0.5 - -trainer: - devices: 8 - precision: 16 - max_epochs: 1000 - num_nodes: 1 - accelerator: gpu - strategy: ddp - accumulate_grad_batches: 1 - enable_checkpointing: False - logger: False - gradient_clip_val: 1 - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - -exp_manager: - exp_dir: ${export_dir} - name: ${name} - create_tensorboard_logger: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val/loss_ctc - mode: min - filepath: ${export_dir} - filename: model_checkpoint diff --git a/SoundScribe/SpeakerID/examples/tts/conf/rad-tts_dec_ipa.yaml b/SoundScribe/SpeakerID/examples/tts/conf/rad-tts_dec_ipa.yaml deleted file mode 100644 index 913eab4e2e17ccb0d22d053ebe6254f6d0a2709f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/rad-tts_dec_ipa.yaml +++ /dev/null @@ -1,273 +0,0 @@ -name: RadTTS -sample_rate: 22050 - -train_dataset: ??? -validation_datasets: ??? -ckpt_path: None -export_dir: ??? -sup_data_path: ??? -sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"] - - - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech -pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech - -# default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# default values for sample_rate=22050 -n_mels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: 8000 -window: "hann" - - -phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" -mapping_file_path: "" - -model: - target: nemo.collections.tts.models.RadTTSModel - bin_loss_start_ratio: 0.2 - bin_loss_warmup_epochs: 100 - - symbols_embedding_dim: 384 - n_mel_channels: ${n_mels} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer - punct: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - # Relies on the heteronyms list for anything that needs to be disambiguated - ignore_ambiguous_words: true - use_chars: true - use_stresses: true - - train_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.TTSDataset" - manifest_filepath: ${train_dataset} - sample_rate: ${sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${n_fft} - win_length: ${n_window_size} - hop_length: ${n_window_stride} - window: ${window} - n_mels: ${n_mels} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: False - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - - - text_tokenizer: - _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" - punct: True - stresses: True - chars: True - space: ' ' - silence: null - apostrophe: True - sep: '|' - add_blank_at: null - pad_with_space: True - g2p: - _target_: "nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p" - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - dataloader_params: - drop_last: false - shuffle: true - batch_size: 8 - num_workers: 8 - pin_memory: false - - validation_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.TTSDataset" - manifest_filepath: ${validation_datasets} - sample_rate: ${sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${n_fft} - win_length: ${n_window_size} - hop_length: ${n_window_stride} - window: ${window} - n_mels: ${n_mels} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: False - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - text_tokenizer: - _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" - punct: True - stresses: True - chars: True - space: ' ' - silence: null - apostrophe: True - sep: '|' - add_blank_at: null - pad_with_space: True - g2p: - _target_: "nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p" - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - dataloader_params: - drop_last: false - shuffle: false - batch_size: 8 - num_workers: 8 - pin_memory: false - - optim: - name: RAdam - lr: 0.0001 - betas: [0.9, 0.98] - weight_decay: 0.000001 - - sched: - name: exp_decay - warmup_steps: 40000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - trainerConfig: - sigma: 1 - iters_per_checkpoint: 3000 - seed: null - ignore_layers: [] - finetune_layers: [] - include_layers: [] - with_tensorboard: true - dur_loss_weight: 1 - ctc_loss_weight: 1 - mask_unvoiced_f0: false - log_step: 1 - binarization_start_iter: 6000 - kl_loss_start_iter: 18000 - loss_weights: - ctc_loss_weight: 0.1 - dur_loss_weight: 1.0 - f0_loss_weight: 1.0 - energy_loss_weight: 1.0 - vpred_loss_weight: 1.0 - unfreeze_modules: "all" - - load_from_checkpoint: False - init_from_ptl_ckpt: ${ckpt_path} - modelConfig: - _target_: "nemo.collections.tts.modules.radtts.RadTTSModule" - n_speakers: 1 - n_speaker_dim: 16 - n_text: 384 #185 - n_text_dim: 512 - n_flows: 8 - n_conv_layers_per_step: 4 - n_mel_channels: 80 - n_hidden: 1024 - mel_encoder_n_hidden: 512 - dummy_speaker_embedding: false - n_early_size: 2 - n_early_every: 2 - n_group_size: 2 - affine_model: wavenet - include_modules: "decatnvpred" - scaling_fn: tanh - matrix_decomposition: LUS - learn_alignments: true - use_context_lstm: true - context_lstm_norm: spectral - context_lstm_w_f0_and_energy: true - text_encoder_lstm_norm: spectral - n_f0_dims: 1 - n_energy_avg_dims: 1 - use_first_order_features: false - unvoiced_bias_activation: "relu" - decoder_use_partial_padding: false - decoder_use_unvoiced_bias: true - ap_pred_log_f0: true - ap_use_unvoiced_bias: true - ap_use_voiced_embeddings: true - dur_model_config: null - f0_model_config: null - energy_model_config: null - v_model_config : - name : dap - hparams : - n_speaker_dim : 16 - take_log_of_input: false - bottleneck_hparams: - in_dim: 512 - reduction_factor: 16 - norm: weightnorm - non_linearity: relu - arch_hparams: - out_dim: 1 - n_layers: 2 - n_channels: 256 - kernel_size: 3 - p_dropout: 0.5 - -trainer: - devices: 8 - precision: 16 - max_epochs: 1000 - num_nodes: 1 - accelerator: gpu - strategy: ddp - accumulate_grad_batches: 1 - enable_checkpointing: False - logger: False - gradient_clip_val: 1 - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - -exp_manager: - exp_dir: ${export_dir} - name: ${name} - create_tensorboard_logger: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val/loss_ctc - mode: min - filepath: ${export_dir} - filename: model_checkpoint diff --git a/SoundScribe/SpeakerID/examples/tts/conf/rad-tts_feature_pred.yaml b/SoundScribe/SpeakerID/examples/tts/conf/rad-tts_feature_pred.yaml deleted file mode 100644 index 1b4aa02034cdc62d3e47e4b1dcf0dde21e4edadb..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/rad-tts_feature_pred.yaml +++ /dev/null @@ -1,332 +0,0 @@ -name: RadTTS -sample_rate: 22050 - -train_dataset: ??? -validation_datasets: ??? -ckpt_path: ??? -export_dir: ??? -sup_data_path: ??? -sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"] - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech -pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech - -# default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# default values for sample_rate=22050 -n_mels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: 8000 -window: "hann" - -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" -mapping_file_path: "" - -model: - target: nemo.collections.tts.models.RadTTSModel - bin_loss_start_ratio: 0.2 - bin_loss_warmup_epochs: 100 - - symbols_embedding_dim: 384 - n_mel_channels: ${n_mels} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer - punct: true - stresses: true - chars: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - mapping_file: ${mapping_file_path} - - train_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.TTSDataset" - manifest_filepath: ${train_dataset} - sample_rate: ${sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${n_fft} - win_length: ${n_window_size} - hop_length: ${n_window_stride} - window: ${window} - n_mels: ${n_mels} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: False - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - - - text_tokenizer: - _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" - punct: True - stresses: True - chars: True - space: ' ' - silence: null - apostrophe: True - sep: '|' - add_blank_at: null - pad_with_space: True - g2p: - _target_: "nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p" - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 8 - pin_memory: True - - validation_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.TTSDataset" - manifest_filepath: ${validation_datasets} - sample_rate: ${sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${n_fft} - win_length: ${n_window_size} - hop_length: ${n_window_stride} - window: ${window} - n_mels: ${n_mels} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: False - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - text_tokenizer: - _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" - punct: True - stresses: True - chars: True - space: ' ' - silence: null - apostrophe: True - sep: '|' - add_blank_at: null - pad_with_space: True - g2p: - _target_: "nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p" - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 8 - pin_memory: True - - optim: - name: RAdam - lr: 0.001 - betas: [0.9, 0.98] - weight_decay: 0.000001 - - sched: - name: exp_decay - warmup_steps: 40000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - trainerConfig: - sigma: 1 - iters_per_checkpoint: 3000 - seed: null - ignore_layers: [] - finetune_layers: [] - include_layers: [] - with_tensorboard: true - dur_loss_weight: 1 - ctc_loss_weight: 1 - mask_unvoiced_f0: false - log_step: 1 - binarization_start_iter: 1000000 - kl_loss_start_iter: 1000000 - loss_weights: - ctc_loss_weight: 0.1 - dur_loss_weight: 1.0 - f0_loss_weight: 1.0 - energy_loss_weight: 1.0 - vpred_loss_weight: 1.0 - unfreeze_modules: "durf0energyvpred" - - load_from_checkpoint: True - init_from_ptl_ckpt: ${ckpt_path} - modelConfig: - _target_: "nemo.collections.tts.modules.radtts.RadTTSModule" - n_speakers: 1 - n_speaker_dim: 16 - n_text: 384 #185 - n_text_dim: 512 - n_flows: 8 - n_conv_layers_per_step: 4 - n_mel_channels: 80 - n_hidden: 1024 - mel_encoder_n_hidden: 512 - n_components: 0 - mean_scale: 0 - fixed_gaussian: true - dummy_speaker_embedding: false - use_positional_embedding: false - n_early_size: 2 - n_early_every: 2 - n_group_size: 2 - use_feature_gating: false - affine_model: wavenet - include_modules: "decatnunvbiasdpmvpredapm" - what_to_train: decatnunvbias - scaling_fn: tanh - reduction_norm: "" - matrix_decomposition: LUS - learn_alignments: true - use_query_proj: true - align_query_enc_type: 3xconv - lstm_applicable_steps: [] - use_context_lstm: true - context_lstm_norm: spectral - context_lstm_w_f0_and_energy: true - text_encoder_lstm_norm: spectral - use_text_conditional_priors: false - zero_out_context: false - n_aug_dims: 6 - n_f0_dims: 1 - n_energy_avg_dims: 1 - use_first_order_features: false - unvoiced_bias_activation: "relu" - decoder_use_partial_padding: false - decoder_use_unvoiced_bias: true - ap_pred_log_f0: true - ap_use_unvoiced_bias: true - ap_use_voiced_embeddings: true - p_dropout: 0.1 - noise_to_unvoiced_in_f0: 0 - noise_to_pvoiced: 0 - dur_model_config: - name: dap - hparams: - n_speaker_dim: 16 - bottleneck_hparams: - in_dim: 512 - reduction_factor: 16 - norm: weightnorm - non_linearity: relu - take_log_of_input: true - arch_hparams: - out_dim: 1 - n_layers: 2 - n_channels: 256 - kernel_size: 3 - p_dropout: 0.1 - f0_model_config: - name: dap - hparams: - n_speaker_dim: 16 - bottleneck_hparams: - in_dim: 512 - reduction_factor: 16 - norm: weightnorm - non_linearity: relu - take_log_of_input: false - arch_hparams: - out_dim: 1 - n_layers: 2 - n_channels: 256 - kernel_size: 11 - p_dropout: 0.5 - - energy_model_config: - name: dap - hparams: - n_speaker_dim: 16 - bottleneck_hparams: - in_dim: 512 - reduction_factor: 16 - norm: weightnorm - non_linearity: relu - take_log_of_input: false - arch_hparams: - out_dim: 1 - n_layers: 2 - n_channels: 256 - kernel_size: 3 - p_dropout: 0.5 - v_model_config : - name: dap - hparams: - n_speaker_dim: 16 - take_log_of_input: false - bottleneck_hparams: - in_dim: 512 - reduction_factor: 16 - norm: weightnorm - non_linearity: relu - arch_hparams: - out_dim: 1 - n_layers: 2 - n_channels: 256 - kernel_size: 3 - p_dropout: 0.5 - -trainer: - devices: 8 - precision: 16 - max_epochs: 1000 - num_nodes: 1 - accelerator: gpu - strategy: ddp - accumulate_grad_batches: 1 - enable_checkpointing: False - logger: False - gradient_clip_val: 1 - flush_logs_every_n_steps: 1000 - log_every_n_steps: 100 - check_val_every_n_epoch: 2 - -exp_manager: - exp_dir: ${export_dir} - name: ${name} - create_tensorboard_logger: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val/loss_energy - mode: min - filepath: ${export_dir} - filename: model_checkpoint diff --git a/SoundScribe/SpeakerID/examples/tts/conf/rad-tts_feature_pred_ipa.yaml b/SoundScribe/SpeakerID/examples/tts/conf/rad-tts_feature_pred_ipa.yaml deleted file mode 100644 index d7a27307bed719ded040b12cba4c5af9c0268562..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/rad-tts_feature_pred_ipa.yaml +++ /dev/null @@ -1,339 +0,0 @@ -name: RadTTS -sample_rate: 22050 - -train_dataset: ??? -validation_datasets: ??? -ckpt_path: ??? -export_dir: ??? -sup_data_path: ??? -sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"] - - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech -pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech - -# default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# default values for sample_rate=22050 -n_mels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: 8000 -window: "hann" - -phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" -mapping_file_path: "" - -model: - target: nemo.collections.tts.models.RadTTSModel - bin_loss_start_ratio: 0.2 - bin_loss_warmup_epochs: 100 - - symbols_embedding_dim: 384 - n_mel_channels: ${n_mels} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer - punct: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - # Relies on the heteronyms list for anything that needs to be disambiguated - ignore_ambiguous_words: true - use_chars: true - use_stresses: true - - train_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.TTSDataset" - manifest_filepath: ${train_dataset} - sample_rate: ${sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${n_fft} - win_length: ${n_window_size} - hop_length: ${n_window_stride} - window: ${window} - n_mels: ${n_mels} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: False - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - - - text_tokenizer: - _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" - punct: True - stresses: True - chars: True - space: ' ' - silence: null - apostrophe: True - sep: '|' - add_blank_at: null - pad_with_space: True - g2p: - _target_: "nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p" - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 8 - pin_memory: True - - validation_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.TTSDataset" - manifest_filepath: ${validation_datasets} - sample_rate: ${sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${n_fft} - win_length: ${n_window_size} - hop_length: ${n_window_stride} - window: ${window} - n_mels: ${n_mels} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: False - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - text_tokenizer: - _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" - punct: True - stresses: True - chars: True - space: ' ' - silence: null - apostrophe: True - sep: '|' - add_blank_at: null - pad_with_space: True - g2p: - _target_: "nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p" - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 8 - pin_memory: True - - optim: - name: RAdam - lr: 0.001 - betas: [0.9, 0.98] - weight_decay: 0.000001 - - sched: - name: exp_decay - warmup_steps: 40000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - trainerConfig: - sigma: 1 - iters_per_checkpoint: 3000 - seed: null - ignore_layers: [] - finetune_layers: [] - include_layers: [] - with_tensorboard: true - dur_loss_weight: 1 - ctc_loss_weight: 1 - mask_unvoiced_f0: false - log_step: 1 - binarization_start_iter: 1000000 - kl_loss_start_iter: 1000000 - loss_weights: - ctc_loss_weight: 0.1 - dur_loss_weight: 1.0 - f0_loss_weight: 1.0 - energy_loss_weight: 1.0 - vpred_loss_weight: 1.0 - unfreeze_modules: "durf0energyvpred" - - load_from_checkpoint: True - init_from_ptl_ckpt: ${ckpt_path} - modelConfig: - _target_: "nemo.collections.tts.modules.radtts.RadTTSModule" - n_speakers: 1 - n_speaker_dim: 16 - n_text: 384 #185 - n_text_dim: 512 - n_flows: 8 - n_conv_layers_per_step: 4 - n_mel_channels: 80 - n_hidden: 1024 - mel_encoder_n_hidden: 512 - n_components: 0 - mean_scale: 0 - fixed_gaussian: true - dummy_speaker_embedding: false - use_positional_embedding: false - n_early_size: 2 - n_early_every: 2 - n_group_size: 2 - use_feature_gating: false - affine_model: wavenet - include_modules: "decatnunvbiasdpmvpredapm" - what_to_train: decatnunvbias - scaling_fn: tanh - reduction_norm: "" - matrix_decomposition: LUS - learn_alignments: true - use_query_proj: true - align_query_enc_type: 3xconv - lstm_applicable_steps: [] - use_context_lstm: true - context_lstm_norm: spectral - context_lstm_w_f0_and_energy: true - text_encoder_lstm_norm: spectral - use_text_conditional_priors: false - zero_out_context: false - n_aug_dims: 6 - n_f0_dims: 1 - n_energy_avg_dims: 1 - use_first_order_features: false - unvoiced_bias_activation: "relu" - decoder_use_partial_padding: false - decoder_use_unvoiced_bias: true - ap_pred_log_f0: true - ap_use_unvoiced_bias: true - ap_use_voiced_embeddings: true - p_dropout: 0.1 - noise_to_unvoiced_in_f0: 0 - noise_to_pvoiced: 0 - dur_model_config: - name: dap - hparams: - n_speaker_dim: 16 - bottleneck_hparams: - in_dim: 512 - reduction_factor: 16 - norm: weightnorm - non_linearity: relu - take_log_of_input: true - use_transformer: true - arch_hparams: - out_dim: 1 - n_layers: 3 - n_head: 1 - d_head: 64 - d_inner: 1024 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0 - in_dim: 48 - f0_model_config: - name: dap - hparams: - n_speaker_dim: 16 - bottleneck_hparams: - in_dim: 512 - reduction_factor: 16 - norm: weightnorm - non_linearity: relu - take_log_of_input: false - arch_hparams: - out_dim: 1 - n_layers: 2 - n_channels: 256 - kernel_size: 11 - p_dropout: 0.5 - - energy_model_config: - name: dap - hparams: - n_speaker_dim: 16 - bottleneck_hparams: - in_dim: 512 - reduction_factor: 16 - norm: weightnorm - non_linearity: relu - take_log_of_input: false - arch_hparams: - out_dim: 1 - n_layers: 2 - n_channels: 256 - kernel_size: 3 - p_dropout: 0.5 - v_model_config : - name: dap - hparams: - n_speaker_dim: 16 - take_log_of_input: false - bottleneck_hparams: - in_dim: 512 - reduction_factor: 16 - norm: weightnorm - non_linearity: relu - arch_hparams: - out_dim: 1 - n_layers: 2 - n_channels: 256 - kernel_size: 3 - p_dropout: 0.5 - -trainer: - devices: 8 - precision: 16 - max_epochs: 1000 - num_nodes: 1 - accelerator: gpu - strategy: ddp - accumulate_grad_batches: 1 - enable_checkpointing: False - logger: False - gradient_clip_val: 1 - log_every_n_steps: 100 - check_val_every_n_epoch: 2 - -exp_manager: - exp_dir: ${export_dir} - name: ${name} - create_tensorboard_logger: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val/loss_energy - mode: min - filepath: ${export_dir} - filename: model_checkpoint diff --git a/SoundScribe/SpeakerID/examples/tts/conf/spectrogram-enhancer.yaml b/SoundScribe/SpeakerID/examples/tts/conf/spectrogram-enhancer.yaml deleted file mode 100644 index ada3bc0ecd0e62ae91d4ee3df17f65ea026b5d25..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/spectrogram-enhancer.yaml +++ /dev/null @@ -1,87 +0,0 @@ -name: "spectrogram-enhancer" - -model: - n_bands: 80 - latent_dim: 192 - style_depth: 4 - network_capacity: 16 - mixed_prob: 0.9 - fmap_max: 192 - start_from_zero: true # might give better results at downstream tasks - - generator: - _target_: "nemo.collections.tts.modules.spectrogram_enhancer.Generator" - n_bands: ${model.n_bands} - latent_dim: ${model.latent_dim} - network_capacity: ${model.network_capacity} - style_depth: ${model.style_depth} - fmap_max: ${model.fmap_max} - - discriminator: - _target_: "nemo.collections.tts.modules.spectrogram_enhancer.Discriminator" - n_bands: ${model.n_bands} - network_capacity: ${model.network_capacity} - fmap_max: ${model.fmap_max} - - consistency_loss_weight: 10.0 # somewhere in [1., 100.], less for clean datasets, higher for noisier - gradient_penalty_loss_weight: 10.0 # read stylegan papers before changing - gradient_penalty_loss_every_n_steps: 4 - - # Spectrogram values range, calculated over your dataset with matching STFT parameters. - # Needed for treating spectrograms as images with pixel values around [0, 1]. - # For LibriTTS, you can try [-13.18, 4.78] - spectrogram_min_value: ??? - spectrogram_max_value: ??? - - train_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.PairedRealFakeSpectrogramsDataset" - manifest_filepath: ??? - dataloader_params: - drop_last: true - shuffle: true - batch_size: 8 - num_workers: 2 - - generator_opt: - _target_: torch.optim.Adam - lr: 2e-4 - betas: [0.5, 0.9] - - discriminator_opt: - _target_: torch.optim.Adam - lr: 2e-4 - betas: [0.5, 0.9] - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp - precision: 32 - max_epochs: 4 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - log_every_n_steps: 1000 - # we don't really need validation - check_val_every_n_epoch: null - limit_val_batches: 0.0 - benchmark: false - # provided by exp_manager - enable_checkpointing: False - logger: false - -exp_manager: - exp_dir: "" - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - # no good stopping rule, keep every checkpoint - # tune n_epochs for size of your dataset to avoid wasting space - checkpoint_callback_params: - every_n_epochs: 1 - save_on_train_epoch_end: true - save_top_k: -1 - monitor: "g_loss" - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/ssl_tts_22050.yaml b/SoundScribe/SpeakerID/examples/tts/conf/ssl_tts_22050.yaml deleted file mode 100644 index 2a10b404a2cf8abc452e4479544d35dc1bef057a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/ssl_tts_22050.yaml +++ /dev/null @@ -1,191 +0,0 @@ -# This config contains the default values for self-supervised pre-training of a Conformer ASR model, large size (~120M). - -# Architecture and training config: -# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective -# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. -# Here are the recommended configs for different variants of Conformer-CTC, other parameters are the same as in this config file. -# One extra layer (compared to original paper) is added to the medium and large variants to compensate for replacing the LSTM decoder with a linear one. -# -# +-------------+---------+---------+----------+------------+-----+ -# | Model | d_model | n_heads | n_layers | time_masks | lr | -# +=============+=========+========+===========+============+=====+ -# | Small (13M)| 176 | 4 | 16 | 5 | 5.0 | -# +-------------+---------+--------+-----------+------------+-----+ -# | Medium (30M)| 256 | 4 | 18 | 5 | 5.0 | -# +-------------+---------+--------+-----------+------------+-----+ -# | Large (121M)| 512 | 8 | 18 | 10 | 2.0 | -# +---------------------------------------------------------------+ -# -# If you do not want to train with AMP, you may use weight decay of 0.0 or reduce the number of time maskings to 2 -# with time_width=100. It may help when you want to train for fewer epochs and need faster convergence. -# With weight_decay=0.0, learning rate may need to get reduced to 2.0. - -name: "Conformer-SSL" -init_from_pretrained_model: "ssl_en_conformer_large" - -model: - sample_rate: 22050 - combined_loss: true - pitch_augment: true - augment_sim_alpha: 1.0 - stop_gradient: false - augment_ctc: true - aug_loss_type: "cosine" - pad_multiple: 1 - train_ds: - manifest_speaker_verification_fp: ??? - manifest_content_fp: ??? - sample_rate: ${model.sample_rate} - batch_size_content: 8 # you may increase batch_size if your memory allows - batch_size_sv: 20 - shuffle: true - num_workers_sv: 4 - num_workers_content: 6 - pin_memory: false - max_duration_content: 16.7 - min_duration_content: 8.0 - segment_max_duration: 2 - sup_data_path: ??? - pitch_augment: ${model.pitch_augment} - cache_pitch_augment: true - pad_multiple: ${model.pad_multiple} - - validation_ds: - manifest_speaker_verification_fp: ??? - manifest_content_fp: ??? - sample_rate: ${model.sample_rate} - batch_size_content: 4 # you may increase batch_size if your memory allows - batch_size_sv: 8 - shuffle: false - num_workers_sv: 0 - num_workers_content: 0 - pin_memory: true - use_start_end_token: false - max_duration_content: 16.7 - min_duration_content: 8.0 - segment_max_duration: 2 - sup_data_path: ??? - pitch_augment: ${model.pitch_augment} - cache_pitch_augment: true - pad_multiple: ${model.pad_multiple} - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - sample_rate: ${model.sample_rate} - normalize: "per_feature" - window_size: null - window_stride: null - n_window_size: 1024 - n_window_stride: 256 - window: "hann" - features: 80 - n_fft: 1024 - log: true - frame_splicing: 1 - dither: 0.00001 - pad_to: 16 - pad_value: 0.0 - - spec_augment: - _target_: nemo.collections.asr.modules.MaskedPatchAugmentation - freq_masks: 3 - freq_width: 20 - patch_size: 48 - mask_patches: 0.5 - - downstream_heads: - task_names: ['speaker_verification', 'content'] - speaker_embed_size: 256 - num_speakers: 5994 - content_embed_size: 128 - - encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.preprocessor.features} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 18 - d_model: 512 - - # Sub-sampling params - subsampling: striding # vggnet or striding, vggnet may give better results but needs more memory - subsampling_factor: 4 # must be power of 2 - subsampling_conv_channels: -1 # -1 sets it to d_model - - # Feed forward module's params - ff_expansion_factor: 4 - - # Multi-headed Attention Module's params - self_attention_model: rel_pos # rel_pos or abs_pos - n_heads: 8 # may need to be lower for smaller d_models - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - att_context_size: [-1, -1] # -1 means unlimited context - xscaling: true # scales up the input embeddings by sqrt(d_model) - untie_biases: true # unties the biases of the TransformerXL layers - pos_emb_max_len: 5000 - - # Convolution module's params - conv_kernel_size: 31 - conv_norm_type: 'batch_norm' # batch_norm or layer_norm - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - decoder_out: 128 - - - optim_backbone: - _target_: torch.optim.Adam - lr: 5e-5 - sched: - min_lr: 1e-6 - warmup_steps: 2000 - - optim_downstream: - _target_: torch.optim.Adam - lr: 1e-4 - sched: - min_lr: 1e-6 - warmup_steps: 1000 - - -trainer: - devices: -1 # number of GPUs, -1 would use all available GPUs - num_nodes: 1 - max_epochs: 1000 - max_steps: 500000 # computed at runtime if not set - val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations - accelerator: auto - strategy: ddp - accumulate_grad_batches: 1 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. - log_every_n_steps: 10 # Interval of logging. - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs - sync_batchnorm: true - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - benchmark: false # needs to be false for models with variable-length speech input as it slows down training - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - # in case of multiple validation sets, first one is used - monitor: "val_loss" - mode: "min" - save_top_k: 5 - - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - # you need to set these two to True to continue the training - resume_if_exists: false - resume_ignore_no_checkpoint: false - - # You may use this section to create a W&B logger - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null diff --git a/SoundScribe/SpeakerID/examples/tts/conf/tacotron2.yaml b/SoundScribe/SpeakerID/examples/tts/conf/tacotron2.yaml deleted file mode 100644 index 6daf62899af639ee1f60ac098ddb4a832b427924..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/tacotron2.yaml +++ /dev/null @@ -1,195 +0,0 @@ -# This config contains the default values for training Tacotron2 model on LJSpeech dataset. -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: Tacotron2 - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: null -sup_data_types: null - -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" - -model: - pitch_fmin: 65.40639132514966 - pitch_fmax: 2093.004522404789 - - sample_rate: 22050 - n_mel_channels: 80 - n_window_size: 1024 - n_window_stride: 256 - n_fft: 1024 - lowfreq: 0 - highfreq: 8000 - window: hann - pad_value: -11.52 - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer - punct: true - stresses: true - chars: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - - train_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.TTSDataset" - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: False - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - dataloader_params: - drop_last: false - shuffle: true - batch_size: 48 - num_workers: 4 - pin_memory: true - - validation_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.TTSDataset" - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: False - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - dataloader_params: - drop_last: false - shuffle: false - batch_size: 24 - num_workers: 8 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures - nfilt: ${model.n_mel_channels} - highfreq: ${model.highfreq} - log: true - log_zero_guard_type: clamp - log_zero_guard_value: 1e-05 - lowfreq: ${model.lowfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - n_window_stride: ${model.n_window_stride} - pad_to: 16 - pad_value: ${model.pad_value} - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - stft_conv: false - nb_augmentation_prob : 0 - mag_power: 1.0 - exact_pad: true - use_grads: false - - encoder: - _target_: nemo.collections.tts.modules.tacotron2.Encoder - encoder_kernel_size: 5 - encoder_n_convolutions: 3 - encoder_embedding_dim: 512 - - decoder: - _target_: nemo.collections.tts.modules.tacotron2.Decoder - decoder_rnn_dim: 1024 - encoder_embedding_dim: ${model.encoder.encoder_embedding_dim} - gate_threshold: 0.5 - max_decoder_steps: 1000 - n_frames_per_step: 1 # currently only 1 is supported - n_mel_channels: ${model.n_mel_channels} - p_attention_dropout: 0.1 - p_decoder_dropout: 0.1 - prenet_dim: 256 - prenet_p_dropout: 0.5 - # Attention parameters - attention_dim: 128 - attention_rnn_dim: 1024 - # AttentionLocation Layer parameters - attention_location_kernel_size: 31 - attention_location_n_filters: 32 - early_stopping: true - - postnet: - _target_: nemo.collections.tts.modules.tacotron2.Postnet - n_mel_channels: ${model.n_mel_channels} - p_dropout: 0.5 - postnet_embedding_dim: 512 - postnet_kernel_size: 5 - postnet_n_convolutions: 5 - - optim: - name: adam - lr: 1e-3 - weight_decay: 1e-6 - - # scheduler setup - sched: - name: CosineAnnealing - min_lr: 1e-5 - -trainer: - devices: 1 # number of gpus - max_epochs: ??? - num_nodes: 1 - accelerator: gpu - strategy: ddp - accumulate_grad_batches: 1 - enable_checkpointing: False # Provided by exp_manager - logger: False # Provided by exp_manager - gradient_clip_val: 1.0 - log_every_n_steps: 60 - check_val_every_n_epoch: 2 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - mode: min diff --git a/SoundScribe/SpeakerID/examples/tts/conf/tacotron2_44100.yaml b/SoundScribe/SpeakerID/examples/tts/conf/tacotron2_44100.yaml deleted file mode 100644 index 3965bfd09f10b51097614d43ac0b4ddd9fe0d5d4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/tacotron2_44100.yaml +++ /dev/null @@ -1,180 +0,0 @@ -# TODO(Oktai15): update this config in 1.8.0 version - -name: Tacotron2 -sample_rate: 44100 -# , , will be added by the tacotron2.py script -labels: -- ' ' -- '!' -- '"' -- '''' -- ( -- ) -- ',' -- '-' -- . -- ':' -- ; -- '?' -- a -- b -- c -- d -- e -- f -- g -- h -- i -- j -- k -- l -- m -- 'n' -- o -- p -- q -- r -- s -- t -- u -- v -- w -- x -- 'y' -- z -n_fft: 2048 -n_mels: 80 -fmax: null -n_stride: 512 -pad_value: -11.52 -train_dataset: ??? -validation_datasets: ??? - -model: - labels: ${labels} - train_ds: - dataset: - _target_: "nemo.collections.asr.data.audio_to_text.AudioToCharDataset" - manifest_filepath: ${train_dataset} - max_duration: null - min_duration: 0.1 - trim: false - int_values: false - normalize: true - sample_rate: ${sample_rate} - # bos_id: 66 - # eos_id: 67 - # pad_id: 68 These parameters are added automatically in Tacotron2 - dataloader_params: - drop_last: false - shuffle: true - batch_size: 48 - num_workers: 4 - pin_memory: true - - - validation_ds: - dataset: - _target_: "nemo.collections.asr.data.audio_to_text.AudioToCharDataset" - manifest_filepath: ${validation_datasets} - max_duration: null - min_duration: 0.1 - int_values: false - normalize: true - sample_rate: ${sample_rate} - trim: false - # bos_id: 66 - # eos_id: 67 - # pad_id: 68 These parameters are added automatically in Tacotron2 - dataloader_params: - drop_last: false - shuffle: false - batch_size: 48 - num_workers: 8 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures - dither: 0.0 - nfilt: ${n_mels} - frame_splicing: 1 - highfreq: ${fmax} - log: true - log_zero_guard_type: clamp - log_zero_guard_value: 1e-05 - lowfreq: 0 - mag_power: 1.0 - n_fft: ${n_fft} - n_window_size: 2048 - n_window_stride: ${n_stride} - normalize: null - pad_to: 16 - pad_value: ${pad_value} - preemph: null - sample_rate: ${sample_rate} - window: hann - - encoder: - _target_: nemo.collections.tts.modules.tacotron2.Encoder - encoder_kernel_size: 5 - encoder_n_convolutions: 3 - encoder_embedding_dim: 512 - - decoder: - _target_: nemo.collections.tts.modules.tacotron2.Decoder - decoder_rnn_dim: 1024 - encoder_embedding_dim: ${model.encoder.encoder_embedding_dim} - gate_threshold: 0.5 - max_decoder_steps: 1000 - n_frames_per_step: 1 # currently only 1 is supported - n_mel_channels: ${n_mels} - p_attention_dropout: 0.1 - p_decoder_dropout: 0.1 - prenet_dim: 256 - prenet_p_dropout: 0.5 - # Attention parameters - attention_dim: 128 - attention_rnn_dim: 1024 - # AttentionLocation Layer parameters - attention_location_kernel_size: 31 - attention_location_n_filters: 32 - early_stopping: true - - postnet: - _target_: nemo.collections.tts.modules.tacotron2.Postnet - n_mel_channels: ${n_mels} - p_dropout: 0.5 - postnet_embedding_dim: 512 - postnet_kernel_size: 5 - postnet_n_convolutions: 5 - - optim: - name: adam - lr: 1e-3 - weight_decay: 1e-6 - - # scheduler setup - sched: - name: CosineAnnealing - min_lr: 1e-5 - - -trainer: - devices: 1 # number of gpus - max_epochs: ??? - num_nodes: 1 - accelerator: gpu - strategy: ddp - accumulate_grad_batches: 1 - enable_checkpointing: False # Provided by exp_manager - logger: False # Provided by exp_manager - gradient_clip_val: 1.0 - log_every_n_steps: 60 - check_val_every_n_epoch: 2 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: True - create_checkpoint_callback: True diff --git a/SoundScribe/SpeakerID/examples/tts/conf/text/normalizer_en.yaml b/SoundScribe/SpeakerID/examples/tts/conf/text/normalizer_en.yaml deleted file mode 100644 index aef142544a78a1e5e683e0235df0ec63a19f41df..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/text/normalizer_en.yaml +++ /dev/null @@ -1,3 +0,0 @@ -_target_: nemo_text_processing.text_normalization.normalize.Normalizer -lang: en -input_case: cased \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/tts/conf/trim/energy.yaml b/SoundScribe/SpeakerID/examples/tts/conf/trim/energy.yaml deleted file mode 100644 index 475b62112130d79206f2129d4e07b45a7f19d674..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/trim/energy.yaml +++ /dev/null @@ -1,7 +0,0 @@ -_target_: nemo.collections.tts.parts.preprocessing.audio_trimming.EnergyAudioTrimmer - -db_threshold: 50.0 -speech_frame_threshold: 3 -trim_win_length: 4096 -trim_hop_length: 1024 -pad_seconds: 0.2 \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/tts/conf/trim/vad.yaml b/SoundScribe/SpeakerID/examples/tts/conf/trim/vad.yaml deleted file mode 100644 index 38795d5e0a6913b71f452089c0153751d29666ad..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/trim/vad.yaml +++ /dev/null @@ -1,10 +0,0 @@ -_target_: nemo.collections.tts.parts.preprocessing.audio_trimming.VadAudioTrimmer - -model_name: "vad_multilingual_marblenet" -vad_sample_rate: 16000 -vad_threshold: 0.5 -device: "cpu" -speech_frame_threshold: 3 -trim_win_length: 4096 -trim_hop_length: 1024 -pad_seconds: 0.2 \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/generator/c16.yaml b/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/generator/c16.yaml deleted file mode 100644 index 7c15f2cbf076ed03d15d1117ea36cb0ff9c94d56..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/generator/c16.yaml +++ /dev/null @@ -1,7 +0,0 @@ -_target_: nemo.collections.tts.modules.univnet_modules.Generator -noise_dim: 64 -channel_size: 16 -dilations: [1, 3, 9, 27] -strides: [8, 8, 4] -lrelu_slope: 0.2 -kpnet_conv_size: 3 \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/generator/c32.yaml b/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/generator/c32.yaml deleted file mode 100644 index 820f8fbb082d774a5d34c8a5af06154ddee58770..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/generator/c32.yaml +++ /dev/null @@ -1,7 +0,0 @@ -_target_: nemo.collections.tts.modules.univnet_modules.Generator -noise_dim: 64 -channel_size: 32 -dilations: [1, 3, 9, 27] -strides: [8, 8, 4] -lrelu_slope: 0.2 -kpnet_conv_size: 3 \ No newline at end of file diff --git a/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/train_ds/train_ds.yaml b/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/train_ds/train_ds.yaml deleted file mode 100644 index 8f52e29a0f3bef033afd0ba82f25e67107c075f8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/train_ds/train_ds.yaml +++ /dev/null @@ -1,13 +0,0 @@ -dataset: - _target_: "nemo.collections.tts.data.dataset.VocoderDataset" - manifest_filepath: ${train_dataset} - sample_rate: ${sample_rate} - n_segments: ${train_n_segments} - max_duration: ${train_max_duration} - min_duration: ${train_min_duration} -dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 4 - pin_memory: true diff --git a/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/train_ds/train_ds_finetune.yaml b/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/train_ds/train_ds_finetune.yaml deleted file mode 100644 index 5185bdb6317d5317d7d7f577a3e9affe392b1797..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/train_ds/train_ds_finetune.yaml +++ /dev/null @@ -1,15 +0,0 @@ -dataset: - _target_: "nemo.collections.tts.data.dataset.VocoderDataset" - manifest_filepath: ${train_dataset} - sample_rate: ${sample_rate} - n_segments: ${train_n_segments} - max_duration: ${train_max_duration} - min_duration: ${train_min_duration} - load_precomputed_mel: true - hop_length: ${n_window_stride} -dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 4 - pin_memory: true diff --git a/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/validation_ds/val_ds.yaml b/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/validation_ds/val_ds.yaml deleted file mode 100644 index e241f810afb70b502c3b1ee5727181ca347efddc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/validation_ds/val_ds.yaml +++ /dev/null @@ -1,13 +0,0 @@ -dataset: - _target_: "nemo.collections.tts.data.dataset.VocoderDataset" - manifest_filepath: ${validation_datasets} - sample_rate: ${sample_rate} - n_segments: ${val_n_segments} - max_duration: ${val_max_duration} - min_duration: ${val_min_duration} -dataloader_params: - drop_last: false - shuffle: false - batch_size: 16 - num_workers: 1 - pin_memory: true diff --git a/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/validation_ds/val_ds_finetune.yaml b/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/validation_ds/val_ds_finetune.yaml deleted file mode 100644 index 6c5b79c9056bbdd76508fc86d04cd105a3314d73..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/univnet/model/validation_ds/val_ds_finetune.yaml +++ /dev/null @@ -1,15 +0,0 @@ -dataset: - _target_: "nemo.collections.tts.data.dataset.VocoderDataset" - manifest_filepath: ${validation_datasets} - sample_rate: ${sample_rate} - n_segments: ${val_n_segments} - max_duration: ${val_max_duration} - min_duration: ${val_min_duration} - load_precomputed_mel: true - hop_length: ${n_window_stride} -dataloader_params: - drop_last: false - shuffle: false - batch_size: 16 - num_workers: 4 - pin_memory: true diff --git a/SoundScribe/SpeakerID/examples/tts/conf/univnet/univnet.yaml b/SoundScribe/SpeakerID/examples/tts/conf/univnet/univnet.yaml deleted file mode 100644 index 2f6b2f0a26ba7899e95f809aaee01338fc305d9b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/univnet/univnet.yaml +++ /dev/null @@ -1,105 +0,0 @@ -# This config contains the default values for training UnivNet model on LJSpeech dataset. -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: "UnivNet" - -train_dataset: ??? -validation_datasets: ??? - -# Default values for dataset with sample_rate=22050 -sample_rate: 22050 -n_mel_channels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: 8000 -window: hann - -train_n_segments: 8192 -train_max_duration: null -train_min_duration: 0.75 - -val_n_segments: 66048 -val_max_duration: null -val_min_duration: 3 - -defaults: - - model/generator: c32 - - model/train_ds: train_ds - - model/validation_ds: val_ds - -model: - discriminator: - mpd: - periods: [2,3,5,7,11] - kernel_size: 5 - stride: 3 - use_spectral_norm: false - lrelu_slope: 0.2 - mrd: - resolutions: [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]] # (filter_length, hop_length, win_length) - use_spectral_norm: false - lrelu_slope: 0.2 - preprocessor: - _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures - nfilt: ${n_mel_channels} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - n_fft: ${n_fft} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - pad_to: 0 - pad_value: -11.52 - sample_rate: ${sample_rate} - window: ${window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: clamp - log_zero_guard_value: 1e-05 - mag_power: 1.0 - use_grads: false - exact_pad: true - - optim: - _target_: torch.optim.AdamW - lr: 0.0001 - betas: [0.5, 0.9] - - max_steps: 1000000 - stft_lamb: 2.5 - denoise_strength: 0.0025 - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp_find_unused_parameters_true - precision: 32 - max_steps: ${model.max_steps} - accumulate_grad_batches: 1 - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 10 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - mode: min - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null - entity: null - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/vits.yaml b/SoundScribe/SpeakerID/examples/tts/conf/vits.yaml deleted file mode 100644 index c38229dd528bd84b77955a43c1ad4913ff11adeb..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/vits.yaml +++ /dev/null @@ -1,213 +0,0 @@ -# This config contains the default values for training VITS model on LJSpeech dataset. -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -# TODO: remove unnecessary arguments, refactoring - -name: VITS - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: null -sup_data_types: null - -phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -sample_rate: 22050 -n_mel_channels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: null -window: hann - -model: - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - mel_fmin: 0.0 - mel_fmax: null - - n_speakers: 0 - segment_size: 8192 - c_mel: 45 - c_kl: 1. - use_spectral_norm: false - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer - punct: true - apostrophe: true - pad_with_space: false - g2p: - _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.8 - # Relies on the heteronyms list for anything that needs to be disambiguated - ignore_ambiguous_words: false - use_chars: true - use_stresses: true - - train_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.TTSDataset" - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: False - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - - dataloader_params: - num_workers: 8 - pin_memory: false - - batch_sampler: - batch_size: 32 - boundaries: [32,300,400,500,600,700,800,900,1000] - num_replicas: ${trainer.devices} - shuffle: true - - validation_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.TTSDataset" - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: False - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 16 - num_workers: 4 - pin_memory: false - - preprocessor: - _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures - nfilt: ${model.n_mel_channels} - highfreq: ${model.highfreq} - log: true - log_zero_guard_type: clamp - log_zero_guard_value: 1e-05 - lowfreq: ${model.lowfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - n_window_stride: ${model.n_window_stride} - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - stft_conv: false - nb_augmentation_prob : 0 - mag_power: 1.0 - exact_pad: true - use_grads: true - - synthesizer: - _target_: nemo.collections.tts.modules.vits_modules.SynthesizerTrn - inter_channels: 192 - hidden_channels: 192 - filter_channels: 768 - n_heads: 2 - n_layers: 6 - kernel_size: 3 - p_dropout: 0.1 - resblock: "1" - resblock_kernel_sizes: [3,7,11] - resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] - upsample_rates: [8,8,2,2] - upsample_initial_channel: 512 - upsample_kernel_sizes: [16,16,4,4] - n_speakers: ${model.n_speakers} - gin_channels: 256 # for multi-speaker - - optim: - _target_: torch.optim.AdamW - lr: 2e-4 - betas: [0.9, 0.99] - eps: 1e-9 - - sched: - name: ExponentialLR - lr_decay: 0.999875 - -trainer: - num_nodes: 1 - devices: 2 - accelerator: gpu - strategy: ddp - precision: 32 - # amp_backend: 'apex' - # amp_level: 'O2' - # benchmark: true - max_epochs: -1 - accumulate_grad_batches: 1 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 50 - check_val_every_n_epoch: 1 - -exp_manager: - exp_dir: ??? - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: loss_gen_all - mode: min - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/vits_44100.yaml b/SoundScribe/SpeakerID/examples/tts/conf/vits_44100.yaml deleted file mode 100644 index bcadb255eb7fdd96ac80967141873f5618490a52..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/vits_44100.yaml +++ /dev/null @@ -1,209 +0,0 @@ -# This config contains the default values for training VITS model on LJSpeech dataset. -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: VITS - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [speaker_id] - -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -sample_rate: 44100 -n_mel_channels: 80 -n_window_size: 2048 -n_window_stride: 512 -n_fft: 2048 -lowfreq: 0 -highfreq: null -window: hann - -phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" - -model: - n_speakers: 13000 - segment_size: 16384 - c_mel: 45 - c_kl: 1. - use_spectral_norm: false - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer - punct: true - apostrophe: true - pad_with_space: false - g2p: - _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.8 - # Relies on the heteronyms list for anything that needs to be disambiguated - ignore_ambiguous_words: false - use_chars: true - use_stresses: true - - train_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.TTSDataset" - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: False - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - - dataloader_params: - num_workers: 8 - pin_memory: false - - batch_sampler: - batch_size: 32 - boundaries: [32,300,400,500,600,700,800,900,1000] - num_replicas: ${trainer.devices} - shuffle: true - - validation_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.TTSDataset" - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: False - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 4 - pin_memory: false - - preprocessor: - _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures - nfilt: ${model.n_mel_channels} - highfreq: ${model.highfreq} - log: true - log_zero_guard_type: clamp - log_zero_guard_value: 1e-05 - lowfreq: ${model.lowfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - n_window_stride: ${model.n_window_stride} - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - stft_conv: false - nb_augmentation_prob : 0 - mag_power: 1.0 - exact_pad: true - use_grads: true - - synthesizer: - _target_: nemo.collections.tts.modules.vits_modules.SynthesizerTrn - inter_channels: 192 - hidden_channels: 192 - filter_channels: 768 - n_heads: 2 - n_layers: 6 - kernel_size: 3 - p_dropout: 0.1 - resblock: "1" - resblock_kernel_sizes: [3,7,11] - resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] - upsample_rates: [8,8,4,2] - upsample_initial_channel: 512 - upsample_kernel_sizes: [16,16,4,4] - n_speakers: ${model.n_speakers} - gin_channels: 256 # for multi-speaker - - optim: - _target_: torch.optim.AdamW - lr: 2e-4 - betas: [0.9, 0.99] - eps: 1e-9 - - sched: - name: CosineAnnealing - max_steps: 1000000 - min_lr: 1e-5 - -trainer: - num_nodes: 1 - devices: 2 - accelerator: gpu - strategy: ddp - precision: 32 - # amp_backend: 'apex' - # amp_level: 'O2' - # benchmark: true - max_epochs: -1 - accumulate_grad_batches: 1 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 50 - check_val_every_n_epoch: 1 - -exp_manager: - exp_dir: ??? - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: loss_gen_all - mode: min - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/waveglow.yaml b/SoundScribe/SpeakerID/examples/tts/conf/waveglow.yaml deleted file mode 100644 index 1d4d4bf8be044dc454b7112862c9e018a7fbdf46..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/waveglow.yaml +++ /dev/null @@ -1,113 +0,0 @@ -# This config contains the default values for training WaveGlow model on LJSpeech dataset. -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: "WaveGlow" - -train_dataset: ??? -validation_datasets: ??? - -# Default values for dataset with sample_rate=22050 -sample_rate: 22050 -n_mel_channels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: 8000 -window: hann - -model: - sigma: 1.0 - train_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.VocoderDataset" - manifest_filepath: ${train_dataset} - sample_rate: ${sample_rate} - max_duration: null - min_duration: 0.1 - n_segments: 16000 - dataloader_params: - drop_last: false - shuffle: true - batch_size: 12 - num_workers: 4 - pin_memory: true - - validation_ds: - dataset: - _target_: "nemo.collections.tts.data.dataset.VocoderDataset" - manifest_filepath: ${validation_datasets} - sample_rate: ${sample_rate} - max_duration: null - min_duration: 0.1 - dataloader_params: - drop_last: false - shuffle: false - batch_size: 8 - num_workers: 4 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures - nfilt: ${n_mel_channels} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - n_fft: ${n_fft} - # Changing these parameters are not recommended, because WaveGlow is currently hardcoded to these values - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - pad_to: 16 - pad_value: -11.52 - sample_rate: ${sample_rate} - window: ${window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: clamp - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - waveglow: - _target_: nemo.collections.tts.modules.waveglow.WaveGlowModule - n_early_every: 4 - n_early_size: 2 - n_flows: 12 - n_group: 8 - n_mel_channels: ${n_mel_channels} - n_wn_channels: 256 - n_wn_layers: 8 - wn_kernel_size: 3 - - optim: - name: adam - lr: 1e-4 - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: ??? - accumulate_grad_batches: 1 - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 200 - check_val_every_n_epoch: 25 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null - entity: null - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/zh/fastpitch_align_22050.yaml b/SoundScribe/SpeakerID/examples/tts/conf/zh/fastpitch_align_22050.yaml deleted file mode 100644 index 21a84f3169c9de43a36cb2ee29390bddcd3cd186..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/zh/fastpitch_align_22050.yaml +++ /dev/null @@ -1,259 +0,0 @@ -# This config contains the default values for training FastPitch model with aligner using 22KHz sampling -# rate. If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [ "align_prior_matrix", "pitch" ] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 1986.977294921875 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 221.4948272705078 for SFbilingual dataset. -pitch_std: ??? # e.g. 64.6528930664063 for SFbilingual dataset. - -# Default values for dataset with sample_rate=22050 -sample_rate: 22050 -n_mel_channels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: null -window: hann - -# There are four candidates of `phoneme_dict_path` provided for Chinese as shown below, -# 1) 24-final Pinyin: "scripts/tts_dataset_files/zh/24finals/pinyin_dict_nv_22.10.txt", -# 2) IPA converted from 24-final Pinyin: "scripts/tts_dataset_files/zh/24finals/ipa_dict_nv23.05.txt", -# 3) 36-final Pinyin: "scripts/tts_dataset_files/zh/36finals/pinyin_dict_nv23.05.txt", -# 4) (default) IPA converted from 36-final Pinyin: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt" -# Suggest to choose IPA symbol set converted from 36-final Pinyin because better audio quality were observed. -phoneme_dict_path: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt" - -model: - learn_alignment: true - bin_loss_warmup_epochs: 100 - - n_speakers: 1 - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: zh - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.ChinesePhonemesTokenizer - punct: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p - phoneme_dict: ${phoneme_dict_path} - word_segmenter: jieba # Only jieba is supported now. - phoneme_prefix: "" - phoneme_case: lower - tone_prefix: "#" - ascii_letter_prefix: "" - ascii_letter_case: upper - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null # change to null to include longer audios. - min_duration: 0.1 - ignore_file: null - trim: true - trim_top_db: 50 - trim_frame_length: ${model.n_window_size} - trim_hop_length: ${model.n_window_stride} - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 12 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null # change to null to include longer audios. - min_duration: 0.1 - ignore_file: null - trim: true - trim_top_db: 50 - trim_frame_length: ${model.n_window_size} - trim_hop_length: ${model.n_window_stride} - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 2 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - input_fft: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - optim: - name: adamw - lr: 1e-3 - betas: [0.9, 0.999] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: -1 # number of gpus - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: 5000 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/conf/zh/fastpitch_align_multispeaker_22050.yaml b/SoundScribe/SpeakerID/examples/tts/conf/zh/fastpitch_align_multispeaker_22050.yaml deleted file mode 100644 index 55c918c28b72c895e85f883d66b670ffbd723ce6..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/conf/zh/fastpitch_align_multispeaker_22050.yaml +++ /dev/null @@ -1,261 +0,0 @@ -# This config contains the default values for training FastPitch model with aligner using 22KHz sampling -# rate. If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id"] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 1986.977294921875 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 221.4948272705078 for SFbilingual dataset. -pitch_std: ??? # e.g. 64.6528930664063 for SFbilingual dataset. - -# Default values for dataset with sample_rate=22050 -sample_rate: 22050 -n_mel_channels: 80 -n_window_size: 1024 -n_window_stride: 256 -n_fft: 1024 -lowfreq: 0 -highfreq: null -window: hann - -# There are four candidates of `phoneme_dict_path` provided for Chinese as shown below, -# 1) 24-final Pinyin: "scripts/tts_dataset_files/zh/24finals/pinyin_dict_nv_22.10.txt", -# 2) IPA converted from 24-final Pinyin: "scripts/tts_dataset_files/zh/24finals/ipa_dict_nv23.05.txt", -# 3) 36-final Pinyin: "scripts/tts_dataset_files/zh/36finals/pinyin_dict_nv23.05.txt", -# 4) (default) IPA converted from 36-final Pinyin: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt" -# Suggest to choose IPA symbol set converted from 36-final Pinyin because better audio quality were observed. -phoneme_dict_path: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt" - -model: - learn_alignment: true - bin_loss_warmup_epochs: 100 - - n_speakers: 175 - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - speaker_emb_condition_prosody: true - speaker_emb_condition_aligner: true - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: zh - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.ChinesePhonemesTokenizer - punct: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p - phoneme_dict: ${phoneme_dict_path} - word_segmenter: jieba # Only jieba is supported now. - phoneme_prefix: "" - phoneme_case: lower - tone_prefix: "#" - ascii_letter_prefix: "" - ascii_letter_case: upper - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null # change to null to include longer audios. - min_duration: 0.1 - ignore_file: null - trim: true - trim_top_db: 50 - trim_frame_length: ${model.n_window_size} - trim_hop_length: ${model.n_window_stride} - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 12 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null # change to null to include longer audios. - min_duration: 0.1 - ignore_file: null - trim: true - trim_top_db: 50 - trim_frame_length: ${model.n_window_size} - trim_hop_length: ${model.n_window_stride} - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 2 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - input_fft: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - - optim: - name: adamw - lr: 1e-3 - betas: [0.9, 0.999] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: -1 # number of gpus - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: 5000 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 5 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/SoundScribe/SpeakerID/examples/tts/fastpitch.py b/SoundScribe/SpeakerID/examples/tts/fastpitch.py deleted file mode 100644 index a8e6ecdc902d997a79bb7c6f84310599ddedf9c4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/fastpitch.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl - -from nemo.collections.common.callbacks import LogEpochTimeCallback -from nemo.collections.tts.models import FastPitchModel -from nemo.core.config import hydra_runner -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="fastpitch_align_v1.05") -def main(cfg): - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - model = FastPitchModel(cfg=cfg.model, trainer=trainer) - lr_logger = pl.callbacks.LearningRateMonitor() - epoch_time_logger = LogEpochTimeCallback() - trainer.callbacks.extend([lr_logger, epoch_time_logger]) - trainer.fit(model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/fastpitch_finetune.py b/SoundScribe/SpeakerID/examples/tts/fastpitch_finetune.py deleted file mode 100644 index 64b5e8b90625371fffb05367237177ae1fab556a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/fastpitch_finetune.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl - -from nemo.collections.common.callbacks import LogEpochTimeCallback -from nemo.collections.tts.models import FastPitchModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="fastpitch_align_44100") -def main(cfg): - if hasattr(cfg.model.optim, 'sched'): - logging.warning("You are using an optimizer scheduler while finetuning. Are you sure this is intended?") - if cfg.model.optim.lr > 1e-3 or cfg.model.optim.lr < 1e-5: - logging.warning("The recommended learning rate for finetuning is 2e-4") - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - model = FastPitchModel(cfg=cfg.model, trainer=trainer) - model.maybe_init_from_pretrained_checkpoint(cfg=cfg) - lr_logger = pl.callbacks.LearningRateMonitor() - epoch_time_logger = LogEpochTimeCallback() - trainer.callbacks.extend([lr_logger, epoch_time_logger]) - trainer.fit(model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/fastpitch_finetune_adapters.py b/SoundScribe/SpeakerID/examples/tts/fastpitch_finetune_adapters.py deleted file mode 100644 index 1361d63fb4cfc29c8b305097abcf60c9f1da269a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/fastpitch_finetune_adapters.py +++ /dev/null @@ -1,153 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from dataclasses import is_dataclass - -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf, open_dict - -from nemo.collections.common.callbacks import LogEpochTimeCallback -from nemo.collections.tts.models import FastPitchModel -from nemo.core import adapter_mixins -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -def update_model_config_to_support_adapter(config) -> DictConfig: - with open_dict(config): - enc_adapter_metadata = adapter_mixins.get_registered_adapter(config.input_fft._target_) - if enc_adapter_metadata is not None: - config.input_fft._target_ = enc_adapter_metadata.adapter_class_path - - dec_adapter_metadata = adapter_mixins.get_registered_adapter(config.output_fft._target_) - if dec_adapter_metadata is not None: - config.output_fft._target_ = dec_adapter_metadata.adapter_class_path - - pitch_predictor_adapter_metadata = adapter_mixins.get_registered_adapter(config.pitch_predictor._target_) - if pitch_predictor_adapter_metadata is not None: - config.pitch_predictor._target_ = pitch_predictor_adapter_metadata.adapter_class_path - - duration_predictor_adapter_metadata = adapter_mixins.get_registered_adapter(config.duration_predictor._target_) - if duration_predictor_adapter_metadata is not None: - config.duration_predictor._target_ = duration_predictor_adapter_metadata.adapter_class_path - - aligner_adapter_metadata = adapter_mixins.get_registered_adapter(config.alignment_module._target_) - if aligner_adapter_metadata is not None: - config.alignment_module._target_ = aligner_adapter_metadata.adapter_class_path - - return config - - -def add_global_adapter_cfg(model, global_adapter_cfg): - # Convert to DictConfig from dict or Dataclass - if is_dataclass(global_adapter_cfg): - global_adapter_cfg = OmegaConf.structured(global_adapter_cfg) - - if not isinstance(global_adapter_cfg, DictConfig): - global_adapter_cfg = DictConfig(global_adapter_cfg) - - # Update the model.cfg with information about the new adapter global cfg - with open_dict(global_adapter_cfg), open_dict(model.cfg): - if 'adapters' not in model.cfg: - model.cfg.adapters = OmegaConf.create({}) - - # Add the global config for adapters to the model's internal config - model.cfg.adapters[model.adapter_global_cfg_key] = global_adapter_cfg - - # Update all adapter modules (that already exist) with this global adapter config - model.update_adapter_cfg(model.cfg.adapters) - - -@hydra_runner(config_path="conf", config_name="fastpitch_align_44100_adapter") -def main(cfg): - if hasattr(cfg.model.optim, 'sched'): - logging.warning("You are using an optimizer scheduler while finetuning. Are you sure this is intended?") - if cfg.model.optim.lr > 1e-3 or cfg.model.optim.lr < 1e-5: - logging.warning("The recommended learning rate for finetuning is 2e-4") - - trainer = pl.Trainer(**cfg.trainer) - exp_log_dir = exp_manager(trainer, cfg.get("exp_manager", None)) - # Initialize FastPitchModel - model = FastPitchModel(cfg=update_model_config_to_support_adapter(cfg.model), trainer=trainer) - model.maybe_init_from_pretrained_checkpoint(cfg=cfg) - - # Extract adapter parameters - with open_dict(cfg.model.adapter): - # Extract the name of the adapter (must be given for training) - adapter_name = cfg.model.adapter.pop("adapter_name", "adapter") - # Extract the name of the modules where adapters need to be added (must be given for training) - adapter_module_name = cfg.model.adapter.pop("adapter_module_name", None) - # Name of the adapter checkpoint which will be saved after training - adapter_state_dict_name = cfg.model.adapter.pop("adapter_state_dict_name", None) - - # augment adapter name with module name, if not provided by user - if adapter_module_name is not None and ':' not in adapter_name: - adapter_name = f'{adapter_module_name}:{adapter_name}' - - # Extract the global adapter config, if provided - adapter_global_cfg = cfg.model.adapter.pop(model.adapter_global_cfg_key, None) - - # Freeze model - model.freeze() - - # Setup adapters - if adapter_global_cfg is not None: - add_global_adapter_cfg(model, adapter_global_cfg) - - if cfg.model.get("unfreeze_aligner", False): - for name, param in model.fastpitch.aligner.named_parameters(): - param.requires_grad = True - - if cfg.model.get("unfreeze_duration_predictor", False): - for name, param in model.fastpitch.duration_predictor.named_parameters(): - param.requires_grad = True - - if cfg.model.get("unfreeze_pitch_predictor", False): - for name, param in model.fastpitch.pitch_predictor.named_parameters(): - param.requires_grad = True - - # Add adapters - model.add_adapter(name=adapter_name, cfg=cfg.model.adapter) - assert model.is_adapter_available() - # enable adapters - model.set_enabled_adapters(enabled=False) - model.set_enabled_adapters(adapter_name, enabled=True) - - # Set model to training mode. - model = model.train() - # Then, Unfreeze just the adapter weights that were enabled above (no part of model) - model.unfreeze_enabled_adapters() - # summarize the model - model.summarize() - - lr_logger = pl.callbacks.LearningRateMonitor() - epoch_time_logger = LogEpochTimeCallback() - trainer.callbacks.extend([lr_logger, epoch_time_logger]) - trainer.fit(model) - - # Save the adapter state dict after training has completed - if adapter_state_dict_name is not None: - state_path = exp_log_dir if exp_log_dir is not None else os.getcwd() - ckpt_path = os.path.join(state_path, "checkpoints") - if os.path.exists(ckpt_path): - state_path = ckpt_path - - # Save the adapter modules in a seperate file - model.save_adapters(os.path.join(state_path, adapter_state_dict_name)) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/fastpitch_ssl.py b/SoundScribe/SpeakerID/examples/tts/fastpitch_ssl.py deleted file mode 100644 index 1101ac1eeaf709e7f5d6092b048e59a197332050..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/fastpitch_ssl.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl - -from nemo.collections.common.callbacks import LogEpochTimeCallback -from nemo.collections.tts.models import fastpitch_ssl, hifigan -from nemo.core.config import hydra_runner -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="fastpitch_ssl") -def main(cfg): - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - vocoder = hifigan.HifiGanModel.load_from_checkpoint(cfg.hifi_ckpt_path).cpu() - vocoder.eval() - model = fastpitch_ssl.FastPitchModel_SSL(cfg=cfg.model, trainer=trainer, vocoder=vocoder) - if cfg.get("finetune", False): - model.maybe_init_from_pretrained_checkpoint(cfg=cfg) - lr_logger = pl.callbacks.LearningRateMonitor() - epoch_time_logger = LogEpochTimeCallback() - trainer.callbacks.extend([lr_logger, epoch_time_logger]) - trainer.fit(model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/g2p/README.md b/SoundScribe/SpeakerID/examples/tts/g2p/README.md deleted file mode 100644 index 2862bd15182c9f489f60947cae7cc36fbacb8d85..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/g2p/README.md +++ /dev/null @@ -1,2 +0,0 @@ -This directory contains grapheme-to-phoneme (G2P) related work. - diff --git a/SoundScribe/SpeakerID/examples/tts/g2p/conf/g2p_conformer_ctc.yaml b/SoundScribe/SpeakerID/examples/tts/g2p/conf/g2p_conformer_ctc.yaml deleted file mode 100644 index 098e1fdc20157147906cc655f7ea87924e94007e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/g2p/conf/g2p_conformer_ctc.yaml +++ /dev/null @@ -1,145 +0,0 @@ -name: G2P-Conformer-CTC - -# Dataset info -train_manifest: ??? -validation_manifest: ??? -test_manifest: null -do_training: True -do_testing: False -pretrained_model: null # path to .nemo file or model name from list_available_models() - -model: - model_name: conformer_bpe - max_source_len: 512 - - tokenizer_grapheme: - dataset: - _target_: nemo.collections.common.tokenizers.char_tokenizer.CharTokenizer - unk_token: "҂" # in the data, T5 unk_token is still - vocab_file: null # will be filled during training - do_lower: true # whether to lower case graphemes - add_punctuation: true # whether to add punctuation symbols - - embedding: - d_model: 300 - encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.embedding.d_model} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 16 - d_model: 176 - - # Sub-sampling params - subsampling: null # vggnet or striding, vggnet may give better results but needs more memory - subsampling_factor: 1 # must be power of 2 - subsampling_conv_channels: -1 # set to -1 to make it equal to the d_model - - # Feed forward module's params - ff_expansion_factor: 4 - - # Multi-headed Attention Module's params - self_attention_model: rel_pos # rel_pos or abs_pos - n_heads: 4 # may need to be lower for smaller d_models - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - att_context_size: [ -1, -1 ] # -1 means unlimited context - xscaling: true # scales up the input embeddings by sqrt(d_model) - untie_biases: true # unties the biases of the TransformerXL layers - pos_emb_max_len: 5000 - - # Convolution module's params - conv_kernel_size: 31 - conv_norm_type: 'batch_norm' # batch_norm or layer_norm - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_pre_encoder: 0.1 # The dropout used before the encoder - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoder - feat_in: null # will be filled during training based on encoder model dim - num_classes: -1 - vocabulary: null # will be filled during training - - tokenizer: - dir: null # path to directory which contains either tokenizer.model (bpe) or vocab.txt (wpe) - type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) - - train_ds: - manifest_filepath: ${train_manifest} - dataset: - _target_: "nemo.collections.tts.g2p.data.ctc.CTCG2PBPEDataset" - phoneme_field: "text" # name of the field in manifest_filepath for ground truth phonemes - grapheme_field: "text_graphemes" # name of the field in manifest_filepath for input grapheme text - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 4 - - validation_ds: - manifest_filepath: ${validation_manifest} - dataset: - _target_: "nemo.collections.tts.g2p.data.ctc.CTCG2PBPEDataset" - phoneme_field: "text" # name of the field in manifest_filepath for ground truth phonemes - grapheme_field: "text_graphemes" # name of the field in manifest_filepath for input grapheme text - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 4 - - test_ds: - manifest_filepath: ${test_manifest} - dataset: - _target_: "nemo.collections.tts.g2p.data.ctc.CTCG2PBPEDataset" - phoneme_field: "text" # name of the field in manifest_filepath for ground truth phonemes - grapheme_field: "text_graphemes" # name of the field in manifest_filepath for input grapheme text - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 0 - - optim: - name: adamw - lr: 2.0 - # optimizer arguments - betas: [ 0.9, 0.98 ] - # less necessity for weight_decay as we already have large augmentations with SpecAug - # you may need weight_decay for large models, stable AMP training, small datasets, or when lower augmentations are used - # weight decay of 0.0 with lr of 2.0 also works fine - weight_decay: 1e-3 - - # scheduler setup - sched: - name: NoamAnnealing - d_model: ${model.encoder.d_model} - # scheduler config override - warmup_steps: 10000 - warmup_ratio: null - min_lr: 1e-6 - -trainer: - devices: 1 # number of gpus - max_epochs: 5 - num_nodes: 1 - accelerator: gpu - strategy: ddp - accumulate_grad_batches: 1 - enable_checkpointing: False # Provided by exp_manager - logger: False # Provided by exp_manager - log_every_n_steps: 200 - check_val_every_n_epoch: 1 - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: True - create_checkpoint_callback: True - checkpoint_callback_params: - save_top_k: 1 - monitor: "val_per" - mode: "min" - save_best_model: true diff --git a/SoundScribe/SpeakerID/examples/tts/g2p/conf/g2p_heteronym_classification.yaml b/SoundScribe/SpeakerID/examples/tts/g2p/conf/g2p_heteronym_classification.yaml deleted file mode 100644 index 31f39d80f789507bfeb0a511b1bc58bcbc9f94e7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/g2p/conf/g2p_heteronym_classification.yaml +++ /dev/null @@ -1,104 +0,0 @@ -name: HeteronymClassification - -# Dataset info -train_manifest: ??? -validation_manifest: ??? -test_manifest: null -do_training: True -do_testing: False -pretrained_model: null # path to .nemo file or model name from list_available_models() - -model: - wordids: ??? # path to wordids in WikiHomograph dataset format - max_seq_length: 256 # the maximum length BERT supports is 512 - label_ids: null # will be filled during training - class_labels: - class_labels_file: null # will be generated during training and saved in .nemo file - - language_model: - pretrained_model_name: bert-base-uncased # currently supports BERT or Distill BERT - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - - head: - num_fc_layers: 2 - fc_dropout: 0.5 - activation: 'relu' - use_transformer_init: True - - train_ds: - dataset: - _target_: "nemo.collections.tts.g2p.data.heteronym_classification.HeteronymClassificationDataset" - manifest: ${train_manifest} - grapheme_field: "text_graphemes" # name of the field in manifest for input grapheme text - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 0 - - validation_ds: - dataset: - _target_: "nemo.collections.tts.g2p.data.heteronym_classification.HeteronymClassificationDataset" - manifest: ${validation_manifest} - grapheme_field: "text_graphemes" # name of the field in manifest for input grapheme text - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 0 - - test_ds: - dataset: - _target_: "nemo.collections.tts.g2p.data.heteronym_classification.HeteronymClassificationDataset" - manifest: ${test_manifest} - grapheme_field: "text_graphemes" # name of the field in manifest for input grapheme text - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 #64 - num_workers: 0 - - optim: - name: adamw - lr: 5e-5 - weight_decay: 0.01 - # scheduler setup - sched: - name: WarmupAnnealing - # pytorch lightning args - reduce_on_plateau: false - # scheduler config override - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - -trainer: - devices: 1 # number of gpus - max_epochs: 10 - num_nodes: 1 - accelerator: gpu - strategy: ddp - accumulate_grad_batches: 1 - enable_checkpointing: False # Provided by exp_manager - logger: False # Provided by exp_manager - log_every_n_steps: 200 - check_val_every_n_epoch: 1 - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: True - create_checkpoint_callback: True - checkpoint_callback_params: - save_top_k: 1 - monitor: "val_loss" - mode: "min" - save_best_model: true diff --git a/SoundScribe/SpeakerID/examples/tts/g2p/conf/g2p_t5.yaml b/SoundScribe/SpeakerID/examples/tts/g2p/conf/g2p_t5.yaml deleted file mode 100644 index 49082f05df14cc6a32a16b65ee2252c689d6ab3e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/g2p/conf/g2p_t5.yaml +++ /dev/null @@ -1,92 +0,0 @@ -name: T5G2P - -# Dataset info -train_manifest: ??? -validation_manifest: ??? -test_manifest: null -do_training: True -do_testing: False -pretrained_model: null # path to .nemo file or model name from list_available_models() - -model: - model_name: "google/byt5-small" # One of: google/byt5-small/base/large/xl or t5-small/base/large/3b/11b - max_source_len: 256 - max_target_len: 512 - do_lower: false - - train_ds: - manifest_filepath: ${train_manifest} - dataset: - _target_: "nemo.collections.tts.g2p.data.t5.T5G2PDataset" - phoneme_field: "text" # name of the field in manifest_filepath for ground truth phonemes - grapheme_field: "text_graphemes" # name of the field in manifest_filepath for input grapheme text - dataloader_params: - drop_last: false - shuffle: true - batch_size: 20 - num_workers: 4 - - validation_ds: - manifest_filepath: ${validation_manifest} - dataset: - _target_: "nemo.collections.tts.g2p.data.t5.T5G2PDataset" - phoneme_field: "text" # name of the field in manifest_filepath for ground truth phonemes - grapheme_field: "text_graphemes" # name of the field in manifest_filepath for input grapheme text - dataloader_params: - drop_last: false - shuffle: false - batch_size: 20 - num_workers: 4 - - test_ds: - manifest_filepath: ${test_manifest} - dataset: - _target_: "nemo.collections.tts.g2p.data.t5.T5G2PDataset" - phoneme_field: "text" # name of the field in manifest_filepath for ground truth phonemes - grapheme_field: "text_graphemes" # name of the field in manifest_filepath for input grapheme text - dataloader_params: - drop_last: false - shuffle: false - batch_size: 20 - num_workers: 4 - - optim: - name: adamw - lr: 2e-4 - weight_decay: 0.01 - # scheduler setup - sched: - name: WarmupAnnealing - - # pytorch lightning args - monitor: val_token_precision - reduce_on_plateau: false - - # scheduler config override - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - -trainer: - devices: 1 # number of gpus - max_epochs: 5 - num_nodes: 1 - accelerator: gpu - strategy: ddp - accumulate_grad_batches: 1 - enable_checkpointing: False # Provided by exp_manager - logger: False # Provided by exp_manager - log_every_n_steps: 200 - check_val_every_n_epoch: 1 - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: True - create_checkpoint_callback: True - checkpoint_callback_params: - save_top_k: 1 - monitor: "val_per" - mode: "min" - save_best_model: true - diff --git a/SoundScribe/SpeakerID/examples/tts/g2p/conf/heteronym_classification_zh.yaml b/SoundScribe/SpeakerID/examples/tts/g2p/conf/heteronym_classification_zh.yaml deleted file mode 100644 index f0ca98ecc97f356f36d1e02003b67ae67eb874b0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/g2p/conf/heteronym_classification_zh.yaml +++ /dev/null @@ -1,106 +0,0 @@ -name: HeteronymClassification - -# Dataset info -# Chinese Polyphones with Pinyin (CPP) -# https://github.com/kakaobrain/g2pM#the-cpp-dataset -train_manifest: ??? -validation_manifest: ??? -test_manifest: ??? -do_training: True -do_testing: False -pretrained_model: null # path to .nemo file or model name from list_available_models() - -model: - wordids: ??? # path to CPP wordids in WikiHomograph dataset format e.g. ./cpp_manifest/wordid.tsv - max_seq_length: 256 # the maximum length BERT supports is 512 - label_ids: null # will be filled during training - class_labels: - class_labels_file: null # will be generated during training and saved in .nemo file - - language_model: - pretrained_model_name: bert-base-chinese # https://huggingface.co/bert-base-chinese/tree/main - lm_checkpoint: null - config_file: null # json file, precedence over config - config: null - - tokenizer: - tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece - vocab_file: null # path to vocab file - tokenizer_model: null # only used if tokenizer is sentencepiece - special_tokens: null - - head: - num_fc_layers: 2 - fc_dropout: 0.5 - activation: 'relu' - use_transformer_init: True - - train_ds: - dataset: - _target_: "nemo.collections.tts.g2p.data.heteronym_classification_data.HeteronymClassificationDataset" - manifest: ${train_manifest} - grapheme_field: "text_graphemes" # name of the field in manifest for input grapheme text - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 0 - - validation_ds: - dataset: - _target_: "nemo.collections.tts.g2p.data.heteronym_classification_data.HeteronymClassificationDataset" - manifest: ${validation_manifest} - grapheme_field: "text_graphemes" # name of the field in manifest for input grapheme text - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 0 - - test_ds: - dataset: - _target_: "nemo.collections.tts.g2p.data.heteronym_classification_data.HeteronymClassificationDataset" - manifest: ${test_manifest} - grapheme_field: "text_graphemes" # name of the field in manifest for input grapheme text - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 #64 - num_workers: 0 - - optim: - name: adamw - lr: 5e-5 - weight_decay: 0.01 - # scheduler setup - sched: - name: WarmupAnnealing - # pytorch lightning args - reduce_on_plateau: false - # scheduler config override - warmup_steps: null - warmup_ratio: 0.1 - last_epoch: -1 - -trainer: - devices: 1 # number of gpus - max_epochs: 10 - num_nodes: 1 - accelerator: gpu - strategy: ddp - accumulate_grad_batches: 1 - enable_checkpointing: False # Provided by exp_manager - logger: False # Provided by exp_manager - log_every_n_steps: 200 - check_val_every_n_epoch: 1 - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: True - create_checkpoint_callback: True - checkpoint_callback_params: - save_top_k: 1 - monitor: "val_loss" - mode: "min" - save_best_model: true diff --git a/SoundScribe/SpeakerID/examples/tts/g2p/g2p_heteronym_classification_inference.py b/SoundScribe/SpeakerID/examples/tts/g2p/g2p_heteronym_classification_inference.py deleted file mode 100644 index 61262c41a3409bcee4b3be62e7cd21d6d3c31035..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/g2p/g2p_heteronym_classification_inference.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import json -import os -from dataclasses import dataclass, is_dataclass -from typing import Optional - -import pytorch_lightning as pl -import torch -from omegaconf import OmegaConf - -from nemo.collections.tts.g2p.models.heteronym_classification import HeteronymClassificationModel -from nemo.core.config import hydra_runner -from nemo.utils import logging - -""" -This script runs inference with HeteronymClassificationModel -If the input manifest contains target "word_id", evaluation will be also performed. - -To prepare dataset, see NeMo/scripts/dataset_processing/g2p/export_wikihomograph_data_to_manifest.py - -Inference form manifest: - -python g2p_heteronym_classification_inference.py \ - manifest="" \ - pretrained_model="" \ - output_manifest="" \ - wordid_to_phonemes_file="" - -Interactive inference: - -python g2p_heteronym_classification_inference.py \ - pretrained_model="" \ - wordid_to_phonemes_file="" # Optional - -""" - - -@dataclass -class TranscriptionConfig: - # Required configs - pretrained_model: str # Path to a .nemo file or Name of a pretrained model - - # path to .json manifest inference, if not provided, interactive mode will be enabled - manifest: Optional[str] = None # Path to .json manifest - output_manifest: Optional[ - str - ] = "predictions.json" # Path to .json manifest to save prediction, will be saved in "pred_text" field - grapheme_field: str = "text_graphemes" # name of the field in .json manifest for input grapheme text - - # mapping from wordid predicted by the model to phonemes, e.g., - # "../../../scripts/tts_dataset_files/wordid_to_ipa-0.7b_nv22.10.tsv" - wordid_to_phonemes_file: Optional[str] = None - - # if "word_id" targets are present in the manifest, evaluation will be performed and errors will be saved in errors_file - errors_file: Optional[str] = None # path to a file to save prediction errors - batch_size: int = 32 - num_workers: int = 0 - - -@hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig) -def main(cfg): - logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - - if is_dataclass(cfg): - cfg = OmegaConf.structured(cfg) - - if not cfg.pretrained_model: - raise ValueError( - 'To run evaluation and inference script a pre-trained model or .nemo file must be provided.' - f'Choose from {HeteronymClassificationModel.list_available_models()} or "pretrained_model"="your_model.nemo"' - ) - - logging.info( - 'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU and \ - no DDP to obtain accurate results' - ) - - # setup GPU - if torch.cuda.is_available(): - device = [0] # use 0th CUDA device - accelerator = 'gpu' - else: - device = 1 - accelerator = 'cpu' - - map_location = torch.device('cuda:{}'.format(device[0]) if accelerator == 'gpu' else 'cpu') - trainer = pl.Trainer(devices=device, accelerator=accelerator, logger=False, enable_checkpointing=False) - - if os.path.exists(cfg.pretrained_model): - model = HeteronymClassificationModel.restore_from(cfg.pretrained_model, map_location=map_location) - elif cfg.pretrained_model in HeteronymClassificationModel.get_available_model_names(): - model = HeteronymClassificationModel.from_pretrained(cfg.pretrained_model, map_location=map_location) - else: - raise ValueError( - f'Provide path to the pre-trained .nemo checkpoint or choose from {HeteronymClassificationModel.list_available_models()}' - ) - model.set_trainer(trainer) - model = model.eval() - - logging.info(f'Config Params: {model._cfg}') - - if cfg.manifest is not None: - if not os.path.exists(cfg.manifest): - raise ValueError(f"{cfg.manifest} not found.") - with torch.no_grad(): - model.disambiguate_manifest( - manifest=cfg.manifest, - output_manifest=cfg.output_manifest, - grapheme_field=cfg.grapheme_field, - batch_size=cfg.batch_size, - num_workers=cfg.num_workers, - ) - - # save predictions to a file - if cfg.errors_file is None: - cfg.errors_file = cfg.output_manifest.replace(".json", "_errors.txt") - - save_errors = True - correct = 0 - total = 0 - with open(cfg.output_manifest, "r", encoding="utf-8") as f_preds, open( - cfg.errors_file, "w", encoding="utf-8" - ) as f_errors: - for line in f_preds: - line = json.loads(line) - predictions = line["pred_wordid"] - # run evaluation if target word_id is available in the input manifest - if "word_id" in line: - targets = line["word_id"] - if isinstance(targets, str): - targets = [targets] - for idx, target_ in enumerate(targets): - total += 1 - if idx >= len(predictions) or target_ != predictions[idx]: - f_errors.write(f"INPUT: {line[cfg.grapheme_field]}\n") - f_errors.write(f"PRED : {predictions[idx]} -- GT: {target_}\n") - f_errors.write("===========================\n") - else: - correct += 1 - else: - save_errors = False - if save_errors: - logging.info(f"Accuracy: {round(correct / total * 100, 2)}% ({total - correct} errors out of {total})") - logging.info(f"Errors saved at {cfg.errors_file}") - else: - logging.info("No 'word_id' values found, skipping evaluation.") - if os.path.exists(cfg.errors_file): - os.remove(cfg.errors_file) - else: - print('Entering interactive mode.') - done = False - while not done: - print('Type "STOP" to exit.') - test_input = input('Input a test input:') - if test_input == "STOP": - done = True - if not done: - with torch.no_grad(): - _, sentences = model.disambiguate( - sentences=[test_input], - batch_size=1, - num_workers=cfg.num_workers, - wordid_to_phonemes_file=cfg.wordid_to_phonemes_file, - ) - print(sentences[0]) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/g2p/g2p_heteronym_classification_train_and_evaluate.py b/SoundScribe/SpeakerID/examples/tts/g2p/g2p_heteronym_classification_train_and_evaluate.py deleted file mode 100644 index 6138656185016b38703015d6c96b877a1a1f0fbf..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/g2p/g2p_heteronym_classification_train_and_evaluate.py +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pytorch_lightning as pl -import torch - -from nemo.collections.common.callbacks import LogEpochTimeCallback -from nemo.collections.tts.g2p.models.heteronym_classification import HeteronymClassificationModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - -""" -This script runs training and evaluation of HeteronymClassificationModel - -To prepare dataset, see NeMo/scripts/dataset_processing/g2p/export_wikihomograph_data_to_manifest.py - -To run training: -python g2p_heteronym_classification_train_and_evaluate.py \ - train_manifest=" \ - validation_manifest=" \ - model.wordids="" \ - do_training=True - -To run training and testing (once the training is complete): -python g2p_heteronym_classification_train_and_evaluate.py \ - train_manifest=" \ - validation_manifest=" \ - model.test_ds.dataset.manifest=" \ - model.wordids="" \ - do_training=True \ - do_testing=True - -To run testing: -python g2p_heteronym_classification_train_and_evaluate.py \ - do_training=False \ - do_testing=True \ - model.test_ds.dataset.manifest=" \ - pretrained_model= - - -See https://github.com/google-research-datasets/WikipediaHomographData/blob/master/data/wordids.tsv for wordids file -format example - -See https://github.com/NVIDIA/NeMo/blob/main/scripts/dataset_processing/g2p/export_wikihomograph_data_to_manifest.py -on how to convert WikiHomograph data for HeteronymClassificationModel training/evaluation -""" - - -@hydra_runner(config_path="conf", config_name="g2p_heteronym_classification.yaml") -def main(cfg): - # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True - # when there are unused parameters like in this model - if cfg.trainer.strategy == 'ddp': - cfg.trainer.strategy = "ddp_find_unused_parameters_true" - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - - model = None - if cfg.do_training: - model = HeteronymClassificationModel(cfg=cfg.model, trainer=trainer) - lr_logger = pl.callbacks.LearningRateMonitor() - epoch_time_logger = LogEpochTimeCallback() - trainer.callbacks.extend([lr_logger, epoch_time_logger]) - trainer.fit(model) - logging.info("Training is complete") - - if cfg.do_testing: - logging.info( - 'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU and \ - no DDP to obtain accurate results' - ) - # setup GPU - if torch.cuda.is_available(): - device = [0] # use 0th CUDA device - accelerator = 'gpu' - else: - device = 1 - accelerator = 'cpu' - - map_location = torch.device('cuda:{}'.format(device[0]) if accelerator == 'gpu' else 'cpu') - trainer = pl.Trainer(devices=device, accelerator=accelerator, logger=False, enable_checkpointing=False) - - if model is None: - if os.path.exists(cfg.pretrained_model): - # restore model from .nemo file path - model = HeteronymClassificationModel.restore_from(restore_path=cfg.pretrained_model) - elif cfg.pretrained_model in HeteronymClassificationModel.get_available_model_names(): - # restore model by name - model = HeteronymClassificationModel.from_pretrained(cfg.pretrained_model, map_location=map_location) - else: - raise ValueError( - f'Provide path to the pre-trained .nemo checkpoint or choose from {HeteronymClassificationModel.list_available_models()}' - ) - - if hasattr(cfg.model, "test_ds") and cfg.model.test_ds.dataset.manifest is not None: - model.setup_test_data(cfg.model.test_ds) - trainer.test(model) - else: - logging.info("test_ds not found, skipping evaluation") - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/g2p/g2p_inference.py b/SoundScribe/SpeakerID/examples/tts/g2p/g2p_inference.py deleted file mode 100644 index e7bffa8886534ed00c6151cb1074fc93ea8e3a0c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/g2p/g2p_inference.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from dataclasses import dataclass, is_dataclass -from typing import Optional - -import pytorch_lightning as pl -import torch -from omegaconf import OmegaConf -from utils import get_metrics - -from nemo.collections.tts.models.base import G2PModel -from nemo.core.config import hydra_runner -from nemo.utils import logging - -""" -python g2p_inference.py \ - pretrained_model=" \ - manifest_filepath="" \ - output_file="" \ - batch_size=32 \ - num_workers=4 \ - pred_field=pred_text -""" - - -@dataclass -class TranscriptionConfig: - # Required configs - pretrained_model: str # Path to a .nemo file or Name of a pretrained model - manifest_filepath: str # Path to .json manifest file - phoneme_field: Optional[ - str - ] = None # name of the field in manifest_filepath for ground truth phonemes, default during training "text" - grapheme_field: Optional[str] = "text_graphemes" # name of the field in manifest_filepath for input grapheme text - - # General configs - output_file: Optional[ - str - ] = None # Path to .json manifest file to save predictions, will be saved in "target_field" - pred_field: Optional[str] = "pred_text" # name of the field in the output_file to save predictions - batch_size: int = 32 # Batch size to use for inference - num_workers: int = 0 # Number of workers to use for DataLoader during inference - - # Config for heteronyms correction - pretrained_heteronyms_model: Optional[ - str - ] = None # Path to a .nemo file or a Name of a pretrained model to disambiguate heteronyms (Optional) - - -@hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig) -def main(cfg: TranscriptionConfig) -> TranscriptionConfig: - logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - - if is_dataclass(cfg): - cfg = OmegaConf.structured(cfg) - - if not cfg.pretrained_model: - raise ValueError( - 'To run evaluation and inference script a pre-trained model or .nemo file must be provided.' - f'Choose from {G2PModel.list_available_models()} or "pretrained_model"="your_model.nemo"' - ) - - logging.info( - 'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU and \ - no DDP to obtain accurate results' - ) - - # setup GPU - if torch.cuda.is_available(): - device = [0] # use 0th CUDA device - accelerator = 'gpu' - else: - device = 1 - accelerator = 'cpu' - - map_location = torch.device('cuda:{}'.format(device[0]) if accelerator == 'gpu' else 'cpu') - trainer = pl.Trainer(devices=device, accelerator=accelerator, logger=False, enable_checkpointing=False) - - if os.path.exists(cfg.pretrained_model): - model = G2PModel.restore_from(cfg.pretrained_model, map_location=map_location) - elif cfg.pretrained_model in G2PModel.get_available_model_names(): - model = G2PModel.from_pretrained(cfg.pretrained_model, map_location=map_location) - else: - raise ValueError( - f'Provide path to the pre-trained .nemo checkpoint or choose from {G2PModel.list_available_models()}' - ) - model._cfg.max_source_len = 512 - model.set_trainer(trainer) - model = model.eval() - - if cfg.output_file is None: - cfg.output_file = cfg.manifest_filepath.replace(".json", "_phonemes.json") - - with torch.no_grad(): - model.convert_graphemes_to_phonemes( - manifest_filepath=cfg.manifest_filepath, - output_manifest_filepath=cfg.output_file, - grapheme_field=cfg.grapheme_field, - batch_size=cfg.batch_size, - num_workers=cfg.num_workers, - pred_field=cfg.pred_field, - ) - print(f"IPA predictions saved in {cfg.output_file}") - - if cfg.phoneme_field is not None: - get_metrics(cfg.output_file, phoneme_field=cfg.phoneme_field, grapheme_field=cfg.grapheme_field) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/tts/g2p/g2p_train_and_evaluate.py b/SoundScribe/SpeakerID/examples/tts/g2p/g2p_train_and_evaluate.py deleted file mode 100644 index ff7b2b0675ea91f5a900548839dba096e035906e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/g2p/g2p_train_and_evaluate.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pytorch_lightning as pl -import torch -from utils import get_model - -from nemo.collections.common.callbacks import LogEpochTimeCallback -from nemo.collections.tts.models.base import G2PModel -from nemo.core.config import hydra_runner -from nemo.utils import logging, model_utils -from nemo.utils.exp_manager import exp_manager - -""" -This script supports training of G2PModels -(for T5G2PModel use g2p_t5.yaml, for CTCG2PModel use either g2p_conformer.yaml or g2p_t5_ctc.yaml) - -# Training T5G2PModel and evaluation at the end of training: - python examples/text_processing/g2p/g2p_train_and_evaluate.py \ - # (Optional: --config-path= --config-name=) \ - model.train_ds.manifest_filepath="" \ - model.validation_ds.manifest_filepath="" \ - model.test_ds.manifest_filepath="" \ - trainer.devices=1 \ - do_training=True \ - do_testing=True - - Example of the config file: NeMo/examples/tts/g2p/conf/g2p_t5.yaml - -# Training Conformer-G2P Model and evaluation at the end of training: - python examples/text_processing/g2p/g2p_train_and_evaluate.py \ - # (Optional: --config-path= --config-name=) \ - model.train_ds.manifest_filepath="" \ - model.validation_ds.manifest_filepath="" \ - model.test_ds.manifest_filepath="" \ - model.tokenizer.dir= \ - trainer.devices=1 \ - do_training=True \ - do_testing=True - - Example of the config file: NeMo/examples/text_processing/g2p/conf/g2p_conformer_ctc.yaml - -# Run evaluation of the pretrained model: - python examples/text_processing/g2p/g2p_train_and_evaluate.py \ - # (Optional: --config-path= --config-name=) \ - pretrained_model="" \ - model.test_ds.manifest_filepath="" \ - trainer.devices=1 \ - do_training=False \ - do_testing=True -""" - - -@hydra_runner(config_path="conf", config_name="g2p_t5") -def main(cfg): - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - - g2p_model = None - if cfg.do_training: - g2p_model = get_model(cfg, trainer) - lr_logger = pl.callbacks.LearningRateMonitor() - epoch_time_logger = LogEpochTimeCallback() - trainer.callbacks.extend([lr_logger, epoch_time_logger]) - trainer.fit(g2p_model) - - if cfg.do_testing: - logging.info( - 'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU and \ - no DDP to obtain accurate results' - ) - # setup GPU - if torch.cuda.is_available(): - device = [0] # use 0th CUDA device - accelerator = 'gpu' - else: - device = 1 - accelerator = 'cpu' - - map_location = torch.device('cuda:{}'.format(device[0]) if accelerator == 'gpu' else 'cpu') - trainer = pl.Trainer(devices=device, accelerator=accelerator, logger=False, enable_checkpointing=False) - - if g2p_model is None: - if os.path.exists(cfg.pretrained_model): - # restore g2p_model from .nemo file path - model_cfg = G2PModel.restore_from(restore_path=cfg.pretrained_model, return_config=True) - classpath = model_cfg.target # original class path - imported_class = model_utils.import_class_by_path(classpath) - logging.info(f"Restoring g2p_model : {imported_class.__name__}") - g2p_model = imported_class.restore_from(restore_path=cfg.pretrained_model, map_location=map_location) - model_name = os.path.splitext(os.path.basename(cfg.pretrained_model))[0] - logging.info(f"Restored {model_name} g2p_model from {cfg.pretrained_model}.") - elif cfg.pretrained_model in G2PModel.get_available_model_names(): - # restore g2p_model by name - g2p_model = G2PModel.from_pretrained(cfg.pretrained_model, map_location=map_location) - else: - raise ValueError( - f'Provide path to the pre-trained .nemo checkpoint or choose from {G2PModel.list_available_models()}' - ) - - if hasattr(cfg.model, "test_ds") and cfg.model.test_ds.manifest_filepath is not None: - g2p_model.setup_multiple_test_data(cfg.model.test_ds) - if g2p_model.prepare_test(trainer): - trainer.test(g2p_model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/g2p/utils.py b/SoundScribe/SpeakerID/examples/tts/g2p/utils.py deleted file mode 100644 index d1870ddaba43de2780282b53a6808fefe44acbd8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/g2p/utils.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json - -from nemo.collections.asr.metrics.wer import word_error_rate -from nemo.collections.tts.g2p.models.ctc import CTCG2PModel -from nemo.collections.tts.g2p.models.t5 import T5G2PModel -from nemo.utils import logging - - -def get_model(cfg, trainer): - """ - Get model instance - - Args: - cfg: model's config file - trainer: trainer - Return: - G2PModel instance - """ - if "CTC" in cfg.name: - model = CTCG2PModel(cfg=cfg.model, trainer=trainer) - elif cfg.name == "T5G2P": - model = T5G2PModel(cfg=cfg.model, trainer=trainer) - else: - raise ValueError(f"{cfg.name} is not supported. Choose from [G2P-Conformer-CTC, T5G2P]") - return model - - -def get_metrics(manifest: str, pred_field="pred_text", phoneme_field="text", grapheme_field="text_graphemes"): - """ - Calculates WER and PER metrics (for duplicated grapheme entries with multiple reference values, - the best matching prediction will be used for evaluation.) - - Args: - manifest: Path to .json manifest file - pred_field: name of the field in the output_file to save predictions - phoneme_field: name of the field in manifest_filepath for ground truth phonemes - grapheme_field: name of the field in manifest_filepath for input grapheme text - - Returns: WER and PER values - """ - all_preds = [] - all_references = [] - all_graphemes = {} - with open(manifest, "r") as f: - for i, line in enumerate(f): - line = json.loads(line) - all_preds.append(line[pred_field]) - all_references.append(line[phoneme_field]) - - if line[grapheme_field] not in all_graphemes: - all_graphemes[line[grapheme_field]] = [] - all_graphemes[line[grapheme_field]].append(i) - - # collect all examples with multiple phoneme options and same grapheme form, choose the one with min PER - all_graphemes = {k: v for k, v in all_graphemes.items() if len(v) > 1} - lines_to_drop = [] - for phon_amb_indices in all_graphemes.values(): - refs, preds = [], [] - for phon_amb_indices_ in phon_amb_indices: - refs.append(all_references[phon_amb_indices_]) - preds.append(all_preds[phon_amb_indices_]) - pers = [] - for ref_, pred_ in zip(refs, preds): - pers.append(word_error_rate(hypotheses=[pred_], references=[ref_], use_cer=True)) - - min_idx = pers.index(min(pers)) - - phon_amb_indices.pop(min_idx) - lines_to_drop.extend(phon_amb_indices) - - # drop duplicated examples, only keep with min PER - all_preds = [x for i, x in enumerate(all_preds) if i not in lines_to_drop] - all_references = [x for i, x in enumerate(all_references) if i not in lines_to_drop] - - wer = word_error_rate(hypotheses=all_preds, references=all_references) - per = word_error_rate(hypotheses=all_preds, references=all_references, use_cer=True) - - logging.info(f"{manifest}: PER: {per * 100:.2f}%, WER: {wer * 100:.2f}%, lines: {len(all_references)}") - return wer, per diff --git a/SoundScribe/SpeakerID/examples/tts/hifigan.py b/SoundScribe/SpeakerID/examples/tts/hifigan.py deleted file mode 100644 index 5c3406a2f24cb9cef3ed74056afc2ab572b0ed97..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/hifigan.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl - -from nemo.collections.tts.models import HifiGanModel -from nemo.core.config import hydra_runner -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf/hifigan", config_name="hifigan") -def main(cfg): - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - model = HifiGanModel(cfg=cfg.model, trainer=trainer) - trainer.fit(model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/hifigan_finetune.py b/SoundScribe/SpeakerID/examples/tts/hifigan_finetune.py deleted file mode 100644 index f0e2513404fdf7c63f575bc6ea9c991b08c22af6..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/hifigan_finetune.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl - -from nemo.collections.tts.models import HifiGanModel -from nemo.core.config import hydra_runner -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf/hifigan", config_name="hifigan_44100") -def main(cfg): - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - model = HifiGanModel(cfg=cfg.model, trainer=trainer) - model.maybe_init_from_pretrained_checkpoint(cfg=cfg) - trainer.fit(model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/mixer_tts.py b/SoundScribe/SpeakerID/examples/tts/mixer_tts.py deleted file mode 100644 index 61a188f53969b008f7a793e0ee7f50d528d8f17d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/mixer_tts.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl - -from nemo.collections.common.callbacks import LogEpochTimeCallback -from nemo.collections.tts.models import MixerTTSModel -from nemo.core.config import hydra_runner -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path='conf', config_name='mixer-tts') -def main(cfg): - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get('exp_manager', None)) - model = MixerTTSModel(cfg=cfg.model, trainer=trainer) - trainer.callbacks.extend([pl.callbacks.LearningRateMonitor(), LogEpochTimeCallback()]) # noqa - trainer.fit(model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/radtts.py b/SoundScribe/SpeakerID/examples/tts/radtts.py deleted file mode 100644 index 7dbdaedced032b64dc5cdb013aa6575f4dff3c46..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/radtts.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl - -from nemo.collections.common.callbacks import LogEpochTimeCallback -from nemo.collections.tts.models.radtts import RadTTSModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -def freeze(model): - for p in model.parameters(): - p.requires_grad = False - - -def unfreeze(model): - for p in model.parameters(): - p.requires_grad = True - - -def prepare_model_weights(model, unfreeze_modules): - if unfreeze_modules != 'all': - model.freeze() # freeze everything - logging.info("module freezed, about to unfreeze modules to be trained") - if 'dur' in unfreeze_modules and hasattr(model.model, 'dur_pred_layer'): - logging.info("Training duration prediction") - unfreeze(model.model.dur_pred_layer) - if 'f0' in unfreeze_modules and hasattr(model.model, 'f0_pred_module'): - logging.info("Training F0 prediction") - unfreeze(model.model.f0_pred_module) - if 'energy' in unfreeze_modules and hasattr(model.model, 'energy_pred_module'): - logging.info("Training energy prediction") - unfreeze(model.model.energy_pred_module) - if 'vpred' in unfreeze_modules and hasattr(model.model, 'v_pred_module'): - logging.info("Training voiced prediction") - unfreeze(model.model.v_pred_module) - if hasattr(model, 'v_embeddings'): - logging.info("Training voiced embeddings") - unfreeze(model.model.v_embeddings) - if 'unvbias' in unfreeze_modules and hasattr(model.model, 'unvoiced_bias_module'): - logging.info("Training unvoiced bias") - unfreeze(model.model.unvoiced_bias_module) - else: - logging.info("Training everything") - - -@hydra_runner(config_path="conf", config_name="rad-tts_dec") -def main(cfg): - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get('exp_manager', None)) - model = RadTTSModel(cfg=cfg.model, trainer=trainer).cuda() - if cfg.model.load_from_checkpoint: - model.maybe_init_from_pretrained_checkpoint(cfg=cfg.model) - prepare_model_weights(model, cfg.model.trainerConfig.unfreeze_modules) - lr_logger = pl.callbacks.LearningRateMonitor() - epoch_time_logger = LogEpochTimeCallback() - trainer.callbacks.extend([lr_logger, epoch_time_logger]) - trainer.fit(model.cuda()) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/examples/tts/spectrogram_enhancer.py b/SoundScribe/SpeakerID/examples/tts/spectrogram_enhancer.py deleted file mode 100644 index 336729236d74a49805b20159de00e39a6ad64cad..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/spectrogram_enhancer.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl - -from nemo.collections.tts.models import SpectrogramEnhancerModel -from nemo.core.config import hydra_runner -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="spectrogram-enhancer") -def main(cfg): - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg=cfg.get("exp_manager", None)) - model = SpectrogramEnhancerModel(cfg=cfg.model, trainer=trainer) - lr_logger = pl.callbacks.LearningRateMonitor() - trainer.callbacks.extend([lr_logger]) - trainer.fit(model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/ssl_tts.py b/SoundScribe/SpeakerID/examples/tts/ssl_tts.py deleted file mode 100644 index a96dccb930ab076280459e00dcaf45cf801dbf2a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/ssl_tts.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl - -from nemo.collections.common.callbacks import LogEpochTimeCallback -from nemo.collections.tts.models import ssl_tts -from nemo.core.config import hydra_runner -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="ssl_tts_22050") -def main(cfg): - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - model = ssl_tts.SSLDisentangler(cfg=cfg.model, trainer=trainer) - model.maybe_init_from_pretrained_checkpoint(cfg=cfg) - lr_logger = pl.callbacks.LearningRateMonitor() - epoch_time_logger = LogEpochTimeCallback() - trainer.callbacks.extend([lr_logger, epoch_time_logger]) - trainer.fit(model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/tacotron2.py b/SoundScribe/SpeakerID/examples/tts/tacotron2.py deleted file mode 100644 index a5446c35f775e05fc3c8cdb924b9ef2a29864886..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/tacotron2.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl - -from nemo.collections.common.callbacks import LogEpochTimeCallback -from nemo.collections.tts.models import Tacotron2Model -from nemo.core.config import hydra_runner -from nemo.utils.exp_manager import exp_manager - - -# hydra_runner is a thin NeMo wrapper around Hydra -# It looks for a config named tacotron2.yaml inside the conf folder -# Hydra parses the yaml and returns it as a Omegaconf DictConfig -@hydra_runner(config_path="conf", config_name="tacotron2") -def main(cfg): - # Define the Lightning trainer - trainer = pl.Trainer(**cfg.trainer) - # exp_manager is a NeMo construct that helps with logging and checkpointing - exp_manager(trainer, cfg.get("exp_manager", None)) - # Define the Tacotron 2 model, this will construct the model as well as - # define the training and validation dataloaders - model = Tacotron2Model(cfg=cfg.model, trainer=trainer) - # Let's add a few more callbacks - lr_logger = pl.callbacks.LearningRateMonitor() - epoch_time_logger = LogEpochTimeCallback() - trainer.callbacks.extend([lr_logger, epoch_time_logger]) - # Call lightning trainer's fit() to train the model - trainer.fit(model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/tacotron2_finetune.py b/SoundScribe/SpeakerID/examples/tts/tacotron2_finetune.py deleted file mode 100644 index a0531f1f2801b870b9c4ecfd46aa4c448473d404..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/tacotron2_finetune.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl - -from nemo.collections.common.callbacks import LogEpochTimeCallback -from nemo.collections.tts.models import Tacotron2Model -from nemo.core.config import hydra_runner -from nemo.utils.exp_manager import exp_manager - - -# hydra_runner is a thin NeMo wrapper around Hydra -# It looks for a config named tacotron2.yaml inside the conf folder -# Hydra parses the yaml and returns it as a Omegaconf DictConfig -@hydra_runner(config_path="conf", config_name="tacotron2_44100") -def main(cfg): - # Define the Lightning trainer - trainer = pl.Trainer(**cfg.trainer) - # exp_manager is a NeMo construct that helps with logging and checkpointing - exp_manager(trainer, cfg.get("exp_manager", None)) - # Define the Tacotron 2 model, this will construct the model as well as - # define the training and validation dataloaders - model = Tacotron2Model(cfg=cfg.model, trainer=trainer) - model.maybe_init_from_pretrained_checkpoint(cfg=cfg) - # Let's add a few more callbacks - lr_logger = pl.callbacks.LearningRateMonitor() - epoch_time_logger = LogEpochTimeCallback() - trainer.callbacks.extend([lr_logger, epoch_time_logger]) - # Call lightning trainer's fit() to train the model - trainer.fit(model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/test_tts_infer.py b/SoundScribe/SpeakerID/examples/tts/test_tts_infer.py deleted file mode 100644 index 3b707ff2b318d90f211a8ed804f394c32211c9a2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/test_tts_infer.py +++ /dev/null @@ -1,153 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is used as a CI test and shows how to chain TTS and ASR models -""" - -from argparse import ArgumentParser -from math import ceil -from pathlib import Path - -import librosa -import soundfile -import torch - -from nemo.collections.asr.metrics.wer import word_error_rate -from nemo.collections.asr.models import EncDecCTCModel -from nemo.collections.common.parts.preprocessing import parsers -from nemo.collections.tts.models.base import SpectrogramGenerator, Vocoder -from nemo.utils import logging - -LIST_OF_TEST_STRINGS = [ - "Hey, this is a test of the speech synthesis system.", - "roupell received the announcement with a cheerful countenance.", - "with thirteen dollars, eighty-seven cents when considerably greater resources were available to him.", - "Two other witnesses were able to offer partial descriptions of a man they saw in the southeast corner window.", - "'just to steady their legs a little' in other words, to add his weight to that of the hanging bodies.", - "The discussion above has already set forth examples of his expression of hatred for the United States.", - "At two:thirty-eight p.m., Eastern Standard Time, Lyndon Baines Johnson took the oath of office as the thirty-sixth President of the United States.", - "or, quote, other high government officials in the nature of a complaint coupled with an expressed or implied determination to use a means.", - "As for my return entrance visa please consider it separately. End quote.", - "it appears that Marina Oswald also complained that her husband was not able to provide more material things for her.", - "appeared in The Dallas Times Herald on November fifteen, nineteen sixty-three.", - "The only exit from the office in the direction Oswald was moving was through the door to the front stairway.", -] - - -def main(): - parser = ArgumentParser() - parser.add_argument( - "--asr_model", - type=str, - default="QuartzNet15x5Base-En", - choices=[x.pretrained_model_name for x in EncDecCTCModel.list_available_models()], - ) - parser.add_argument( - "--tts_model_spec", - type=str, - default="tts_en_tacotron2", - choices=[x.pretrained_model_name for x in SpectrogramGenerator.list_available_models()], - ) - parser.add_argument( - "--tts_model_vocoder", - type=str, - default="tts_en_waveglow_88m", - choices=[x.pretrained_model_name for x in Vocoder.list_available_models()], - ) - parser.add_argument("--wer_tolerance", type=float, default=1.0, help="used by test") - parser.add_argument("--trim", action="store_true") - parser.add_argument("--debug", action="store_true") - args = parser.parse_args() - torch.set_grad_enabled(False) - - if args.debug: - logging.set_verbosity(logging.DEBUG) - - logging.info(f"Using NGC cloud ASR model {args.asr_model}") - asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model) - logging.info(f"Using NGC cloud TTS Spectrogram Generator model {args.tts_model_spec}") - tts_model_spec = SpectrogramGenerator.from_pretrained(model_name=args.tts_model_spec) - logging.info(f"Using NGC cloud TTS Vocoder model {args.tts_model_vocoder}") - tts_model_vocoder = Vocoder.from_pretrained(model_name=args.tts_model_vocoder) - models = [asr_model, tts_model_spec, tts_model_vocoder] - - if torch.cuda.is_available(): - for i, m in enumerate(models): - models[i] = m.cuda() - for m in models: - m.eval() - - asr_model, tts_model_spec, tts_model_vocoder = models - - parser = parsers.make_parser( - labels=asr_model.decoder.vocabulary, name="en", unk_id=-1, blank_id=-1, do_normalize=True, - ) - labels_map = dict([(i, asr_model.decoder.vocabulary[i]) for i in range(len(asr_model.decoder.vocabulary))]) - - tts_input = [] - asr_references = [] - longest_tts_input = 0 - for test_str in LIST_OF_TEST_STRINGS: - tts_parsed_input = tts_model_spec.parse(test_str) - if len(tts_parsed_input[0]) > longest_tts_input: - longest_tts_input = len(tts_parsed_input[0]) - tts_input.append(tts_parsed_input.squeeze()) - - asr_parsed = parser(test_str) - asr_parsed = ''.join([labels_map[c] for c in asr_parsed]) - asr_references.append(asr_parsed) - - # Pad TTS Inputs - for i, text in enumerate(tts_input): - pad = (0, longest_tts_input - len(text)) - tts_input[i] = torch.nn.functional.pad(text, pad, value=68) - - logging.debug(tts_input) - - # Do TTS - tts_input = torch.stack(tts_input) - if torch.cuda.is_available(): - tts_input = tts_input.cuda() - specs = tts_model_spec.generate_spectrogram(tokens=tts_input) - audio = [] - step = ceil(len(specs) / 4) - for i in range(4): - audio.append(tts_model_vocoder.convert_spectrogram_to_audio(spec=specs[i * step : i * step + step])) - - audio = [item for sublist in audio for item in sublist] - audio_file_paths = [] - # Save audio - logging.debug(f"args.trim: {args.trim}") - for i, aud in enumerate(audio): - aud = aud.cpu().numpy() - if args.trim: - aud = librosa.effects.trim(aud, top_db=40)[0] - soundfile.write(f"{i}.wav", aud, samplerate=22050) - audio_file_paths.append(str(Path(f"{i}.wav"))) - - # Do ASR - hypotheses = asr_model.transcribe(audio_file_paths) - for i, _ in enumerate(hypotheses): - logging.debug(f"{i}") - logging.debug(f"ref:'{asr_references[i]}'") - logging.debug(f"hyp:'{hypotheses[i]}'") - wer_value = word_error_rate(hypotheses=hypotheses, references=asr_references) - if wer_value > args.wer_tolerance: - raise ValueError(f"Got WER of {wer_value}. It was higher than {args.wer_tolerance}") - logging.info(f'Got WER of {wer_value}. Tolerance was {args.wer_tolerance}') - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/univnet.py b/SoundScribe/SpeakerID/examples/tts/univnet.py deleted file mode 100644 index 91aafa66184227bbf04d5712bef7fb994091a3ad..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/univnet.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl - -from nemo.collections.common.callbacks import LogEpochTimeCallback -from nemo.collections.tts.models import UnivNetModel -from nemo.core.config import hydra_runner -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf/univnet", config_name="univnet") -def main(cfg): - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - model = UnivNetModel(cfg=cfg.model, trainer=trainer) - lr_logger = pl.callbacks.LearningRateMonitor() - epoch_time_logger = LogEpochTimeCallback() - trainer.callbacks.extend([lr_logger, epoch_time_logger]) - trainer.fit(model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/vits.py b/SoundScribe/SpeakerID/examples/tts/vits.py deleted file mode 100644 index 75e0d827018ac8619b202ebea99d83a7d9e0af72..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/vits.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl - -from nemo.collections.tts.models.vits import VitsModel -from nemo.core.config import hydra_runner -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="vits") -def main(cfg): - trainer = pl.Trainer(use_distributed_sampler=False, **cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - model = VitsModel(cfg=cfg.model, trainer=trainer) - - trainer.callbacks.extend([pl.callbacks.LearningRateMonitor()]) - trainer.fit(model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/examples/tts/waveglow.py b/SoundScribe/SpeakerID/examples/tts/waveglow.py deleted file mode 100644 index 66b13491abd4ec84d3d6d87813ec44219a4287d9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/examples/tts/waveglow.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytorch_lightning as pl - -from nemo.collections.common.callbacks import LogEpochTimeCallback -from nemo.collections.tts.models import WaveGlowModel -from nemo.core.config import hydra_runner -from nemo.utils.exp_manager import exp_manager - - -@hydra_runner(config_path="conf", config_name="waveglow") -def main(cfg): - trainer = pl.Trainer(**cfg.trainer) - exp_manager(trainer, cfg.get("exp_manager", None)) - model = WaveGlowModel(cfg=cfg.model, trainer=trainer) - epoch_time_logger = LogEpochTimeCallback() - trainer.callbacks.extend([epoch_time_logger]) - trainer.fit(model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/SoundScribe/SpeakerID/external/get_collections.py b/SoundScribe/SpeakerID/external/get_collections.py deleted file mode 100644 index d546ccb057099bb0783a9b8050be5e2e841ee059..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/external/get_collections.py +++ /dev/null @@ -1,90 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" Script responsible for generation of a JSON file with list of NeMo collections. """ - -import argparse -import importlib -import json -import os - -import nemo -from nemo.utils import logging - - -def process_collection(id, col): - """ Helper function processing the collection. - - Args: - id: (short) name of the collection. - col: a collection (python module). - """ - return { - "id": id, - "name": col.__name__, - "description": col.__description__, - "version": col.__version__, - "author": col.__author__, - } - - -def main(): - """ Main function generating a JSON file with list of NeMo collections. """ - # Parse filename. - parser = argparse.ArgumentParser() - parser.add_argument('--filename', help='Name of the output JSON file', type=str, default="collections.json") - args = parser.parse_args() - - # Get collections directory. - colletions_dir = os.path.dirname(nemo.collections.__file__) - logging.info('Analysing collections in `{}`'.format(colletions_dir)) - - # Generate list of NeMo collections - from the list of collection subfolders. - collections = {} - for sub_dir in os.listdir(colletions_dir): - # Skip cache. - if sub_dir == "__pycache__": - continue - # Check if it is a directory. - if os.path.isdir(os.path.join(colletions_dir, sub_dir)): - collections[sub_dir] = "nemo.collections." + sub_dir - - output_list = [] - # Iterate over all collections. - for key, val in collections.items(): - # Try to get module specification. - module_spec = importlib.util.find_spec(val) - if module_spec is None: - logging.warning(" * Failed to process `{}`".format(val)) - else: - try: - # Import the module from the module specification. - module = importlib.util.module_from_spec(module_spec) - module_spec.loader.exec_module(module) - # Add to list. - output_list.append(process_collection(key, module)) - logging.info(" * Processed `{}`".format(val)) - except AttributeError: - logging.warning(" * Failed to process `{}`".format(val)) - - # Export to JSON. - with open(args.filename, 'w', encoding='utf-8') as outfile: - json.dump(output_list, outfile) - - logging.info('Finshed the analysis, results exported to `{}`.'.format(args.filename)) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/external/get_modules.py b/SoundScribe/SpeakerID/external/get_modules.py deleted file mode 100644 index c080be9e8e5acf4620d4d2fd4f5dd31a7d8303d1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/external/get_modules.py +++ /dev/null @@ -1,159 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" Script responsible for generation of a JSON file containing list of modules of a given collection. """ - -import argparse -import importlib -import inspect -import json -import os - -import nemo -from nemo.utils import logging - - -def process_member(name, obj, module_list): - """ Helper function processing the passed object and, if ok, adding a record to the module list. - - Args: - name: name of the member - obj: member (class/function etc.) - module_list: list of modules that (probably) will be expanded. - """ - # It is not a class - skip it. - if not inspect.isclass(obj): - return - - # Check inheritance - we know that all our datasets/modules/losses inherit from Serialization, - # Btw. Serialization is also required by this script. - if not issubclass(obj, nemo.core.Serialization): - return - - logging.info(" * Processing `{}`".format(str(obj))) - - module_list.append( - { - "name": name, - "cls": str(obj), - # Temporary solution: mockup arguments. - "arguments": [ - "jasper", - "activation", - "feat_in", - "normalization_mode", - "residual_mode", - "norm_groups", - "conv_mask", - "frame_splicing", - "init_mode", - ], - # Temporary solution: mockup input types. - "input_types": { - "audio_signal": "axes: (batch, dimension, time); elements_type: MelSpectrogramType", - "length": "axes: (batch,); elements_type: LengthType", - }, - # Temporary solution: mockup output types. - "output_types": { - "encoder_output": "axes: (batch, dimension, time); elements_type: AcousticEncodedRepresentation" - }, - } - ) - - -def main(): - """ Main function analysing the indicated NeMo collection and generating a JSON file with module descriptions. """ - # Parse filename. - parser = argparse.ArgumentParser() - parser.add_argument('--collection', help='ID of the collection', type=str) - parser.add_argument('--filename', help='Name of the output JSON file', type=str, default="modules.json") - args = parser.parse_args() - - # Get collections directory. - colletions_dir = os.path.dirname(nemo.collections.__file__) - logging.info('Analysing collections in `{}`'.format(colletions_dir)) - - # Generate list of NeMo collections - from the list of collection subfolders. - collections = {} - for sub_dir in os.listdir(colletions_dir): - # Skip cache. - if sub_dir == "__pycache__": - continue - # Check if it is a directory. - if os.path.isdir(os.path.join(colletions_dir, sub_dir)): - collections[sub_dir] = "nemo.collections." + sub_dir - - # Check the collection. - if args.collection not in collections.keys(): - logging.error("Coudn't process the incidated `{}` collection".format(args.collection)) - logging.info( - "Please select one of the existing collections using `--collection [{}]`".format("|".join(collections)) - ) - exit(-1) - - # Load the collection specification. - collection_spec = importlib.util.find_spec(collections[args.collection]) - if collection_spec is None: - logging.error("Failed to load the `{}` collection".format(val)) - - # Import the module from the module specification. - collection = importlib.util.module_from_spec(collection_spec) - collection_spec.loader.exec_module(collection) - - module_list = [] - # Iterate over the packages in the indicated collection. - logging.info("Analysing the `{}` collection".format(args.collection)) - - try: # Datasets in dataset folder - logging.info("Analysing the 'data' package") - for name, obj in inspect.getmembers(collection.data): - process_member(name, obj, module_list) - except AttributeError as e: - logging.info(" * No datasets found") - - try: # Datasets in dataset folder - logging.info("Analysing the 'datasets' package") - for name, obj in inspect.getmembers(collection.datasets): - process_member(name, obj, module_list) - except AttributeError as e: - logging.info(" * No datasets found") - - try: # Modules - logging.info("Analysing the 'modules' package") - for name, obj in inspect.getmembers(collection.modules): - process_member(name, obj, module_list) - except AttributeError as e: - logging.info(" * No modules found") - - try: # Losses - logging.info("Analysing the 'losses' package") - for name, obj in inspect.getmembers(collection.losses): - process_member(name, obj, module_list) - except AttributeError as e: - logging.info(" * No losses found") - - # Add prefix - only for default name. - filename = args.filename if args.filename != "modules.json" else args.collection + "_" + args.filename - # Export to JSON. - with open(filename, 'w', encoding='utf-8') as outfile: - json.dump(module_list, outfile) - - logging.info( - 'Finished analysis of the `{}` collection, results exported to `{}`.'.format(args.collection, filename) - ) - - -if __name__ == '__main__': - main() diff --git a/SoundScribe/SpeakerID/nemo/README.md b/SoundScribe/SpeakerID/nemo/README.md deleted file mode 100644 index 2db456547ab9ba648063be0711247a98aecde594..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/README.md +++ /dev/null @@ -1,9 +0,0 @@ -NeMo (**Ne**ural **Mo**dules) is a toolkit for creating AI applications built around **neural modules**, conceptual blocks of neural networks that take *typed* inputs and produce *typed* outputs. - -**NeMo Core** provides common APIs all modules and models have to implement. - -**NeMo Collections** - -* ASR - collection of modules and models for building speech recognition networks -* TTS - collection of modules and models for building speech synthesis networks -* NLP - collection of modules and models for building NLP networks diff --git a/SoundScribe/SpeakerID/nemo/__init__.py b/SoundScribe/SpeakerID/nemo/__init__.py deleted file mode 100644 index 5b9fedbd1cc691ff5b16a78420965131787f37dd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from nemo.package_info import ( - __contact_emails__, - __contact_names__, - __description__, - __download_url__, - __homepage__, - __keywords__, - __license__, - __package_name__, - __repository_url__, - __shortversion__, - __version__, -) diff --git a/SoundScribe/SpeakerID/nemo/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 1974b5657ff70dc27c6c03ee9d1bcfb092b35552..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/__pycache__/__init__.cpython-39.pyc b/SoundScribe/SpeakerID/nemo/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index d9085cbf936470485f385b70b9a1af41d6058e26..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/__pycache__/constants.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/__pycache__/constants.cpython-310.pyc deleted file mode 100644 index 2bd51e5632325a7cd4c711e4cc02683183239d30..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/__pycache__/constants.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/__pycache__/package_info.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/__pycache__/package_info.cpython-310.pyc deleted file mode 100644 index 847a6933358386665d619cde8c76080357371166..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/__pycache__/package_info.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/__init__.py b/SoundScribe/SpeakerID/nemo/collections/__init__.py deleted file mode 100644 index 9e3250071955216f6abc505e6181fb59931baa8d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 72e141369543bf2892ba2c7c6a72fd3b59b6771c..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/__pycache__/__init__.cpython-39.pyc b/SoundScribe/SpeakerID/nemo/collections/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 84d7b9393ef6be34f34a2b044551f3b42a070df3..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/__init__.py deleted file mode 100644 index cd14a4395006b23564424aa7d2f90a1069d5894f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr import data, losses, models, modules -from nemo.package_info import __version__ - -# Set collection version equal to NeMo version. -__version = __version__ - -# Authorship. -__author__ = "NVIDIA Corporation" - -# Set collection name. -__description__ = "Automatic Speech Recognition collection" diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 9a20a27611b0c856d6b2bbebeafa06a99f6091c2..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/__pycache__/__init__.cpython-39.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index bae432761d34427bb4b0b235defb0a09534a2cbb..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/__init__.py deleted file mode 100644 index 9e3250071955216f6abc505e6181fb59931baa8d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index b0d1a5e07d35050fd08af3b96806303629edce09..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_audio.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_audio.cpython-310.pyc deleted file mode 100644 index fb30bc737a0ab6407988b9630bedae2b4bcc6f05..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_audio.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_audio_dataset.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_audio_dataset.cpython-310.pyc deleted file mode 100644 index dff92adf5c53dd7fcc0482e56d71191674215cf9..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_audio_dataset.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_diar_label.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_diar_label.cpython-310.pyc deleted file mode 100644 index f6bd5af223daa6f70c70a11a452ee1dd73095d7b..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_diar_label.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_label.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_label.cpython-310.pyc deleted file mode 100644 index c952482ec1c3a89b637a9a1e18c961486ed0d740..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_label.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_label_dataset.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_label_dataset.cpython-310.pyc deleted file mode 100644 index 5f287d0215a3cbd71b4f9e66d124a21dceffb63a..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_label_dataset.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_text.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_text.cpython-310.pyc deleted file mode 100644 index b57d9b93f5ea13f6412ac5d85ae9070112c4ea72..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_text.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_text_dali.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_text_dali.cpython-310.pyc deleted file mode 100644 index 00330d96f0b566e8a8cd432c08841c4f642325f7..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_text_dali.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_text_dataset.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_text_dataset.cpython-310.pyc deleted file mode 100644 index ef86c2d2ab670b53cbc176559193a2ce3dc95e97..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/audio_to_text_dataset.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/feature_to_label.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/feature_to_label.cpython-310.pyc deleted file mode 100644 index 15154777c905baefa196f7b6befc405cb85ce797..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/feature_to_label.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/feature_to_label_dataset.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/feature_to_label_dataset.cpython-310.pyc deleted file mode 100644 index 0c6232788192f39fbf8f61babd54dabb4e436f21..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/data/__pycache__/feature_to_label_dataset.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_audio.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_audio.py deleted file mode 100644 index a3c6dd0cc1b3f74b935b0e47490ffca4c9a68fb7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_audio.py +++ /dev/null @@ -1,1136 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import abc -import math -import random -from collections import OrderedDict, namedtuple -from dataclasses import dataclass -from typing import Callable, Dict, List, Optional, Tuple, Type, Union - -import librosa -import numpy as np -import torch - -from nemo.collections.asr.parts.preprocessing.segment import AudioSegment -from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType -from nemo.collections.common.parts.preprocessing import collections -from nemo.collections.common.parts.utils import flatten -from nemo.core.classes import Dataset -from nemo.core.neural_types import AudioSignal, EncodedRepresentation, LengthsType, NeuralType -from nemo.utils import logging - -__all__ = [ - 'AudioToTargetDataset', - 'AudioToTargetWithReferenceDataset', - 'AudioToTargetWithEmbeddingDataset', -] - - -def _audio_collate_fn(batch: List[dict]) -> Tuple[torch.Tensor]: - """Collate a batch of items returned by __getitem__. - Examples for each signal are zero padded to the same length - (batch_length), which is determined by the longest example. - Lengths of the original signals are returned in the output. - - Args: - batch: List of dictionaries. Each element of the list - has the following format - ``` - { - 'signal_0': 1D or 2D tensor, - 'signal_1': 1D or 2D tensor, - ... - 'signal_N': 1D or 2D tensor, - } - ``` - 1D tensors have shape (num_samples,) and 2D tensors - have shape (num_channels, num_samples) - - Returns: - A tuple containing signal tensor and signal length tensor (in samples) - for each signal. - The output has the following format: - ``` - (signal_0, signal_0_length, signal_1, signal_1_length, ..., signal_N, signal_N_length) - ``` - Note that the output format is obtained by interleaving signals and their length. - """ - signals = batch[0].keys() - - batched = tuple() - - for signal in signals: - signal_length = [b[signal].shape[-1] for b in batch] - # Batch length is determined by the longest signal in the batch - batch_length = max(signal_length) - b_signal = [] - for s_len, b in zip(signal_length, batch): - # check if padding is necessary - if s_len < batch_length: - if b[signal].ndim == 1: - # single-channel signal - pad = (0, batch_length - s_len) - elif b[signal].ndim == 2: - # multi-channel signal - pad = (0, batch_length - s_len, 0, 0) - else: - raise RuntimeError( - f'Signal {signal} has unsuported dimensions {signal.shape}. Currently, only 1D and 2D arrays are supported.' - ) - b[signal] = torch.nn.functional.pad(b[signal], pad) - # append the current padded signal - b_signal.append(b[signal]) - # (signal_batched, signal_length) - batched += (torch.stack(b_signal), torch.tensor(signal_length, dtype=torch.int32)) - - # Currently, outputs are expected to be in a tuple, where each element must correspond - # to the output type in the OrderedDict returned by output_types. - # - # Therefore, we return batched signals by interleaving signals and their length: - # (signal_0, signal_0_length, signal_1, signal_1_length, ...) - return batched - - -@dataclass -class SignalSetup: - signals: List[str] # signal names - duration: Optional[Union[float, list]] = None # duration for each signal - channel_selectors: Optional[List[ChannelSelectorType]] = None # channel selector for loading each signal - - -class ASRAudioProcessor: - """Class that processes an example from Audio collection and returns - a dictionary with prepared signals. - - For example, the output dictionary may be the following - ``` - { - 'input_signal': input_signal_tensor, - 'target_signal': target_signal_tensor, - 'reference_signal': reference_signal_tensor, - 'embedding_vector': embedding_vector - } - ``` - Keys in the output dictionary are ordered with synchronous signals given first, - followed by asynchronous signals and embedding. - - Args: - sample_rate: sample rate used for all audio signals - random_offset: If `True`, offset will be randomized when loading a subsegment - from a file. - """ - - def __init__( - self, sample_rate: float, random_offset: bool, - ): - self.sample_rate = sample_rate - self.random_offset = random_offset - - self.sync_setup = None - self.async_setup = None - self.embedding_setup = None - - @property - def sample_rate(self) -> float: - return self._sample_rate - - @sample_rate.setter - def sample_rate(self, value: float): - if value <= 0: - raise ValueError(f'Sample rate must be positive, received {value}') - - self._sample_rate = value - - @property - def random_offset(self) -> bool: - return self._random_offset - - @random_offset.setter - def random_offset(self, value: bool): - self._random_offset = value - - @property - def sync_setup(self) -> SignalSetup: - """Return the current setup for synchronous signals. - - Returns: - A dataclass containing the list of signals, their - duration and channel selectors. - """ - return self._sync_setup - - @sync_setup.setter - def sync_setup(self, value: Optional[SignalSetup]): - """Setup signals to be loaded synchronously. - - Args: - value: An instance of SignalSetup with the following fields - - signals: list of signals (keys of example.audio_signals) which will be loaded - synchronously with the same start time and duration. - - duration: Duration for each signal to be loaded. - If duration is set to None, the whole file will be loaded. - - channel_selectors: A list of channel selector for each signal. If channel selector - is None, all channels in the audio file will be loaded. - """ - if value is None or isinstance(value, SignalSetup): - self._sync_setup = value - else: - raise ValueError(f'Unexpected type {type(value)} for value {value}.') - - @property - def async_setup(self) -> SignalSetup: - """Return the current setup for asynchronous signals. - - Returns: - A dataclass containing the list of signals, their - duration and channel selectors. - """ - return self._async_setup - - @async_setup.setter - def async_setup(self, value: Optional[SignalSetup]): - """Setup signals to be loaded asynchronously. - - Args: - Args: - value: An instance of SignalSetup with the following fields - - signals: list of signals (keys of example.audio_signals) which will be loaded - asynchronously with signals possibly having different start and duration - - duration: Duration for each signal to be loaded. - If duration is set to None, the whole file will be loaded. - - channel_selectors: A list of channel selector for each signal. If channel selector - is None, all channels in the audio file will be loaded. - """ - if value is None or isinstance(value, SignalSetup): - self._async_setup = value - else: - raise ValueError(f'Unexpected type {type(value)} for value {value}.') - - @property - def embedding_setup(self) -> SignalSetup: - """Setup signals corresponding to an embedding vector. - """ - return self._embedding_setup - - @embedding_setup.setter - def embedding_setup(self, value: SignalSetup): - """Setup signals corresponding to an embedding vector. - - Args: - value: An instance of SignalSetup with the following fields - - signals: list of signals (keys of example.audio_signals) which will be loaded - as embedding vectors. - """ - if value is None or isinstance(value, SignalSetup): - self._embedding_setup = value - else: - raise ValueError(f'Unexpected type {type(value)} for value {value}.') - - def process(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]: - """Process an example from a collection of audio examples. - - Args: - example: an example from Audio collection. - - Returns: - An ordered dictionary of signals and their tensors. - For example, the output dictionary may be the following - ``` - { - 'input_signal': input_signal_tensor, - 'target_signal': target_signal_tensor, - 'reference_signal': reference_signal_tensor, - 'embedding_vector': embedding_vector - } - ``` - Keys in the output dictionary are ordered with synchronous signals given first, - followed by asynchronous signals and embedding. - """ - audio = self.load_audio(example=example) - audio = self.process_audio(audio=audio) - return audio - - def load_audio(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]: - """Given an example, load audio from `example.audio_files` and prepare - the output dictionary. - - Args: - example: An example from an audio collection - - Returns: - An ordered dictionary of signals and their tensors. - For example, the output dictionary may be the following - ``` - { - 'input_signal': input_signal_tensor, - 'target_signal': target_signal_tensor, - 'reference_signal': reference_signal_tensor, - 'embedding_vector': embedding_vector - } - ``` - Keys in the output dictionary are ordered with synchronous signals given first, - followed by asynchronous signals and embedding. - """ - output = OrderedDict() - - if self.sync_setup is not None: - # Load all signals with the same start and duration - sync_signals = self.load_sync_signals(example) - output.update(sync_signals) - - if self.async_setup is not None: - # Load each signal independently - async_signals = self.load_async_signals(example) - output.update(async_signals) - - # Load embedding vector - if self.embedding_setup is not None: - embedding = self.load_embedding(example) - output.update(embedding) - - if not output: - raise RuntimeError('Output dictionary is empty. Please use `_setup` methods to setup signals to be loaded') - - return output - - def process_audio(self, audio: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: - """Process audio signals available in the input dictionary. - - Args: - audio: A dictionary containing loaded signals `signal: tensor` - - Returns: - An ordered dictionary of signals and their tensors. - """ - # Currently, not doing any processing of the loaded signals. - return audio - - def load_sync_signals(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]: - """Load signals with the same start and duration. - - Args: - example: an example from audio collection - - Returns: - An ordered dictionary of signals and their tensors. - """ - output = OrderedDict() - sync_audio_files = [example.audio_files[s] for s in self.sync_setup.signals] - - sync_samples = self.get_samples_synchronized( - audio_files=sync_audio_files, - channel_selectors=self.sync_setup.channel_selectors, - sample_rate=self.sample_rate, - duration=self.sync_setup.duration, - fixed_offset=example.offset, - random_offset=self.random_offset, - ) - - for signal, samples in zip(self.sync_setup.signals, sync_samples): - output[signal] = torch.tensor(samples) - - return output - - def load_async_signals(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]: - """Load each async signal independently, no constraints on starting - from the same time. - - Args: - example: an example from audio collection - - Returns: - An ordered dictionary of signals and their tensors. - """ - output = OrderedDict() - for idx, signal in enumerate(self.async_setup.signals): - samples = self.get_samples( - audio_file=example.audio_files[signal], - sample_rate=self.sample_rate, - duration=self.async_setup.duration[idx], - channel_selector=self.async_setup.channel_selectors[idx], - fixed_offset=example.offset, - random_offset=self.random_offset, - ) - output[signal] = torch.tensor(samples) - return output - - @classmethod - def get_samples( - cls, - audio_file: str, - sample_rate: int, - duration: Optional[float] = None, - channel_selector: ChannelSelectorType = None, - fixed_offset: float = 0, - random_offset: bool = False, - ) -> np.ndarray: - """Get samples from an audio file. - For a single-channel signal, the output is shape (num_samples,). - For a multi-channel signal, the output is shape (num_samples, num_channels). - - Args: - audio_file: path to an audio file - sample_rate: desired sample rate for output samples - duration: Optional desired duration of output samples. - If `None`, the complete file will be loaded. - If set, a segment of `duration` seconds will be loaded. - channel_selector: Optional channel selector, for selecting a subset of channels. - fixed_offset: Optional fixed offset when loading samples. - random_offset: If `True`, offset will be randomized when loading a short segment - from a file. The value is randomized between fixed_offset and - max_offset (set depending on the duration and fixed_offset). - - Returns: - Numpy array with samples from audio file. - The array has shape (num_samples,) for a single-channel signal - or (num_channels, num_samples) for a multi-channel signal. - """ - output = cls.get_samples_synchronized( - audio_files=[audio_file], - sample_rate=sample_rate, - duration=duration, - channel_selectors=[channel_selector], - fixed_offset=fixed_offset, - random_offset=random_offset, - ) - - return output[0] - - @classmethod - def get_samples_synchronized( - cls, - audio_files: List[str], - sample_rate: int, - duration: Optional[float] = None, - channel_selectors: Optional[List[ChannelSelectorType]] = None, - fixed_offset: float = 0, - random_offset: bool = False, - ) -> List[np.ndarray]: - """Get samples from multiple files with the same start and end point. - - Args: - audio_files: list of paths to audio files - sample_rate: desired sample rate for output samples - duration: Optional desired duration of output samples. - If `None`, the complete files will be loaded. - If set, a segment of `duration` seconds will be loaded from - all files. Segment is synchronized across files, so that - start and end points are the same. - channel_selectors: Optional channel selector for each signal, for selecting - a subset of channels. - fixed_offset: Optional fixed offset when loading samples. - random_offset: If `True`, offset will be randomized when loading a short segment - from a file. The value is randomized between fixed_offset and - max_offset (set depending on the duration and fixed_offset). - - Returns: - List with the same size as `audio_files` but containing numpy arrays - with samples from each audio file. - Each array has shape (num_samples,) or (num_channels, num_samples), for single- - or multi-channel signal, respectively. - For example, if `audio_files = [path/to/file_1.wav, path/to/file_2.wav]`, - the output will be a list `output = [samples_file_1, samples_file_2]`. - """ - if channel_selectors is None: - channel_selectors = [None] * len(audio_files) - - if duration is None: - # Load complete files starting from a fixed offset - offset = fixed_offset # fixed offset - num_samples = None # no constrain on the number of samples - - else: - # Fixed duration of the output - audio_durations = cls.get_duration(audio_files) - min_audio_duration = min(audio_durations) - available_duration = min_audio_duration - fixed_offset - - if available_duration <= 0: - raise ValueError(f'Fixed offset {fixed_offset}s is larger than shortest file {min_duration}s.') - - if duration + fixed_offset > min_audio_duration: - # The shortest file is shorter than the requested duration - logging.debug( - f'Shortest file ({min_audio_duration}s) is less than the desired duration {duration}s + fixed offset {fixed_offset}s. Returned signals will be shortened to {available_duration} seconds.' - ) - offset = fixed_offset - duration = available_duration - elif random_offset: - # Randomize offset based on the shortest file - max_offset = min_audio_duration - duration - offset = random.uniform(fixed_offset, max_offset) - else: - # Fixed offset - offset = fixed_offset - - # Fixed number of samples - num_samples = math.floor(duration * sample_rate) - - output = [] - - # Prepare segments - for idx, audio_file in enumerate(audio_files): - segment_samples = cls.get_samples_from_file( - audio_file=audio_file, - sample_rate=sample_rate, - offset=offset, - num_samples=num_samples, - channel_selector=channel_selectors[idx], - ) - output.append(segment_samples) - - return output - - @classmethod - def get_samples_from_file( - cls, - audio_file: Union[str, List[str]], - sample_rate: int, - offset: float, - num_samples: Optional[int] = None, - channel_selector: Optional[ChannelSelectorType] = None, - ) -> np.ndarray: - """Get samples from a single or multiple files. - If loading samples from multiple files, they will - be concatenated along the channel dimension. - - Args: - audio_file: path or a list of paths. - sample_rate: sample rate of the loaded samples - offset: fixed offset in seconds - num_samples: Optional, number of samples to load. - If `None`, all available samples will be loaded. - channel_selector: Select a subset of available channels. - - Returns: - An array with shape (samples,) or (channels, samples) - """ - if isinstance(audio_file, str): - # Load samples from a single file - segment_samples = cls.get_segment_from_file( - audio_file=audio_file, - sample_rate=sample_rate, - offset=offset, - num_samples=num_samples, - channel_selector=channel_selector, - ) - elif isinstance(audio_file, list): - # Load samples from multiple files and form a multi-channel signal - segment_samples = [] - for a_file in audio_file: - a_file_samples = cls.get_segment_from_file( - audio_file=a_file, - sample_rate=sample_rate, - offset=offset, - num_samples=num_samples, - channel_selector=channel_selector, - ) - segment_samples.append(a_file_samples) - segment_samples = cls.list_to_multichannel(segment_samples) - elif audio_file is None: - # Support for inference, when the target signal is `None` - segment_samples = [] - else: - raise RuntimeError(f'Unexpected audio_file type {type(audio_file)}') - return segment_samples - - @staticmethod - def get_segment_from_file( - audio_file: str, - sample_rate: int, - offset: float, - num_samples: Optional[int] = None, - channel_selector: Optional[ChannelSelectorType] = None, - ) -> np.ndarray: - """Get a segment of samples from a single audio file. - - Args: - audio_file: path to an audio file - sample_rate: sample rate of the loaded samples - offset: fixed offset in seconds - num_samples: Optional, number of samples to load. - If `None`, all available samples will be loaded. - channel_selector: Select a subset of available channels. - - Returns: - An array with shape (samples,) or (channels, samples) - """ - if num_samples is None: - segment = AudioSegment.from_file( - audio_file=audio_file, target_sr=sample_rate, offset=offset, channel_selector=channel_selector, - ) - - else: - segment = AudioSegment.segment_from_file( - audio_file=audio_file, - target_sr=sample_rate, - n_segments=num_samples, - offset=offset, - channel_selector=channel_selector, - ) - - if segment.samples.ndim == 1: - # Single-channel signal - return segment.samples - elif segment.samples.ndim == 2: - # Use multi-channel format as (channels, samples) - return segment.samples.T - else: - raise RuntimeError(f'Unexpected samples shape: {segment.samples.shape}') - - @staticmethod - def list_to_multichannel(signal: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: - """Convert a list of signals into a multi-channel signal by concatenating - the elements of the list along the channel dimension. - - If input is not a list, it is returned unmodified. - - Args: - signal: list of arrays - - Returns: - Numpy array obtained by concatenating the elements of the list - along the channel dimension (axis=0). - """ - if not isinstance(signal, list): - # Nothing to do there - return signal - elif len(signal) == 0: - # Nothing to do, return as is - return signal - elif len(signal) == 1: - # Nothing to concatenate, return the original format - return signal[0] - - # If multiple signals are provided in a list, we concatenate them along the channel dimension - if signal[0].ndim == 1: - # Single-channel individual files - mc_signal = np.stack(signal, axis=0) - elif signal[0].ndim == 2: - # Multi-channel individual files - mc_signal = np.concatenate(signal, axis=0) - else: - raise RuntimeError(f'Unexpected target with {signal[0].ndim} dimensions.') - - return mc_signal - - @staticmethod - def get_duration(audio_files: List[str]) -> List[float]: - """Get duration for each audio file in `audio_files`. - - Args: - audio_files: list of paths to audio files - - Returns: - List of durations in seconds. - """ - duration = [librosa.get_duration(path=f) for f in flatten(audio_files)] - return duration - - def load_embedding(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]: - """Given an example, load embedding from `example.audio_files[embedding]` - and return it in a dictionary. - - Args: - example: An example from audio collection - - Returns: - An dictionary of embedding keys and their tensors. - """ - output = OrderedDict() - for idx, signal in enumerate(self.embedding_setup.signals): - embedding_file = example.audio_files[signal] - embedding = self.load_embedding_vector(embedding_file) - output[signal] = torch.tensor(embedding) - return output - - @staticmethod - def load_embedding_vector(filepath: str) -> np.ndarray: - """Load an embedding vector from a file. - - Args: - filepath: path to a file storing a vector. - Currently, it is assumed the file is a npy file. - - Returns: - Array loaded from filepath. - """ - if filepath.endswith('.npy'): - with open(filepath, 'rb') as f: - embedding = np.load(f) - else: - raise RuntimeError(f'Unknown embedding file format in file: {filepath}') - - return embedding - - -class BaseAudioDataset(Dataset): - """Base class of audio datasets, providing common functionality - for other audio datasets. - - Args: - collection: Collection of audio examples prepared from manifest files. - audio_processor: Used to process every example from the collection. - A callable with `process` method. For reference, - please check ASRAudioProcessor. - """ - - @property - @abc.abstractmethod - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - - def __init__(self, collection: collections.Audio, audio_processor: Callable, output_type: Type[namedtuple]): - """Instantiates an audio dataset. - """ - super().__init__() - - self.collection = collection - self.audio_processor = audio_processor - self.output_type = output_type - - def num_channels(self, signal_key) -> int: - """Returns the number of channels for a particular signal in - items prepared by this dictionary. - - More specifically, this will get the tensor from the first - item in the dataset, check if it's a one- or two-dimensional - tensor, and return the number of channels based on the size - of the first axis (shape[0]). - - NOTE: - This assumes that all examples have the same number of channels. - - Args: - signal_key: string, used to select a signal from the dictionary - output by __getitem__ - - Returns: - Number of channels for the selected signal. - """ - # Assumption: whole dataset has the same number of channels - item = self.__getitem__(0) - - if item[signal_key].ndim == 1: - return 1 - elif item[signal_key].ndim == 2: - return item[signal_key].shape[0] - else: - raise RuntimeError( - f'Unexpected number of dimension for signal {signal_key} with shape {item[signal_key].shape}' - ) - - def __getitem__(self, index: int) -> Dict[str, torch.Tensor]: - """Return a single example from the dataset. - - Args: - index: integer index of an example in the collection - - Returns: - Dictionary providing mapping from signal to its tensor. - For example: - ``` - { - 'input_signal': input_signal_tensor, - 'target_signal': target_signal_tensor, - } - ``` - """ - example = self.collection[index] - output = self.audio_processor.process(example=example) - - return output - - def __len__(self) -> int: - """Return the number of examples in the dataset. - """ - return len(self.collection) - - def _collate_fn(self, batch) -> Tuple[torch.Tensor]: - """Collate items in a batch. - """ - return self.output_type(*_audio_collate_fn(batch)) - - -AudioToTargetExample = namedtuple( - typename='AudioToTargetExample', field_names='input_signal input_length target_signal target_length' -) - - -class AudioToTargetDataset(BaseAudioDataset): - """A dataset for audio-to-audio tasks where the goal is to use - an input signal to recover the corresponding target signal. - - Each line of the manifest file is expected to have the following format - ``` - { - 'input_key': 'path/to/input.wav', - 'target_key': 'path/to/path_to_target.wav', - 'duration': duration_of_input, - } - ``` - - Additionally, multiple audio files may be provided for each key in the manifest, for example, - ``` - { - 'input_key': 'path/to/input.wav', - 'target_key': ['path/to/path_to_target_ch0.wav', 'path/to/path_to_target_ch1.wav'], - 'duration': duration_of_input, - } - ``` - - Keys for input and target signals can be configured in the constructor (`input_key` and `target_key`). - - Args: - manifest_filepath: Path to manifest file in a format described above. - sample_rate: Sample rate for loaded audio signals. - input_key: Key pointing to input audio files in the manifest - target_key: Key pointing to target audio files in manifest - audio_duration: Optional duration of each item returned by __getitem__. - If `None`, complete audio will be loaded. - If set, a random subsegment will be loaded synchronously from - target and audio, i.e., with the same start and end point. - random_offset: If `True`, offset will be randomized when loading a subsegment - from a file. - max_duration: If audio exceeds this length, do not include in dataset. - min_duration: If audio is less than this length, do not include in dataset. - max_utts: Limit number of utterances. - input_channel_selector: Optional, select subset of channels from each input audio file. - If `None`, all channels will be loaded. - target_channel_selector: Optional, select subset of channels from each input audio file. - If `None`, all channels will be loaded. - """ - - def __init__( - self, - manifest_filepath: str, - sample_rate: int, - input_key: str, - target_key: str, - audio_duration: Optional[float] = None, - random_offset: bool = False, - max_duration: Optional[float] = None, - min_duration: Optional[float] = None, - max_utts: Optional[int] = None, - input_channel_selector: Optional[int] = None, - target_channel_selector: Optional[int] = None, - ): - audio_to_manifest_key = { - 'input_signal': input_key, - 'target_signal': target_key, - } - - collection = collections.AudioCollection( - manifest_files=manifest_filepath, - audio_to_manifest_key=audio_to_manifest_key, - min_duration=min_duration, - max_duration=max_duration, - max_number=max_utts, - ) - - audio_processor = ASRAudioProcessor(sample_rate=sample_rate, random_offset=random_offset,) - audio_processor.sync_setup = SignalSetup( - signals=['input_signal', 'target_signal'], - duration=audio_duration, - channel_selectors=[input_channel_selector, target_channel_selector], - ) - - super().__init__(collection=collection, audio_processor=audio_processor, output_type=AudioToTargetExample) - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - - Returns: - Ordered dictionary in the following form: - ``` - { - 'input_signal': batched single- or multi-channel format, - 'input_length': batched original length of each input signal - 'target_signal': batched single- or multi-channel format, - 'target_length': batched original length of each target signal - } - ``` - """ - sc_audio_type = NeuralType(('B', 'T'), AudioSignal()) - mc_audio_type = NeuralType(('B', 'C', 'T'), AudioSignal()) - - return OrderedDict( - input_signal=sc_audio_type if self.num_channels('input_signal') == 1 else mc_audio_type, - input_length=NeuralType(('B',), LengthsType()), - target_signal=sc_audio_type if self.num_channels('target_signal') == 1 else mc_audio_type, - target_length=NeuralType(('B',), LengthsType()), - ) - - -AudioToTargetWithReferenceExample = namedtuple( - typename='AudioToTargetWithReferenceExample', - field_names='input_signal input_length target_signal target_length reference_signal reference_length', -) - - -class AudioToTargetWithReferenceDataset(BaseAudioDataset): - """A dataset for audio-to-audio tasks where the goal is to use - an input signal to recover the corresponding target signal and an - additional reference signal is available. - - This can be used, for example, when a reference signal is - available from - - enrollment utterance for the target signal - - echo reference from playback - - reference from another sensor that correlates with the target signal - - Each line of the manifest file is expected to have the following format - ``` - { - 'input_key': 'path/to/input.wav', - 'target_key': 'path/to/path_to_target.wav', - 'reference_key': 'path/to/path_to_reference.wav', - 'duration': duration_of_input, - } - ``` - - Keys for input, target and reference signals can be configured in the constructor. - - Args: - manifest_filepath: Path to manifest file in a format described above. - sample_rate: Sample rate for loaded audio signals. - input_key: Key pointing to input audio files in the manifest - target_key: Key pointing to target audio files in manifest - reference_key: Key pointing to reference audio files in manifest - audio_duration: Optional duration of each item returned by __getitem__. - If `None`, complete audio will be loaded. - If set, a random subsegment will be loaded synchronously from - target and audio, i.e., with the same start and end point. - random_offset: If `True`, offset will be randomized when loading a subsegment - from a file. - max_duration: If audio exceeds this length, do not include in dataset. - min_duration: If audio is less than this length, do not include in dataset. - max_utts: Limit number of utterances. - input_channel_selector: Optional, select subset of channels from each input audio file. - If `None`, all channels will be loaded. - target_channel_selector: Optional, select subset of channels from each input audio file. - If `None`, all channels will be loaded. - reference_channel_selector: Optional, select subset of channels from each input audio file. - If `None`, all channels will be loaded. - reference_is_synchronized: If True, it is assumed that the reference signal is synchronized - with the input signal, so the same subsegment will be loaded as for - input and target. If False, reference signal will be loaded independently - from input and target. - reference_duration: Optional, can be used to set a fixed duration of the reference utterance. If `None`, - complete audio file will be loaded. - """ - - def __init__( - self, - manifest_filepath: str, - sample_rate: int, - input_key: str, - target_key: str, - reference_key: str, - audio_duration: Optional[float] = None, - random_offset: bool = False, - max_duration: Optional[float] = None, - min_duration: Optional[float] = None, - max_utts: Optional[int] = None, - input_channel_selector: Optional[int] = None, - target_channel_selector: Optional[int] = None, - reference_channel_selector: Optional[int] = None, - reference_is_synchronized: bool = True, - reference_duration: Optional[float] = None, - ): - audio_to_manifest_key = { - 'input_signal': input_key, - 'target_signal': target_key, - 'reference_signal': reference_key, - } - - collection = collections.AudioCollection( - manifest_files=manifest_filepath, - audio_to_manifest_key=audio_to_manifest_key, - min_duration=min_duration, - max_duration=max_duration, - max_number=max_utts, - ) - - audio_processor = ASRAudioProcessor(sample_rate=sample_rate, random_offset=random_offset,) - - if reference_is_synchronized: - audio_processor.sync_setup = SignalSetup( - signals=['input_signal', 'target_signal', 'reference_signal'], - duration=audio_duration, - channel_selectors=[input_channel_selector, target_channel_selector, reference_channel_selector], - ) - else: - audio_processor.sync_setup = SignalSetup( - signals=['input_signal', 'target_signal'], - duration=audio_duration, - channel_selectors=[input_channel_selector, target_channel_selector], - ) - audio_processor.async_setup = SignalSetup( - signals=['reference_signal'], - duration=[reference_duration], - channel_selectors=[reference_channel_selector], - ) - - super().__init__( - collection=collection, audio_processor=audio_processor, output_type=AudioToTargetWithReferenceExample - ) - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - - Returns: - Ordered dictionary in the following form: - ``` - { - 'input_signal': batched single- or multi-channel format, - 'input_length': batched original length of each input signal - 'target_signal': batched single- or multi-channel format, - 'target_length': batched original length of each target signal - 'reference_signal': single- or multi-channel format, - 'reference_length': original length of each reference signal - } - ``` - """ - sc_audio_type = NeuralType(('B', 'T'), AudioSignal()) - mc_audio_type = NeuralType(('B', 'C', 'T'), AudioSignal()) - - return OrderedDict( - input_signal=sc_audio_type if self.num_channels('input_signal') == 1 else mc_audio_type, - input_length=NeuralType(('B',), LengthsType()), - target_signal=sc_audio_type if self.num_channels('target_signal') == 1 else mc_audio_type, - target_length=NeuralType(('B',), LengthsType()), - reference_signal=sc_audio_type if self.num_channels('reference_signal') == 1 else mc_audio_type, - reference_length=NeuralType(('B',), LengthsType()), - ) - - -AudioToTargetWithEmbeddingExample = namedtuple( - typename='AudioToTargetWithEmbeddingExample', - field_names='input_signal input_length target_signal target_length embedding_vector embedding_length', -) - - -class AudioToTargetWithEmbeddingDataset(BaseAudioDataset): - """A dataset for audio-to-audio tasks where the goal is to use - an input signal to recover the corresponding target signal and an - additional embedding signal. It is assumed that the embedding - is in a form of a vector. - - Each line of the manifest file is expected to have the following format - ``` - { - input_key: 'path/to/input.wav', - target_key: 'path/to/path_to_target.wav', - embedding_key: 'path/to/path_to_reference.npy', - 'duration': duration_of_input, - } - ``` - - Keys for input, target and embedding signals can be configured in the constructor. - - Args: - manifest_filepath: Path to manifest file in a format described above. - sample_rate: Sample rate for loaded audio signals. - input_key: Key pointing to input audio files in the manifest - target_key: Key pointing to target audio files in manifest - embedding_key: Key pointing to embedding files in manifest - audio_duration: Optional duration of each item returned by __getitem__. - If `None`, complete audio will be loaded. - If set, a random subsegment will be loaded synchronously from - target and audio, i.e., with the same start and end point. - random_offset: If `True`, offset will be randomized when loading a subsegment - from a file. - max_duration: If audio exceeds this length, do not include in dataset. - min_duration: If audio is less than this length, do not include in dataset. - max_utts: Limit number of utterances. - input_channel_selector: Optional, select subset of channels from each input audio file. - If `None`, all channels will be loaded. - target_channel_selector: Optional, select subset of channels from each input audio file. - If `None`, all channels will be loaded. - """ - - def __init__( - self, - manifest_filepath: str, - sample_rate: int, - input_key: str, - target_key: str, - embedding_key: str, - audio_duration: Optional[float] = None, - random_offset: bool = False, - max_duration: Optional[float] = None, - min_duration: Optional[float] = None, - max_utts: Optional[int] = None, - input_channel_selector: Optional[int] = None, - target_channel_selector: Optional[int] = None, - ): - audio_to_manifest_key = { - 'input_signal': input_key, - 'target_signal': target_key, - 'embedding_vector': embedding_key, - } - - collection = collections.AudioCollection( - manifest_files=manifest_filepath, - audio_to_manifest_key=audio_to_manifest_key, - min_duration=min_duration, - max_duration=max_duration, - max_number=max_utts, - ) - - audio_processor = ASRAudioProcessor(sample_rate=sample_rate, random_offset=random_offset,) - audio_processor.sync_setup = SignalSetup( - signals=['input_signal', 'target_signal'], - duration=audio_duration, - channel_selectors=[input_channel_selector, target_channel_selector], - ) - audio_processor.embedding_setup = SignalSetup(signals=['embedding_vector']) - - super().__init__( - collection=collection, audio_processor=audio_processor, output_type=AudioToTargetWithEmbeddingExample - ) - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - - Returns: - Ordered dictionary in the following form: - ``` - { - 'input_signal': batched single- or multi-channel format, - 'input_length': batched original length of each input signal - 'target_signal': batched single- or multi-channel format, - 'target_length': batched original length of each target signal - 'embedding_vector': batched embedded vector format, - 'embedding_length': batched original length of each embedding vector - } - ``` - """ - sc_audio_type = NeuralType(('B', 'T'), AudioSignal()) - mc_audio_type = NeuralType(('B', 'C', 'T'), AudioSignal()) - - return OrderedDict( - input_signal=sc_audio_type if self.num_channels('input_signal') == 1 else mc_audio_type, - input_length=NeuralType(('B',), LengthsType()), - target_signal=sc_audio_type if self.num_channels('target_signal') == 1 else mc_audio_type, - target_length=NeuralType(('B',), LengthsType()), - embedding_vector=NeuralType(('B', 'D'), EncodedRepresentation()), - embedding_length=NeuralType(('B',), LengthsType()), - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_audio_dataset.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_audio_dataset.py deleted file mode 100644 index b296d64b1f2a9ceb95fbcd615b2f0f9f44c16c9e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_audio_dataset.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.data import audio_to_audio - - -def get_audio_to_target_dataset(config: dict) -> audio_to_audio.AudioToTargetDataset: - """Instantiates an audio-to-audio dataset. - - Args: - config: Config of AudioToTargetDataset. - - Returns: - An instance of AudioToTargetDataset - """ - dataset = audio_to_audio.AudioToTargetDataset( - manifest_filepath=config['manifest_filepath'], - sample_rate=config['sample_rate'], - input_key=config['input_key'], - target_key=config['target_key'], - audio_duration=config.get('audio_duration', None), - random_offset=config.get('random_offset', False), - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - max_utts=config.get('max_utts', 0), - input_channel_selector=config.get('input_channel_selector', None), - target_channel_selector=config.get('target_channel_selector', None), - ) - return dataset - - -def get_audio_to_target_with_reference_dataset(config: dict) -> audio_to_audio.AudioToTargetWithReferenceDataset: - """Instantiates an audio-to-audio dataset. - - Args: - config: Config of AudioToTargetWithReferenceDataset. - - Returns: - An instance of AudioToTargetWithReferenceDataset - """ - dataset = audio_to_audio.AudioToTargetWithReferenceDataset( - manifest_filepath=config['manifest_filepath'], - sample_rate=config['sample_rate'], - input_key=config['input_key'], - target_key=config['target_key'], - reference_key=config['reference_key'], - audio_duration=config.get('audio_duration', None), - random_offset=config.get('random_offset', False), - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - max_utts=config.get('max_utts', 0), - input_channel_selector=config.get('input_channel_selector', None), - target_channel_selector=config.get('target_channel_selector', None), - reference_channel_selector=config.get('reference_channel_selector', None), - reference_is_synchronized=config.get('reference_is_synchronized', True), - reference_duration=config.get('reference_duration', None), - ) - return dataset - - -def get_audio_to_target_with_embedding_dataset(config: dict) -> audio_to_audio.AudioToTargetWithEmbeddingDataset: - """Instantiates an audio-to-audio dataset. - - Args: - config: Config of AudioToTargetWithEmbeddingDataset. - - Returns: - An instance of AudioToTargetWithEmbeddingDataset - """ - dataset = audio_to_audio.AudioToTargetWithEmbeddingDataset( - manifest_filepath=config['manifest_filepath'], - sample_rate=config['sample_rate'], - input_key=config['input_key'], - target_key=config['target_key'], - embedding_key=config['embedding_key'], - audio_duration=config.get('audio_duration', None), - random_offset=config.get('random_offset', False), - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - max_utts=config.get('max_utts', 0), - input_channel_selector=config.get('input_channel_selector', None), - target_channel_selector=config.get('target_channel_selector', None), - ) - return dataset diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_ctm_dataset.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_ctm_dataset.py deleted file mode 100644 index 54503053ae367d2f84a9fdc055699371809dc88c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_ctm_dataset.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -from dataclasses import dataclass -from pathlib import Path -from typing import Any, List, Tuple - -from nemo.collections.asr.data.audio_to_text_dataset import ASRPredictionWriter -from nemo.utils import logging - - -@dataclass -class FrameCtmUnit: - """A container class for one CTM unit with start and length countable in frames. - """ - - label: str - start_frame: int - length: int - probability: float - - def __repr__(self) -> str: - return f"{self.label}\t({self.probability:1.3f}): [{self.start_frame:6d}, {self.length:6d}]" - - @property - def end_frame(self): - return self.start_frame + self.length - - def to_ctm_str(self, time_per_frame: int) -> str: - """Represents the data as part of the CTM line. - - The CTM line format is - - This method prepares the last four entities.""" - return f"{self.start_frame * time_per_frame :.3f} {self.length * time_per_frame :.3f} {self.label} {self.probability :1.3f}" - - -class ASRCTMPredictionWriter(ASRPredictionWriter): - def __init__(self, dataset, output_file: str, output_ctm_dir: str, time_per_frame: float): - super().__init__(dataset, output_file) - self.output_ctm_dir = output_ctm_dir - self.time_per_frame = time_per_frame - os.makedirs(self.output_ctm_dir, exist_ok=True) - - def write_ctm(self, name, filepath, frameCtmUnits): - with open(filepath, "tw", encoding="utf-8") as f: - for unit in frameCtmUnits: - f.write(f"{name} 1 {unit.to_ctm_str(self.time_per_frame)}\n") - - def write_on_batch_end( - self, - trainer, - pl_module: 'LightningModule', - prediction: Tuple[int, List[FrameCtmUnit]], - batch_indices: List[int], - batch: Any, - batch_idx: int, - dataloader_idx: int, - ): - for sample_id, units in prediction: - sample = self.dataset.get_manifest_sample(sample_id) - with_ctm = True - if len(units) == 0: - logging.warning( - f"""Do not producing CTM output for item `{sample.audio_file}`. - Check if text is empty or if duration is too short: `{sample.text_raw}`, {sample.duration}""" - ) - with_ctm = False - item = {} - item["audio_filepath"] = sample.audio_file - item["duration"] = sample.duration - item["text"] = sample.text_raw - if with_ctm: - utt_name = Path(sample.audio_file).stem - ctm_filepath = os.path.join(self.output_ctm_dir, utt_name) + ".ctm" - self.write_ctm(utt_name, ctm_filepath, units) - item["ctm_filepath"] = ctm_filepath - else: - item["ctm_filepath"] = "" - self.outf.write(json.dumps(item) + "\n") - self.samples_num += 1 - return diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_diar_label.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_diar_label.py deleted file mode 100644 index a1cb6d0f1bdcb0aba196f050b13dbcbb5a0a2e6e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_diar_label.py +++ /dev/null @@ -1,853 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from collections import OrderedDict -from statistics import mode -from typing import Dict, Optional - -import torch - -from nemo.collections.asr.parts.utils.offline_clustering import get_argmin_mat -from nemo.collections.asr.parts.utils.speaker_utils import convert_rttm_line, prepare_split_data -from nemo.collections.common.parts.preprocessing.collections import DiarizationSpeechLabel -from nemo.core.classes import Dataset -from nemo.core.neural_types import AudioSignal, EncodedRepresentation, LengthsType, NeuralType, ProbsType - - -def get_scale_mapping_list(uniq_timestamps): - """ - Call get_argmin_mat function to find the index of the non-base-scale segment that is closest to the - given base-scale segment. For each scale and each segment, a base-scale segment is assigned. - - Args: - uniq_timestamps: (dict) - The dictionary containing embeddings, timestamps and multiscale weights. - If uniq_timestamps contains only one scale, single scale diarization is performed. - - Returns: - scale_mapping_argmat (torch.tensor): - - The element at the m-th row and the n-th column of the scale mapping matrix indicates the (m+1)-th scale - segment index which has the closest center distance with (n+1)-th segment in the base scale. - - - Example: - `scale_mapping_argmat[2][101] = 85` - - In the above example, the code snippet means that 86-th segment in the 3rd scale (python index is 2) is - mapped to the 102-th segment in the base scale. Thus, the longer segments bound to have more repeating - numbers since multiple base scale segments (since the base scale has the shortest length) fall into the - range of the longer segments. At the same time, each row contains N numbers of indices where N is number - of segments in the base-scale (i.e., the finest scale). - """ - timestamps_in_scales = [] - for key, val in uniq_timestamps['scale_dict'].items(): - timestamps_in_scales.append(torch.tensor(val['time_stamps'])) - session_scale_mapping_list = get_argmin_mat(timestamps_in_scales) - scale_mapping_argmat = [[] for _ in range(len(uniq_timestamps['scale_dict'].keys()))] - for scale_idx in range(len(session_scale_mapping_list)): - scale_mapping_argmat[scale_idx] = session_scale_mapping_list[scale_idx] - scale_mapping_argmat = torch.stack(scale_mapping_argmat) - return scale_mapping_argmat - - -def extract_seg_info_from_rttm(uniq_id, rttm_lines, mapping_dict=None, target_spks=None): - """ - Get RTTM lines containing speaker labels, start time and end time. target_spks contains two targeted - speaker indices for creating groundtruth label files. Only speakers in target_spks variable will be - included in the output lists. - - Args: - uniq_id (str): - Unique file ID that refers to an input audio file and corresponding RTTM (Annotation) file. - rttm_lines (list): - List containing RTTM lines in str format. - mapping_dict (dict): - Mapping between the estimated speakers and the speakers in the ground-truth annotation. - `mapping_dict` variable is only provided when the inference mode is running in sequence-eval mode. - Sequence eval mode uses the mapping between the estimated speakers and the speakers in ground-truth annotation. - Returns: - rttm_tup (tuple): - Tuple containing lists of start time, end time and speaker labels. - - """ - stt_list, end_list, speaker_list, pairwise_infer_spks = [], [], [], [] - if target_spks: - inv_map = {v: k for k, v in mapping_dict.items()} - for spk_idx in target_spks: - spk_str = f'speaker_{spk_idx}' - if spk_str in inv_map: - pairwise_infer_spks.append(inv_map[spk_str]) - for rttm_line in rttm_lines: - start, end, speaker = convert_rttm_line(rttm_line) - if target_spks is None or speaker in pairwise_infer_spks: - end_list.append(end) - stt_list.append(start) - speaker_list.append(speaker) - rttm_tup = (stt_list, end_list, speaker_list) - return rttm_tup - - -def assign_frame_level_spk_vector(rttm_timestamps, round_digits, frame_per_sec, target_spks, min_spks=2): - """ - Create a multi-dimensional vector sequence containing speaker timestamp information in RTTM. - The unit-length is the frame shift length of the acoustic feature. The feature-level annotations - `fr_level_target` will later be converted to base-segment level diarization label. - - Args: - rttm_timestamps (list): - List containing start and end time for each speaker segment label. - stt_list, end_list and speaker_list are contained. - frame_per_sec (int): - Number of feature frames per second. This quantity is determined by window_stride variable in preprocessing module. - target_spks (tuple): - Speaker indices that are generated from combinations. If there are only one or two speakers, - only a single target_spks variable is generated. - - Returns: - fr_level_target (torch.tensor): - Tensor containing label for each feature level frame. - """ - stt_list, end_list, speaker_list = rttm_timestamps - if len(speaker_list) == 0: - return None - else: - sorted_speakers = sorted(list(set(speaker_list))) - total_fr_len = int(max(end_list) * (10 ** round_digits)) - spk_num = max(len(sorted_speakers), min_spks) - speaker_mapping_dict = {rttm_key: x_int for x_int, rttm_key in enumerate(sorted_speakers)} - fr_level_target = torch.zeros(total_fr_len, spk_num) - - # If RTTM is not provided, then there is no speaker mapping dict in target_spks. - # Thus, return a zero-filled tensor as a placeholder. - for count, (stt, end, spk_rttm_key) in enumerate(zip(stt_list, end_list, speaker_list)): - stt, end = round(stt, round_digits), round(end, round_digits) - spk = speaker_mapping_dict[spk_rttm_key] - stt_fr, end_fr = int(round(stt, 2) * frame_per_sec), int(round(end, round_digits) * frame_per_sec) - fr_level_target[stt_fr:end_fr, spk] = 1 - return fr_level_target - - -class _AudioMSDDTrainDataset(Dataset): - """ - Dataset class that loads a json file containing paths to audio files, - RTTM files and number of speakers. This Dataset class is designed for - training or fine-tuning speaker embedding extractor and diarization decoder - at the same time. - - Example: - {"audio_filepath": "/path/to/audio_0.wav", "num_speakers": 2, - "rttm_filepath": "/path/to/diar_label_0.rttm} - ... - {"audio_filepath": "/path/to/audio_n.wav", "num_speakers": 2, - "rttm_filepath": "/path/to/diar_label_n.rttm} - - Args: - manifest_filepath (str): - Path to input manifest json files. - multiscale_args_dict (dict): - Dictionary containing the parameters for multiscale segmentation and clustering. - emb_dir (str): - Path to a temporary folder where segmentation information for embedding extraction is saved. - soft_label_thres (float): - Threshold that determines the label of each segment based on RTTM file information. - featurizer: - Featurizer instance for generating features from the raw waveform. - window_stride (float): - Window stride for acoustic feature. This value is used for calculating the numbers of feature-level frames. - emb_batch_size (int): - Number of embedding vectors that are trained with attached computational graphs. - pairwise_infer (bool): - This variable should be True if dataloader is created for an inference task. - random_flip (bool): - If True, the two labels and input signals are randomly flipped per every epoch while training. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports.""" - output_types = { - "features": NeuralType(('B', 'T'), AudioSignal()), - "feature_length": NeuralType(('B'), LengthsType()), - "ms_seg_timestamps": NeuralType(('B', 'C', 'T', 'D'), LengthsType()), - "ms_seg_counts": NeuralType(('B', 'C'), LengthsType()), - "clus_label_index": NeuralType(('B', 'T'), LengthsType()), - "scale_mapping": NeuralType(('B', 'C', 'T'), LengthsType()), - "targets": NeuralType(('B', 'T', 'C'), ProbsType()), - } - - return output_types - - def __init__( - self, - *, - manifest_filepath: str, - multiscale_args_dict: str, - emb_dir: str, - soft_label_thres: float, - featurizer, - window_stride, - emb_batch_size, - pairwise_infer: bool, - random_flip: bool = True, - global_rank: int = 0, - ): - super().__init__() - self.collection = DiarizationSpeechLabel( - manifests_files=manifest_filepath.split(','), - emb_dict=None, - clus_label_dict=None, - pairwise_infer=pairwise_infer, - ) - self.featurizer = featurizer - self.multiscale_args_dict = multiscale_args_dict - self.emb_dir = emb_dir - self.round_digits = 2 - self.decim = 10 ** self.round_digits - self.soft_label_thres = soft_label_thres - self.pairwise_infer = pairwise_infer - self.max_spks = 2 - self.frame_per_sec = int(1 / window_stride) - self.emb_batch_size = emb_batch_size - self.random_flip = random_flip - self.global_rank = global_rank - self.manifest_filepath = manifest_filepath - self.multiscale_timestamp_dict = prepare_split_data( - self.manifest_filepath, self.emb_dir, self.multiscale_args_dict, self.global_rank, - ) - - def __len__(self): - return len(self.collection) - - def assign_labels_to_longer_segs(self, uniq_id, base_scale_clus_label): - """ - Assign the generated speaker labels from the base scale (the finest scale) to the longer scales. - This process is needed to get the cluster labels for each scale. The cluster labels are needed to - calculate the cluster-average speaker embedding for each scale. - - Args: - uniq_id (str): - Unique sample ID for training. - base_scale_clus_label (torch.tensor): - Tensor variable containing the speaker labels for the base-scale segments. - - Returns: - per_scale_clus_label (torch.tensor): - Tensor variable containing the speaker labels for each segment in each scale. - Note that the total length of the speaker label sequence differs over scale since - each scale has a different number of segments for the same session. - - scale_mapping (torch.tensor): - Matrix containing the segment indices of each scale. scale_mapping is necessary for reshaping the - multiscale embeddings to form an input matrix for the MSDD model. - """ - per_scale_clus_label = [] - self.scale_n = len(self.multiscale_timestamp_dict[uniq_id]['scale_dict']) - uniq_scale_mapping = get_scale_mapping_list(self.multiscale_timestamp_dict[uniq_id]) - for scale_index in range(self.scale_n): - new_clus_label = [] - scale_seq_len = len(self.multiscale_timestamp_dict[uniq_id]["scale_dict"][scale_index]["time_stamps"]) - for seg_idx in range(scale_seq_len): - if seg_idx in uniq_scale_mapping[scale_index]: - seg_clus_label = mode(base_scale_clus_label[uniq_scale_mapping[scale_index] == seg_idx]) - else: - seg_clus_label = 0 if len(new_clus_label) == 0 else new_clus_label[-1] - new_clus_label.append(seg_clus_label) - per_scale_clus_label.extend(new_clus_label) - per_scale_clus_label = torch.tensor(per_scale_clus_label) - return per_scale_clus_label, uniq_scale_mapping - - def get_diar_target_labels(self, uniq_id, sample, fr_level_target): - """ - Convert frame-level diarization target variable into segment-level target variable. Since the granularity is reduced - from frame level (10ms) to segment level (100ms~500ms), we need a threshold value, `soft_label_thres`, which determines - the label of each segment based on the overlap between a segment range (start and end time) and the frame-level target variable. - - Args: - uniq_id (str): - Unique file ID that refers to an input audio file and corresponding RTTM (Annotation) file. - sample: - `DiarizationSpeechLabel` instance containing sample information such as audio filepath and RTTM filepath. - fr_level_target (torch.tensor): - Tensor containing label for each feature-level frame. - - Returns: - seg_target (torch.tensor): - Tensor containing binary speaker labels for base-scale segments. - base_clus_label (torch.tensor): - Representative speaker label for each segment. This variable only has one speaker label for each base-scale segment. - -1 means that there is no corresponding speaker in the target_spks tuple. - """ - seg_target_list, base_clus_label = [], [] - self.scale_n = len(self.multiscale_timestamp_dict[uniq_id]['scale_dict']) - subseg_time_stamp_list = self.multiscale_timestamp_dict[uniq_id]["scale_dict"][self.scale_n - 1]["time_stamps"] - for (seg_stt, seg_end) in subseg_time_stamp_list: - seg_stt_fr, seg_end_fr = int(seg_stt * self.frame_per_sec), int(seg_end * self.frame_per_sec) - soft_label_vec_sess = torch.sum(fr_level_target[seg_stt_fr:seg_end_fr, :], axis=0) / ( - seg_end_fr - seg_stt_fr - ) - label_int_sess = torch.argmax(soft_label_vec_sess) - soft_label_vec = soft_label_vec_sess.unsqueeze(0)[:, sample.target_spks].squeeze() - if label_int_sess in sample.target_spks and torch.sum(soft_label_vec_sess) > 0: - label_int = sample.target_spks.index(label_int_sess) - else: - label_int = -1 - label_vec = (soft_label_vec > self.soft_label_thres).float() - seg_target_list.append(label_vec.detach()) - base_clus_label.append(label_int) - seg_target = torch.stack(seg_target_list) - base_clus_label = torch.tensor(base_clus_label) - return seg_target, base_clus_label - - def parse_rttm_for_ms_targets(self, sample): - """ - Generate target tensor variable by extracting groundtruth diarization labels from an RTTM file. - This function converts (start, end, speaker_id) format into base-scale (the finest scale) segment level - diarization label in a matrix form. - - Example of seg_target: - [[0., 1.], [0., 1.], [1., 1.], [1., 0.], [1., 0.], ..., [0., 1.]] - - Args: - sample: - `DiarizationSpeechLabel` instance containing sample information such as audio filepath and RTTM filepath. - target_spks (tuple): - Speaker indices that are generated from combinations. If there are only one or two speakers, - only a single target_spks tuple is generated. - - Returns: - clus_label_index (torch.tensor): - Groundtruth clustering label (cluster index for each segment) from RTTM files for training purpose. - seg_target (torch.tensor): - Tensor variable containing hard-labels of speaker activity in each base-scale segment. - scale_mapping (torch.tensor): - Matrix containing the segment indices of each scale. scale_mapping is necessary for reshaping the - multiscale embeddings to form an input matrix for the MSDD model. - - """ - rttm_lines = open(sample.rttm_file).readlines() - uniq_id = self.get_uniq_id_with_range(sample) - rttm_timestamps = extract_seg_info_from_rttm(uniq_id, rttm_lines) - fr_level_target = assign_frame_level_spk_vector( - rttm_timestamps, self.round_digits, self.frame_per_sec, target_spks=sample.target_spks - ) - seg_target, base_clus_label = self.get_diar_target_labels(uniq_id, sample, fr_level_target) - clus_label_index, scale_mapping = self.assign_labels_to_longer_segs(uniq_id, base_clus_label) - return clus_label_index, seg_target, scale_mapping - - def get_uniq_id_with_range(self, sample, deci=3): - """ - Generate unique training sample ID from unique file ID, offset and duration. The start-end time added - unique ID is required for identifying the sample since multiple short audio samples are generated from a single - audio file. The start time and end time of the audio stream uses millisecond units if `deci=3`. - - Args: - sample: - `DiarizationSpeechLabel` instance from collections. - - Returns: - uniq_id (str): - Unique sample ID which includes start and end time of the audio stream. - Example: abc1001_3122_6458 - - """ - bare_uniq_id = os.path.splitext(os.path.basename(sample.rttm_file))[0] - offset = str(int(round(sample.offset, deci) * pow(10, deci))) - endtime = str(int(round(sample.offset + sample.duration, deci) * pow(10, deci))) - uniq_id = f"{bare_uniq_id}_{offset}_{endtime}" - return uniq_id - - def get_ms_seg_timestamps(self, sample): - """ - Get start and end time of segments in each scale. - - Args: - sample: - `DiarizationSpeechLabel` instance from preprocessing.collections - Returns: - ms_seg_timestamps (torch.tensor): - Tensor containing Multiscale segment timestamps. - ms_seg_counts (torch.tensor): - Number of segments for each scale. This information is used for reshaping embedding batch - during forward propagation. - """ - uniq_id = self.get_uniq_id_with_range(sample) - ms_seg_timestamps_list = [] - max_seq_len = len(self.multiscale_timestamp_dict[uniq_id]["scale_dict"][self.scale_n - 1]["time_stamps"]) - ms_seg_counts = [0 for _ in range(self.scale_n)] - for scale_idx in range(self.scale_n): - scale_ts_list = [] - for k, (seg_stt, seg_end) in enumerate( - self.multiscale_timestamp_dict[uniq_id]["scale_dict"][scale_idx]["time_stamps"] - ): - stt, end = ( - int((seg_stt - sample.offset) * self.frame_per_sec), - int((seg_end - sample.offset) * self.frame_per_sec), - ) - scale_ts_list.append(torch.tensor([stt, end]).detach()) - ms_seg_counts[scale_idx] = len( - self.multiscale_timestamp_dict[uniq_id]["scale_dict"][scale_idx]["time_stamps"] - ) - scale_ts = torch.stack(scale_ts_list) - scale_ts_padded = torch.cat([scale_ts, torch.zeros(max_seq_len - len(scale_ts_list), 2)], dim=0) - ms_seg_timestamps_list.append(scale_ts_padded.detach()) - ms_seg_timestamps = torch.stack(ms_seg_timestamps_list) - ms_seg_counts = torch.tensor(ms_seg_counts) - return ms_seg_timestamps, ms_seg_counts - - def __getitem__(self, index): - sample = self.collection[index] - if sample.offset is None: - sample.offset = 0 - clus_label_index, targets, scale_mapping = self.parse_rttm_for_ms_targets(sample) - features = self.featurizer.process(sample.audio_file, offset=sample.offset, duration=sample.duration) - feature_length = torch.tensor(features.shape[0]).long() - ms_seg_timestamps, ms_seg_counts = self.get_ms_seg_timestamps(sample) - if self.random_flip: - torch.manual_seed(index) - flip = torch.cat([torch.randperm(self.max_spks), torch.tensor(-1).unsqueeze(0)]) - clus_label_index, targets = flip[clus_label_index], targets[:, flip[: self.max_spks]] - return features, feature_length, ms_seg_timestamps, ms_seg_counts, clus_label_index, scale_mapping, targets - - -class _AudioMSDDInferDataset(Dataset): - """ - Dataset class that loads a json file containing paths to audio files, - RTTM files and number of speakers. This Dataset class is built for diarization inference and - evaluation. Speaker embedding sequences, segment timestamps, cluster-average speaker embeddings - are loaded from memory and fed into the dataloader. - - Example: - {"audio_filepath": "/path/to/audio_0.wav", "num_speakers": 2, - "rttm_filepath": "/path/to/diar_label_0.rttm} - ... - {"audio_filepath": "/path/to/audio_n.wav", "num_speakers": 2, - "rttm_filepath": "/path/to/diar_label_n.rttm} - - Args: - manifest_filepath (str): - Path to input manifest json files. - emb_dict (dict): - Dictionary containing cluster-average embeddings and speaker mapping information. - emb_seq (dict): - Dictionary containing multiscale speaker embedding sequence, scale mapping and corresponding segment timestamps. - clus_label_dict (dict): - Subsegment-level (from base-scale) speaker labels from clustering results. - soft_label_thres (float): - A threshold that determines the label of each segment based on RTTM file information. - featurizer: - Featurizer instance for generating features from raw waveform. - seq_eval_mode (bool): - If True, F1 score will be calculated for each speaker pair during inference mode. - window_stride (float): - Window stride for acoustic feature. This value is used for calculating the numbers of feature-level frames. - use_single_scale_clus (bool): - Use only one scale for clustering instead of using multiple scales of embeddings for clustering. - pairwise_infer (bool): - This variable should be True if dataloader is created for an inference task. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports.""" - output_types = OrderedDict( - { - "ms_emb_seq": NeuralType(('B', 'T', 'C', 'D'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - "ms_avg_embs": NeuralType(('B', 'C', 'D', 'C'), EncodedRepresentation()), - "targets": NeuralType(('B', 'T', 'C'), ProbsType()), - } - ) - return output_types - - def __init__( - self, - *, - manifest_filepath: str, - emb_dict: Dict, - emb_seq: Dict, - clus_label_dict: Dict, - soft_label_thres: float, - seq_eval_mode: bool, - window_stride: float, - use_single_scale_clus: bool, - pairwise_infer: bool, - ): - super().__init__() - self.collection = DiarizationSpeechLabel( - manifests_files=manifest_filepath.split(','), - emb_dict=emb_dict, - clus_label_dict=clus_label_dict, - seq_eval_mode=seq_eval_mode, - pairwise_infer=pairwise_infer, - ) - self.emb_dict = emb_dict - self.emb_seq = emb_seq - self.clus_label_dict = clus_label_dict - self.round_digits = 2 - self.decim = 10 ** self.round_digits - self.frame_per_sec = int(1 / window_stride) - self.soft_label_thres = soft_label_thres - self.pairwise_infer = pairwise_infer - self.max_spks = 2 - self.use_single_scale_clus = use_single_scale_clus - self.seq_eval_mode = seq_eval_mode - - def __len__(self): - return len(self.collection) - - def parse_rttm_multiscale(self, sample): - """ - Generate target tensor variable by extracting groundtruth diarization labels from an RTTM file. - This function is only used when ``self.seq_eval_mode=True`` and RTTM files are provided. This function converts - (start, end, speaker_id) format into base-scale (the finest scale) segment level diarization label in a matrix - form to create target matrix. - - Args: - sample: - DiarizationSpeechLabel instance containing sample information such as audio filepath and RTTM filepath. - target_spks (tuple): - Two Indices of targeted speakers for evaluation. - Example of target_spks: (2, 3) - Returns: - seg_target (torch.tensor): - Tensor variable containing hard-labels of speaker activity in each base-scale segment. - """ - if sample.rttm_file is None: - raise ValueError(f"RTTM file is not provided for this sample {sample}") - rttm_lines = open(sample.rttm_file).readlines() - uniq_id = os.path.splitext(os.path.basename(sample.rttm_file))[0] - mapping_dict = self.emb_dict[max(self.emb_dict.keys())][uniq_id]['mapping'] - rttm_timestamps = extract_seg_info_from_rttm(uniq_id, rttm_lines, mapping_dict, sample.target_spks) - fr_level_target = assign_frame_level_spk_vector( - rttm_timestamps, self.round_digits, self.frame_per_sec, sample.target_spks - ) - seg_target = self.get_diar_target_labels_from_fr_target(uniq_id, fr_level_target) - return seg_target - - def get_diar_target_labels_from_fr_target(self, uniq_id, fr_level_target): - """ - Generate base-scale level binary diarization label from frame-level target matrix. For the given frame-level - speaker target matrix fr_level_target, we count the number of frames that belong to each speaker and calculate - ratios for each speaker into the `soft_label_vec` variable. Finally, `soft_label_vec` variable is compared with `soft_label_thres` - to determine whether a label vector should contain 0 or 1 for each speaker bin. Note that seg_target variable has - dimension of (number of base-scale segments x 2) dimension. - - Example of seg_target: - [[0., 1.], [0., 1.], [1., 1.], [1., 0.], [1., 0.], ..., [0., 1.]] - - Args: - uniq_id (str): - Unique file ID that refers to an input audio file and corresponding RTTM (Annotation) file. - fr_level_target (torch.tensor): - frame-level binary speaker annotation (1: exist 0: non-exist) generated from RTTM file. - - Returns: - seg_target (torch.tensor): - Tensor variable containing binary hard-labels of speaker activity in each base-scale segment. - - """ - if fr_level_target is None: - return None - else: - seg_target_list = [] - for (seg_stt, seg_end, label_int) in self.clus_label_dict[uniq_id]: - seg_stt_fr, seg_end_fr = int(seg_stt * self.frame_per_sec), int(seg_end * self.frame_per_sec) - soft_label_vec = torch.sum(fr_level_target[seg_stt_fr:seg_end_fr, :], axis=0) / ( - seg_end_fr - seg_stt_fr - ) - label_vec = (soft_label_vec > self.soft_label_thres).int() - seg_target_list.append(label_vec) - seg_target = torch.stack(seg_target_list) - return seg_target - - def __getitem__(self, index): - sample = self.collection[index] - if sample.offset is None: - sample.offset = 0 - - uniq_id = os.path.splitext(os.path.basename(sample.audio_file))[0] - scale_n = len(self.emb_dict.keys()) - _avg_embs = torch.stack([self.emb_dict[scale_index][uniq_id]['avg_embs'] for scale_index in range(scale_n)]) - - if self.pairwise_infer: - avg_embs = _avg_embs[:, :, self.collection[index].target_spks] - else: - avg_embs = _avg_embs - - if avg_embs.shape[2] > self.max_spks: - raise ValueError( - f" avg_embs.shape[2] {avg_embs.shape[2]} should be less than or equal to self.max_num_speakers {self.max_spks}" - ) - - feats = [] - for scale_index in range(scale_n): - repeat_mat = self.emb_seq["session_scale_mapping"][uniq_id][scale_index] - feats.append(self.emb_seq[scale_index][uniq_id][repeat_mat, :]) - feats_out = torch.stack(feats).permute(1, 0, 2) - feats_len = feats_out.shape[0] - - if self.seq_eval_mode: - targets = self.parse_rttm_multiscale(sample) - else: - targets = torch.zeros(feats_len, 2).float() - - return feats_out, feats_len, targets, avg_embs - - -def _msdd_train_collate_fn(self, batch): - """ - Collate batch of variables that are needed for raw waveform to diarization label training. - The following variables are included in training/validation batch: - - Args: - batch (tuple): - Batch tuple containing the variables for the diarization training. - Returns: - features (torch.tensor): - Raw waveform samples (time series) loaded from the audio_filepath in the input manifest file. - feature lengths (time series sample length): - A list of lengths of the raw waveform samples. - ms_seg_timestamps (torch.tensor): - Matrix containing the start time and end time (timestamps) for each segment and each scale. - ms_seg_timestamps is needed for extracting acoustic features from raw waveforms. - ms_seg_counts (torch.tensor): - Matrix containing The number of segments for each scale. ms_seg_counts is necessary for reshaping - the input matrix for the MSDD model. - clus_label_index (torch.tensor): - Groundtruth Clustering label (cluster index for each segment) from RTTM files for training purpose. - clus_label_index is necessary for calculating cluster-average embedding. - scale_mapping (torch.tensor): - Matrix containing the segment indices of each scale. scale_mapping is necessary for reshaping the - multiscale embeddings to form an input matrix for the MSDD model. - targets (torch.tensor): - Groundtruth Speaker label for the given input embedding sequence. - """ - packed_batch = list(zip(*batch)) - features, feature_length, ms_seg_timestamps, ms_seg_counts, clus_label_index, scale_mapping, targets = packed_batch - features_list, feature_length_list = [], [] - ms_seg_timestamps_list, ms_seg_counts_list, scale_clus_label_list, scale_mapping_list, targets_list = ( - [], - [], - [], - [], - [], - ) - - max_raw_feat_len = max([x.shape[0] for x in features]) - max_target_len = max([x.shape[0] for x in targets]) - max_total_seg_len = max([x.shape[0] for x in clus_label_index]) - - for feat, feat_len, ms_seg_ts, ms_seg_ct, scale_clus, scl_map, tgt in batch: - seq_len = tgt.shape[0] - pad_feat = (0, max_raw_feat_len - feat_len) - pad_tgt = (0, 0, 0, max_target_len - seq_len) - pad_sm = (0, max_target_len - seq_len) - pad_ts = (0, 0, 0, max_target_len - seq_len) - pad_sc = (0, max_total_seg_len - scale_clus.shape[0]) - padded_feat = torch.nn.functional.pad(feat, pad_feat) - padded_tgt = torch.nn.functional.pad(tgt, pad_tgt) - padded_sm = torch.nn.functional.pad(scl_map, pad_sm) - padded_ms_seg_ts = torch.nn.functional.pad(ms_seg_ts, pad_ts) - padded_scale_clus = torch.nn.functional.pad(scale_clus, pad_sc) - - features_list.append(padded_feat) - feature_length_list.append(feat_len.clone().detach()) - ms_seg_timestamps_list.append(padded_ms_seg_ts) - ms_seg_counts_list.append(ms_seg_ct.clone().detach()) - scale_clus_label_list.append(padded_scale_clus) - scale_mapping_list.append(padded_sm) - targets_list.append(padded_tgt) - - features = torch.stack(features_list) - feature_length = torch.stack(feature_length_list) - ms_seg_timestamps = torch.stack(ms_seg_timestamps_list) - clus_label_index = torch.stack(scale_clus_label_list) - ms_seg_counts = torch.stack(ms_seg_counts_list) - scale_mapping = torch.stack(scale_mapping_list) - targets = torch.stack(targets_list) - return features, feature_length, ms_seg_timestamps, ms_seg_counts, clus_label_index, scale_mapping, targets - - -def _msdd_infer_collate_fn(self, batch): - """ - Collate batch of feats (speaker embeddings), feature lengths, target label sequences and cluster-average embeddings. - - Args: - batch (tuple): - Batch tuple containing feats, feats_len, targets and ms_avg_embs. - Returns: - feats (torch.tensor): - Collated speaker embedding with unified length. - feats_len (torch.tensor): - The actual length of each embedding sequence without zero padding. - targets (torch.tensor): - Groundtruth Speaker label for the given input embedding sequence. - ms_avg_embs (torch.tensor): - Cluster-average speaker embedding vectors. - """ - - packed_batch = list(zip(*batch)) - feats, feats_len, targets, ms_avg_embs = packed_batch - feats_list, flen_list, targets_list, ms_avg_embs_list = [], [], [], [] - max_audio_len = max(feats_len) - max_target_len = max([x.shape[0] for x in targets]) - - for feature, feat_len, target, ivector in batch: - flen_list.append(feat_len) - ms_avg_embs_list.append(ivector) - if feat_len < max_audio_len: - pad_a = (0, 0, 0, 0, 0, max_audio_len - feat_len) - pad_t = (0, 0, 0, max_target_len - target.shape[0]) - padded_feature = torch.nn.functional.pad(feature, pad_a) - padded_target = torch.nn.functional.pad(target, pad_t) - feats_list.append(padded_feature) - targets_list.append(padded_target) - else: - targets_list.append(target.clone().detach()) - feats_list.append(feature.clone().detach()) - - feats = torch.stack(feats_list) - feats_len = torch.tensor(flen_list) - targets = torch.stack(targets_list) - ms_avg_embs = torch.stack(ms_avg_embs_list) - return feats, feats_len, targets, ms_avg_embs - - -class AudioToSpeechMSDDTrainDataset(_AudioMSDDTrainDataset): - """ - Dataset class that loads a json file containing paths to audio files, - rttm files and number of speakers. This Dataset class is designed for - training or fine-tuning speaker embedding extractor and diarization decoder - at the same time. - - Example: - {"audio_filepath": "/path/to/audio_0.wav", "num_speakers": 2, - "rttm_filepath": "/path/to/diar_label_0.rttm} - ... - {"audio_filepath": "/path/to/audio_n.wav", "num_speakers": 2, - "rttm_filepath": "/path/to/diar_label_n.rttm} - - Args: - manifest_filepath (str): - Path to input manifest json files. - multiscale_args_dict (dict): - Dictionary containing the parameters for multiscale segmentation and clustering. - emb_dir (str): - Path to a temporary folder where segmentation information for embedding extraction is saved. - soft_label_thres (float): - A threshold that determines the label of each segment based on RTTM file information. - featurizer: - Featurizer instance for generating features from the raw waveform. - window_stride (float): - Window stride for acoustic feature. This value is used for calculating the numbers of feature-level frames. - emb_batch_size (int): - Number of embedding vectors that are trained with attached computational graphs. - pairwise_infer (bool): - This variable should be True if dataloader is created for an inference task. - """ - - def __init__( - self, - *, - manifest_filepath: str, - multiscale_args_dict: Dict, - emb_dir: str, - soft_label_thres: float, - featurizer, - window_stride, - emb_batch_size, - pairwise_infer: bool, - global_rank: int, - ): - super().__init__( - manifest_filepath=manifest_filepath, - multiscale_args_dict=multiscale_args_dict, - emb_dir=emb_dir, - soft_label_thres=soft_label_thres, - featurizer=featurizer, - window_stride=window_stride, - emb_batch_size=emb_batch_size, - pairwise_infer=pairwise_infer, - global_rank=global_rank, - ) - - def msdd_train_collate_fn(self, batch): - return _msdd_train_collate_fn(self, batch) - - -class AudioToSpeechMSDDInferDataset(_AudioMSDDInferDataset): - """ - Dataset class that loads a json file containing paths to audio files, - rttm files and number of speakers. The created labels are used for diarization inference. - - Example: - {"audio_filepath": "/path/to/audio_0.wav", "num_speakers": 2, - "rttm_filepath": "/path/to/diar_label_0.rttm} - ... - {"audio_filepath": "/path/to/audio_n.wav", "num_speakers": 2, - "rttm_filepath": "/path/to/diar_label_n.rttm} - - Args: - manifest_filepath (str): - Path to input manifest json files. - emb_dict (dict): - Dictionary containing cluster-average embeddings and speaker mapping information. - emb_seq (dict): - Dictionary containing multiscale speaker embedding sequence, scale mapping and corresponding segment timestamps. - clus_label_dict (dict): - Subsegment-level (from base-scale) speaker labels from clustering results. - soft_label_thres (float): - Threshold that determines speaker labels of segments depending on the overlap with groundtruth speaker timestamps. - featurizer: - Featurizer instance for generating features from raw waveform. - use_single_scale_clus (bool): - Use only one scale for clustering instead of using multiple scales of embeddings for clustering. - seq_eval_mode (bool): - If True, F1 score will be calculated for each speaker pair during inference mode. - window_stride (float): - Window stride for acoustic feature. This value is used for calculating the numbers of feature-level frames. - pairwise_infer (bool): - If True, this Dataset class operates in inference mode. In inference mode, a set of speakers in the input audio - is split into multiple pairs of speakers and speaker tuples (e.g. 3 speakers: [(0,1), (1,2), (0,2)]) and then - fed into the MSDD to merge the individual results. - """ - - def __init__( - self, - *, - manifest_filepath: str, - emb_dict: Dict, - emb_seq: Dict, - clus_label_dict: Dict, - soft_label_thres: float, - use_single_scale_clus: bool, - seq_eval_mode: bool, - window_stride: float, - pairwise_infer: bool, - ): - super().__init__( - manifest_filepath=manifest_filepath, - emb_dict=emb_dict, - emb_seq=emb_seq, - clus_label_dict=clus_label_dict, - soft_label_thres=soft_label_thres, - use_single_scale_clus=use_single_scale_clus, - window_stride=window_stride, - seq_eval_mode=seq_eval_mode, - pairwise_infer=pairwise_infer, - ) - - def msdd_infer_collate_fn(self, batch): - return _msdd_infer_collate_fn(self, batch) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_label.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_label.py deleted file mode 100644 index f00f961b4c81f0639e09dd39734b8205b92fa385..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_label.py +++ /dev/null @@ -1,1294 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import io -import os -from typing import Dict, List, Optional, Union - -import torch -import webdataset as wd - -from nemo.collections.asr.data.audio_to_text import cache_datastore_manifests, expand_sharded_filepaths -from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer -from nemo.collections.asr.parts.preprocessing.segment import available_formats as valid_sf_formats -from nemo.collections.common.parts.preprocessing import collections -from nemo.core.classes import Dataset, IterableDataset -from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, NeuralType, RegressionValuesType -from nemo.utils import logging - -# List of valid file formats (prioritized by order of importance) -VALID_FILE_FORMATS = ';'.join(['wav', 'mp3', 'flac'] + [fmt.lower() for fmt in valid_sf_formats.keys()]) - - -def repeat_signal(signal: torch.Tensor, sig_len: int, required_length: int) -> torch.Tensor: - """repeat signal to make short signal to have required_length - Args: - signal (Tensor): input signal - sig_len (int): length of input signal - required_length (int): length of generated signal - Returns: - signal (Tensor): generated signal of required_length by repeating itself. - """ - sub: torch.Tensor = torch.tensor([]) - repeat = int(required_length // sig_len) - rem = int(required_length % sig_len) - sub: torch.Tensor = torch.tensor([]) - rep_sig: torch.Tensor = torch.cat(repeat * [signal]) - if rem > 0: - sub = signal[-rem:] - signal = torch.cat((rep_sig, sub)) - else: - signal = rep_sig - return signal - - -def normalize(signal): - """normalize signal - Args: - signal(FloatTensor): signal to be normalized. - """ - signal_minusmean = signal - signal.mean() - return signal_minusmean / signal_minusmean.abs().max() - - -def count_occurence(manifest_file_id): - """Count number of wav files in Dict manifest_file_id. Use for _TarredAudioToLabelDataset. - Args: - manifest_file_id (Dict): Dict of files and their corresponding id. {'A-sub0' : 1, ..., 'S-sub10':100} - Returns: - count (Dict): Dict of wav files {'A' : 2, ..., 'S':10} - """ - count = dict() - for i in manifest_file_id: - audio_filename = i.split("-sub")[0] - count[audio_filename] = count.get(audio_filename, 0) + 1 - return count - - -def _speech_collate_fn(batch, pad_id): - """collate batch of audio sig, audio len, tokens, tokens len - Args: - batch (Optional[FloatTensor], Optional[LongTensor], LongTensor, - LongTensor): A tuple of tuples of signal, signal lengths, - encoded tokens, and encoded tokens length. This collate func - assumes the signals are 1d torch tensors (i.e. mono audio). - """ - _, audio_lengths, _, tokens_lengths = zip(*batch) - max_audio_len = 0 - has_audio = audio_lengths[0] is not None - if has_audio: - max_audio_len = max(audio_lengths).item() - max_tokens_len = max(tokens_lengths).item() - - audio_signal, tokens = [], [] - for sig, sig_len, tokens_i, tokens_i_len in batch: - if has_audio: - sig_len = sig_len.item() - if sig_len < max_audio_len: - pad = (0, max_audio_len - sig_len) - sig = torch.nn.functional.pad(sig, pad) - audio_signal.append(sig) - tokens_i_len = tokens_i_len.item() - if tokens_i_len < max_tokens_len: - pad = (0, max_tokens_len - tokens_i_len) - tokens_i = torch.nn.functional.pad(tokens_i, pad, value=pad_id) - tokens.append(tokens_i) - - if has_audio: - audio_signal = torch.stack(audio_signal) - audio_lengths = torch.stack(audio_lengths) - else: - audio_signal, audio_lengths = None, None - tokens = torch.stack(tokens) - tokens_lengths = torch.stack(tokens_lengths) - - return audio_signal, audio_lengths, tokens, tokens_lengths - - -def _fixed_seq_collate_fn(self, batch): - """collate batch of audio sig, audio len, tokens, tokens len - Args: - batch (Optional[FloatTensor], Optional[LongTensor], LongTensor, - LongTensor): A tuple of tuples of signal, signal lengths, - encoded tokens, and encoded tokens length. This collate func - assumes the signals are 1d torch tensors (i.e. mono audio). - """ - _, audio_lengths, _, tokens_lengths = zip(*batch) - - has_audio = audio_lengths[0] is not None - fixed_length = int(max(audio_lengths)) - - audio_signal, tokens, new_audio_lengths = [], [], [] - for sig, sig_len, tokens_i, _ in batch: - if has_audio: - sig_len = sig_len.item() - chunck_len = sig_len - fixed_length - - if chunck_len < 0: - repeat = fixed_length // sig_len - rem = fixed_length % sig_len - sub = sig[-rem:] if rem > 0 else torch.tensor([]) - rep_sig = torch.cat(repeat * [sig]) - sig = torch.cat((rep_sig, sub)) - new_audio_lengths.append(torch.tensor(fixed_length)) - - audio_signal.append(sig) - - tokens.append(tokens_i) - - if has_audio: - audio_signal = torch.stack(audio_signal) - audio_lengths = torch.stack(new_audio_lengths) - else: - audio_signal, audio_lengths = None, None - tokens = torch.stack(tokens) - tokens_lengths = torch.stack(tokens_lengths) - - return audio_signal, audio_lengths, tokens, tokens_lengths - - -def _vad_frame_seq_collate_fn(self, batch): - """collate batch of audio sig, audio len, tokens, tokens len - Args: - batch (Optional[FloatTensor], Optional[LongTensor], LongTensor, - LongTensor): A tuple of tuples of signal, signal lengths, - encoded tokens, and encoded tokens length. This collate func - assumes the signals are 1d torch tensors (i.e. mono audio). - batch size equals to 1. - """ - slice_length = int(self.featurizer.sample_rate * self.window_length_in_sec) - _, audio_lengths, _, tokens_lengths = zip(*batch) - slice_length = int(min(slice_length, max(audio_lengths))) - shift = int(self.featurizer.sample_rate * self.shift_length_in_sec) - has_audio = audio_lengths[0] is not None - - audio_signal, num_slices, tokens, audio_lengths = [], [], [], [] - - append_len_start = slice_length // 2 - append_len_end = slice_length - slice_length // 2 - for sig, sig_len, tokens_i, _ in batch: - if self.normalize_audio: - sig = normalize(sig) - start = torch.zeros(append_len_start) - end = torch.zeros(append_len_end) - sig = torch.cat((start, sig, end)) - sig_len += slice_length - - if has_audio: - slices = torch.div(sig_len - slice_length, shift, rounding_mode='trunc') - for slice_id in range(slices): - start_idx = slice_id * shift - end_idx = start_idx + slice_length - signal = sig[start_idx:end_idx] - audio_signal.append(signal) - - num_slices.append(slices) - tokens.extend([tokens_i] * slices) - audio_lengths.extend([slice_length] * slices) - - if has_audio: - audio_signal = torch.stack(audio_signal) - audio_lengths = torch.tensor(audio_lengths) - else: - audio_signal, audio_lengths = None, None - - tokens = torch.stack(tokens) - tokens_lengths = torch.tensor(num_slices) - return audio_signal, audio_lengths, tokens, tokens_lengths - - -class _AudioLabelDataset(Dataset): - """ - Dataset that loads tensors via a json file containing paths to audio files, - labels, and durations and offsets(in seconds). Each new line is a - different sample. Example below: - and their target labels. JSON files should be of the following format:: - {"audio_filepath": "/path/to/audio_wav_0.wav", "duration": time_in_sec_0, "label": \ -target_label_0, "offset": offset_in_sec_0} - ... - {"audio_filepath": "/path/to/audio_wav_n.wav", "duration": time_in_sec_n, "label": \ -target_label_n, "offset": offset_in_sec_n} - Args: - manifest_filepath (Union[str, List[str]]): Dataset parameter. Path to JSON containing data. - labels (list): Dataset parameter. List of target classes that can be output by the speaker recognition model. - featurizer - min_duration (float): Dataset parameter. All training files which have a duration less than min_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to 0.1. - max_duration (float): Dataset parameter. - All training files which have a duration more than max_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to None. - trim (bool): Whether to use trim silence from beginning and end of audio signal using librosa.effects.trim(). - Defaults to False. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - - output_types = { - 'audio_signal': NeuralType( - ('B', 'T'), - AudioSignal(freq=self._sample_rate) - if self is not None and hasattr(self, '_sample_rate') - else AudioSignal(), - ), - 'a_sig_length': NeuralType(tuple('B'), LengthsType()), - } - - if self.is_regression_task: - output_types.update( - { - 'targets': NeuralType(tuple('B'), RegressionValuesType()), - 'targets_length': NeuralType(tuple('B'), LengthsType()), - } - ) - else: - - output_types.update( - {'label': NeuralType(tuple('B'), LabelsType()), 'label_length': NeuralType(tuple('B'), LengthsType()),} - ) - - return output_types - - def __init__( - self, - *, - manifest_filepath: Union[str, List[str]], - labels: List[str], - featurizer, - min_duration: Optional[float] = 0.1, - max_duration: Optional[float] = None, - trim: bool = False, - is_regression_task: bool = False, - cal_labels_occurrence: Optional[bool] = False, - ): - super().__init__() - if isinstance(manifest_filepath, str): - manifest_filepath = manifest_filepath.split(',') - cache_datastore_manifests(manifest_filepaths=manifest_filepath, cache_audio=True) - self.collection = collections.ASRSpeechLabel( - manifests_files=manifest_filepath, - min_duration=min_duration, - max_duration=max_duration, - is_regression_task=is_regression_task, - cal_labels_occurrence=cal_labels_occurrence, - ) - - self.featurizer = featurizer - self.trim = trim - self.is_regression_task = is_regression_task - - if not is_regression_task: - self.labels = labels if labels else self.collection.uniq_labels - self.num_classes = len(self.labels) if self.labels is not None else 1 - self.label2id, self.id2label = {}, {} - self.id2occurrence, self.labels_occurrence = {}, [] - - for label_id, label in enumerate(self.labels): - self.label2id[label] = label_id - self.id2label[label_id] = label - if cal_labels_occurrence: - self.id2occurrence[label_id] = self.collection.labels_occurrence[label] - - if cal_labels_occurrence: - self.labels_occurrence = [self.id2occurrence[k] for k in sorted(self.id2occurrence)] - - for idx in range(len(self.labels[:5])): - logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx])) - - else: - self.labels = [] - self.num_classes = 1 - - def __len__(self): - return len(self.collection) - - def __getitem__(self, index): - sample = self.collection[index] - - offset = sample.offset - - if offset is None: - offset = 0 - - features = self.featurizer.process(sample.audio_file, offset=offset, duration=sample.duration, trim=self.trim) - f, fl = features, torch.tensor(features.shape[0]).long() - - if not self.is_regression_task: - t = torch.tensor(self.label2id[sample.label]).long() - else: - t = torch.tensor(sample.label).float() - - tl = torch.tensor(1).long() # For compatibility with collate_fn used later - - return f, fl, t, tl - - -# Ported from https://github.com/NVIDIA/OpenSeq2Seq/blob/master/open_seq2seq/data/speech2text/speech_commands.py -class AudioToClassificationLabelDataset(_AudioLabelDataset): - """ - Dataset that loads tensors via a json file containing paths to audio - files, command class, and durations (in seconds). Each new line is a - different sample. Example below: - {"audio_filepath": "/path/to/audio_wav_0.wav", "duration": time_in_sec_0, "label": \ - target_label_0, "offset": offset_in_sec_0} - ... - {"audio_filepath": "/path/to/audio_wav_n.wav", "duration": time_in_sec_n, "label": \ - target_label_n, "offset": offset_in_sec_n} - Args: - manifest_filepath (Union[str, List[str]]): Path to manifest json as described above. Can - be comma-separated paths. - labels (Optional[list]): String containing all the possible labels to map to - if None then automatically picks from ASRSpeechLabel collection. - featurizer: Initialized featurizer class that converts paths of - audio to feature tensors - max_duration: If audio exceeds this length, do not include in dataset - min_duration: If audio is less than this length, do not include - in dataset - trim: Boolean flag whether to trim the audio - """ - - def _collate_fn(self, batch): - return _speech_collate_fn(batch, pad_id=0) - - -class AudioToSpeechLabelDataset(_AudioLabelDataset): - """ - Dataset that loads tensors via a json file containing paths to audio - files, command class, and durations (in seconds). Each new line is a - different sample. Example below: - {"audio_filepath": "/path/to/audio_wav_0.wav", "duration": time_in_sec_0, "label": \ - target_label_0, "offset": offset_in_sec_0} - ... - {"audio_filepath": "/path/to/audio_wav_n.wav", "duration": time_in_sec_n, "label": \ - target_label_n, "offset": offset_in_sec_n} - Args: - manifest_filepath (Union[str, List[str]]): Path to manifest json as described above. Can - be comma-separated paths. - labels (Optional[list]): String containing all the possible labels to map to - if None then automatically picks from ASRSpeechLabel collection. - min_duration (float): Dataset parameter. - All training files which have a duration less than min_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to 0.1. - max_duration (float): Dataset parameter. - All training files which have a duration more than max_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to None. - trim (bool): Whether to use trim silence from beginning and end - of audio signal using librosa.effects.trim(). - Defaults to False. - window_length_in_sec (float): length of window/slice (in seconds) - Use this for speaker recognition and VAD tasks. - shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task in a batch - Use this for VAD task during inference. - normalize_audio (bool): Whether to normalize audio signal. - Defaults to False. - is_regression_task (bool): Whether the dataset is for a regression task instead of classification. - Defaults to False. - cal_labels_occurrence (bool): Whether to calculate occurrence of labels - Defaults to False. - """ - - def __init__( - self, - *, - manifest_filepath: Union[str, List[str]], - labels: List[str], - featurizer, - min_duration: Optional[float] = 0.1, - max_duration: Optional[float] = None, - trim: bool = False, - window_length_in_sec: Optional[float] = 8, - shift_length_in_sec: Optional[float] = 1, - normalize_audio: bool = False, - is_regression_task: bool = False, - cal_labels_occurrence: Optional[bool] = False, - ): - self.window_length_in_sec = window_length_in_sec - self.shift_length_in_sec = shift_length_in_sec - self.normalize_audio = normalize_audio - - logging.debug("Window/slice length considered for collate func is {}".format(self.window_length_in_sec)) - logging.debug("Shift length considered for collate func is {}".format(self.shift_length_in_sec)) - - super().__init__( - manifest_filepath=manifest_filepath, - labels=labels, - featurizer=featurizer, - min_duration=min_duration, - max_duration=max_duration, - trim=trim, - is_regression_task=is_regression_task, - cal_labels_occurrence=cal_labels_occurrence, - ) - - def fixed_seq_collate_fn(self, batch): - return _fixed_seq_collate_fn(self, batch) - - def vad_frame_seq_collate_fn(self, batch): - return _vad_frame_seq_collate_fn(self, batch) - - -class _TarredAudioLabelDataset(IterableDataset): - """ - A similar Dataset to the AudioLabelDataSet, but which loads tarred audio files. - - Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToSpeechLabelDataset), - as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should - contain the information for one audio file, including at least the label and name of the audio - file within the tarball. - - Valid formats for the audio_tar_filepaths argument include: - (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or - (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...]. - - Note: For brace expansion in (1), there may be cases where `{x..y}` syntax cannot be used due to shell interference. - This occurs most commonly inside SLURM scripts. Therefore we provide a few equivalent replacements. - Supported opening braces - { <=> (, [, < and the special tag _OP_. - Supported closing braces - } <=> ), ], > and the special tag _CL_. - For SLURM based tasks, we suggest the use of the special tags for ease of use. - - See the documentation for more information about accepted data and input formats. - - If using multiple processes the number of shards should be divisible by the number of workers to ensure an - even split among workers. If it is not divisible, logging will give a warning but training will proceed. - In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering - is applied. We currently do not check for this, but your program may hang if the shards are uneven! - - Notice that a few arguments are different from the AudioLabelDataSet; for example, shuffle (bool) has been - replaced by shuffle_n (int). - - Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest - after filtering. An incorrect manifest length may lead to some DataLoader issues down the line. - - Args: - audio_tar_filepaths: Either a list of audio tarball filepaths, or a - string (can be brace-expandable). - manifest_filepath (str): Path to the manifest. - labels (list): Dataset parameter. - List of target classes that can be output by the speaker recognition model. - featurizer - shuffle_n (int): How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - Defaults to 0. - min_duration (float): Dataset parameter. - All training files which have a duration less than min_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to 0.1. - max_duration (float): Dataset parameter. - All training files which have a duration more than max_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to None. - trim(bool): Whether to use trim silence from beginning and end - of audio signal using librosa.effects.trim(). - Defaults to False. - window_length_in_sec (float): length of slice/window (in seconds) # Pass this only for speaker recognition and VAD task - shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task. in a batch # Pass this only for VAD task during inference. - normalize_audio (bool): Whether to normalize audio signal. Defaults to False. - shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp. - - `scatter`: The default shard strategy applied by WebDataset, where each node gets - a unique set of shards, which are permanently pre-allocated and never changed at runtime. - - `replicate`: Optional shard strategy, where each node gets all of the set of shards - available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. - The benefit of replication is that it allows each node to sample data points from the entire - dataset independently of other nodes, and reduces dependence on the value of `shuffle_n`. - - .. warning:: - Replicated strategy allows every node to sample the entire set of available tarfiles, - and therefore more than one node may sample the same tarfile, and even sample the same - data points! As such, there is no assured guarantee that all samples in the dataset will be - sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific - occasions (when the number of shards is not divisible with ``world_size``), will not sample - the entire dataset. For these reasons it is not advisable to use tarred datasets as validation - or test datasets. - global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. - world_size (int): Total number of processes, used for partitioning shards. Defaults to 0. - is_regression_task (bool): Whether it is a regression task. Defualts to False. - """ - - def __init__( - self, - *, - audio_tar_filepaths: Union[str, List[str]], - manifest_filepath: Union[str, List[str]], - labels: List[str], - featurizer, - shuffle_n: int = 0, - min_duration: Optional[float] = 0.1, - max_duration: Optional[float] = None, - trim: bool = False, - shard_strategy: str = "scatter", - global_rank: int = 0, - world_size: int = 0, - is_regression_task: bool = False, - ): - cache_datastore_manifests(manifest_filepaths=manifest_filepath) - self.collection = collections.ASRSpeechLabel( - manifests_files=manifest_filepath, - min_duration=min_duration, - max_duration=max_duration, - index_by_file_id=True, # Must set this so the manifest lines can be indexed by file ID - ) - - self.file_occurence = count_occurence(self.collection.mapping) - - self.featurizer = featurizer - self.trim = trim - - self.labels = labels if labels else self.collection.uniq_labels - self.num_classes = len(self.labels) - - self.label2id, self.id2label = {}, {} - for label_id, label in enumerate(self.labels): - self.label2id[label] = label_id - self.id2label[label_id] = label - - for idx in range(len(self.labels[:5])): - logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx])) - - audio_tar_filepaths = expand_sharded_filepaths( - sharded_filepaths=audio_tar_filepaths, - shard_strategy=shard_strategy, - world_size=world_size, - global_rank=global_rank, - ) - # Put together WebDataset - self._dataset = wd.WebDataset(urls=audio_tar_filepaths, nodesplitter=None) - - if shuffle_n > 0: - self._dataset = self._dataset.shuffle(shuffle_n) - else: - logging.info("WebDataset will not shuffle files within the tar files.") - - self._dataset = ( - self._dataset.rename(audio=VALID_FILE_FORMATS, key='__key__') - .to_tuple('audio', 'key') - .pipe(self._filter) - .map(f=self._build_sample) - ) - - def _filter(self, iterator): - """This function is used to remove samples that have been filtered out by ASRSpeechLabel already. - Otherwise, we would get a KeyError as _build_sample attempts to find the manifest entry for a sample - that was filtered out (e.g. for duration). - Note that if using multi-GPU training, filtering may lead to an imbalance in samples in each shard, - which may make your code hang as one process will finish before the other. - """ - - class TarredAudioFilter: - def __init__(self, collection, file_occurence): - self.iterator = iterator - self.collection = collection - self.file_occurence = file_occurence - self._iterable = self._internal_generator() - - def __iter__(self): - self._iterable = self._internal_generator() - return self - - def __next__(self): - try: - values = next(self._iterable) - except StopIteration: - # reset generator - self._iterable = self._internal_generator() - values = next(self._iterable) - - return values - - def _internal_generator(self): - """ - WebDataset requires an Iterator, but we require an iterable that yields 1-or-more - values per value inside self.iterator. - - Therefore wrap the iterator with a generator function that will yield 1-or-more - values per sample in the iterator. - """ - for _, tup in enumerate(self.iterator): - audio_bytes, audio_filename = tup - - file_id, _ = os.path.splitext(os.path.basename(audio_filename)) - if audio_filename in self.file_occurence: - for j in range(0, self.file_occurence[file_id]): - if j == 0: - audio_filename = file_id - else: - audio_filename = file_id + "-sub" + str(j) - yield audio_bytes, audio_filename - - return TarredAudioFilter(self.collection, self.file_occurence) - - def _build_sample(self, tup): - """Builds the training sample by combining the data from the WebDataset with the manifest info. - """ - audio_bytes, audio_filename = tup - # Grab manifest entry from self.collection - file_id, _ = os.path.splitext(os.path.basename(audio_filename)) - - manifest_idx = self.collection.mapping[file_id] - manifest_entry = self.collection[manifest_idx] - - offset = manifest_entry.offset - if offset is None: - offset = 0 - - # Convert audio bytes to IO stream for processing (for SoundFile to read) - audio_filestream = io.BytesIO(audio_bytes) - features = self.featurizer.process( - audio_filestream, offset=offset, duration=manifest_entry.duration, trim=self.trim, - ) - - audio_filestream.close() - - # Audio features - f, fl = features, torch.tensor(features.shape[0]).long() - - t = self.label2id[manifest_entry.label] - tl = 1 # For compatibility with collate_fn used later - - return f, fl, torch.tensor(t).long(), torch.tensor(tl).long() - - def __iter__(self): - return self._dataset.__iter__() - - def __len__(self): - return len(self.collection) - - -class TarredAudioToClassificationLabelDataset(_TarredAudioLabelDataset): - """ - A similar Dataset to the AudioToClassificationLabelDataset, but which loads tarred audio files. - - Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToClassificationLabelDataset), - as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should - contain the information for one audio file, including at least the transcript and name of the audio - file within the tarball. - - Valid formats for the audio_tar_filepaths argument include: - (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or - (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...]. - - See the WebDataset documentation for more information about accepted data and input formats. - - If using multiple processes the number of shards should be divisible by the number of workers to ensure an - even split among workers. If it is not divisible, logging will give a warning but training will proceed. - In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering - is applied. We currently do not check for this, but your program may hang if the shards are uneven! - - Notice that a few arguments are different from the AudioToBPEDataset; for example, shuffle (bool) has been - replaced by shuffle_n (int). - - Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest - after filtering. An incorrect manifest length may lead to some DataLoader issues down the line. - - Args: - audio_tar_filepaths: Either a list of audio tarball filepaths, or a - string (can be brace-expandable). - manifest_filepath (str): Path to the manifest. - labels (list): Dataset parameter. - List of target classes that can be output by the speaker recognition model. - featurizer - shuffle_n (int): How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - Defaults to 0. - min_duration (float): Dataset parameter. - All training files which have a duration less than min_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to 0.1. - max_duration (float): Dataset parameter. - All training files which have a duration more than max_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to None. - trim(bool): Whether to use trim silence from beginning and end - of audio signal using librosa.effects.trim(). - Defaults to False. - shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp. - - `scatter`: The default shard strategy applied by WebDataset, where each node gets - a unique set of shards, which are permanently pre-allocated and never changed at runtime. - - `replicate`: Optional shard strategy, where each node gets all of the set of shards - available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. - The benefit of replication is that it allows each node to sample data points from the entire - dataset independently of other nodes, and reduces dependence on value of `shuffle_n`. - - .. warning:: - Replicated strategy allows every node to sample the entire set of available tarfiles, - and therefore more than one node may sample the same tarfile, and even sample the same - data points! As such, there is no assured guarantee that all samples in the dataset will be - sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific - occasions (when the number of shards is not divisible with ``world_size``), will not sample - the entire dataset. For these reasons it is not advisable to use tarred datasets as validation - or test datasets. - global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. - world_size (int): Total number of processes, used for partitioning shards. Defaults to 0. - is_regression_task (bool): Whether it is a regression task. Defualts to False. - """ - - def _collate_fn(self, batch): - return _speech_collate_fn(batch, pad_id=0) - - -class TarredAudioToSpeechLabelDataset(_TarredAudioLabelDataset): - """ - A similar Dataset to the AudioToSpeechLabelDataset, but which loads tarred audio files. - - Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToSpeechLabelDataset), - as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should - contain the information for one audio file, including at least the transcript and name of the audio - file within the tarball. - - Valid formats for the audio_tar_filepaths argument include: - (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or - (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...]. - - See the WebDataset documentation for more information about accepted data and input formats. - - If using multiple processes the number of shards should be divisible by the number of workers to ensure an - even split among workers. If it is not divisible, logging will give a warning but training will proceed. - In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering - is applied. We currently do not check for this, but your program may hang if the shards are uneven! - - Notice that a few arguments are different from the AudioToBPEDataset; for example, shuffle (bool) has been - replaced by shuffle_n (int). - - Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest - after filtering. An incorrect manifest length may lead to some DataLoader issues down the line. - - Args: - audio_tar_filepaths: Either a list of audio tarball filepaths, or a - string (can be brace-expandable). - manifest_filepath (str): Path to the manifest. - labels (list): Dataset parameter. - List of target classes that can be output by the speaker recognition model. - featurizer - shuffle_n (int): How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - Defaults to 0. - min_duration (float): Dataset parameter. - All training files which have a duration less than min_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to 0.1. - max_duration (float): Dataset parameter. - All training files which have a duration more than max_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to None. - trim(bool): Whether to use trim silence from beginning and end - of audio signal using librosa.effects.trim(). - Defaults to False. - window_length_in_sec (float): time length of window/slice (in seconds) # Pass this only for speaker recognition and VAD task - shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task. in a batch # Pass this only for VAD task during inference. - normalize_audio (bool): Whether to normalize audio signal. Defaults to False. - shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp. - - `scatter`: The default shard strategy applied by WebDataset, where each node gets - a unique set of shards, which are permanently pre-allocated and never changed at runtime. - - `replicate`: Optional shard strategy, where each node gets all of the set of shards - available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. - The benefit of replication is that it allows each node to sample data points from the entire - dataset independently of other nodes, and reduces dependence on value of `shuffle_n`. - - .. warning:: - Replicated strategy allows every node to sample the entire set of available tarfiles, - and therefore more than one node may sample the same tarfile, and even sample the same - data points! As such, there is no assured guarantee that all samples in the dataset will be - sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific - occasions (when the number of shards is not divisible with ``world_size``), will not sample - the entire dataset. For these reasons it is not advisable to use tarred datasets as validation - or test datasets. - global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. - world_size (int): Total number of processes, used for partitioning shards. Defaults to 0. - """ - - def __init__( - self, - *, - audio_tar_filepaths: Union[str, List[str]], - manifest_filepath: Union[str, List[str]], - labels: List[str], - featurizer, - shuffle_n: int = 0, - min_duration: Optional[float] = 0.1, - max_duration: Optional[float] = None, - trim: bool = False, - window_length_in_sec: Optional[float] = 8, - shift_length_in_sec: Optional[float] = 1, - normalize_audio: bool = False, - shard_strategy: str = "scatter", - global_rank: int = 0, - world_size: int = 0, - ): - logging.info("Window/slice length considered for collate func is {}".format(window_length_in_sec)) - logging.info("Shift length considered for collate func is {}".format(shift_length_in_sec)) - self.window_length_in_sec = window_length_in_sec - self.shift_length_in_sec = shift_length_in_sec - self.normalize_audio = normalize_audio - - super().__init__( - audio_tar_filepaths=audio_tar_filepaths, - manifest_filepath=manifest_filepath, - labels=labels, - featurizer=featurizer, - shuffle_n=shuffle_n, - min_duration=min_duration, - max_duration=max_duration, - trim=trim, - shard_strategy=shard_strategy, - global_rank=global_rank, - world_size=world_size, - ) - - def fixed_seq_collate_fn(self, batch): - return _fixed_seq_collate_fn(self, batch) - - def sliced_seq_collate_fn(self, batch): - raise NotImplementedError - - def vad_frame_seq_collate_fn(self, batch): - return _vad_frame_seq_collate_fn(self, batch) - - -class AudioToMultiLabelDataset(Dataset): - """ - Dataset that loads a json file containing paths to audio files, durations (in seconds), and a sequence of labels. - Each new line is a different sample. Example below: - {"audio_filepath": "/path/to/audio_wav_0.wav", "duration": time_in_sec_0, "label": \ - "0 1 1 0 1", "offset": offset_in_sec_0} - ... - {"audio_filepath": "/path/to/audio_wav_n.wav", "duration": time_in_sec_n, "label": \ - "0 1 0 0 1", "offset": offset_in_sec_n} - Args: - manifest_filepath (Union[str, List[str]]): Path to manifest json as described above. Can - be comma-separated paths. - labels (Optional[list]): String containing all the possible labels to map to - if None then automatically picks from ASRSpeechLabel collection. - min_duration (float): Dataset parameter. - All training files which have a duration less than min_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to 0.1. - max_duration (float): Dataset parameter. - All training files which have a duration more than max_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to None. - trim (bool): Whether to use trim silence from beginning and end - of audio signal using librosa.effects.trim(). - Defaults to False. - window_length_in_sec (float): length of window/slice (in seconds) - Use this for speaker recognition and VAD tasks. - shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task in a batch - Use this for VAD task during inference. - normalize_audio (bool): Whether to normalize audio signal. - Defaults to False. - is_regression_task (bool): Whether the dataset is for a regression task instead of classification. - Defaults to False. - cal_labels_occurrence (bool): Whether to calculate occurrence of labels - Defaults to False. - delimiter (Optional[str]): Delimiter to use when splitting the label string, default to None. - normalize_audio_db (Optional[float]): normalize audio signal to a target db, default to None. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - - output_types = { - 'audio_signal': NeuralType( - ('B', 'T'), - AudioSignal(freq=self._sample_rate) - if self is not None and hasattr(self, '_sample_rate') - else AudioSignal(), - ), - 'a_sig_length': NeuralType(tuple('B'), LengthsType()), - } - - if self.is_regression_task: - output_types.update( - { - 'targets': NeuralType(tuple('B, T'), RegressionValuesType()), - 'targets_length': NeuralType(tuple('B'), LengthsType()), - } - ) - else: - output_types.update( - {'label': NeuralType(('B', 'T'), LabelsType()), 'label_length': NeuralType(tuple('B'), LengthsType()),} - ) - - return output_types - - def __init__( - self, - *, - manifest_filepath: Union[str, List[str]], - sample_rate: int, - labels: Optional[List[str]] = None, - int_values: bool = False, - augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None, - min_duration: Optional[float] = 0.1, - max_duration: Optional[float] = None, - trim_silence: bool = False, - is_regression_task: bool = False, - cal_labels_occurrence: Optional[bool] = False, - delimiter: Optional[str] = None, - normalize_audio_db: Optional[float] = None, - ): - super().__init__() - if isinstance(manifest_filepath, str): - manifest_filepath = manifest_filepath.split(',') - - self.delimiter = delimiter - self.normalize_audio_db = normalize_audio_db - - self.collection = collections.ASRSpeechLabel( - manifests_files=manifest_filepath, - min_duration=min_duration, - max_duration=max_duration, - is_regression_task=is_regression_task, - cal_labels_occurrence=cal_labels_occurrence, - delimiter=delimiter, - ) - - self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor) - self.trim = trim_silence - self.is_regression_task = is_regression_task - self.id2occurrence = {} - self.labels_occurrence = None - - if not is_regression_task: - self.labels = labels if labels else self._get_label_set() - self.num_classes = len(self.labels) if self.labels is not None else 1 - self.label2id, self.id2label = {}, {} - for label_id, label in enumerate(self.labels): - self.label2id[label] = label_id - self.id2label[label_id] = label - if cal_labels_occurrence: - self.id2occurrence[label_id] = self.collection.labels_occurrence[label] - self.labels_occurrence.append(self.id2occurrence[label_id]) - - for idx in range(len(self.labels[:5])): - logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx])) - else: - self.labels = [] - self.num_classes = 1 - - def _get_label_set(self): - labels = [] - for sample in self.collection: - label_str = sample.label - if label_str: - label_str_list = label_str.split(self.delimiter) if self.delimiter else label_str.split() - labels.extend(label_str_list) - return sorted(set(labels)) - - def _label_str_to_tensor(self, label_str: str): - labels = label_str.split(self.delimiter) if self.delimiter else label_str.split() - - if self.is_regression_task: - labels = [float(s) for s in labels] - labels = torch.tensor(labels).float() - else: - labels = [self.label2id[s] for s in labels] - labels = torch.tensor(labels).long() - return labels - - def __len__(self): - return len(self.collection) - - def __getitem__(self, index): - sample = self.collection[index] - - offset = sample.offset - - if offset is None: - offset = 0 - - features = self.featurizer.process( - sample.audio_file, - offset=offset, - duration=sample.duration, - trim=self.trim, - normalize_db=self.normalize_audio_db, - ) - - f, fl = features, torch.tensor(features.size(0)).long() - - t = self._label_str_to_tensor(sample.label) - - tl = torch.tensor(t.size(0)).long() - - return f, fl, t, tl - - def _collate_fn(self, batch): - return _speech_collate_fn(batch, pad_id=0) - - -class TarredAudioToMultiLabelDataset(IterableDataset): - """ - A similar Dataset to the AudioToMultiLabelDataset, but which loads tarred audio files. - - Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToSpeechLabelDataset), - as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should - contain the information for one audio file, including at least the transcript and name of the audio - file within the tarball. - - Valid formats for the audio_tar_filepaths argument include: - (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or - (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...]. - - See the WebDataset documentation for more information about accepted data and input formats. - - If using multiple processes the number of shards should be divisible by the number of workers to ensure an - even split among workers. If it is not divisible, logging will give a warning but training will proceed. - In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering - is applied. We currently do not check for this, but your program may hang if the shards are uneven! - - Notice that a few arguments are different from the AudioToBPEDataset; for example, shuffle (bool) has been - replaced by shuffle_n (int). - - Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest - after filtering. An incorrect manifest length may lead to some DataLoader issues down the line. - - Args: - audio_tar_filepaths: Either a list of audio tarball filepaths, or a - string (can be brace-expandable). - manifest_filepath (str): Path to the manifest. - labels (list): Dataset parameter. - List of target classes that can be output by the speaker recognition model. - shuffle_n (int): How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - Defaults to 0. - min_duration (float): Dataset parameter. - All training files which have a duration less than min_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to 0.1. - max_duration (float): Dataset parameter. - All training files which have a duration more than max_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to None. - trim(bool): Whether to use trim silence from beginning and end - of audio signal using librosa.effects.trim(). - Defaults to False. - window_length_in_sec (float): time length of window/slice (in seconds) # Pass this only for speaker recognition and VAD task - shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task. in a batch # Pass this only for VAD task during inference. - normalize_audio (bool): Whether to normalize audio signal. Defaults to False. - shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp. - - `scatter`: The default shard strategy applied by WebDataset, where each node gets - a unique set of shards, which are permanently pre-allocated and never changed at runtime. - - `replicate`: Optional shard strategy, where each node gets all of the set of shards - available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. - The benefit of replication is that it allows each node to sample data points from the entire - dataset independently of other nodes, and reduces dependence on value of `shuffle_n`. - - .. warning:: - Replicated strategy allows every node to sample the entire set of available tarfiles, - and therefore more than one node may sample the same tarfile, and even sample the same - data points! As such, there is no assured guarantee that all samples in the dataset will be - sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific - occasions (when the number of shards is not divisible with ``world_size``), will not sample - the entire dataset. For these reasons it is not advisable to use tarred datasets as validation - or test datasets. - global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. - world_size (int): Total number of processes, used for partitioning shards. Defaults to 0. - delimiter (Optional[str]): Delimiter to use when splitting the label string, default to None. - normalize_audio_db (Optional[float]): normalize audio signal to a target db, default to None. - """ - - def __init__( - self, - *, - audio_tar_filepaths: Union[str, List[str]], - manifest_filepath: Union[str, List[str]], - sample_rate: int, - labels: Optional[List[str]] = None, - shuffle_n: int = 0, - int_values: bool = False, - augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None, - min_duration: Optional[float] = 0.1, - max_duration: Optional[float] = None, - trim_silence: bool = False, - is_regression_task: bool = False, - shard_strategy: str = "scatter", - global_rank: int = 0, - world_size: int = 0, - delimiter: Optional[str] = None, - normalize_audio_db: Optional[float] = None, - ): - super().__init__() - if isinstance(manifest_filepath, str): - manifest_filepath = manifest_filepath.split(',') - - self.trim = trim_silence - self.is_regression_task = is_regression_task - self.delimiter = delimiter - self.normalize_audio_db = normalize_audio_db - - self.collection = collections.ASRSpeechLabel( - manifests_files=manifest_filepath, - min_duration=min_duration, - max_duration=max_duration, - is_regression_task=is_regression_task, - index_by_file_id=True, - ) - self.file_occurence = count_occurence(self.collection.mapping) - - self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor) - - if not is_regression_task: - self.labels = labels if labels else self._get_label_set() - self.num_classes = len(self.labels) if self.labels is not None else 1 - self.label2id, self.id2label = {}, {} - for label_id, label in enumerate(self.labels): - self.label2id[label] = label_id - self.id2label[label_id] = label - for idx in range(len(self.labels[:5])): - logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx])) - else: - self.labels = [] - self.num_classes = 1 - - audio_tar_filepaths = expand_sharded_filepaths( - sharded_filepaths=audio_tar_filepaths, - shard_strategy=shard_strategy, - world_size=world_size, - global_rank=global_rank, - ) - # Put together WebDataset - self._dataset = wd.WebDataset(urls=audio_tar_filepaths, nodesplitter=None) - - if shuffle_n > 0: - self._dataset = self._dataset.shuffle(shuffle_n) - else: - logging.info("WebDataset will not shuffle files within the tar files.") - - self._dataset = ( - self._dataset.rename(audio=VALID_FILE_FORMATS, key='__key__') - .to_tuple('audio', 'key') - .pipe(self._filter) - .map(f=self._build_sample) - ) - - def _get_label_set(self): - labels = [] - for sample in self.collection: - label_str = sample.label - if label_str: - label_str_list = label_str.split(self.delimiter) if self.delimiter else label_str.split() - labels.extend(label_str_list) - return sorted(set(labels)) - - def _label_str_to_tensor(self, label_str: str): - labels = label_str.split(self.delimiter) if self.delimiter else label_str.split() - - if self.is_regression_task: - labels = [float(s) for s in labels] - labels = torch.tensor(labels).float() - else: - labels = [self.label2id[s] for s in labels] - labels = torch.tensor(labels).long() - return labels - - def _filter(self, iterator): - """This function is used to remove samples that have been filtered out by ASRSpeechLabel already. - Otherwise, we would get a KeyError as _build_sample attempts to find the manifest entry for a sample - that was filtered out (e.g. for duration). - Note that if using multi-GPU training, filtering may lead to an imbalance in samples in each shard, - which may make your code hang as one process will finish before the other. - """ - - class TarredAudioFilter: - def __init__(self, collection, file_occurence): - self.iterator = iterator - self.collection = collection - self.file_occurence = file_occurence - self._iterable = self._internal_generator() - - def __iter__(self): - self._iterable = self._internal_generator() - return self - - def __next__(self): - try: - values = next(self._iterable) - except StopIteration: - # reset generator - self._iterable = self._internal_generator() - values = next(self._iterable) - - return values - - def _internal_generator(self): - """ - WebDataset requires an Iterator, but we require an iterable that yields 1-or-more - values per value inside self.iterator. - - Therefore wrap the iterator with a generator function that will yield 1-or-more - values per sample in the iterator. - """ - for _, tup in enumerate(self.iterator): - audio_bytes, audio_filename = tup - - file_id, _ = os.path.splitext(os.path.basename(audio_filename)) - if audio_filename in self.file_occurence: - for j in range(0, self.file_occurence[file_id]): - if j == 0: - audio_filename = file_id - else: - audio_filename = file_id + "-sub" + str(j) - yield audio_bytes, audio_filename - - return TarredAudioFilter(self.collection, self.file_occurence) - - def _build_sample(self, tup): - """Builds the training sample by combining the data from the WebDataset with the manifest info. - """ - audio_bytes, audio_filename = tup - # Grab manifest entry from self.collection - file_id, _ = os.path.splitext(os.path.basename(audio_filename)) - - manifest_idx = self.collection.mapping[file_id] - manifest_entry = self.collection[manifest_idx] - - offset = manifest_entry.offset - if offset is None: - offset = 0 - - # Convert audio bytes to IO stream for processing (for SoundFile to read) - audio_filestream = io.BytesIO(audio_bytes) - features = self.featurizer.process( - audio_filestream, - offset=offset, - duration=manifest_entry.duration, - trim=self.trim, - normalize_db=self.normalize_audio_db, - ) - - audio_filestream.close() - - # Audio features - f, fl = features, torch.tensor(features.shape[0]).long() - - t = self._label_str_to_tensor(manifest_entry.label) - - tl = torch.tensor(t.size(0)).long() - - return f, fl, t, tl - - def __iter__(self): - return self._dataset.__iter__() - - def __len__(self): - return len(self.collection) - - def _collate_fn(self, batch): - return _speech_collate_fn(batch, pad_id=0) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_label_dataset.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_label_dataset.py deleted file mode 100644 index dcead6df94b8c898038c2ffc744ad478bb4e74fc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_label_dataset.py +++ /dev/null @@ -1,304 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import copy - -from omegaconf import DictConfig - -from nemo.collections.asr.data import audio_to_label -from nemo.collections.asr.data.audio_to_text_dataset import convert_to_config_list, get_chain_dataset -from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations -from nemo.collections.common.data.dataset import ConcatDataset - - -def get_classification_label_dataset(featurizer, config: dict) -> audio_to_label.AudioToClassificationLabelDataset: - """ - Instantiates a Classification AudioLabelDataset. - - Args: - config: Config of the AudioToClassificationLabelDataset. - - Returns: - An instance of AudioToClassificationLabelDataset. - """ - dataset = audio_to_label.AudioToClassificationLabelDataset( - manifest_filepath=config['manifest_filepath'], - labels=config['labels'], - featurizer=featurizer, - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - trim=config.get('trim_silence', False), - is_regression_task=config.get('is_regression_task', False), - cal_labels_occurrence=config.get('cal_labels_occurrence', False), - ) - return dataset - - -def get_speech_label_dataset(featurizer, config: dict) -> audio_to_label.AudioToSpeechLabelDataset: - """ - Instantiates a Speech Label (e.g. VAD, speaker recognition) AudioLabelDataset. - - Args: - config: Config of the AudioToSpeechLabelDataSet. - - Returns: - An instance of AudioToSpeechLabelDataset. - """ - dataset = audio_to_label.AudioToSpeechLabelDataset( - manifest_filepath=config['manifest_filepath'], - labels=config['labels'], - featurizer=featurizer, - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - trim=config.get('trim_silence', False), - window_length_in_sec=config.get('window_length_in_sec', 0.31), - shift_length_in_sec=config.get('shift_length_in_sec', 0.01), - normalize_audio=config.get('normalize_audio', False), - cal_labels_occurrence=config.get('cal_labels_occurrence', False), - ) - return dataset - - -def get_tarred_classification_label_dataset( - featurizer, config: dict, shuffle_n: int, global_rank: int, world_size: int -) -> audio_to_label.TarredAudioToClassificationLabelDataset: - """ - Instantiates a Classification TarredAudioLabelDataset. - - Args: - config: Config of the TarredAudioToClassificationLabelDataset. - shuffle_n: How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - global_rank: Global rank of this device. - world_size: Global world size in the training method. - - Returns: - An instance of TarredAudioToClassificationLabelDataset. - """ - tarred_audio_filepaths = config['tarred_audio_filepaths'] - manifest_filepaths = config['manifest_filepath'] - datasets = [] - tarred_audio_filepaths = convert_to_config_list(tarred_audio_filepaths) - manifest_filepaths = convert_to_config_list(manifest_filepaths) - - bucketing_weights = config.get('bucketing_weights', None) # For upsampling buckets - if bucketing_weights: - for idx, weight in enumerate(bucketing_weights): - if not isinstance(weight, int) or weight <= 0: - raise ValueError(f"bucket weights must be positive integers") - - if len(manifest_filepaths) != len(tarred_audio_filepaths): - raise ValueError( - f"manifest_filepaths (length={len(manifest_filepaths)}) and tarred_audio_filepaths (length={len(tarred_audio_filepaths)}) need to have the same number of buckets." - ) - - for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate( - zip(tarred_audio_filepaths, manifest_filepaths) - ): - if len(tarred_audio_filepath) == 1: - tarred_audio_filepath = tarred_audio_filepath[0] - dataset = audio_to_label.TarredAudioToClassificationLabelDataset( - audio_tar_filepaths=tarred_audio_filepath, - manifest_filepath=manifest_filepath, - labels=config['labels'], - featurizer=featurizer, - shuffle_n=shuffle_n, - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - trim=config.get('trim_silence', False), - shard_strategy=config.get('tarred_shard_strategy', 'scatter'), - global_rank=global_rank, - world_size=world_size, - is_regression_task=config.get('is_regression_task', False), - ) - - if bucketing_weights: - [datasets.append(dataset) for _ in range(bucketing_weights[dataset_idx])] - else: - datasets.append(dataset) - - return get_chain_dataset(datasets=datasets, ds_config=config, rank=global_rank) - - -def get_concat_tarred_speech_label_dataset( - featurizer, config: dict, shuffle_n: int, global_rank: int, world_size: int, -): - tarred_audio_filepaths = config['tarred_audio_filepaths'] - manifest_filepaths = config['manifest_filepath'] - datasets = [] - for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate( - zip(tarred_audio_filepaths, manifest_filepaths) - ): - conf = copy.deepcopy(config) - conf['manifest_filepath'] = manifest_filepath - conf['tarred_audio_filepaths'] = tarred_audio_filepath - dataset = get_tarred_speech_label_dataset( - config=conf, featurizer=featurizer, shuffle_n=shuffle_n, global_rank=global_rank, world_size=world_size, - ) - datasets.append(dataset) - - dataset = ConcatDataset( - datasets, - sampling_technique=config.get('concat_sampling_technique', 'temperature'), - sampling_temperature=config.get('concat_sampling_temperature', 5), - sampling_probabilities=config.get('concat_sampling_probabilities', None), - global_rank=global_rank, - world_size=world_size, - shuffle=config['shuffle'], - ) - return dataset - - -def get_tarred_speech_label_dataset( - featurizer, config: dict, shuffle_n: int, global_rank: int, world_size: int, -) -> audio_to_label.TarredAudioToSpeechLabelDataset: - """ - InInstantiates a Speech Label (e.g. VAD, speaker recognition) TarredAudioLabelDataset. - - Args: - config: Config of the TarredAudioToSpeechLabelDataset. - shuffle_n: How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - global_rank: Global rank of this device. - world_size: Global world size in the training method. - - Returns: - An instance of TarredAudioToSpeechLabelDataset. - """ - tarred_audio_filepaths = config['tarred_audio_filepaths'] - manifest_filepaths = config['manifest_filepath'] - datasets = [] - tarred_audio_filepaths = convert_to_config_list(tarred_audio_filepaths) - manifest_filepaths = convert_to_config_list(manifest_filepaths) - - bucketing_weights = config.get('bucketing_weights', None) # For upsampling buckets - if bucketing_weights: - for idx, weight in enumerate(bucketing_weights): - if not isinstance(weight, int) or weight <= 0: - raise ValueError(f"bucket weights must be positive integers") - - if len(manifest_filepaths) != len(tarred_audio_filepaths): - raise ValueError( - f"manifest_filepaths (length={len(manifest_filepaths)}) and tarred_audio_filepaths (length={len(tarred_audio_filepaths)}) need to have the same number of buckets." - ) - - for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate( - zip(tarred_audio_filepaths, manifest_filepaths) - ): - if len(tarred_audio_filepath) == 1: - tarred_audio_filepath = tarred_audio_filepath[0] - dataset = audio_to_label.TarredAudioToSpeechLabelDataset( - audio_tar_filepaths=tarred_audio_filepath, - manifest_filepath=manifest_filepath, - labels=config['labels'], - featurizer=featurizer, - shuffle_n=shuffle_n, - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - trim=config.get('trim_silence', False), - window_length_in_sec=config.get('window_length_in_sec', 8), - shift_length_in_sec=config.get('shift_length_in_sec', 0.075), - normalize_audio=config.get('normalize_audio', False), - shard_strategy=config.get('tarred_shard_strategy', 'scatter'), - global_rank=global_rank, - world_size=world_size, - ) - - if bucketing_weights: - [datasets.append(dataset) for _ in range(bucketing_weights[dataset_idx])] - else: - datasets.append(dataset) - - return get_chain_dataset(datasets=datasets, ds_config=config, rank=global_rank) - - -def get_audio_multi_label_dataset(cfg: DictConfig) -> audio_to_label.AudioToMultiLabelDataset: - if "augmentor" in cfg: - augmentor = process_augmentations(cfg.augmentor) - else: - augmentor = None - - dataset = audio_to_label.AudioToMultiLabelDataset( - manifest_filepath=cfg.get("manifest_filepath"), - sample_rate=cfg.get("sample_rate"), - labels=cfg.get("labels", None), - int_values=cfg.get("int_values", False), - augmentor=augmentor, - min_duration=cfg.get("min_duration", None), - max_duration=cfg.get("max_duration", None), - trim_silence=cfg.get("trim_silence", False), - is_regression_task=cfg.get("is_regression_task", False), - cal_labels_occurrence=cfg.get("cal_labels_occurrence", False), - delimiter=cfg.get("delimiter", None), - normalize_audio_db=cfg.get("normalize_audio_db", None), - ) - return dataset - - -def get_tarred_audio_multi_label_dataset( - cfg: DictConfig, shuffle_n: int, global_rank: int, world_size: int -) -> audio_to_label.TarredAudioToMultiLabelDataset: - - if "augmentor" in cfg: - augmentor = process_augmentations(cfg.augmentor) - else: - augmentor = None - - tarred_audio_filepaths = cfg['tarred_audio_filepaths'] - manifest_filepaths = cfg['manifest_filepath'] - datasets = [] - tarred_audio_filepaths = convert_to_config_list(tarred_audio_filepaths) - manifest_filepaths = convert_to_config_list(manifest_filepaths) - - bucketing_weights = cfg.get('bucketing_weights', None) # For upsampling buckets - if bucketing_weights: - for idx, weight in enumerate(bucketing_weights): - if not isinstance(weight, int) or weight <= 0: - raise ValueError(f"bucket weights must be positive integers") - - if len(manifest_filepaths) != len(tarred_audio_filepaths): - raise ValueError( - f"manifest_filepaths (length={len(manifest_filepaths)}) and tarred_audio_filepaths (length={len(tarred_audio_filepaths)}) need to have the same number of buckets." - ) - - for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate( - zip(tarred_audio_filepaths, manifest_filepaths) - ): - if len(tarred_audio_filepath) == 1: - tarred_audio_filepath = tarred_audio_filepath[0] - - dataset = audio_to_label.TarredAudioToMultiLabelDataset( - audio_tar_filepaths=tarred_audio_filepath, - manifest_filepath=manifest_filepath, - sample_rate=cfg["sample_rate"], - labels=cfg['labels'], - shuffle_n=shuffle_n, - int_values=cfg.get("int_values", False), - augmentor=augmentor, - min_duration=cfg.get('min_duration', None), - max_duration=cfg.get('max_duration', None), - trim_silence=cfg.get('trim_silence', False), - is_regression_task=cfg.get('is_regression_task', False), - delimiter=cfg.get("delimiter", None), - shard_strategy=cfg.get('tarred_shard_strategy', 'scatter'), - global_rank=global_rank, - world_size=world_size, - normalize_audio_db=cfg.get("normalize_audio_db", None), - ) - - if bucketing_weights: - [datasets.append(dataset) for _ in range(bucketing_weights[dataset_idx])] - else: - datasets.append(dataset) - - return get_chain_dataset(datasets=datasets, ds_config=cfg, rank=global_rank) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_text.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_text.py deleted file mode 100644 index 58cd3630e3222100e1bec0586a8a71b937d802ba..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_text.py +++ /dev/null @@ -1,1366 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import io -import json -import math -import multiprocessing -import os -from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union - -import braceexpand -import numpy as np -import torch -import webdataset as wd -from torch.utils.data import ChainDataset -from tqdm import tqdm - -from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer -from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType -from nemo.collections.common import tokenizers -from nemo.collections.common.parts.preprocessing import collections, parsers -from nemo.core.classes import Dataset, IterableDataset -from nemo.core.neural_types import * -from nemo.utils import logging -from nemo.utils.data_utils import ( - DataStoreObject, - datastore_object_get, - datastore_path_to_webdataset_url, - is_datastore_cache_shared, - is_datastore_path, - is_tarred_path, -) -from nemo.utils.get_rank import is_global_rank_zero - -__all__ = [ - 'AudioToCharDataset', - 'AudioToBPEDataset', - 'TarredAudioToCharDataset', - 'TarredAudioToBPEDataset', -] - - -def _speech_collate_fn(batch, pad_id): - """collate batch of audio sig, audio len, tokens, tokens len - Args: - batch (Optional[FloatTensor], Optional[LongTensor], LongTensor, - LongTensor): A tuple of tuples of signal, signal lengths, - encoded tokens, and encoded tokens length. This collate func - assumes the signals are 1d torch tensors (i.e. mono audio). - """ - packed_batch = list(zip(*batch)) - if len(packed_batch) == 5: - _, audio_lengths, _, tokens_lengths, sample_ids = packed_batch - elif len(packed_batch) == 4: - sample_ids = None - _, audio_lengths, _, tokens_lengths = packed_batch - else: - raise ValueError("Expects 4 or 5 tensors in the batch!") - max_audio_len = 0 - has_audio = audio_lengths[0] is not None - if has_audio: - max_audio_len = max(audio_lengths).item() - max_tokens_len = max(tokens_lengths).item() - - audio_signal, tokens = [], [] - for b in batch: - if len(b) == 5: - sig, sig_len, tokens_i, tokens_i_len, _ = b - else: - sig, sig_len, tokens_i, tokens_i_len = b - if has_audio: - sig_len = sig_len.item() - if sig_len < max_audio_len: - pad = (0, max_audio_len - sig_len) - sig = torch.nn.functional.pad(sig, pad) - audio_signal.append(sig) - tokens_i_len = tokens_i_len.item() - if tokens_i_len < max_tokens_len: - pad = (0, max_tokens_len - tokens_i_len) - tokens_i = torch.nn.functional.pad(tokens_i, pad, value=pad_id) - tokens.append(tokens_i) - - if has_audio: - audio_signal = torch.stack(audio_signal) - audio_lengths = torch.stack(audio_lengths) - else: - audio_signal, audio_lengths = None, None - tokens = torch.stack(tokens) - tokens_lengths = torch.stack(tokens_lengths) - if sample_ids is None: - return audio_signal, audio_lengths, tokens, tokens_lengths - else: - sample_ids = torch.tensor(sample_ids, dtype=torch.int32) - return audio_signal, audio_lengths, tokens, tokens_lengths, sample_ids - - -class ASRManifestProcessor: - """ - Class that processes a manifest json file containing paths to audio files, transcripts, and durations (in seconds). - Each new line is a different sample. Example below: - {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147} - ... - {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt": - "utterance_id", "ctm_utt": "en_4156", "side": "A"} - Args: - manifest_filepath: Path to manifest json as described above. Can be comma-separated paths. - parser: Str for a language specific preprocessor or a callable. - max_duration: If audio exceeds this length, do not include in dataset. - min_duration: If audio is less than this length, do not include in dataset. - max_utts: Limit number of utterances. - bos_id: Id of beginning of sequence symbol to append if not None. - eos_id: Id of end of sequence symbol to append if not None. - pad_id: Id of pad symbol. Defaults to 0. - """ - - def __init__( - self, - manifest_filepath: str, - parser: Union[str, Callable], - max_duration: Optional[float] = None, - min_duration: Optional[float] = None, - max_utts: int = 0, - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - pad_id: int = 0, - index_by_file_id: bool = False, - ): - self.parser = parser - - self.collection = collections.ASRAudioText( - manifests_files=manifest_filepath, - parser=parser, - min_duration=min_duration, - max_duration=max_duration, - max_number=max_utts, - index_by_file_id=index_by_file_id, - ) - - self.eos_id = eos_id - self.bos_id = bos_id - self.pad_id = pad_id - - def process_text_by_id(self, index: int) -> Tuple[List[int], int]: - sample = self.collection[index] - return self.process_text_by_sample(sample) - - def process_text_by_file_id(self, file_id: str) -> Tuple[List[int], int]: - manifest_idx = self.collection.mapping[file_id][0] - sample = self.collection[manifest_idx] - return self.process_text_by_sample(sample) - - def process_text_by_sample(self, sample: collections.ASRAudioText.OUTPUT_TYPE) -> Tuple[List[int], int]: - t, tl = sample.text_tokens, len(sample.text_tokens) - - if self.bos_id is not None: - t = [self.bos_id] + t - tl += 1 - if self.eos_id is not None: - t = t + [self.eos_id] - tl += 1 - - return t, tl - - -def expand_sharded_filepaths(sharded_filepaths, shard_strategy: str, world_size: int, global_rank: int): - valid_shard_strategies = ['scatter', 'replicate'] - if shard_strategy not in valid_shard_strategies: - raise ValueError(f"`shard_strategy` must be one of {valid_shard_strategies}") - - if isinstance(sharded_filepaths, str): - # Replace '(' and '[' with '{' - brace_keys_open = ['(', '[', '<', '_OP_'] - for bkey in brace_keys_open: - if bkey in sharded_filepaths: - sharded_filepaths = sharded_filepaths.replace(bkey, "{") - - # Replace ')' and ']' with '}' - brace_keys_close = [')', ']', '>', '_CL_'] - for bkey in brace_keys_close: - if bkey in sharded_filepaths: - sharded_filepaths = sharded_filepaths.replace(bkey, "}") - - if isinstance(sharded_filepaths, str): - # Brace expand, set escape=False for Windows compatibility - sharded_filepaths = list(braceexpand.braceexpand(sharded_filepaths, escape=False)) - - # Expand store paths into WebDataset URLs - sharded_filepaths = [ - datastore_path_to_webdataset_url(p) if is_datastore_path(p) and is_tarred_path(p) else p - for p in sharded_filepaths - ] - - # Check for distributed and partition shards accordingly - if world_size > 1: - if shard_strategy == 'scatter': - logging.info("All tarred dataset shards will be scattered evenly across all nodes.") - - if len(sharded_filepaths) % world_size != 0: - logging.warning( - f"Number of shards in tarred dataset ({len(sharded_filepaths)}) is not divisible " - f"by number of distributed workers ({world_size})." - ) - - begin_idx = (len(sharded_filepaths) // world_size) * global_rank - end_idx = begin_idx + len(sharded_filepaths) // world_size - sharded_filepaths = sharded_filepaths[begin_idx:end_idx] - logging.info( - "Partitioning tarred dataset: process (%d) taking shards [%d, %d)", global_rank, begin_idx, end_idx - ) - - elif shard_strategy == 'replicate': - logging.info("All tarred dataset shards will be replicated across all nodes.") - else: - raise ValueError(f"Invalid shard strategy ! Allowed values are : {valid_shard_strategies}") - - return sharded_filepaths - - -def cache_datastore_manifests( - manifest_filepaths: Union[str, List[str]], - cache_audio: bool = False, - shared_cache: Optional[bool] = None, - num_workers: Optional[int] = None, - max_num_workers: int = 20, -): - """Cache manifests and audio from an object store. - It is assumed that remote manifests are using relative paths. - - Args: - manifest_filepaths: list of paths to manifest files (list of strings or a string with `,` as separator) - cache_audio: If True, audio from manifest will also be cached - shared_cache: Optional, True if cache is shared across all nodes - num_workers: Optional, number of workers to be used for download - max_num_workers: max number of workers to be used for download, used when setting num_workers automatically - """ - if isinstance(manifest_filepaths, str): - manifest_filepaths = manifest_filepaths.split(',') - - num_datastore_manifests = sum([is_datastore_path(f) for f in manifest_filepaths]) - - if num_datastore_manifests > 0: - # Local utility function - def cache_data(manifest_filepaths, cache_audio, num_workers, max_num_workers): - """Cache manifests and audio data from object store. - """ - # Determine the number of workers to use - if num_workers is None: - num_workers = os.cpu_count() - 1 - num_workers = min(num_workers, max_num_workers) - - # Process each manifest file - for manifest_file in manifest_filepaths: - # If manifest is on a data store, then cache it. - # Otherwise, nothing to do. - if is_datastore_path(manifest_file): - logging.info('Cache manifest file: %s', manifest_file) - cached_manifest_file = DataStoreObject(manifest_file).get() - logging.info('Cached at: %s', str(cached_manifest_file)) - - if cache_audio: - # Each audio file from manifest will be cached. - logging.info('Cache audio from manifest file: %s', manifest_file) - # Assumes that manifest is using relative paths - manifest_dir = os.path.dirname(manifest_file) - # Prepare all store objects - audio_objects = [] - with open(cached_manifest_file, 'r') as f: - for line in f: - item = json.loads(line) - store_path = os.path.join(manifest_dir, item['audio_filepath']) - audio_objects.append(DataStoreObject(store_path=store_path)) - - if num_workers is not None and num_workers > 1: - logging.debug('Using multiprocessing with num_workers: %d.', num_workers) - with multiprocessing.Pool(processes=num_workers) as p: - result = list( - tqdm(p.imap(datastore_object_get, audio_objects), total=len(audio_objects)) - ) - else: - logging.debug('Using a single process.') - result = [] - for audio_object in tqdm(audio_objects): - result.append(audio_object.get() is not None) - - if not all(result): - raise RuntimeError('Some files not downloaded successfully') - logging.info('Caching complete') - - else: - # Nothing to do here - logging.debug('Manifest is not on a data store: %s', manifest_file) - - if torch.distributed.is_available() and torch.distributed.is_initialized(): - logging.debug('Distributed environment is available and initialized.') - - # Handle distributed environment - if shared_cache is None: - shared_cache = is_datastore_cache_shared() - - if shared_cache: - logging.debug('Cache is shared among nodes, cache data on global rank zero.') - is_rank_zero = is_global_rank_zero() - else: - logging.debug('Cache is not shared among nodes, cache data on local rank zero.') - local_rank = int(os.environ.get("LOCAL_RANK", 0)) - is_rank_zero = local_rank == 0 - - if is_rank_zero: - logging.info('Cache data from %s rank 0', 'global' if shared_cache else 'local') - cache_data( - manifest_filepaths=manifest_filepaths, - cache_audio=cache_audio, - num_workers=num_workers, - max_num_workers=max_num_workers, - ) - logging.debug('Reached barrier') - torch.distributed.barrier() - - elif is_global_rank_zero(): - # Handle non-distributed environment, e.g., if running on a single GPU - logging.warning( - 'Torch distributed is not initialized and caching may be prone to data race conditions. ' - 'Now caching data from global rank 0. If there are other ranks and they pass this ' - 'before rank 0, errors might result.' - ) - cache_data( - manifest_filepaths=manifest_filepaths, - cache_audio=cache_audio, - num_workers=num_workers, - max_num_workers=max_num_workers, - ) - else: - raise RuntimeError( - 'Torch distributed is not initialized and caching on nodes other than global rank zero is disabled ' - 'to avoid race condition between different ranks. To ensure distributed environment is ' - 'initialized, please update data config to use `defer_setup = True`.' - ) - - -"""Optionally expand / shard the list of manifests - This is made to use the same notation as the sharded audio files - - Args: - manifest_filepaths: list of manifest files (the sharded notation) - shard_strategy: scatter or replicate (scatter by default) - shard_manifests: bool, if False, no sharding / manifest filepath expansion will be attempted - global_rank: int, the rank of this worker - world_size: int, total number of workers -""" - - -def shard_manifests_if_needed( - manifest_filepaths: Union[str, List[str]], - shard_strategy: str, - shard_manifests: bool, - global_rank: int, - world_size: int, -): - if shard_manifests: - if not torch.distributed.is_available(): - logging.warning("Not running in torch.distributed mode. Manifest sharding not available") - return manifest_filepaths - - if not torch.distributed.is_initialized(): - logging.warning( - 'Manifest sharding was requested but torch.distributed is not initialized ' - 'Did you intend to set the defer_setup flag?' - ) - return manifest_filepaths - - manifest_filepaths = expand_sharded_filepaths( - sharded_filepaths=manifest_filepaths, - shard_strategy=shard_strategy, - world_size=world_size, - global_rank=global_rank, - ) - - return manifest_filepaths - - -class _AudioTextDataset(Dataset): - """ - Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations (in seconds). - Each new line is a different sample. Example below: - {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147} - ... - {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt": - "utterance_id", "ctm_utt": "en_4156", "side": "A"} - Args: - manifest_filepath: Path to manifest json as described above. Can be comma-separated paths. - parser: Str for a language specific preprocessor or a callable. - sample_rate (int): Sample rate to resample loaded audio to - int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor object used to augment loaded - audio - max_duration: If audio exceeds this length, do not include in dataset - min_duration: If audio is less than this length, do not include in dataset - max_utts: Limit number of utterances - trim: whether or not to trim silence. Defaults to False - bos_id: Id of beginning of sequence symbol to append if not None - eos_id: Id of end of sequence symbol to append if not None - pad_id: Id of pad symbol. Defaults to 0 - return_sample_id (bool): whether to return the sample_id as a part of each sample - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'audio_signal': NeuralType(('B', 'T'), AudioSignal()), - 'a_sig_length': NeuralType(tuple('B'), LengthsType()), - 'transcripts': NeuralType(('B', 'T'), LabelsType()), - 'transcript_length': NeuralType(tuple('B'), LengthsType()), - 'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True), - } - - def __init__( - self, - manifest_filepath: str, - parser: Union[str, Callable], - sample_rate: int, - int_values: bool = False, - augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None, - max_duration: Optional[int] = None, - min_duration: Optional[int] = None, - max_utts: int = 0, - trim: bool = False, - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - pad_id: int = 0, - return_sample_id: bool = False, - channel_selector: Optional[ChannelSelectorType] = None, - ): - if type(manifest_filepath) == str: - manifest_filepath = manifest_filepath.split(",") - - # If necessary, cache manifests and audio from object store - cache_datastore_manifests(manifest_filepaths=manifest_filepath, cache_audio=True) - - self.manifest_processor = ASRManifestProcessor( - manifest_filepath=manifest_filepath, - parser=parser, - max_duration=max_duration, - min_duration=min_duration, - max_utts=max_utts, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - ) - self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor) - self.trim = trim - self.return_sample_id = return_sample_id - self.channel_selector = channel_selector - - def get_manifest_sample(self, sample_id): - return self.manifest_processor.collection[sample_id] - - def __getitem__(self, index): - sample = self.manifest_processor.collection[index] - offset = sample.offset - - if offset is None: - offset = 0 - - features = self.featurizer.process( - sample.audio_file, - offset=offset, - duration=sample.duration, - trim=self.trim, - orig_sr=sample.orig_sr, - channel_selector=self.channel_selector, - ) - f, fl = features, torch.tensor(features.shape[0]).long() - - t, tl = self.manifest_processor.process_text_by_sample(sample=sample) - - if self.return_sample_id: - output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long(), index - else: - output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long() - - return output - - def __len__(self): - return len(self.manifest_processor.collection) - - def _collate_fn(self, batch): - return _speech_collate_fn(batch, pad_id=self.manifest_processor.pad_id) - - -class AudioToCharDataset(_AudioTextDataset): - """ - Dataset that loads tensors via a json file containing paths to audio - files, transcripts, and durations (in seconds). Each new line is a - different sample. Example below: - {"audio_filepath": "/path/to/audio.wav", "text_filepath": - "/path/to/audio.txt", "duration": 23.147} - ... - {"audio_filepath": "/path/to/audio.wav", "text": "the - transcription", "offset": 301.75, "duration": 0.82, "utt": - "utterance_id", "ctm_utt": "en_4156", "side": "A"} - - Args: - manifest_filepath: Path to manifest json as described above. Can - be comma-separated paths. - labels: String containing all the possible characters to map to - sample_rate (int): Sample rate to resample loaded audio to - int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor - object used to augment loaded audio - max_duration: If audio exceeds this length, do not include in dataset - min_duration: If audio is less than this length, do not include - in dataset - max_utts: Limit number of utterances - blank_index: blank character index, default = -1 - unk_index: unk_character index, default = -1 - normalize: whether to normalize transcript text (default): True - bos_id: Id of beginning of sequence symbol to append if not None - eos_id: Id of end of sequence symbol to append if not None - return_sample_id (bool): whether to return the sample_id as a part of each sample - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'audio_signal': NeuralType(('B', 'T'), AudioSignal()), - 'a_sig_length': NeuralType(tuple('B'), LengthsType()), - 'transcripts': NeuralType(('B', 'T'), LabelsType()), - 'transcript_length': NeuralType(tuple('B'), LengthsType()), - 'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True), - } - - def __init__( - self, - manifest_filepath: str, - labels: Union[str, List[str]], - sample_rate: int, - int_values: bool = False, - augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None, - max_duration: Optional[float] = None, - min_duration: Optional[float] = None, - max_utts: int = 0, - blank_index: int = -1, - unk_index: int = -1, - normalize: bool = True, - trim: bool = False, - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - pad_id: int = 0, - parser: Union[str, Callable] = 'en', - return_sample_id: bool = False, - channel_selector: Optional[ChannelSelectorType] = None, - ): - self.labels = labels - - parser = parsers.make_parser( - labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize - ) - - super().__init__( - manifest_filepath=manifest_filepath, - parser=parser, - sample_rate=sample_rate, - int_values=int_values, - augmentor=augmentor, - max_duration=max_duration, - min_duration=min_duration, - max_utts=max_utts, - trim=trim, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - return_sample_id=return_sample_id, - channel_selector=channel_selector, - ) - - -class AudioToBPEDataset(_AudioTextDataset): - """ - Dataset that loads tensors via a json file containing paths to audio - files, transcripts, and durations (in seconds). Each new line is a - different sample. Example below: - {"audio_filepath": "/path/to/audio.wav", "text_filepath": - "/path/to/audio.txt", "duration": 23.147} - ... - {"audio_filepath": "/path/to/audio.wav", "text": "the - transcription", "offset": 301.75, "duration": 0.82, "utt": - "utterance_id", "ctm_utt": "en_4156", "side": "A"} - - In practice, the dataset and manifest used for character encoding and byte pair encoding - are exactly the same. The only difference lies in how the dataset tokenizes the text in - the manifest. - - Args: - manifest_filepath: Path to manifest json as described above. Can - be comma-separated paths. - tokenizer: A subclass of the Tokenizer wrapper found in the common collection, - nemo.collections.common.tokenizers.TokenizerSpec. ASR Models support a subset of - all available tokenizers. - sample_rate (int): Sample rate to resample loaded audio to - int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor - object used to augment loaded audio - max_duration: If audio exceeds this length, do not include in dataset - min_duration: If audio is less than this length, do not include - in dataset - max_utts: Limit number of utterances - trim: Whether to trim silence segments - use_start_end_token: Boolean which dictates whether to add [BOS] and [EOS] - tokens to beginning and ending of speech respectively. - return_sample_id (bool): whether to return the sample_id as a part of each sample - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'audio_signal': NeuralType(('B', 'T'), AudioSignal()), - 'a_sig_length': NeuralType(tuple('B'), LengthsType()), - 'transcripts': NeuralType(('B', 'T'), LabelsType()), - 'transcript_length': NeuralType(tuple('B'), LengthsType()), - 'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True), - } - - def __init__( - self, - manifest_filepath: str, - tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec', - sample_rate: int, - int_values: bool = False, - augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None, - max_duration: Optional[int] = None, - min_duration: Optional[int] = None, - max_utts: int = 0, - trim: bool = False, - use_start_end_token: bool = True, - return_sample_id: bool = False, - channel_selector: Optional[ChannelSelectorType] = None, - ): - if use_start_end_token and hasattr(tokenizer, "bos_id") and tokenizer.bos_id > 0: - bos_id = tokenizer.bos_id - else: - bos_id = None - - if use_start_end_token and hasattr(tokenizer, "eos_id") and tokenizer.eos_id > 0: - eos_id = tokenizer.eos_id - else: - eos_id = None - - if hasattr(tokenizer, "pad_id") and tokenizer.pad_id > 0: - pad_id = tokenizer.pad_id - else: - pad_id = 0 - - class TokenizerWrapper: - def __init__(self, tokenizer): - if isinstance(tokenizer, tokenizers.aggregate_tokenizer.AggregateTokenizer): - self.is_aggregate = True - else: - self.is_aggregate = False - self._tokenizer = tokenizer - - def __call__(self, *args): - if isinstance(args[0], List) and self.is_aggregate: - t = [] - for span in args[0]: - t.extend(self._tokenizer.text_to_ids(span['str'], span['lang'])) - return t - - t = self._tokenizer.text_to_ids(*args) - return t - - super().__init__( - manifest_filepath=manifest_filepath, - parser=TokenizerWrapper(tokenizer), - sample_rate=sample_rate, - int_values=int_values, - augmentor=augmentor, - max_duration=max_duration, - min_duration=min_duration, - max_utts=max_utts, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - trim=trim, - return_sample_id=return_sample_id, - channel_selector=channel_selector, - ) - - -class _TarredAudioToTextDataset(IterableDataset): - """ - A similar Dataset to the AudioToCharDataset/AudioToBPEDataset, but which loads tarred audio files. - - Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToCharDataset/AudioToBPEDataset), - as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should - contain the information for one audio file, including at least the transcript and name of the audio - file within the tarball. - - Valid formats for the audio_tar_filepaths argument include: - (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or - (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...]. - - Note: For brace expansion in (1), there may be cases where `{x..y}` syntax cannot be used due to shell interference. - This occurs most commonly inside SLURM scripts. Therefore we provide a few equivalent replacements. - Supported opening braces - { <=> (, [, < and the special tag _OP_. - Supported closing braces - } <=> ), ], > and the special tag _CL_. - For SLURM based tasks, we suggest the use of the special tags for ease of use. - - See the WebDataset documentation for more information about accepted data and input formats. - - If using multiple workers the number of shards should be divisible by world_size to ensure an - even split among workers. If it is not divisible, logging will give a warning but training will proceed. - In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering - is applied. We currently do not check for this, but your program may hang if the shards are uneven! - - Notice that a few arguments are different from the AudioToCharDataset; for example, shuffle (bool) has been - replaced by shuffle_n (int). - - Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest - after filtering. An incorrect manifest length may lead to some DataLoader issues down the line. - - Args: - audio_tar_filepaths: Either a list of audio tarball filepaths, or a - string (can be brace-expandable). - manifest_filepath (str): Path to the manifest. - parser (callable): A callable which is used to pre-process the text output. - sample_rate (int): Sample rate to resample loaded audio to - int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor - object used to augment loaded audio - shuffle_n (int): How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - Defaults to 0. - min_duration (float): Dataset parameter. - All training files which have a duration less than min_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to 0.1. - max_duration (float): Dataset parameter. - All training files which have a duration more than max_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to None. - blank_index (int): Blank character index, defaults to -1. - unk_index (int): Unknown character index, defaults to -1. - normalize (bool): Dataset parameter. - Whether to use automatic text cleaning. - It is highly recommended to manually clean text for best results. - Defaults to True. - trim (bool): Whether to use trim silence from beginning and end - of audio signal using librosa.effects.trim(). - Defaults to False. - bos_id (id): Dataset parameter. - Beginning of string symbol id used for seq2seq models. - Defaults to None. - eos_id (id): Dataset parameter. - End of string symbol id used for seq2seq models. - Defaults to None. - pad_id (id): Token used to pad when collating samples in batches. - If this is None, pads using 0s. - Defaults to None. - shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp. - - `scatter`: The default shard strategy applied by WebDataset, where each node gets - a unique set of shards, which are permanently pre-allocated and never changed at runtime. - - `replicate`: Optional shard strategy, where each node gets all of the set of shards - available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. - The benefit of replication is that it allows each node to sample data points from the entire - dataset independently of other nodes, and reduces dependence on value of `shuffle_n`. - - .. warning:: - Replicated strategy allows every node to sample the entire set of available tarfiles, - and therefore more than one node may sample the same tarfile, and even sample the same - data points! As such, there is no assured guarantee that all samples in the dataset will be - sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific - occasions (when the number of shards is not divisible with ``world_size``), will not sample - the entire dataset. For these reasons it is not advisable to use tarred datasets as validation - or test datasets. - shard_manifests (bool): Whether or not to try / shard manifests. Defaults to False. - global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. - world_size (int): Total number of processes, used for partitioning shards. Defaults to 0. - return_sample_id (bool): whether to return the sample_id as a part of each sample - """ - - def __init__( - self, - audio_tar_filepaths: Union[str, List[str]], - manifest_filepath: str, - parser: Callable, - sample_rate: int, - int_values: bool = False, - augmentor: Optional['nemo.collections.asr.parts.perturb.AudioAugmentor'] = None, - shuffle_n: int = 0, - min_duration: Optional[float] = None, - max_duration: Optional[float] = None, - trim: bool = False, - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - pad_id: int = 0, - shard_strategy: str = "scatter", - shard_manifests: bool = False, - global_rank: int = 0, - world_size: int = 0, - return_sample_id: bool = False, - ): - self.shard_manifests = shard_manifests - - # Shard manifests if necessary and possible and then expand the paths - manifest_filepath = shard_manifests_if_needed( - shard_manifests=shard_manifests, - shard_strategy=shard_strategy, - manifest_filepaths=manifest_filepath, - world_size=world_size, - global_rank=global_rank, - ) - - # If necessary, cache manifests from object store - cache_datastore_manifests(manifest_filepaths=manifest_filepath) - - self.manifest_processor = ASRManifestProcessor( - manifest_filepath=manifest_filepath, - parser=parser, - max_duration=max_duration, - min_duration=min_duration, - max_utts=0, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - index_by_file_id=True, # Must set this so the manifest lines can be indexed by file ID - ) - - self.len = self._compute_len() - - self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor) - self.trim = trim - self.eos_id = eos_id - self.bos_id = bos_id - self.pad_id = pad_id - self.return_sample_id = return_sample_id - - audio_tar_filepaths = expand_sharded_filepaths( - sharded_filepaths=audio_tar_filepaths, - shard_strategy=shard_strategy, - world_size=world_size, - global_rank=global_rank, - ) - - # Put together WebDataset - self._dataset = wd.WebDataset(urls=audio_tar_filepaths, nodesplitter=None) - - if shuffle_n > 0: - self._dataset = self._dataset.shuffle(shuffle_n) - else: - logging.info("WebDataset will not shuffle files within the tar files.") - - self._dataset = ( - self._dataset.rename(audio='wav;ogg;flac', key='__key__') - .to_tuple('audio', 'key') - .pipe(self._filter) - .pipe(self._loop_offsets) - .map(f=self._build_sample) - ) - - def _filter(self, iterator): - """This function is used to remove samples that have been filtered out by ASRAudioText already. - Otherwise, we would get a KeyError as _build_sample attempts to find the manifest entry for a sample - that was filtered out (e.g. for duration). - Note that if using multi-GPU training, filtering may lead to an imbalance in samples in each shard, - which may make your code hang as one process will finish before the other. - """ - - class TarredAudioFilter: - def __init__(self, collection): - self.iterator = iterator - self.collection = collection - - def __iter__(self): - return self - - def __next__(self): - while True: - audio_bytes, audio_filename = next(self.iterator) - file_id, _ = os.path.splitext(os.path.basename(audio_filename)) - if file_id in self.collection.mapping: - return audio_bytes, audio_filename - - return TarredAudioFilter(self.manifest_processor.collection) - - def _loop_offsets(self, iterator): - """This function is used to iterate through utterances with different offsets for each file. - """ - - class TarredAudioLoopOffsets: - def __init__(self, collection): - self.iterator = iterator - self.collection = collection - self.current_fn = None - self.current_bytes = None - self.offset_id = 0 - - def __iter__(self): - return self - - def __next__(self): - if self.current_fn is None: - self.current_bytes, self.current_fn = next(self.iterator) - self.offset_id = 0 - else: - offset_list = self.collection.mapping[self.current_fn] - if len(offset_list) == self.offset_id + 1: - self.current_bytes, self.current_fn = next(self.iterator) - self.offset_id = 0 - else: - self.offset_id += 1 - - return self.current_bytes, self.current_fn, self.offset_id - - return TarredAudioLoopOffsets(self.manifest_processor.collection) - - def _collate_fn(self, batch): - return _speech_collate_fn(batch, self.pad_id) - - def _build_sample(self, tup): - """Builds the training sample by combining the data from the WebDataset with the manifest info. - """ - audio_bytes, audio_filename, offset_id = tup - - # Grab manifest entry from self.manifest_preprocessor.collection - file_id, _ = os.path.splitext(os.path.basename(audio_filename)) - manifest_idx = self.manifest_processor.collection.mapping[file_id][offset_id] - manifest_entry = self.manifest_processor.collection[manifest_idx] - - offset = manifest_entry.offset - if offset is None: - offset = 0 - - # Convert audio bytes to IO stream for processing (for SoundFile to read) - audio_filestream = io.BytesIO(audio_bytes) - features = self.featurizer.process( - audio_filestream, - offset=offset, - duration=manifest_entry.duration, - trim=self.trim, - orig_sr=manifest_entry.orig_sr, - ) - audio_filestream.close() - - # Audio features - f, fl = features, torch.tensor(features.shape[0]).long() - - # Text features - t, tl = manifest_entry.text_tokens, len(manifest_entry.text_tokens) - - self.manifest_processor.process_text_by_sample(sample=manifest_entry) - - if self.bos_id is not None: - t = [self.bos_id] + t - tl += 1 - if self.eos_id is not None: - t = t + [self.eos_id] - tl += 1 - - if self.return_sample_id: - return f, fl, torch.tensor(t).long(), torch.tensor(tl).long(), manifest_idx - else: - return f, fl, torch.tensor(t).long(), torch.tensor(tl).long() - - def get_manifest_sample(self, sample_id): - return self.manifest_processor.collection[sample_id] - - def __iter__(self): - return self._dataset.__iter__() - - def _compute_len(self): - if self.shard_manifests and torch.distributed.is_available() and torch.distributed.is_initialized(): - my_len = torch.tensor(len(self.manifest_processor.collection), dtype=torch.int32).cuda() - torch.distributed.all_reduce(my_len) - my_len = my_len.int() - logging.info(f'Sharded manifests: Total length: {my_len}') - else: - my_len = len(self.manifest_processor.collection) - - return my_len - - def __len__(self): - return self.len - - -class TarredAudioToCharDataset(_TarredAudioToTextDataset): - """ - A similar Dataset to the AudioToCharDataset, but which loads tarred audio files. - - Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToCharDataset), - as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should - contain the information for one audio file, including at least the transcript and name of the audio - file within the tarball. - - Valid formats for the audio_tar_filepaths argument include: - (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or - (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...]. - - See the WebDataset documentation for more information about accepted data and input formats. - - If using multiple workers the number of shards should be divisible by world_size to ensure an - even split among workers. If it is not divisible, logging will give a warning but training will proceed. - In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering - is applied. We currently do not check for this, but your program may hang if the shards are uneven! - - Notice that a few arguments are different from the AudioToCharDataset; for example, shuffle (bool) has been - replaced by shuffle_n (int). - - Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest - after filtering. An incorrect manifest length may lead to some DataLoader issues down the line. - - Args: - audio_tar_filepaths: Either a list of audio tarball filepaths, or a - string (can be brace-expandable). - manifest_filepath (str): Path to the manifest. - labels (list): List of characters that can be output by the ASR model. - For Jasper, this is the 28 character set {a-z '}. The CTC blank - symbol is automatically added later for models using ctc. - sample_rate (int): Sample rate to resample loaded audio to - int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor - object used to augment loaded audio - shuffle_n (int): How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - Defaults to 0. - min_duration (float): Dataset parameter. - All training files which have a duration less than min_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to 0.1. - max_duration (float): Dataset parameter. - All training files which have a duration more than max_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to None. - blank_index (int): Blank character index, defaults to -1. - unk_index (int): Unknown character index, defaults to -1. - normalize (bool): Dataset parameter. - Whether to use automatic text cleaning. - It is highly recommended to manually clean text for best results. - Defaults to True. - trim (bool): Whether to use trim silence from beginning and end - of audio signal using librosa.effects.trim(). - Defaults to False. - bos_id (id): Dataset parameter. - Beginning of string symbol id used for seq2seq models. - Defaults to None. - eos_id (id): Dataset parameter. - End of string symbol id used for seq2seq models. - Defaults to None. - pad_id (id): Token used to pad when collating samples in batches. - If this is None, pads using 0s. - Defaults to None. - shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp. - - - `scatter`: The default shard strategy applied by WebDataset, where each node gets - a unique set of shards, which are permanently pre-allocated and never changed at runtime. - - `replicate`: Optional shard strategy, where each node gets all of the set of shards - available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. - The benefit of replication is that it allows each node to sample data points from the entire - dataset independently of other nodes, and reduces dependence on value of `shuffle_n`. - - .. warning:: - - Replicated strategy allows every node to sample the entire set of available tarfiles, - and therefore more than one node may sample the same tarfile, and even sample the same - data points! As such, there is no assured guarantee that all samples in the dataset will be - sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific - occasions (when the number of shards is not divisible with ``world_size``), will not sample - the entire dataset. For these reasons it is not advisable to use tarred datasets as validation - or test datasets. - - global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. - world_size (int): Total number of processes, used for partitioning shards. Defaults to 0. - return_sample_id (bool): whether to return the sample_id as a part of each sample - """ - - def __init__( - self, - audio_tar_filepaths: Union[str, List[str]], - manifest_filepath: str, - labels: List[str], - sample_rate: int, - int_values: bool = False, - augmentor: Optional['nemo.collections.asr.parts.perturb.AudioAugmentor'] = None, - shuffle_n: int = 0, - min_duration: Optional[float] = None, - max_duration: Optional[float] = None, - blank_index: int = -1, - unk_index: int = -1, - normalize: bool = True, - trim: bool = False, - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - parser: Optional[str] = 'en', - pad_id: int = 0, - shard_strategy: str = "scatter", - shard_manifests: bool = False, - global_rank: int = 0, - world_size: int = 0, - return_sample_id: bool = False, - ): - self.labels = labels - - parser = parsers.make_parser( - labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize - ) - - super().__init__( - audio_tar_filepaths=audio_tar_filepaths, - manifest_filepath=manifest_filepath, - parser=parser, - sample_rate=sample_rate, - int_values=int_values, - augmentor=augmentor, - shuffle_n=shuffle_n, - min_duration=min_duration, - max_duration=max_duration, - trim=trim, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - shard_strategy=shard_strategy, - shard_manifests=shard_manifests, - global_rank=global_rank, - world_size=world_size, - return_sample_id=return_sample_id, - ) - - -class TarredAudioToBPEDataset(_TarredAudioToTextDataset): - """ - A similar Dataset to the AudioToBPEDataset, but which loads tarred audio files. - - Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToBPEDataset), - as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should - contain the information for one audio file, including at least the transcript and name of the audio - file within the tarball. - - Valid formats for the audio_tar_filepaths argument include: - (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or - (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...]. - - See the WebDataset documentation for more information about accepted data and input formats. - - If using multiple workers the number of shards should be divisible by world_size to ensure an - even split among workers. If it is not divisible, logging will give a warning but training will proceed. - In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering - is applied. We currently do not check for this, but your program may hang if the shards are uneven! - - Notice that a few arguments are different from the AudioToBPEDataset; for example, shuffle (bool) has been - replaced by shuffle_n (int). - - Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest - after filtering. An incorrect manifest length may lead to some DataLoader issues down the line. - - Args: - audio_tar_filepaths: Either a list of audio tarball filepaths, or a - string (can be brace-expandable). - manifest_filepath (str): Path to the manifest. - tokenizer (TokenizerSpec): Either a Word Piece Encoding tokenizer (BERT), - or a Sentence Piece Encoding tokenizer (BPE). The CTC blank - symbol is automatically added later for models using ctc. - sample_rate (int): Sample rate to resample loaded audio to - int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor - object used to augment loaded audio - shuffle_n (int): How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - Defaults to 0. - min_duration (float): Dataset parameter. - All training files which have a duration less than min_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to 0.1. - max_duration (float): Dataset parameter. - All training files which have a duration more than max_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to None. - trim (bool): Whether to use trim silence from beginning and end - of audio signal using librosa.effects.trim(). - Defaults to False. - use_start_end_token: Boolean which dictates whether to add [BOS] and [EOS] - tokens to beginning and ending of speech respectively. - pad_id (id): Token used to pad when collating samples in batches. - If this is None, pads using 0s. - Defaults to None. - shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp. - - - `scatter`: The default shard strategy applied by WebDataset, where each node gets - a unique set of shards, which are permanently pre-allocated and never changed at runtime. - - `replicate`: Optional shard strategy, where each node gets all of the set of shards - available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. - The benefit of replication is that it allows each node to sample data points from the entire - dataset independently of other nodes, and reduces dependence on value of `shuffle_n`. - - .. warning:: - - Replicated strategy allows every node to sample the entire set of available tarfiles, - and therefore more than one node may sample the same tarfile, and even sample the same - data points! As such, there is no assured guarantee that all samples in the dataset will be - sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific - occasions (when the number of shards is not divisible with ``world_size``), will not sample - the entire dataset. For these reasons it is not advisable to use tarred datasets as validation - or test datasets. - - global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. - world_size (int): Total number of processes, used for partitioning shards. Defaults to 0. - return_sample_id (bool): whether to return the sample_id as a part of each sample - """ - - def __init__( - self, - audio_tar_filepaths: Union[str, List[str]], - manifest_filepath: str, - tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec', - sample_rate: int, - int_values: bool = False, - augmentor: Optional['nemo.collections.asr.parts.perturb.AudioAugmentor'] = None, - shuffle_n: int = 0, - min_duration: Optional[float] = None, - max_duration: Optional[float] = None, - trim: bool = False, - use_start_end_token: bool = True, - shard_strategy: str = "scatter", - shard_manifests: bool = False, - global_rank: int = 0, - world_size: int = 0, - return_sample_id: bool = False, - ): - if use_start_end_token and hasattr(tokenizer, "bos_id") and tokenizer.bos_id > 0: - bos_id = tokenizer.bos_id - else: - bos_id = None - - if use_start_end_token and hasattr(tokenizer, "eos_id") and tokenizer.eos_id > 0: - eos_id = tokenizer.eos_id - else: - eos_id = None - - if hasattr(tokenizer, "pad_id") and tokenizer.pad_id > 0: - pad_id = tokenizer.pad_id - else: - pad_id = 0 - - class TokenizerWrapper: - def __init__(self, tokenizer): - if isinstance(tokenizer, tokenizers.aggregate_tokenizer.AggregateTokenizer): - self.is_aggregate = True - else: - self.is_aggregate = False - self._tokenizer = tokenizer - - def __call__(self, *args): - if isinstance(args[0], List) and self.is_aggregate: - t = [] - for span in args[0]: - t.extend(self._tokenizer.text_to_ids(span['str'], span['lang'])) - return t - - t = self._tokenizer.text_to_ids(*args) - return t - - super().__init__( - audio_tar_filepaths=audio_tar_filepaths, - manifest_filepath=manifest_filepath, - parser=TokenizerWrapper(tokenizer), - sample_rate=sample_rate, - int_values=int_values, - augmentor=augmentor, - shuffle_n=shuffle_n, - min_duration=min_duration, - max_duration=max_duration, - trim=trim, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - shard_strategy=shard_strategy, - shard_manifests=shard_manifests, - global_rank=global_rank, - world_size=world_size, - return_sample_id=return_sample_id, - ) - - -class BucketingDataset(IterableDataset): - """ - A Dataset which wraps another IterableDataset and adopts it for bucketing - Args: - dataset (IterableDataset): The IterableDataset to get wrapped - bucketing_batch_size (int): Number of samples to build a batch - """ - - def __init__( - self, dataset: IterableDataset, bucketing_batch_size: int, - ): - self.wrapped_dataset = dataset - self.bucketing_batch_size = bucketing_batch_size - super().__init__() - - def _collate_fn(self, batch): - return _speech_collate_fn(batch[0], self.wrapped_dataset.pad_id) - - def __iter__(self): - return BucketingIterator( - wrapped_ds=self.wrapped_dataset._dataset, bucketing_batch_size=self.bucketing_batch_size - ).__iter__() - - def __len__(self): - return int(math.ceil(len(self.wrapped_dataset) / float(self.bucketing_batch_size))) - - -class BucketingIterator: - def __init__(self, wrapped_ds, bucketing_batch_size): - self.wrapped_ds = wrapped_ds - self.wrapped_iter = None - self.bucketing_batch_size = bucketing_batch_size - - def __iter__(self): - self.wrapped_iter = iter(self.wrapped_ds) - return self - - def __next__(self): - batches = [] - for idx in range(self.bucketing_batch_size): - try: - sample = next(self.wrapped_iter) - except StopIteration: - break - batches.append(sample) - if len(batches) == 0: - raise StopIteration - return batches - - -class RandomizedChainDataset(ChainDataset): - def __init__(self, datasets: Iterable[Dataset], rnd_seed=0) -> None: - super(RandomizedChainDataset, self).__init__(list(datasets)) - self.rnd_gen = np.random.RandomState(rnd_seed) - - def __iter__(self): - shuffled_order = self.rnd_gen.permutation(len(self.datasets)) - for dataset_idx in shuffled_order: - d = self.datasets[dataset_idx] - assert isinstance(d, IterableDataset), "ChainDataset only supports IterableDataset" - for idx, x in enumerate(d): - yield x - # in case d is an infinite dataset, we want to break the loop - # so that the other datasets get a chance to yield too - if idx >= len(d) - 1: - break diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_text_dali.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_text_dali.py deleted file mode 100644 index 77bd71129cc2b31debbb4400acecab99b50aa264..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_text_dali.py +++ /dev/null @@ -1,772 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import operator -import os.path -import time -from collections.abc import Iterator -from typing import Callable, List, Optional, Union - -import torch -from omegaconf import DictConfig - -from nemo.collections.asr.data.audio_to_text import ASRManifestProcessor, expand_sharded_filepaths -from nemo.collections.common.parts.preprocessing import parsers -from nemo.utils import logging, model_utils - -try: - import nvidia.dali as dali - from nvidia.dali.pipeline import Pipeline - from nvidia.dali.plugin.pytorch import DALIGenericIterator as DALIPytorchIterator - from nvidia.dali.plugin.pytorch import LastBatchPolicy as LastBatchPolicy - - HAVE_DALI = True -except (ImportError, ModuleNotFoundError): - HAVE_DALI = False - -__all__ = [ - 'AudioToCharDALIDataset', - 'AudioToBPEDALIDataset', -] - -""" -Below minimum version is required to access the "read_idxs" argument in -dali.fn.readers.nemo_asr -""" -__DALI_MINIMUM_VERSION__ = "1.11" - -DALI_INSTALLATION_MESSAGE = ( - "Could not import `nvidia.dali`.\n" - "Please install DALI by following the steps provided here - \n" - "https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html" -) - - -def is_dali_supported(min_version: str, verbose: bool = False) -> bool: - """ - Checks if DALI in installed, and version is >= min_verion. - - Args: - min_version: A semver str that is the minimum requirement. - verbose: Whether to log the installation instructions if DALI is not found. - - Returns: - bool - whether DALI could be imported or not. - """ - module_available, _ = model_utils.check_lib_version( - 'nvidia.dali', checked_version=min_version, operator=operator.ge - ) - - # If DALI is not installed - if module_available is None: - if verbose: - logging.info(DALI_INSTALLATION_MESSAGE) - - return False - - return module_available - - -class DALIOutputs(object): - def __init__(self, out_dict): - self._has_processed_signal = 'processed_signal' in out_dict and 'processed_signal_len' in out_dict - if not self._has_processed_signal: - assert 'audio' in out_dict and 'audio_len' in out_dict - assert 'transcript' in out_dict and 'transcript_len' in out_dict - if self._has_processed_signal: - self._outs = ( - out_dict['processed_signal'], - out_dict['processed_signal_len'].reshape(-1), - out_dict['transcript'], - out_dict['transcript_len'].reshape(-1), - ) - else: - self._outs = ( - out_dict['audio'], - out_dict['audio_len'].reshape(-1), - out_dict['transcript'], - out_dict['transcript_len'].reshape(-1), - ) - - @property - def has_processed_signal(self): - return self._has_processed_signal - - def __getitem__(self, key): - return self._outs[key] - - def __len__(self): - return len(self._outs) - - -class _AudioTextDALIDataset(Iterator): - """ - NVIDIA DALI pipeline that loads tensors via one or more manifest files where each line containing a sample descriptor in JSON, - including audio files, transcripts, and durations (in seconds). - Here's an example: - {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147} - ... - {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt": - "utterance_id", "ctm_utt": "en_4156", "side": "A"} - - Args: - manifest_filepath: Path to manifest file with the format described above. Can be comma-separated paths. - device (str): Determines the device type to be used for preprocessing. Allowed values are: 'cpu', 'gpu'. - batch_size (int): Number of samples in a batch. - parser (str, callable): A str for an inbuilt parser, or a callable with signature f(str) -> List[int]. - sample_rate (int): Sample rate to resample loaded audio to. - num_threads (int): Number of CPU processing threads to be created by the DALI pipeline. - max_duration (float): Determines the maximum allowed duration, in seconds, of the loaded audio files. - min_duration (float): Determines the minimum allowed duration, in seconds, of the loaded audio files. - bos_id (int): Id of beginning of sequence symbol to append if not None - eos_id (int): Id of end of sequence symbol to append if not None - pad_id (int): Id used to pad the input. Defaults to 0 if not provided. - trim (bool): If True, it will extract the nonsilent region of the loaded audio signal. - shuffle (bool): If set to True, the dataset will shuffled after loading. - drop_last (bool): If set to True, the last batch will be dropped if incomplete. This will be the case when the shard size is not divisible by the batch size. - If set to False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. - device_id (int): Index of the GPU to be used (local_rank). Only applicable when device == 'gpu'. Defaults to 0. - global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. - world_size (int): Total number of processes, used for partitioning shards. Defaults to 1. - preprocessor_cfg (DictConfig): Preprocessor configuration. Supports AudioToMelSpectrogramPreprocessor and AudioToMFCCPreprocessor. - return_sample_id (bool): whether to return the sample_id as a part of each sample (not supported yet). - """ - - def __init__( - self, - manifest_filepath: str, - device: str, - batch_size: int, - parser: Union[str, Callable], - audio_tar_filepaths: Optional[Union[str, List[str]]] = None, - audio_tar_index_filepaths: Optional[Union[str, List[str]]] = None, - sample_rate: int = 16000, - num_threads: int = 4, - max_duration: float = 0.0, - min_duration: float = 0.0, - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - pad_id: int = 0, - trim: bool = False, - shuffle: bool = False, - drop_last: bool = False, - shard_strategy: str = "scatter", - device_id: int = 0, - global_rank: int = 0, - world_size: int = 1, - preprocessor_cfg: DictConfig = None, - return_sample_id: bool = False, - ): - self.drop_last = drop_last # used by lr_scheduler - if return_sample_id: - raise ValueError( - "Currently DALI data layers don't support returning the sample_id and return_sample_id can not be enabled." - ) - self.return_sample_id = return_sample_id - - if not HAVE_DALI: - raise ModuleNotFoundError( - f"{self} requires NVIDIA DALI to be installed. " - f"See: https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html#id1" - ) - - if device not in ('cpu', 'gpu'): - raise ValueError( - f"{self} received an unexpected device argument {device}. Supported values are: 'cpu', 'gpu'" - ) - - device_id = device_id if device == 'gpu' else None - - self.batch_size = batch_size # Used by NeMo - - self.device = device - self.device_id = device_id - - if world_size > 1: - self.shard_id = global_rank - self.num_shards = world_size - else: - self.shard_id = None - self.num_shards = None - - self.eos_id = eos_id - self.bos_id = bos_id - self.sample_rate = sample_rate - - self.pipe = Pipeline( - batch_size=batch_size, - num_threads=num_threads, - device_id=self.device_id, - exec_async=True, - exec_pipelined=True, - ) - - has_preprocessor = preprocessor_cfg is not None - if has_preprocessor: - if preprocessor_cfg._target_ == "nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor": - feature_type = "mel_spectrogram" - elif preprocessor_cfg._target_ == "nemo.collections.asr.modules.AudioToMFCCPreprocessor": - feature_type = "mfcc" - else: - raise ValueError( - f"{self} received an unexpected preprocessor configuration: {preprocessor_cfg._target_}." - f" Supported preprocessors are: AudioToMelSpectrogramPreprocessor, AudioToMFCCPreprocessor" - ) - - # Default values taken from AudioToMelSpectrogramPreprocessor - params = preprocessor_cfg - self.dither = params['dither'] if 'dither' in params else 0.0 - self.preemph = params['preemph'] if 'preemph' in params else 0.97 - self.window_size_sec = params['window_size'] if 'window_size' in params else 0.02 - self.window_stride_sec = params['window_stride'] if 'window_stride' in params else 0.01 - self.sample_rate = params['sample_rate'] if 'sample_rate' in params else sample_rate - self.window_size = int(self.window_size_sec * self.sample_rate) - self.window_stride = int(self.window_stride_sec * self.sample_rate) - - normalize = params['normalize'] if 'normalize' in params else 'per_feature' - if normalize == 'per_feature': # Each freq channel independently - self.normalization_axes = (1,) - elif normalize == 'all_features': - self.normalization_axes = (0, 1) - else: - raise ValueError( - f"{self} received {normalize} for the normalize parameter." - f" It must be either 'per_feature' or 'all_features'." - ) - - self.window = None - window_name = params['window'] if 'window' in params else 'hann' - torch_windows = { - 'hann': torch.hann_window, - 'hamming': torch.hamming_window, - 'blackman': torch.blackman_window, - 'bartlett': torch.bartlett_window, - 'none': None, - } - - if window_name == 'ones': - window_tensor = torch.ones(self.window_size) - else: - try: - window_fn = torch_windows.get(window_name, None) - except: - raise ValueError( - f"{self} received '{window_name}' for the window parameter." - f" It must be one of: ('hann', 'ones', 'hamming', 'blackman', 'bartlett', None)." - f" None is equivalent to 'hann'." - ) - window_tensor = window_fn(self.window_size, periodic=False) if window_fn else None - self.window = window_tensor.numpy().tolist() if window_tensor is not None else None - - self.n_fft = params['n_fft'] if 'n_fft' in params else 2 ** math.ceil(math.log2(self.window_size)) - self.n_mels = params['n_mels'] if 'n_mels' in params else 64 - self.n_mfcc = params['n_mfcc'] if 'n_mfcc' in params else 64 - - features = params['features'] if 'features' in params else 0 - if features > 0: - if feature_type == 'mel_spectrogram': - self.n_mels = features - elif feature_type == 'mfcc': - self.n_mfcc = features - - # TODO Implement frame splicing - if 'frame_splicing' in params: - assert params['frame_splicing'] == 1, "Frame splicing is not implemented" - - self.freq_low = params['lowfreq'] if 'lowfreq' in params else 0.0 - self.freq_high = params['highfreq'] if 'highfreq' in params else self.sample_rate / 2.0 - self.log_features = params['log'] if 'log' in params else True - - # We want to avoid taking the log of zero - # There are two options: either adding or clamping to a small value - - self.log_zero_guard_type = params['log_zero_guard_type'] if 'log_zero_guard_type' in params else 'add' - if self.log_zero_guard_type not in ["add", "clamp"]: - raise ValueError( - f"{self} received {self.log_zero_guard_type} for the " - f"log_zero_guard_type parameter. It must be either 'add' or " - f"'clamp'." - ) - - self.log_zero_guard_value = ( - params['log_zero_guard_value'] if 'log_zero_guard_value' in params else 2 ** -24 - ) - if isinstance(self.log_zero_guard_value, str): - if self.log_zero_guard_value == "tiny": - self.log_zero_guard_value = torch.finfo(torch.float32).tiny - elif self.log_zero_guard_value == "eps": - self.log_zero_guard_value = torch.finfo(torch.float32).eps - else: - raise ValueError( - f"{self} received {self.log_zero_guard_value} for the log_zero_guard_type parameter." - f"It must be either a number, 'tiny', or 'eps'" - ) - - self.mag_power = params['mag_power'] if 'mag_power' in params else 2 - if self.mag_power != 1.0 and self.mag_power != 2.0: - raise ValueError( - f"{self} received {self.mag_power} for the mag_power parameter." f" It must be either 1.0 or 2.0." - ) - - self.pad_to = max(params['pad_to'], 1) if 'pad_to' in params else 16 - self.pad_value = params['pad_value'] if 'pad_value' in params else 0.0 - - with self.pipe: - if audio_tar_filepaths is None and audio_tar_index_filepaths is None: - audio, indices = dali.fn.readers.nemo_asr( - name="Reader", - manifest_filepaths=manifest_filepath.split(','), - dtype=dali.types.FLOAT, - downmix=True, - sample_rate=float(self.sample_rate), - min_duration=min_duration, - max_duration=max_duration, - read_sample_rate=False, - read_text=False, - read_idxs=True, - random_shuffle=shuffle, - shard_id=self.shard_id, - num_shards=self.num_shards, - pad_last_batch=True, - ) - - self.is_tarred_dataset = False - - elif audio_tar_filepaths is not None and audio_tar_index_filepaths is not None: - audio_tar_filepaths = expand_sharded_filepaths( - audio_tar_filepaths, shard_strategy=shard_strategy, world_size=world_size, global_rank=global_rank - ) - audio_tar_index_filepaths = expand_sharded_filepaths( - audio_tar_index_filepaths, - shard_strategy=shard_strategy, - world_size=world_size, - global_rank=global_rank, - ) - - if len(audio_tar_filepaths) != len(audio_tar_index_filepaths) and len(audio_tar_index_filepaths) != 0: - raise ValueError( - f"Number of filepaths provided for `audio_tar_filepaths` must match " - f"`audio_tar_index_filepaths`. Got {len(audio_tar_filepaths)} audio_tar_filepaths and " - f"{len(audio_tar_index_filepaths)} audio_tar_index_filepaths." - ) - - tar_file = dali.fn.readers.webdataset( - paths=audio_tar_filepaths, - index_paths=audio_tar_index_filepaths, - name="Reader", - ext=["wav"], - missing_component_behavior="error", - random_shuffle=shuffle, - shard_id=self.shard_id, - num_shards=self.num_shards, - pad_last_batch=True, - ) - audio, _ = dali.fn.decoders.audio( - tar_file, dtype=dali.types.FLOAT, downmix=True, sample_rate=float(self.sample_rate), - ) - indices = dali.fn.get_property(tar_file, key="source_info") - indices = dali.fn.pad(indices) - - self.is_tarred_dataset = True - - else: - raise RuntimeError( - "When using DALI datasets, either `audio_tar_filepaths` " - "and `audio_tar_index_filepaths` should either both be None (sequential dataset)" - "or provided (tarred dataset)." - ) - - # Extract nonsilent region, if necessary - if trim: - # Need to extract non-silent region before moving to the GPU - roi_start, roi_len = dali.fn.nonsilent_region(audio, cutoff_db=-60) - audio = audio.gpu() if self.device == 'gpu' else audio - audio = dali.fn.slice( - audio, roi_start, roi_len, normalized_anchor=False, normalized_shape=False, axes=[0] - ) - else: - audio = audio.gpu() if self.device == 'gpu' else audio - - if not has_preprocessor: - # No preprocessing, the output is the audio signal - audio_len = dali.fn.shapes(dali.fn.reshape(audio, shape=[-1])) - audio = dali.fn.pad(audio) - self.pipe.set_outputs(audio, audio_len, indices) - else: - # Additive gaussian noise (dither) - if self.dither > 0.0: - gaussian_noise = dali.fn.random.normal(audio) - audio = audio + self.dither * gaussian_noise - - # Preemphasis filter - if self.preemph > 0.0: - audio = dali.fn.preemphasis_filter(audio, preemph_coeff=self.preemph, border='zero') - - # Power spectrogram - spec = dali.fn.spectrogram( - audio, - nfft=self.n_fft, - window_length=self.window_size, - window_step=self.window_stride, - window_fn=self.window, - ) - - if feature_type == 'mel_spectrogram' or feature_type == 'mfcc': - # Spectrogram to Mel Spectrogram - spec = dali.fn.mel_filter_bank( - spec, - sample_rate=self.sample_rate, - nfilter=self.n_mels, - normalize=True, - freq_low=self.freq_low, - freq_high=self.freq_high, - ) - # Mel Spectrogram to MFCC - if feature_type == 'mfcc': - spec = dali.fn.mfcc(spec, n_mfcc=self.n_mfcc) - - # Logarithm - if self.log_zero_guard_type == 'add': - spec = spec + self.log_zero_guard_value - - spec = dali.fn.to_decibels( - spec, multiplier=math.log(10), reference=1.0, cutoff_db=math.log(self.log_zero_guard_value) - ) - - # Normalization - spec = dali.fn.normalize(spec, axes=self.normalization_axes, epsilon=1e-5 ** 2, ddof=1) - - # Extracting the length of the spectrogram - spec_len = dali.fn.slice(dali.fn.shapes(spec), 1, 1, axes=(0,)) - - # Pads feature dimension to be a multiple of `pad_to` and the temporal dimension to be as big as the largest sample (shape -1) - spec = dali.fn.pad(spec, fill_value=self.pad_value, axes=(0, 1), align=(self.pad_to, 1), shape=(1, -1)) - self.pipe.set_outputs(spec, spec_len, indices) - - x = time.time() - # Building DALI pipeline - self.pipe.build() - y = time.time() - - logging.info(f"Time for pipe.build() : {(y - x)} seconds") - - if has_preprocessor: - output_names = ['processed_signal', 'processed_signal_len', 'manifest_indices'] - else: - output_names = ['audio', 'audio_len', 'manifest_indices'] - - x = time.time() - last_batch_policy = LastBatchPolicy.DROP if drop_last else LastBatchPolicy.PARTIAL - self._iter = DALIPytorchIterator( - [self.pipe], - output_map=output_names, - reader_name="Reader", - last_batch_policy=last_batch_policy, - dynamic_shape=True, - auto_reset=True, - ) - y = time.time() - logging.info(f"Time for DALIPytorchIterator to initialize : {(y - x)} seconds") - - # TODO come up with a better solution - class DummyDataset: - def __init__(self, parent): - self.parent = parent - - def __len__(self): - return self.parent.size - - self.dataset = DummyDataset(self) # Used by NeMo - - x = time.time() - self.manifest_processor = ASRManifestProcessor( - manifest_filepath=manifest_filepath, - parser=parser, - max_duration=max_duration, - min_duration=min_duration, - max_utts=0, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - index_by_file_id=self.is_tarred_dataset, - ) - y = time.time() - logging.info(f"Time to build nemo manifest processor - {(y - x)} seconds") - - def reset(self): - self._iter.reset() - - def __iter__(self): - return self - - def next(self): - return self.__next__() - - @property - def size(self): - return self._iter.size - - def __len__(self): - return len(self._iter) - - def __next__(self): - outputs = self._iter.next() - assert len(outputs) == 1 - dali_out = outputs[0] - manifest_indices = dali_out['manifest_indices'].numpy() - - out = {} - out_names = ['processed_signal', 'processed_signal_len', 'audio', 'audio_len'] - for out_name in out_names: - if out_name in dali_out: - out[out_name] = dali_out[out_name].detach().clone() - - text_tokens = [] - text_tokens_len = [] - max_len = 0 - batch_size = manifest_indices.shape[0] - for i, manifest_index in enumerate(manifest_indices): - - if not self.is_tarred_dataset: - # Loose-file dataset. Index is integer based. - manifest_index = manifest_index[0] - text, text_length = self.manifest_processor.process_text_by_id(manifest_index) - else: - # Tarred-file dataset. Index is filename based. - resolved_manifest_indices = manifest_index.tobytes().decode().split(":") - resolved_manifest_index = resolved_manifest_indices[2] # we require just the filename segment - resolved_manifest_index = os.path.splitext(resolved_manifest_index)[0] # we dont need file extension - text, text_length = self.manifest_processor.process_text_by_file_id(resolved_manifest_index) - - text_tokens_len.append(text_length) - text_tokens.append(text) - if text_length > max_len: - max_len = text_length - - transcript_out = torch.full([batch_size, max_len], fill_value=self.manifest_processor.pad_id, dtype=torch.long) - for i, n in enumerate(text_tokens_len): - transcript_out[i, :n] = torch.tensor(text_tokens[i], dtype=torch.long) - transcript_len_out = torch.tensor(text_tokens_len, dtype=torch.long) - - out['transcript'] = transcript_out - out['transcript_len'] = transcript_len_out - return DALIOutputs(out) - - -class AudioToCharDALIDataset(_AudioTextDALIDataset): - """ - Character based NVIDIA DALI pipeline that loads tensors via one or more manifest files where each line containing a - sample descriptor in JSON, including audio files, transcripts, and durations (in seconds). - Here's an example: - {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147} - ... - {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt": - "utterance_id", "ctm_utt": "en_4156", "side": "A"} - - Args: - manifest_filepath: Path to manifest file with the format described above. Can be comma-separated paths. - device (str): Determines the device type to be used for preprocessing. Allowed values are: 'cpu', 'gpu'. - batch_size (int): Number of samples in a batch. - labels (List[str]): String containing all the possible characters to map to. - sample_rate (int): Sample rate to resample loaded audio to. - num_threads (int): Number of CPU processing threads to be created by the DALI pipeline. - max_duration (float): Determines the maximum allowed duration, in seconds, of the loaded audio files. - min_duration (float): Determines the minimum allowed duration, in seconds, of the loaded audio files. - blank_index (int): blank character index, default = -1 - unk_index (int): unk_character index, default = -1 - normalize (bool): whether to normalize transcript text (default): True - bos_id (int): Id of beginning of sequence symbol to append if not None - eos_id (int): Id of end of sequence symbol to append if not None - pad_id (int): Id used to pad the input. Defaults to 0 if not provided. - trim (bool): If True, it will extract the nonsilent region of the loaded audio signal. - shuffle (bool): If set to True, the dataset will shuffled after loading. - drop_last (bool): If set to True, the last batch will be dropped if incomplete. This will be the case when the shard size is not divisible by the batch size. - If set to False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. - parser (str, callable): A str for an inbuilt parser, or a callable with signature f(str) -> List[int]. - device_id (int): Index of the GPU to be used (local_rank). Only applicable when device == 'gpu'. Defaults to 0. - global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. - world_size (int): Total number of processes, used for partitioning shards. Defaults to 1. - preprocessor_cfg (DictConfig): Preprocessor configuration. Supports AudioToMelSpectrogramPreprocessor and AudioToMFCCPreprocessor. - return_sample_id (bool): whether to return the sample_id as a part of each sample (not supported yet). - """ - - def __init__( - self, - manifest_filepath: str, - device: str, - batch_size: int, - labels: Union[str, List[str]], - sample_rate: int = 16000, - audio_tar_filepaths: Optional[Union[str, List[str]]] = None, - audio_tar_index_filepaths: Optional[Union[str, List[str]]] = None, - num_threads: int = 4, - max_duration: float = 0.0, - min_duration: float = 0.0, - blank_index: int = -1, - unk_index: int = -1, - normalize: bool = True, - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - pad_id: int = 0, - trim: bool = False, - shuffle: bool = False, - drop_last: bool = False, - parser: Union[str, Callable] = 'en', - shard_strategy: str = "scatter", - device_id: int = 0, - global_rank: int = 0, - world_size: int = 1, - preprocessor_cfg: DictConfig = None, - return_sample_id: bool = False, - ): - self.labels = labels - - parser = parsers.make_parser( - labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize - ) - - super().__init__( - manifest_filepath=manifest_filepath, - device=device, - batch_size=batch_size, - audio_tar_filepaths=audio_tar_filepaths, - audio_tar_index_filepaths=audio_tar_index_filepaths, - sample_rate=sample_rate, - num_threads=num_threads, - max_duration=max_duration, - min_duration=min_duration, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - trim=trim, - shuffle=shuffle, - drop_last=drop_last, - parser=parser, - shard_strategy=shard_strategy, - device_id=device_id, - global_rank=global_rank, - world_size=world_size, - preprocessor_cfg=preprocessor_cfg, - return_sample_id=return_sample_id, - ) - - -class AudioToBPEDALIDataset(_AudioTextDALIDataset): - """ - Subword based NVIDIA DALI pipeline that loads tensors via one or more manifest files where each line containing a - sample descriptor in JSON, including audio files, transcripts, and durations (in seconds). - Here's an example: - {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147} - ... - {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt": - "utterance_id", "ctm_utt": "en_4156", "side": "A"} - - Args: - manifest_filepath: Path to manifest file with the format described above. Can be comma-separated paths. - tokenizer (TokenizerSpec): A TokenizerSpec implementation that wraps a tokenization implementation. - device (str): Determines the device type to be used for preprocessing. Allowed values are: 'cpu', 'gpu'. - batch_size (int): Number of samples in a batch. - sample_rate (int): Sample rate to resample loaded audio to. - num_threads (int): Number of CPU processing threads to be created by the DALI pipeline. - max_duration (float): Determines the maximum allowed duration, in seconds, of the loaded audio files. - min_duration (float): Determines the minimum allowed duration, in seconds, of the loaded audio files. - bos_id (int): Id of beginning of sequence symbol to append if not None. Injected from the tokenizer. - eos_id (int): Id of end of sequence symbol to append if not None. Injected from the tokenizer. - pad_id (int): Id used to pad the input. Defaults to 0 if not provided. Injected from the tokenizer. - trim (bool): If True, it will extract the nonsilent region of the loaded audio signal. - shuffle (bool): If set to True, the dataset will shuffled after loading. - drop_last (bool): If set to True, the last batch will be dropped if incomplete. This will be the case when the shard size is not divisible by the batch size. - If set to False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. - - device_id (int): Index of the GPU to be used (local_rank). Only applicable when device == 'gpu'. Defaults to 0. - global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. - world_size (int): Total number of processes, used for partitioning shards. Defaults to 1. - preprocessor_cfg (DictConfig): Preprocessor configuration. Supports AudioToMelSpectrogramPreprocessor and AudioToMFCCPreprocessor. - use_start_end_token (bool): Boolean which dictates whether to add [BOS] and [EOS] tokens to beginning and - ending of speech respectively. - return_sample_id (bool): whether to return the sample_id as a part of each sample (not supported yet). - """ - - def __init__( - self, - manifest_filepath: str, - tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec', - device: str, - batch_size: int, - sample_rate: int = 16000, - audio_tar_filepaths: Optional[Union[str, List[str]]] = None, - audio_tar_index_filepaths: Optional[Union[str, List[str]]] = None, - num_threads: int = 4, - max_duration: float = 0.0, - min_duration: float = 0.0, - trim: bool = False, - shuffle: bool = False, - drop_last: bool = False, - shard_strategy: str = "scatter", - device_id: int = 0, - global_rank: int = 0, - world_size: int = 1, - preprocessor_cfg: DictConfig = None, - use_start_end_token: bool = True, - return_sample_id: bool = False, - ): - - if use_start_end_token and hasattr(tokenizer, 'bos_token'): - bos_id = tokenizer.bos_id - else: - bos_id = None - - if use_start_end_token and hasattr(tokenizer, 'eos_token'): - eos_id = tokenizer.eos_id - else: - eos_id = None - - if hasattr(tokenizer, 'pad_token'): - pad_id = tokenizer.pad_id - else: - pad_id = 0 - - class TokenizerWrapper: - def __init__(self, tokenizer): - self._tokenizer = tokenizer - - def __call__(self, text): - t = self._tokenizer.text_to_ids(text) - return t - - super().__init__( - manifest_filepath=manifest_filepath, - device=device, - batch_size=batch_size, - sample_rate=sample_rate, - audio_tar_filepaths=audio_tar_filepaths, - audio_tar_index_filepaths=audio_tar_index_filepaths, - num_threads=num_threads, - max_duration=max_duration, - min_duration=min_duration, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - trim=trim, - shuffle=shuffle, - drop_last=drop_last, - parser=TokenizerWrapper(tokenizer), - shard_strategy=shard_strategy, - device_id=device_id, - global_rank=global_rank, - world_size=world_size, - preprocessor_cfg=preprocessor_cfg, - return_sample_id=return_sample_id, - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_text_dataset.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_text_dataset.py deleted file mode 100644 index 3234b617cc9c73baf4a96cb098256ed1e610c810..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/audio_to_text_dataset.py +++ /dev/null @@ -1,950 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import json -import random -from math import isclose -from typing import Any, List, Optional, Union - -import torch -from omegaconf import DictConfig, OmegaConf, open_dict -from omegaconf.listconfig import ListConfig -from pytorch_lightning.callbacks import BasePredictionWriter -from torch.utils.data import ChainDataset - -from nemo.collections.asr.data import audio_to_text, audio_to_text_dali -from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations -from nemo.collections.common.data.dataset import CodeSwitchedDataset, ConcatDataset -from nemo.utils import logging - - -def inject_dataloader_value_from_model_config(model_cfg: dict, dataloader_cfg: DictConfig, key: str): - """ - Extracts the label set provided at the top level of the model, and propagates it to the dataloader - config. - - Args: - model_cfg: A DictConfig representing the model's config. - dataloader_cfg: A DictConfig representing the individual data loader - key: A str value representing a key in the model_cfg whose value will be propagated to the - dataloader config. - """ - if key not in model_cfg: - logging.info( - f"Model level config does not contain `{key}`, please explicitly provide `{key}` to the dataloaders." - ) - return - - if not isinstance(dataloader_cfg, DictConfig): - dataloader_cfg = DictConfig(dataloader_cfg) - - # If key exists in the data loader config (either set explicitly or as a placeholder (via None)) - if key in dataloader_cfg: - # Dataloader `labels` is provided and is non-null - if dataloader_cfg[key] is not None and model_cfg[key] != dataloader_cfg[key]: - # Model level `labels` dont match Dataloader level `labels` - logging.warning( - f'`{key}` is explicitly provided to the data loader, and is different from ' - f'the `{key}` provided at the model level config.\n' - f'If this is incorrect, please set the dataloader\'s `{key}` to None.' - ) - - else: - # Dataloader `key` is None or values match - # Propagate from model level `key` (even if they match) - with open_dict(dataloader_cfg): - dataloader_cfg[key] = model_cfg[key] - - else: - # If key key doesnt even exist in dataloader_cfg, inject it explicitly - with open_dict(dataloader_cfg): - dataloader_cfg[key] = model_cfg[key] - - -def get_concat_char_dataset( - config: dict, global_rank: int, world_size: int, augmentor: Optional['AudioAugmentor'] = None -) -> ConcatDataset: - """ - Instantiates an instance of ConcatDataset containing one or more intances of - Character Encoding based AudioToCharDataset. - - Args: - config: Config of the AudioToCharDataset. - global_rank: Global rank of this device. - world_size: Global world size in the training method. - augmentor: Optional AudioAugmentor object for augmentations on audio data. - - Returns: - An instance of ConcatDataset containing one or more instances of AudioToCharDataset. - """ - if 'labels' not in config: - logging.warning(f"dataset does not have explicitly defined labels") - - manifest_filepaths = config['manifest_filepath'] - datasets = [] - - # needed to support validation Concat Datasets that arrive here as - # [[dataset1,dataset2]] otherwise ModelPT would interfere - if len(manifest_filepaths) == 1 and not isinstance(manifest_filepaths[0], str): - logging.info(f"removing an extra nesting level from {manifest_filepaths}") - manifest_filepaths = config['manifest_filepath'][0] - - for manifest_filepath in manifest_filepaths: - conf = copy.deepcopy(config) - conf['manifest_filepath'] = manifest_filepath - - dataset = get_char_dataset(config=conf, augmentor=augmentor) - datasets.append(dataset) - - dataset = ConcatDataset( - datasets, - sampling_technique=config.get('concat_sampling_technique', 'temperature'), - sampling_temperature=config.get('concat_sampling_temperature', 5), - sampling_scale=config.get('concat_sampling_scale', 1), - sampling_probabilities=config.get('concat_sampling_probabilities', None), - shuffle=config.get('concat_shuffle', True), - seed=config.get('concat_sampling_seed', None), - global_rank=global_rank, - world_size=world_size, - ) - return dataset - - -def get_char_dataset(config: dict, augmentor: Optional['AudioAugmentor'] = None) -> audio_to_text.AudioToCharDataset: - """ - Instantiates a Character Encoding based AudioToCharDataset. - - Args: - config: Config of the AudioToCharDataset. - augmentor: Optional AudioAugmentor object for augmentations on audio data. - - Returns: - An instance of AudioToCharDataset. - """ - if 'labels' not in config: - logging.warning(f"dataset does not have explicitly defined labels") - - dataset = audio_to_text.AudioToCharDataset( - manifest_filepath=config['manifest_filepath'], - labels=config.get('labels', None), - sample_rate=config['sample_rate'], - int_values=config.get('int_values', False), - augmentor=augmentor, - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - max_utts=config.get('max_utts', 0), - blank_index=config.get('blank_index', -1), - unk_index=config.get('unk_index', -1), - normalize=config.get('normalize_transcripts', False), - trim=config.get('trim_silence', False), - parser=config.get('parser', 'en'), - return_sample_id=config.get('return_sample_id', False), - channel_selector=config.get('channel_selector', None), - ) - return dataset - - -def get_concat_bpe_dataset( - config: dict, - tokenizer: 'TokenizerSpec', - global_rank: int, - world_size: int, - augmentor: Optional['AudioAugmentor'] = None, -) -> ConcatDataset: - """ - Instantiates a ContactDataset based on several Byte Pair Encoding / Word Piece Encoding based AudioToBPEDatasets. - - Args: - config: Config of the AudioToBPEDataset. - tokenizer: An instance of a TokenizerSpec object. - global_rank: Global rank of this device. - world_size: Global world size in the training method. - augmentor: Optional AudioAugmentor object for augmentations on audio data. - - Returns: - An instance of ConcatDataset containing several instances of AudioToBPEDataset. - """ - manifest_filepaths = config['manifest_filepath'] - datasets = [] - - # needed to support validation Concat Datasets that arrive here as - # [[dataset1,dataset2]] otherwise ModelPT would interfere - if len(manifest_filepaths) == 1 and not isinstance(manifest_filepaths[0], str): - logging.info(f"removing an extra nesting level from {manifest_filepaths}") - manifest_filepaths = config['manifest_filepath'][0] - - for manifest_filepath in manifest_filepaths: - conf = copy.deepcopy(config) - conf['manifest_filepath'] = manifest_filepath - dataset = get_bpe_dataset(config=conf, tokenizer=tokenizer, augmentor=augmentor) - datasets.append(dataset) - - dataset = ConcatDataset( - datasets, - sampling_technique=config.get('concat_sampling_technique', 'temperature'), - sampling_temperature=config.get('concat_sampling_temperature', 5), - sampling_scale=config.get('concat_sampling_scale', 1), - sampling_probabilities=config.get('concat_sampling_probabilities', None), - shuffle=config.get('concat_shuffle', True), - seed=config.get('concat_sampling_seed', None), - global_rank=global_rank, - world_size=world_size, - ) - return dataset - - -def get_bpe_dataset( - config: dict, tokenizer: 'TokenizerSpec', augmentor: Optional['AudioAugmentor'] = None -) -> audio_to_text.AudioToBPEDataset: - """ - Instantiates a Byte Pair Encoding / Word Piece Encoding based AudioToBPEDataset. - - Args: - config: Config of the AudioToBPEDataset. - tokenizer: An instance of a TokenizerSpec object. - augmentor: Optional AudioAugmentor object for augmentations on audio data. - - Returns: - An instance of AudioToBPEDataset. - """ - dataset = audio_to_text.AudioToBPEDataset( - manifest_filepath=config['manifest_filepath'], - tokenizer=tokenizer, - sample_rate=config['sample_rate'], - int_values=config.get('int_values', False), - augmentor=augmentor, - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - max_utts=config.get('max_utts', 0), - trim=config.get('trim_silence', False), - use_start_end_token=config.get('use_start_end_token', True), - return_sample_id=config.get('return_sample_id', False), - channel_selector=config.get('channel_selector', None), - ) - return dataset - - -def get_concat_tarred_dataset( - config: dict, - shuffle_n: int, - global_rank: int, - world_size: int, - tokenizer: Optional['TokenizerSpec'] = None, - augmentor: Optional['AudioAugmentor'] = None, -) -> ConcatDataset: - """ - Instantiates a ConcatDataset containing multiple Word Piece/BPE Encoding based TarredAudioToBPEDataset or a char based TarredAudioToCharDataset. - - Args: - config: Config of the TarredAudioToBPEDataset or TarredAudioToCharDataset. - shuffle_n: How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - tokenizer: An instance of a TokenizerSpec object if BPE dataset is needed. - global_rank: Global rank of this device. - world_size: Global world size in the training method. - Passsing None would return a char-based dataset. - augmentor: Optional AudioAugmentor object for augmentations on audio data. - - Returns: - An instance of ConcatDataset containing one or more TarredAudioToBPEDatasets or TarredAudioToCharDatasets. - """ - - tarred_audio_filepaths = config['tarred_audio_filepaths'] - manifest_filepaths = config['manifest_filepath'] - datasets = [] - for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate( - zip(tarred_audio_filepaths, manifest_filepaths) - ): - conf = copy.deepcopy(config) - conf['manifest_filepath'] = manifest_filepath - conf['tarred_audio_filepaths'] = tarred_audio_filepath - dataset = get_tarred_dataset( - config=conf, - tokenizer=tokenizer, - shuffle_n=shuffle_n, - global_rank=global_rank, - world_size=world_size, - augmentor=augmentor, - ) - datasets.append(dataset) - - dataset = ConcatDataset( - datasets, - sampling_technique=config.get('concat_sampling_technique', 'temperature'), - sampling_temperature=config.get('concat_sampling_temperature', 5), - sampling_scale=config.get('concat_sampling_scale', 1), - sampling_probabilities=config.get('concat_sampling_probabilities', None), - shuffle=config.get('concat_shuffle', True), - seed=config.get('concat_sampling_seed', None), - global_rank=global_rank, - world_size=world_size, - ) - return dataset - - -def get_tarred_dataset( - config: dict, - shuffle_n: int, - global_rank: int, - world_size: int, - tokenizer: Optional['TokenizerSpec'] = None, - augmentor: Optional['AudioAugmentor'] = None, -) -> Union[audio_to_text.TarredAudioToBPEDataset, audio_to_text.TarredAudioToCharDataset]: - """ - Instantiates a Word Piece/BPE Encoding based TarredAudioToBPEDataset or a char based TarredAudioToCharDataset. - - Args: - config: Config of the TarredAudioToBPEDataset or TarredAudioToCharDataset. - shuffle_n: How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - tokenizer: An instance of a TokenizerSpec object if BPE dataset is needed. - global_rank: Global rank of this device. - world_size: Global world size in the training method. - Passsing None would return a char-based dataset. - augmentor: Optional AudioAugmentor object for augmentations on audio data. - - Returns: - An instance of TarredAudioToBPEDataset or TarredAudioToCharDataset. - """ - tarred_audio_filepaths = config['tarred_audio_filepaths'] - manifest_filepaths = config['manifest_filepath'] - datasets = [] - tarred_audio_filepaths = convert_to_config_list(tarred_audio_filepaths) - manifest_filepaths = convert_to_config_list(manifest_filepaths) - - bucketing_weights = config.get('bucketing_weights', None) # For upsampling buckets - if bucketing_weights: - for idx, weight in enumerate(bucketing_weights): - if not isinstance(weight, int) or weight <= 0: - raise ValueError(f"bucket weights must be positive integers") - - if len(manifest_filepaths) != len(tarred_audio_filepaths): - raise ValueError( - f"manifest_filepaths (length={len(manifest_filepaths)}) and tarred_audio_filepaths (length={len(tarred_audio_filepaths)}) need to have the same number of buckets." - ) - - if 'labels' not in config: - logging.warning(f"dataset does not have explicitly defined labels") - - if 'max_utts' in config: - raise ValueError('"max_utts" parameter is not supported for tarred datasets') - - for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate( - zip(tarred_audio_filepaths, manifest_filepaths) - ): - if len(tarred_audio_filepath) == 1: - tarred_audio_filepath = tarred_audio_filepath[0] - if len(manifest_filepath) == 1: - manifest_filepath = manifest_filepath[0] - - if tokenizer is None: - dataset = audio_to_text.TarredAudioToCharDataset( - audio_tar_filepaths=tarred_audio_filepath, - manifest_filepath=manifest_filepath, - labels=config.get('labels', None), - sample_rate=config['sample_rate'], - int_values=config.get('int_values', False), - augmentor=augmentor, - shuffle_n=shuffle_n, - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - blank_index=config.get('blank_index', -1), - unk_index=config.get('unk_index', -1), - normalize=config.get('normalize_transcripts', False), - trim=config.get('trim_silence', False), - parser=config.get('parser', 'en'), - shard_strategy=config.get('tarred_shard_strategy', 'scatter'), - shard_manifests=config.get('shard_manifests', False), - global_rank=global_rank, - world_size=world_size, - return_sample_id=config.get('return_sample_id', False), - ) - else: - dataset = audio_to_text.TarredAudioToBPEDataset( - audio_tar_filepaths=tarred_audio_filepath, - manifest_filepath=manifest_filepath, - tokenizer=tokenizer, - sample_rate=config['sample_rate'], - int_values=config.get('int_values', False), - augmentor=augmentor, - shuffle_n=shuffle_n, - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - trim=config.get('trim_silence', False), - use_start_end_token=config.get('use_start_end_token', True), - shard_strategy=config.get('tarred_shard_strategy', 'scatter'), - shard_manifests=config.get('shard_manifests', False), - global_rank=global_rank, - world_size=world_size, - return_sample_id=config.get('return_sample_id', False), - ) - if bucketing_weights: - [datasets.append(dataset) for _ in range(bucketing_weights[dataset_idx])] - else: - datasets.append(dataset) - - return get_chain_dataset(datasets=datasets, ds_config=config, rank=global_rank) - - -def get_code_switched_dataset( - config: dict, - shuffle_n: int, - global_rank: int, - world_size: int, - tokenizer: Optional['TokenizerSpec'] = None, - augmentor: Optional['AudioAugmentor'] = None, -) -> CodeSwitchedDataset: - - if 'manifest_filepath' not in config: - raise ValueError("`manifest_filepath` must be provided in the dataset config if `is_code_switched=True`") - if 'code_switched' not in config: - raise ValueError("`code_switched` param group must be in the dataset config if `is_code_switched=True`") - - manifest_filepaths = config['manifest_filepath'] - tarred_audio_filepaths = config.get('tarred_audio_filepaths', None) - - cs_config = OmegaConf.to_container(config['code_switched']) - - # needed to support validation Datasets that arrive here as - # [[dataset1,dataset2]] otherwise ModelPT would interfere - if len(manifest_filepaths) == 1 and not isinstance(manifest_filepaths[0], str): - manifest_filepaths = config['manifest_filepath'][0] - if tarred_audio_filepaths is None: - tarred_audio_filepaths = [None] * len(manifest_filepaths) - - if len(manifest_filepaths) != len(tarred_audio_filepaths): - raise ValueError( - f"manifest_filepaths (length={len(manifest_filepaths)}) and tarred_audio_filepaths (length={len(tarred_audio_filepaths)}) need to have the same number of items." - ) - - datasets = [] - for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate( - zip(tarred_audio_filepaths, manifest_filepaths) - ): - conf = copy.deepcopy(config) - conf['manifest_filepath'] = manifest_filepath - with open_dict(conf): - conf['tarred_audio_filepaths'] = tarred_audio_filepath - if tarred_audio_filepath is None or len(tarred_audio_filepath) == 0: - if tokenizer is None: - dataset = get_char_dataset(config=conf, augmentor=None) - else: - dataset = get_bpe_dataset(config=conf, tokenizer=tokenizer, augmentor=None) - else: - dataset = get_tarred_dataset( - config=conf, - tokenizer=tokenizer, - shuffle_n=shuffle_n, - global_rank=global_rank, - world_size=world_size, - augmentor=None, - ) - datasets.append(dataset) - - config = OmegaConf.to_container(config) - - dataset = CodeSwitchedDataset( - datasets, - shuffle=cs_config.get('shuffle', True), - min_duration=cs_config.get('min_duration', 4), - max_duration=cs_config.get('max_duration', 20), - min_monolingual=cs_config.get('min_monolingual', 0.3), - lang_probs=cs_config.get('probs', None), - db_norm=cs_config.get('db_norm', -25.0), - pause_start=cs_config.get('pause_start', 0), - pause_join=cs_config.get('pause_join', 0), - pause_end=cs_config.get('pause_end', 0), - sampling_scales=cs_config.get('sampling_scales', None), - seed=cs_config.get('seed', None), - global_rank=global_rank, - world_size=world_size, - pure_random=cs_config.get('pure_random', False), - force_monochannel=cs_config.get('force_monochannel', True), - infinity_mode=cs_config.get('infinity_mode', False), - sample_rate=config['sample_rate'], - augmentor=augmentor, - ) - - return dataset - - -def get_dali_char_dataset( - config: dict, - shuffle: bool, - device_id: int, - global_rank: int, - world_size: int, - preprocessor_cfg: Optional[DictConfig] = None, -) -> audio_to_text_dali.AudioToCharDALIDataset: - """ - Instantiates a Character Encoding based AudioToCharDALIDataset. - - Args: - config: Config of the AudioToCharDALIDataset. - shuffle: Bool flag whether to shuffle the dataset. - device_id: Index of the GPU to be used (local_rank). Only applicable when device == 'gpu'. Defaults to 0. - global_rank: Global rank of this device. - world_size: Global world size in the training method. - augmentor: Optional AudioAugmentor object for augmentations on audio data. - preprocessor_cfg: Preprocessor configuration. Supports AudioToMelSpectrogramPreprocessor and AudioToMFCCPreprocessor. - - Returns: - An instance of AudioToCharDALIDataset. - """ - device = 'gpu' if torch.cuda.is_available() else 'cpu' - dataset = audio_to_text_dali.AudioToCharDALIDataset( - manifest_filepath=config['manifest_filepath'], - device=device, - batch_size=config['batch_size'], - labels=config['labels'], - sample_rate=config['sample_rate'], - audio_tar_filepaths=config.get('tarred_audio_filepaths', None), - audio_tar_index_filepaths=config.get('tarred_audio_index_filepaths', None), - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - blank_index=config.get('blank_index', -1), - unk_index=config.get('unk_index', -1), - normalize=config.get('normalize_transcripts', False), - trim=config.get('trim_silence', False), - parser=config.get('parser', 'en'), - shuffle=shuffle, - shard_strategy=config.get('tarred_shard_strategy', 'scatter'), - device_id=device_id, - global_rank=global_rank, - world_size=world_size, - preprocessor_cfg=preprocessor_cfg, - return_sample_id=config.get('return_sample_id', False), - ) - return dataset - - -def get_dali_bpe_dataset( - config: dict, - tokenizer, - shuffle: bool, - device_id: int, - global_rank: int, - world_size: int, - preprocessor_cfg: Optional[DictConfig] = None, -) -> audio_to_text_dali.AudioToCharDALIDataset: - """ - Instantiates a Subword Encoding based AudioToBPEDALIDataset. - - Args: - config: Config of the AudioToBPEDALIDataset. - tokenizer: An implementation of NeMo TokenizerSpec. - shuffle: Bool flag whether to shuffle the dataset. - device_id: Index of the GPU to be used (local_rank). Only applicable when device == 'gpu'. Defaults to 0. - global_rank: Global rank of this device. - world_size: Global world size in the training method. - preprocessor_cfg: Preprocessor configuration. Supports AudioToMelSpectrogramPreprocessor and AudioToMFCCPreprocessor. - - Returns: - An instance of AudioToCharDALIDataset. - """ - device = 'gpu' if torch.cuda.is_available() else 'cpu' - dataset = audio_to_text_dali.AudioToBPEDALIDataset( - manifest_filepath=config['manifest_filepath'], - tokenizer=tokenizer, - device=device, - batch_size=config['batch_size'], - sample_rate=config['sample_rate'], - audio_tar_filepaths=config.get('tarred_audio_filepaths', None), - audio_tar_index_filepaths=config.get('tarred_audio_index_filepaths', None), - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - trim=config.get('trim_silence', False), - use_start_end_token=config.get('use_start_end_token', True), - shuffle=shuffle, - shard_strategy=config.get('tarred_shard_strategy', 'scatter'), - device_id=device_id, - global_rank=global_rank, - world_size=world_size, - preprocessor_cfg=preprocessor_cfg, - return_sample_id=config.get('return_sample_id', False), - ) - return dataset - - -def get_audio_to_text_char_dataset_from_config( - config, local_rank: int, global_rank: int, world_size: int, preprocessor_cfg: Optional[DictConfig] = None -): - """ - Construct Audio-To-Text Char dataset from a config. - Args: - config: dataset config - local_rank: model local rank - global_rank: model global rand - world_size: world size - preprocessor_cfg: preprocessor config, for DALI dataset - - Returns: - constructed dataset or None if dataset config is invalid or nothing to load - """ - if 'augmentor' in config: - augmentor = process_augmentations(config['augmentor'], global_rank=global_rank, world_size=world_size) - else: - augmentor = None - - is_concat = config.get('is_concat', False) - if is_concat: - if 'concat_sampling_technique' in config and config['concat_sampling_technique'] is None: - logging.warning( - f"Concat dataset requires `concat_sampling_technique` but it was not provided. Config: {config}" - ) - return None - if config['concat_sampling_technique'] == 'random': - if not 'concat_sampling_probabilities' in config: - logging.warning(f"Concat dataset requires `concat_sampling_probabilities` list. Config: {config}") - return None - else: - if not isclose(sum(config['concat_sampling_probabilities']), 1, abs_tol=1e-6): - logging.warning(f"`concat_sampling_probabilities` need to sum to 1. Config: {config}") - return None - - shuffle = config['shuffle'] - device = 'gpu' if torch.cuda.is_available() else 'cpu' - if config.get('use_dali', False): - device_id = local_rank if device == 'gpu' else None - dataset = get_dali_char_dataset( - config=config, - shuffle=shuffle, - device_id=device_id, - global_rank=global_rank, - world_size=world_size, - preprocessor_cfg=preprocessor_cfg, - ) - return dataset - - # Instantiate a code-switched dataset if config is present - if config.get('is_code_switched', False): - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}") - return None - if not ('code_switched' in config and config['code_switched'] is not None): - logging.warning( - f"Code switched dataset requires `*_ds.code_switched.*` dict but it was not provided. Config: {config}" - ) - return None - if ( - ('probs' in config['code_switched']) - and (config['code_switched']['probs'] is not None) - and (not isclose(sum(config['code_switched']['probs']), 1, abs_tol=1e-6)) - ): - logging.warning(f"`.code_switched.probs` need to sum to 1. Config: {config['code_switched']}") - return None - - shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0 - dataset = get_code_switched_dataset( - config=config, - shuffle_n=shuffle_n, - global_rank=global_rank, - world_size=world_size, - tokenizer=None, - augmentor=augmentor, - ) - # Instantiate tarred dataset loader or normal dataset loader - elif config.get('is_tarred', False): - if ('tarred_audio_filepaths' in config and config['tarred_audio_filepaths'] is None) or ( - 'manifest_filepath' in config and config['manifest_filepath'] is None - ): - logging.warning( - "Could not load dataset as `manifest_filepath` was None or " - f"`tarred_audio_filepaths` is None. Provided config : {config}" - ) - return None - - shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0 - if is_concat: - dataset = get_concat_tarred_dataset( - config=config, - shuffle_n=shuffle_n, - global_rank=global_rank, - world_size=world_size, - augmentor=augmentor, - ) - else: - dataset = get_tarred_dataset( - config=config, - shuffle_n=shuffle_n, - global_rank=global_rank, - world_size=world_size, - augmentor=augmentor, - ) - else: - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}") - return None - if is_concat: - dataset = get_concat_char_dataset( - config=config, global_rank=global_rank, world_size=world_size, augmentor=augmentor - ) - else: - dataset = get_char_dataset(config=config, augmentor=augmentor) - return dataset - - -def get_audio_to_text_bpe_dataset_from_config( - config, - local_rank: int, - global_rank: int, - world_size: int, - tokenizer, - preprocessor_cfg: Optional[DictConfig] = None, -): - """ - Construct Audio-To-Text BPE dataset from a config. - Args: - config: BPE dataset config - local_rank: model local rank - global_rank: model global rand - world_size: world size - tokenizer: BPE tokenizer - preprocessor_cfg: preprocessor config, for DALI BPE dataset - - Returns: - constructed dataset or None if dataset config is invalid or nothing to load - """ - if 'augmentor' in config: - augmentor = process_augmentations(config['augmentor'], global_rank=global_rank, world_size=world_size) - else: - augmentor = None - - is_concat = config.get('is_concat', False) - if is_concat: - if 'concat_sampling_technique' in config and config['concat_sampling_technique'] is None: - logging.warning( - f"Concat dataset requires `concat_sampling_technique` but it was not provided. Config: {config}" - ) - return None - - if config['concat_sampling_technique'] == 'random': - if not 'concat_sampling_probabilities' in config: - logging.warning(f"Concat dataset requires `concat_sampling_probabilities` list. Config: {config}") - return None - else: - if not isclose(sum(config['concat_sampling_probabilities']), 1, abs_tol=1e-6): - logging.warning(f"`concat_sampling_probabilities` need to sum to 1. Config: {config}") - return None - - shuffle = config['shuffle'] - device = 'gpu' if torch.cuda.is_available() else 'cpu' - if config.get('use_dali', False): - device_id = local_rank if device == 'gpu' else None - dataset = get_dali_bpe_dataset( - config=config, - tokenizer=tokenizer, - shuffle=shuffle, - device_id=device_id, - global_rank=global_rank, - world_size=world_size, - preprocessor_cfg=preprocessor_cfg, - ) - return dataset - - # Instantiate a code-switched dataset if config is present - if config.get('is_code_switched', False): - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}") - return None - if not ('code_switched' in config and config['code_switched'] is not None): - logging.warning( - f"Code switched dataset requires `*_ds.code_switched.*` dict but it was not provided. Config: {config}" - ) - return None - if ( - ('probs' in config['code_switched']) - and (config['code_switched']['probs'] is not None) - and (not isclose(sum(config['code_switched']['probs']), 1, abs_tol=1e-6)) - ): - logging.warning(f"`.code_switched.probs` need to sum to 1. Config: {config['code_switched']}") - return None - - shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0 - dataset = get_code_switched_dataset( - config=config, - shuffle_n=shuffle_n, - global_rank=global_rank, - world_size=world_size, - tokenizer=tokenizer, - augmentor=augmentor, - ) - # Instantiate tarred dataset loader or normal dataset loader - elif config.get('is_tarred', False): - if ('tarred_audio_filepaths' in config and config['tarred_audio_filepaths'] is None) or ( - 'manifest_filepath' in config and config['manifest_filepath'] is None - ): - logging.warning( - "Could not load dataset as `manifest_filepath` was None or " - f"`tarred_audio_filepaths` is None. Provided config : {config}" - ) - return None - - shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0 - if is_concat: - dataset = get_concat_tarred_dataset( - config=config, - tokenizer=tokenizer, - shuffle_n=shuffle_n, - global_rank=global_rank, - world_size=world_size, - augmentor=augmentor, - ) - else: - dataset = get_tarred_dataset( - config=config, - tokenizer=tokenizer, - shuffle_n=shuffle_n, - global_rank=global_rank, - world_size=world_size, - augmentor=augmentor, - ) - else: - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}") - return None - if is_concat: - dataset = get_concat_bpe_dataset( - config=config, - global_rank=global_rank, - world_size=world_size, - tokenizer=tokenizer, - augmentor=augmentor, - ) - else: - dataset = get_bpe_dataset(config=config, tokenizer=tokenizer, augmentor=augmentor) - return dataset - - -class ASRPredictionWriter(BasePredictionWriter): - def __init__(self, dataset, output_file: str): - super().__init__(write_interval="batch") - self.outf = open(output_file, 'w', encoding='utf-8') - self.dataset = dataset - self.samples_num = 0 - - def write_on_batch_end( - self, - trainer, - pl_module: 'LightningModule', - prediction: Any, - batch_indices: List[int], - batch: Any, - batch_idx: int, - dataloader_idx: int, - ): - for sample_id, transcribed_text in prediction: - item = {} - sample = self.dataset.get_manifest_sample(sample_id) - item["audio_filepath"] = sample.audio_file - item["offset"] = sample.offset - item["duration"] = sample.duration - item["text"] = sample.text_raw - item["pred_text"] = transcribed_text - self.outf.write(json.dumps(item) + "\n") - self.samples_num += 1 - return - - def close_output_file(self): - self.outf.close() - return self.samples_num - - -def convert_to_config_list(initial_list): - if type(initial_list) is str: - initial_list = initial_list.split(",") - if initial_list is None or initial_list == []: - raise ValueError("manifest_filepaths and tarred_audio_filepaths must not be empty.") - if not isinstance(initial_list, ListConfig): - initial_list = ListConfig([initial_list]) - - for list_idx, list_val in enumerate(initial_list): - if type(list_val) != type(initial_list[0]): - raise ValueError( - "manifest_filepaths and tarred_audio_filepaths need to be a list of lists for bucketing or just a list of strings" - ) - if type(initial_list[0]) is not ListConfig: - initial_list = ListConfig([initial_list]) - return initial_list - - -def get_chain_dataset(datasets, ds_config, rank=0): - if len(datasets) > 1: - if ds_config.get('bucketing_batch_size', None) is not None: - bucketing_batch_sizes = calc_bucketing_batch_sizes(ds_config, len(datasets)) - logging.info( - f"Batch bucketing is enabled for {len(datasets)} buckets with adaptive batch sizes of {bucketing_batch_sizes}!" - ) - for idx, dataset in enumerate(datasets): - datasets[idx] = audio_to_text.BucketingDataset( - dataset=dataset, bucketing_batch_size=bucketing_batch_sizes[idx] - ) - else: - logging.info( - f"Batch bucketing is enabled for {len(datasets)} buckets with fixed batch size of {ds_config['batch_size']}!" - ) - - if len(datasets) == 1: - return datasets[0] - bucketing_strategy = ds_config.get('bucketing_strategy', 'synced_randomized') - if bucketing_strategy == 'fixed_order': - return ChainDataset(datasets) - elif bucketing_strategy == 'synced_randomized': - return audio_to_text.RandomizedChainDataset(datasets=datasets, rnd_seed=0) - elif bucketing_strategy == 'fully_randomized': - return audio_to_text.RandomizedChainDataset(datasets=datasets, rnd_seed=random.randint(0, 30000) + rank) - else: - raise ValueError( - f'bucketing_strategy={bucketing_strategy} is not supported! Supported strategies are [fixed_order, fully_randomized, synced_randomized].' - ) - - -def calc_bucketing_batch_sizes(ds_config, datasets_len): - bucketing_batch_size = ds_config['bucketing_batch_size'] - bucketing_weights = ds_config.get('bucketing_weights', None) # To adjust for upsampled buckets - - bucketing_batch_sizes = [] - - if ds_config['batch_size'] != 1: - raise ValueError( - f"batch_size should be set to one when bucketing_batch_size is set and adaptive bucketing is enabled (batch_size={ds_config['batch_size']}!" - ) - if type(bucketing_batch_size) == int: # linear scaling - if bucketing_weights: # Want same batchsize for the same duplicated bucket - for idx, weight in enumerate(bucketing_weights): - scale_factor = datasets_len - idx - [bucketing_batch_sizes.append(scale_factor * bucketing_batch_size) for _ in range(weight)] - else: - for idx in range(datasets_len): - scale_factor = datasets_len - idx - bucketing_batch_sizes.append(scale_factor * bucketing_batch_size) - elif isinstance(bucketing_batch_size, ListConfig) or isinstance( - bucketing_batch_size, list - ): # assigned bucket sizes - if bucketing_weights: # Want same batchsize for same duplicated bucket - for idx, weight in enumerate(bucketing_weights): - [bucketing_batch_sizes.append(bucketing_batch_size[idx]) for _ in range(weight)] - else: - bucketing_batch_sizes = bucketing_batch_size - else: - raise ValueError( - f"bucketing_batch_size should be an integer or a list (bucketing_batch_size={bucketing_batch_size})!" - ) - - if len(bucketing_batch_sizes) != datasets_len: - raise ValueError( - f"batch_size should have the same length as the number of buckets ({len(bucketing_batch_sizes)}!={datasets_len}) " - ) - return bucketing_batch_sizes diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/data_simulation.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/data_simulation.py deleted file mode 100644 index 5bbdcdfb560578da0cd72e4b2be7a7b40d5d2952..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/data_simulation.py +++ /dev/null @@ -1,4037 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import concurrent -import itertools -import multiprocessing -import os -import random -import warnings -from typing import Dict, Iterable, List, Optional, Tuple, Union - -import h5py -import librosa -import matplotlib.pyplot as plt -import numpy as np -import soundfile as sf -import torch -from numpy.random import default_rng -from omegaconf import DictConfig, OmegaConf -from scipy.signal import convolve -from scipy.signal.windows import cosine, hamming, hann -from scipy.spatial.transform import Rotation -from tqdm import tqdm - -from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations -from nemo.collections.asr.parts.preprocessing.segment import AudioSegment -from nemo.collections.asr.parts.utils.audio_utils import db2mag, generate_approximate_noise_field, mag2db, pow2db, rms -from nemo.collections.asr.parts.utils.data_simulation_utils import ( - DataAnnotator, - SpeechSampler, - build_speaker_samples_map, - get_background_noise, - get_cleaned_base_path, - get_random_offset_index, - get_speaker_ids, - get_speaker_samples, - get_split_points_in_alignments, - load_speaker_sample, - normalize_audio, - per_speaker_normalize, - perturb_audio, - read_audio_from_buffer, - read_noise_manifest, -) -from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest -from nemo.collections.asr.parts.utils.speaker_utils import get_overlap_range, is_overlap, merge_float_intervals -from nemo.utils import logging - -try: - import pyroomacoustics as pra - from pyroomacoustics.directivities import CardioidFamily, DirectionVector, DirectivityPattern - - PRA = True -except ImportError: - PRA = False -try: - from gpuRIR import att2t_SabineEstimator, beta_SabineEstimation, simulateRIR, t2n - - GPURIR = True -except ImportError: - GPURIR = False - - -class MultiSpeakerSimulator(object): - """ - Multispeaker Audio Session Simulator - Simulates multispeaker audio sessions using single-speaker audio files and - corresponding word alignments. - - Change Log: - v1.0: Dec 2022 - - First working verison, supports multispeaker simulation with overlaps, silence and RIR - v1.0.1: Feb 2023 - - Multi-GPU support for speed up - - Faster random sampling routine - - Fixed sentence duration bug - - Silence and overlap length sampling algorithms are updated to guarantee `mean_silence` approximation - v1.0.2: March 2023 - - Added support for segment-level gain perturbation and session-level white-noise perturbation - - Modified speaker sampling mechanism to include as many speakers as possible in each data-generation run - - Added chunking mechanism to avoid freezing in multiprocessing processes - - v1.1.0 March 2023 - - Faster audio-file loading with maximum audio duration parameter - - Re-organized MultiSpeakerSimulator class and moved util functions to util files. - v1.1.1 March 2023 - - Changed `silence_mean` to use exactly the same sampling equation as `overlap_mean`. - - - Args: - cfg: OmegaConf configuration loaded from yaml file. - - Parameters: - manifest_filepath (str): Manifest file with paths to single speaker audio files - sr (int): Sampling rate of the input audio files from the manifest - random_seed (int): Seed to random number generator - - session_config: - num_speakers (int): Number of unique speakers per multispeaker audio session - num_sessions (int): Number of sessions to simulate - session_length (int): Length of each simulated multispeaker audio session (seconds). Short sessions - (e.g. ~240 seconds) tend to fall short of the expected overlap-ratio and silence-ratio. - - session_params: - max_audio_read_sec (int): The maximum audio length in second when loading an audio file. - The bigger the number, the slower the reading speed. Should be greater than 2.5 second. - sentence_length_params (list): k,p values for a negative_binomial distribution which is sampled to get the - sentence length (in number of words) - dominance_var (float): Variance in speaker dominance (where each speaker's dominance is sampled from a normal - distribution centered on 1/`num_speakers`, and then the dominance values are together - normalized to 1) - min_dominance (float): Minimum percentage of speaking time per speaker (note that this can cause the dominance of - the other speakers to be slightly reduced) - turn_prob (float): Probability of switching speakers after each utterance - - mean_silence (float): Mean proportion of silence to speaking time in the audio session. Should be in range [0, 1). - mean_silence_var (float): Variance for mean silence in all audio sessions. - This value should be 0 <= mean_silence_var < mean_silence * (1 - mean_silence). - per_silence_var (float): Variance for each silence in an audio session, set large values (e.g., 20) for de-correlation. - per_silence_min (float): Minimum duration for each silence, default to 0. - per_silence_max (float): Maximum duration for each silence, default to -1 for no maximum. - mean_overlap (float): Mean proportion of overlap in the overall non-silence duration. Should be in range [0, 1) and - recommend [0, 0.15] range for accurate results. - mean_overlap_var (float): Variance for mean overlap in all audio sessions. - This value should be 0 <= mean_overlap_var < mean_overlap * (1 - mean_overlap). - per_overlap_var (float): Variance for per overlap in each session, set large values to de-correlate silence lengths - with the latest speech segment lengths - per_overlap_min (float): Minimum per overlap duration in seconds - per_overlap_max (float): Maximum per overlap duration in seconds, set -1 for no maximum - start_window (bool): Whether to window the start of sentences to smooth the audio signal (and remove silence at - the start of the clip) - window_type (str): Type of windowing used when segmenting utterances ("hamming", "hann", "cosine") - window_size (float): Length of window at the start or the end of segmented utterance (seconds) - start_buffer (float): Buffer of silence before the start of the sentence (to avoid cutting off speech or starting - abruptly) - split_buffer (float): Split RTTM labels if greater than twice this amount of silence (to avoid long gaps between - utterances as being labelled as speech) - release_buffer (float): Buffer before window at end of sentence (to avoid cutting off speech or ending abruptly) - normalize (bool): Normalize speaker volumes - normalization_type (str): Normalizing speakers ("equal" - same volume per speaker, "var" - variable volume per - speaker) - normalization_var (str): Variance in speaker volume (sample from standard deviation centered at 1) - min_volume (float): Minimum speaker volume (only used when variable normalization is used) - max_volume (float): Maximum speaker volume (only used when variable normalization is used) - end_buffer (float): Buffer at the end of the session to leave blank - - outputs: - output_dir (str): Output directory for audio sessions and corresponding label files - output_filename (str): Output filename for the wav and RTTM files - overwrite_output (bool): If true, delete the output directory if it exists - output_precision (int): Number of decimal places in output files - - background_noise: - add_bg (bool): Add ambient background noise if true - background_manifest (str): Path to background noise manifest file - snr (int): SNR for background noise (using average speaker power), set `snr_min` and `snr_max` values to enable random SNR - snr_min (int): Min random SNR for background noise (using average speaker power), set `null` to use fixed SNR - snr_max (int): Max random SNR for background noise (using average speaker power), set `null` to use fixed SNR - - segment_augmentor: - add_seg_aug (bool): Set True to enable augmentation on each speech segment (Default: False) - segmentor: - gain: - prob (float): Probability range (uniform distribution) gain augmentation for individual segment - min_gain_dbfs (float): minimum gain in terms of dB - max_gain_dbfs (float): maximum gain in terms of dB - - session_augmentor: - add_sess_aug: (bool) set True to enable audio augmentation on the whole session (Default: False) - segmentor: - white_noise: - prob (float): Probability of adding white noise (Default: 1.0) - min_level (float): minimum gain in terms of dB - max_level (float): maximum gain in terms of dB - - speaker_enforcement: - enforce_num_speakers (bool): Enforce that all requested speakers are present in the output wav file - enforce_time (list): Percentage of the way through the audio session that enforcement mode is triggered (sampled - between time 1 and 2) - - segment_manifest: (parameters for regenerating the segment manifest file) - window (float): Window length for segmentation - shift (float): Shift length for segmentation - step_count (int): Number of the unit segments you want to create per utterance - deci (int): Rounding decimals for segment manifest file - """ - - def __init__(self, cfg): - self._params = cfg - self.annotator = DataAnnotator(cfg) - self.sampler = SpeechSampler(cfg) - # internal params - self._manifest = read_manifest(self._params.data_simulator.manifest_filepath) - self._speaker_samples = build_speaker_samples_map(self._manifest) - self._noise_samples = [] - self._sentence = None - self._text = "" - self._words = [] - self._alignments = [] - # minimum number of alignments for a manifest to be considered valid - self._min_alignment_count = 2 - self._merged_speech_intervals = [] - # keep track of furthest sample per speaker to avoid overlapping same speaker - self._furthest_sample = [0 for n in range(self._params.data_simulator.session_config.num_speakers)] - # use to ensure overlap percentage is correct - self._missing_overlap = 0 - # creating manifests during online data simulation - self.base_manifest_filepath = None - self.segment_manifest_filepath = None - self._max_audio_read_sec = self._params.data_simulator.session_params.max_audio_read_sec - self._turn_prob_min = self._params.data_simulator.session_params.get("turn_prob_min", 0.5) - # variable speaker volume - self._volume = None - self._speaker_ids = None - self._device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") - self._audio_read_buffer_dict = {} - self.add_missing_overlap = self._params.data_simulator.session_params.get("add_missing_overlap", False) - - if ( - self._params.data_simulator.segment_augmentor.get("augmentor", None) - and self._params.data_simulator.segment_augmentor.add_seg_aug - ): - self.segment_augmentor = process_augmentations( - augmenter=self._params.data_simulator.segment_augmentor.augmentor - ) - else: - self.segment_augmentor = None - - if ( - self._params.data_simulator.session_augmentor.get("augmentor", None) - and self._params.data_simulator.session_augmentor.add_sess_aug - ): - self.session_augmentor = process_augmentations( - augmenter=self._params.data_simulator.session_augmentor.augmentor - ) - else: - self.session_augmentor = None - - # Error check the input arguments for simulation - self._check_args() - - # Initialize speaker permutations to maximize the number of speakers in the created dataset - self._permutated_speaker_inds = self._init_speaker_permutations( - num_sess=self._params.data_simulator.session_config.num_sessions, - num_speakers=self._params.data_simulator.session_config.num_speakers, - all_speaker_ids=self._speaker_samples.keys(), - random_seed=self._params.data_simulator.random_seed, - ) - - # Intialize multiprocessing related variables - self.num_workers = self._params.get("num_workers", 1) - self.multiprocessing_chunksize = self._params.data_simulator.get('multiprocessing_chunksize', 10000) - self.chunk_count = self._init_chunk_count() - - def _init_speaker_permutations(self, num_sess: int, num_speakers: int, all_speaker_ids: List, random_seed: int): - """ - Initialize the speaker permutations for the number of speakers in the session. - When generating the simulated sessions, we want to include as many speakers as possible. - This function generates a set of permutations that can be used to sweep all speakers in - the source dataset to make sure we maximize the total number of speakers included in - the simulated sessions. - - Args: - num_sess (int): Number of sessions to generate - num_speakers (int): Number of speakers in each session - all_speaker_ids (list): List of all speaker IDs - - Returns: - permuted_inds (np.array): - Array of permuted speaker indices to use for each session - Dimensions: (num_sess, num_speakers) - """ - np.random.seed(random_seed) - all_speaker_id_counts = len(list(all_speaker_ids)) - - # Calculate how many permutations are needed - perm_set_count = int(np.ceil(num_speakers * num_sess / all_speaker_id_counts)) - - target_count = num_speakers * num_sess - for count in range(perm_set_count): - if target_count < all_speaker_id_counts: - seq_len = target_count - else: - seq_len = all_speaker_id_counts - if seq_len <= 0: - raise ValueError(f"seq_len is {seq_len} at count {count} and should be greater than 0") - - if count == 0: - permuted_inds = np.random.permutation(len(all_speaker_ids))[:seq_len] - else: - permuted_inds = np.hstack((permuted_inds, np.random.permutation(len(all_speaker_ids))[:seq_len])) - target_count -= seq_len - - logging.info(f"Total {all_speaker_id_counts} speakers in the source dataset.") - logging.info(f"Initialized speaker permutations for {num_sess} sessions with {num_speakers} speakers each.") - return permuted_inds.reshape(num_sess, num_speakers) - - def _init_chunk_count(self): - """ - Initialize the chunk count for multi-processing to prevent over-flow of job counts. - The multi-processing pipeline can freeze if there are more than approximately 10,000 jobs - in the pipeline at the same time. - """ - return int(np.ceil(self._params.data_simulator.session_config.num_sessions / self.multiprocessing_chunksize)) - - def _check_args(self): - """ - Checks YAML arguments to ensure they are within valid ranges. - """ - if self._params.data_simulator.session_config.num_speakers < 1: - raise Exception("At least one speaker is required for making audio sessions (num_speakers < 1)") - if ( - self._params.data_simulator.session_params.turn_prob < 0 - or self._params.data_simulator.session_params.turn_prob > 1 - ): - raise Exception("Turn probability is outside of [0,1]") - if ( - self._params.data_simulator.session_params.turn_prob < 0 - or self._params.data_simulator.session_params.turn_prob > 1 - ): - raise Exception("Turn probability is outside of [0,1]") - elif ( - self._params.data_simulator.session_params.turn_prob < self._turn_prob_min - and self._params.data_simulator.speaker_enforcement.enforce_num_speakers == True - ): - logging.warning( - "Turn probability is less than {self._turn_prob_min} while enforce_num_speakers=True, which may result in excessive session lengths. Forcing turn_prob to 0.5." - ) - self._params.data_simulator.session_params.turn_prob = self._turn_prob_min - if self._params.data_simulator.session_params.max_audio_read_sec < 2.5: - raise Exception("Max audio read time must be greater than 2.5 seconds") - - if self._params.data_simulator.session_params.sentence_length_params[0] <= 0: - raise Exception( - "k (number of success until the exp. ends) in Sentence length parameter value must be a positive number" - ) - - if not (0 < self._params.data_simulator.session_params.sentence_length_params[1] <= 1): - raise Exception("p (success probability) value in sentence length parameter must be in range (0,1]") - - if ( - self._params.data_simulator.session_params.mean_overlap < 0 - or self._params.data_simulator.session_params.mean_overlap > 1 - ): - raise Exception("Mean overlap is outside of [0,1]") - if ( - self._params.data_simulator.session_params.mean_silence < 0 - or self._params.data_simulator.session_params.mean_silence > 1 - ): - raise Exception("Mean silence is outside of [0,1]") - if self._params.data_simulator.session_params.mean_silence_var < 0: - raise Exception("Mean silence variance is not below 0") - if ( - self._params.data_simulator.session_params.mean_silence > 0 - and self._params.data_simulator.session_params.mean_silence_var - >= self._params.data_simulator.session_params.mean_silence - * (1 - self._params.data_simulator.session_params.mean_silence) - ): - raise Exception("Mean silence variance should be lower than mean_silence * (1-mean_silence)") - if self._params.data_simulator.session_params.per_silence_var < 0: - raise Exception("Per silence variance is below 0") - - if self._params.data_simulator.session_params.mean_overlap_var < 0: - raise Exception("Mean overlap variance is not larger than 0") - if ( - self._params.data_simulator.session_params.mean_overlap > 0 - and self._params.data_simulator.session_params.mean_overlap_var - >= self._params.data_simulator.session_params.mean_overlap - * (1 - self._params.data_simulator.session_params.mean_overlap) - ): - raise Exception("Mean overlap variance should be lower than mean_overlap * (1-mean_overlap)") - if self._params.data_simulator.session_params.per_overlap_var < 0: - raise Exception("Per overlap variance is not larger than 0") - - if ( - self._params.data_simulator.session_params.min_dominance < 0 - or self._params.data_simulator.session_params.min_dominance > 1 - ): - raise Exception("Minimum dominance is outside of [0,1]") - if ( - self._params.data_simulator.speaker_enforcement.enforce_time[0] < 0 - or self._params.data_simulator.speaker_enforcement.enforce_time[0] > 1 - ): - raise Exception("Speaker enforcement start is outside of [0,1]") - if ( - self._params.data_simulator.speaker_enforcement.enforce_time[1] < 0 - or self._params.data_simulator.speaker_enforcement.enforce_time[1] > 1 - ): - raise Exception("Speaker enforcement end is outside of [0,1]") - - if ( - self._params.data_simulator.session_params.min_dominance - * self._params.data_simulator.session_config.num_speakers - > 1 - ): - raise Exception("Number of speakers times minimum dominance is greater than 1") - - if ( - self._params.data_simulator.session_params.window_type not in ['hamming', 'hann', 'cosine'] - and self._params.data_simulator.session_params.window_type is not None - ): - raise Exception("Incorrect window type provided") - - if len(self._manifest) == 0: - raise Exception("Manifest file is empty. Check that the source path is correct.") - - def clean_up(self): - """ - Clear the system memory. Cache data for audio files and alignments are removed. - """ - self._sentence = None - self._words = [] - self._alignments = [] - self._audio_read_buffer_dict = {} - torch.cuda.empty_cache() - - def _get_speaker_dominance(self) -> List[float]: - """ - Get the dominance value for each speaker, accounting for the dominance variance and - the minimum per-speaker dominance. - - Returns: - dominance (list): Per-speaker dominance - """ - dominance_mean = 1.0 / self._params.data_simulator.session_config.num_speakers - dominance = np.random.normal( - loc=dominance_mean, - scale=self._params.data_simulator.session_params.dominance_var, - size=self._params.data_simulator.session_config.num_speakers, - ) - dominance = np.clip(dominance, a_min=0, a_max=np.inf) - # normalize while maintaining minimum dominance - total = np.sum(dominance) - if total == 0: - for i in range(len(dominance)): - dominance[i] += self._params.data_simulator.session_params.min_dominance - # scale accounting for min_dominance which has to be added after - dominance = (dominance / total) * ( - 1 - - self._params.data_simulator.session_params.min_dominance - * self._params.data_simulator.session_config.num_speakers - ) - for i in range(len(dominance)): - dominance[i] += self._params.data_simulator.session_params.min_dominance - if ( - i > 0 - ): # dominance values are cumulative to make it easy to select the speaker using a random value in [0,1] - dominance[i] = dominance[i] + dominance[i - 1] - return dominance - - def _increase_speaker_dominance( - self, base_speaker_dominance: List[float], factor: int - ) -> Tuple[List[float], bool]: - """ - Increase speaker dominance for unrepresented speakers (used only in enforce mode). - Increases the dominance for these speakers by the input factor (and then re-normalizes the probabilities to 1). - - Args: - base_speaker_dominance (list): Dominance values for each speaker. - factor (int): Factor to increase dominance of unrepresented speakers by. - Returns: - dominance (list): Per-speaker dominance - enforce (bool): Whether to keep enforce mode turned on - """ - increase_percent = [] - for i in range(self._params.data_simulator.session_config.num_speakers): - if self._furthest_sample[i] == 0: - increase_percent.append(i) - # ramp up enforce counter until speaker is sampled, then reset once all speakers have spoken - if len(increase_percent) > 0: - # extract original per-speaker probabilities - dominance = np.copy(base_speaker_dominance) - for i in range(len(dominance) - 1, 0, -1): - dominance[i] = dominance[i] - dominance[i - 1] - # increase specified speakers by the desired factor - for i in increase_percent: - dominance[i] = dominance[i] * factor - # renormalize - dominance = dominance / np.sum(dominance) - for i in range(1, len(dominance)): - dominance[i] = dominance[i] + dominance[i - 1] - enforce = True - else: # no unrepresented speakers, so enforce mode can be turned off - dominance = base_speaker_dominance - enforce = False - return dominance, enforce - - def _set_speaker_volume(self): - """ - Set the volume for each speaker (either equal volume or variable speaker volume). - """ - if self._params.data_simulator.session_params.normalization_type == 'equal': - self._volume = np.ones(self._params.data_simulator.session_config.num_speakers) - elif self._params.data_simulator.session_params.normalization_type == 'variable': - self._volume = np.random.normal( - loc=1.0, - scale=self._params.data_simulator.session_params.normalization_var, - size=self._params.data_simulator.session_config.num_speakers, - ) - self._volume = np.clip( - np.array(self._volume), - a_min=self._params.data_simulator.session_params.min_volume, - a_max=self._params.data_simulator.session_params.max_volume, - ).tolist() - - def _get_next_speaker(self, prev_speaker: int, dominance: List[float]) -> int: - """ - Get the next speaker (accounting for turn probability and dominance distribution). - - Args: - prev_speaker (int): Previous speaker turn. - dominance (list): Dominance values for each speaker. - Returns: - prev_speaker/speaker_turn (int): Speaker turn - """ - if self._params.data_simulator.session_config.num_speakers == 1: - prev_speaker = 0 if prev_speaker is None else prev_speaker - return prev_speaker - else: - if ( - np.random.uniform(0, 1) > self._params.data_simulator.session_params.turn_prob - and prev_speaker is not None - ): - return prev_speaker - else: - speaker_turn = prev_speaker - while speaker_turn == prev_speaker: # ensure another speaker goes next - rand = np.random.uniform(0, 1) - speaker_turn = 0 - while rand > dominance[speaker_turn]: - speaker_turn += 1 - return speaker_turn - - def _get_window(self, window_amount: int, start: bool = False): - """ - Get window curve to alleviate abrupt change of time-series signal when segmenting audio samples. - - Args: - window_amount (int): Window length (in terms of number of samples). - start (bool): If true, return the first half of the window. - - Returns: - window (tensor): Half window (either first half or second half) - """ - if self._params.data_simulator.session_params.window_type == 'hamming': - window = hamming(window_amount * 2) - elif self._params.data_simulator.session_params.window_type == 'hann': - window = hann(window_amount * 2) - elif self._params.data_simulator.session_params.window_type == 'cosine': - window = cosine(window_amount * 2) - else: - raise Exception("Incorrect window type provided") - - window = torch.from_numpy(window).to(self._device) - - # return the first half or second half of the window - if start: - return window[:window_amount] - else: - return window[window_amount:] - - def _get_start_buffer_and_window(self, first_alignment: int) -> Tuple[int, int]: - """ - Get the start cutoff and window length for smoothing the start of the sentence. - - Args: - first_alignment (int): Start of the first word (in terms of number of samples). - Returns: - start_cutoff (int): Amount into the audio clip to start - window_amount (int): Window length - """ - window_amount = int(self._params.data_simulator.session_params.window_size * self._params.data_simulator.sr) - start_buffer = int(self._params.data_simulator.session_params.start_buffer * self._params.data_simulator.sr) - - if first_alignment < start_buffer: - window_amount = 0 - start_cutoff = 0 - elif first_alignment < start_buffer + window_amount: - window_amount = first_alignment - start_buffer - start_cutoff = 0 - else: - start_cutoff = first_alignment - start_buffer - window_amount - - return start_cutoff, window_amount - - def _get_end_buffer_and_window( - self, current_sample_cursor: int, remaining_dur_samples: int, remaining_len_audio_file: int - ) -> Tuple[int, int]: - """ - Get the end buffer and window length for smoothing the end of the sentence. - - Args: - current_sample_cursor (int): Current location in the target file (in terms of number of samples). - remaining_dur_samples (int): Remaining duration in the target file (in terms of number of samples). - remaining_len_audio_file (int): Length remaining in audio file (in terms of number of samples). - Returns: - release_buffer (int): Amount after the end of the last alignment to include - window_amount (int): Window length - """ - window_amount = int(self._params.data_simulator.session_params.window_size * self._params.data_simulator.sr) - release_buffer = int( - self._params.data_simulator.session_params.release_buffer * self._params.data_simulator.sr - ) - - if current_sample_cursor + release_buffer > remaining_dur_samples: - release_buffer = remaining_dur_samples - current_sample_cursor - window_amount = 0 - elif current_sample_cursor + window_amount + release_buffer > remaining_dur_samples: - window_amount = remaining_dur_samples - current_sample_cursor - release_buffer - - if remaining_len_audio_file < release_buffer: - release_buffer = remaining_len_audio_file - window_amount = 0 - elif remaining_len_audio_file < release_buffer + window_amount: - window_amount = remaining_len_audio_file - release_buffer - - return release_buffer, window_amount - - def _check_missing_speakers(self, num_missing: int = 0): - """ - Check if any speakers were not included in the clip and display a warning. - - Args: - num_missing (int): Number of missing speakers. - """ - for k in range(len(self._furthest_sample)): - if self._furthest_sample[k] == 0: - num_missing += 1 - if num_missing != 0: - warnings.warn( - f"{self._params.data_simulator.session_config.num_speakers - num_missing}" - f"speakers were included in the clip instead of the requested amount of " - f"{self._params.data_simulator.session_config.num_speakers}" - ) - - def _add_file( - self, - audio_manifest: dict, - audio_file, - sentence_word_count: int, - max_word_count_in_sentence: int, - max_samples_in_sentence: int, - random_offset: bool = False, - ) -> Tuple[int, torch.Tensor]: - """ - Add audio file to current sentence (up to the desired number of words). - Uses the alignments to segment the audio file. - NOTE: 0 index is always silence in `audio_manifest['words']`, so we choose `offset_idx=1` as the first word - - Args: - audio_manifest (dict): Line from manifest file for current audio file - audio_file (tensor): Current loaded audio file - sentence_word_count (int): Running count for number of words in sentence - max_word_count_in_sentence (int): Maximum count for number of words in sentence - max_samples_in_sentence (int): Maximum length for sentence in terms of samples - - Returns: - sentence_word_count+current_word_count (int): Running word count - len(self._sentence) (tensor): Current length of the audio file - """ - # In general, random offset is not needed since random silence index has already been chosen - if random_offset: - offset_idx = np.random.randint(low=1, high=len(audio_manifest['words'])) - else: - offset_idx = 1 - - first_alignment = int(audio_manifest['alignments'][offset_idx - 1] * self._params.data_simulator.sr) - start_cutoff, start_window_amount = self._get_start_buffer_and_window(first_alignment) - if not self._params.data_simulator.session_params.start_window: # cut off the start of the sentence - start_window_amount = 0 - - # Ensure the desired number of words are added and the length of the output session isn't exceeded - sentence_samples = len(self._sentence) - - remaining_dur_samples = max_samples_in_sentence - sentence_samples - remaining_duration = max_word_count_in_sentence - sentence_word_count - prev_dur_samples, dur_samples, curr_dur_samples = 0, 0, 0 - current_word_count = 0 - word_idx = offset_idx - silence_count = 1 - while ( - current_word_count < remaining_duration - and dur_samples < remaining_dur_samples - and word_idx < len(audio_manifest['words']) - ): - dur_samples = int(audio_manifest['alignments'][word_idx] * self._params.data_simulator.sr) - start_cutoff - - # check the length of the generated sentence in terms of sample count (int). - if curr_dur_samples + dur_samples > remaining_dur_samples: - # if the upcoming loop will exceed the remaining sample count, break out of the loop. - break - - word = audio_manifest['words'][word_idx] - - if silence_count > 0 and word == "": - break - - self._words.append(word) - self._alignments.append( - float(sentence_samples * 1.0 / self._params.data_simulator.sr) - - float(start_cutoff * 1.0 / self._params.data_simulator.sr) - + audio_manifest['alignments'][word_idx] - ) - - if word == "": - word_idx += 1 - silence_count += 1 - continue - elif self._text == "": - self._text += word - else: - self._text += " " + word - - word_idx += 1 - current_word_count += 1 - prev_dur_samples = dur_samples - curr_dur_samples += dur_samples - - # add audio clip up to the final alignment - if self._params.data_simulator.session_params.window_type is not None: # cut off the start of the sentence - if start_window_amount > 0: # include window - window = self._get_window(start_window_amount, start=True) - self._sentence = self._sentence.to(self._device) - self._sentence = torch.cat( - ( - self._sentence, - torch.multiply(audio_file[start_cutoff : start_cutoff + start_window_amount], window), - ), - 0, - ) - self._sentence = torch.cat( - (self._sentence, audio_file[start_cutoff + start_window_amount : start_cutoff + prev_dur_samples],), 0, - ).to(self._device) - - else: - self._sentence = torch.cat( - (self._sentence, audio_file[start_cutoff : start_cutoff + prev_dur_samples]), 0 - ).to(self._device) - - # windowing at the end of the sentence - if ( - word_idx < len(audio_manifest['words']) - ) and self._params.data_simulator.session_params.window_type is not None: - release_buffer, end_window_amount = self._get_end_buffer_and_window( - prev_dur_samples, remaining_dur_samples, len(audio_file[start_cutoff + prev_dur_samples :]), - ) - self._sentence = torch.cat( - ( - self._sentence, - audio_file[start_cutoff + prev_dur_samples : start_cutoff + prev_dur_samples + release_buffer], - ), - 0, - ).to(self._device) - - if end_window_amount > 0: # include window - window = self._get_window(end_window_amount, start=False) - sig_start = start_cutoff + prev_dur_samples + release_buffer - sig_end = start_cutoff + prev_dur_samples + release_buffer + end_window_amount - windowed_audio_file = torch.multiply(audio_file[sig_start:sig_end], window) - self._sentence = torch.cat((self._sentence, windowed_audio_file), 0).to(self._device) - - del audio_file - return sentence_word_count + current_word_count, len(self._sentence) - - def _build_sentence( - self, - speaker_turn: int, - speaker_ids: List[str], - speaker_wav_align_map: Dict[str, list], - max_samples_in_sentence: int, - ): - """ - Build a new sentence by attaching utterance samples together until the sentence has reached a desired length. - While generating the sentence, alignment information is used to segment the audio. - - Args: - speaker_turn (int): Current speaker turn. - speaker_ids (list): LibriSpeech speaker IDs for each speaker in the current session. - speaker_wav_align_map (dict): Dictionary containing speaker IDs and their corresponding wav filepath and alignments. - max_samples_in_sentence (int): Maximum length for sentence in terms of samples - """ - # select speaker length - sl = ( - np.random.negative_binomial( - self._params.data_simulator.session_params.sentence_length_params[0], - self._params.data_simulator.session_params.sentence_length_params[1], - ) - + 1 - ) - - # initialize sentence, text, words, alignments - self._sentence = torch.zeros(0, dtype=torch.float64, device=self._device) - self._text = "" - self._words, self._alignments = [], [] - sentence_word_count, sentence_samples = 0, 0 - - # build sentence - while sentence_word_count < sl and sentence_samples < max_samples_in_sentence: - audio_manifest = load_speaker_sample( - speaker_wav_align_map=speaker_wav_align_map, - speaker_ids=speaker_ids, - speaker_turn=speaker_turn, - min_alignment_count=self._min_alignment_count, - ) - - offset_index = get_random_offset_index( - audio_manifest=audio_manifest, - audio_read_buffer_dict=self._audio_read_buffer_dict, - offset_min=0, - max_audio_read_sec=self._max_audio_read_sec, - min_alignment_count=self._min_alignment_count, - ) - - audio_file, sr, audio_manifest = read_audio_from_buffer( - audio_manifest=audio_manifest, - buffer_dict=self._audio_read_buffer_dict, - offset_index=offset_index, - device=self._device, - max_audio_read_sec=self._max_audio_read_sec, - min_alignment_count=self._min_alignment_count, - read_subset=True, - ) - - # Step 6-2: Add optional perturbations to the specific audio segment (i.e. to `self._sentnece`) - if self._params.data_simulator.segment_augmentor.add_seg_aug: - audio_file = perturb_audio(audio_file, sr, self.segment_augmentor, device=self._device) - - sentence_word_count, sentence_samples = self._add_file( - audio_manifest, audio_file, sentence_word_count, sl, max_samples_in_sentence - ) - - # per-speaker normalization (accounting for active speaker time) - if self._params.data_simulator.session_params.normalize and torch.max(torch.abs(self._sentence)) > 0: - splits = get_split_points_in_alignments( - words=self._words, - alignments=self._alignments, - split_buffer=self._params.data_simulator.session_params.split_buffer, - sr=self._params.data_simulator.sr, - sentence_audio_len=len(self._sentence), - ) - self._sentence = per_speaker_normalize( - sentence_audio=self._sentence, - splits=splits, - speaker_turn=speaker_turn, - volume=self._volume, - device=self._device, - ) - - def _add_silence_or_overlap( - self, - speaker_turn: int, - prev_speaker: int, - start: int, - length: int, - session_len_samples: int, - prev_len_samples: int, - enforce: bool, - ) -> int: - """ - Returns new overlapped (or shifted) start position after inserting overlap or silence. - - Args: - speaker_turn (int): The integer index of the current speaker turn. - prev_speaker (int): The integer index of the previous speaker turn. - start (int): Current start of the audio file being inserted. - length (int): Length of the audio file being inserted. - session_len_samples (int): Maximum length of the session in terms of number of samples - prev_len_samples (int): Length of previous sentence (in terms of number of samples) - enforce (bool): Whether speaker enforcement mode is being used - Returns: - new_start (int): New starting position in the session accounting for overlap or silence - """ - running_len_samples = start + length - # `length` is the length of the current sentence to be added, so not included in self.sampler.running_speech_len_samples - non_silence_len_samples = self.sampler.running_speech_len_samples + length - - # compare silence and overlap ratios - add_overlap = self.sampler.silence_vs_overlap_selector(running_len_samples, non_silence_len_samples) - - # choose overlap if this speaker is not the same as the previous speaker and add_overlap is True. - if prev_speaker != speaker_turn and prev_speaker is not None and add_overlap: - desired_overlap_amount = self.sampler.sample_from_overlap_model(non_silence_len_samples) - new_start = start - desired_overlap_amount - - # avoid overlap at start of clip - if new_start < 0: - desired_overlap_amount -= 0 - new_start - self._missing_overlap += 0 - new_start - new_start = 0 - - # if same speaker ends up overlapping from any previous clip, pad with silence instead - if new_start < self._furthest_sample[speaker_turn]: - desired_overlap_amount -= self._furthest_sample[speaker_turn] - new_start - self._missing_overlap += self._furthest_sample[speaker_turn] - new_start - new_start = self._furthest_sample[speaker_turn] - - prev_start = start - prev_len_samples - prev_end = start - new_end = new_start + length - - # check overlap amount to calculate the actual amount of generated overlaps - overlap_amount = 0 - if is_overlap([prev_start, prev_end], [new_start, new_end]): - overlap_range = get_overlap_range([prev_start, prev_end], [new_start, new_end]) - overlap_amount = max(overlap_range[1] - overlap_range[0], 0) - - if overlap_amount < desired_overlap_amount: - self._missing_overlap += desired_overlap_amount - overlap_amount - self.sampler.running_overlap_len_samples += overlap_amount - - # if we are not adding overlap, add silence - else: - silence_amount = self.sampler.sample_from_silence_model(running_len_samples) - if start + length + silence_amount > session_len_samples and not enforce: - new_start = max(session_len_samples - length, start) - else: - new_start = start + silence_amount - return new_start - - def _get_session_meta_data(self, array: np.ndarray, snr: float) -> dict: - """ - Get meta data for the current session. - - Args: - array (np.ndarray): audio array - snr (float): signal-to-noise ratio - - Returns: - dict: meta data - """ - meta_data = { - "duration": array.shape[0] / self._params.data_simulator.sr, - "silence_mean": self.sampler.sess_silence_mean, - "overlap_mean": self.sampler.sess_overlap_mean, - "bg_snr": snr, - "speaker_ids": self._speaker_ids, - "speaker_volumes": list(self._volume), - } - return meta_data - - def _get_session_silence_from_rttm(self, rttm_list: List[str], running_len_samples: int): - """ - Calculate the total speech and silence duration in the current session using RTTM file. - - Args: - rttm_list (list): - List of RTTM timestamps - running_len_samples (int): - Total number of samples generated so far in the current session - - Returns: - sess_speech_len_rttm (int): - The total number of speech samples in the current session - sess_silence_len_rttm (int): - The total number of silence samples in the current session - """ - all_sample_list = [] - for x_raw in rttm_list: - x = [token for token in x_raw.split()] - all_sample_list.append([float(x[0]), float(x[1])]) - - self._merged_speech_intervals = merge_float_intervals(all_sample_list) - total_speech_in_secs = sum([x[1] - x[0] for x in self._merged_speech_intervals]) - total_silence_in_secs = running_len_samples / self._params.data_simulator.sr - total_speech_in_secs - sess_speech_len = int(total_speech_in_secs * self._params.data_simulator.sr) - sess_silence_len = int(total_silence_in_secs * self._params.data_simulator.sr) - return sess_speech_len, sess_silence_len - - def _add_sentence_to_array( - self, start: int, length: int, array: torch.Tensor, is_speech: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, int]: - """ - Add a sentence to the session array containing time-series signal. - - Args: - start (int): Starting position in the session - length (int): Length of the sentence - array (torch.Tensor): Session array - is_speech (torch.Tensor): Session array containing speech/non-speech labels - - Returns: - array (torch.Tensor): Session array in torch.Tensor format - is_speech (torch.Tensor): Session array containing speech/non-speech labels in torch.Tensor format - """ - end = start + length - if end > len(array): # only occurs in enforce mode - array = torch.nn.functional.pad(array, (0, end - len(array))) - is_speech = torch.nn.functional.pad(is_speech, (0, end - len(is_speech))) - array[start:end] += self._sentence - is_speech[start:end] = 1 - return array, is_speech, end - - def _generate_session( - self, - idx: int, - basepath: str, - filename: str, - speaker_ids: List[str], - speaker_wav_align_map: Dict[str, list], - noise_samples: list, - device: torch.device, - enforce_counter: int = 2, - ): - """ - _generate_session function without RIR simulation. - Generate a multispeaker audio session and corresponding label files. - - Args: - idx (int): Index for current session (out of total number of sessions). - basepath (str): Path to output directory. - filename (str): Filename for output files. - speaker_ids (list): List of speaker IDs that will be used in this session. - speaker_wav_align_map (dict): Dictionary containing speaker IDs and their corresponding wav filepath and alignments. - noise_samples (list): List of randomly sampled noise source files that will be used for generating this session. - device (torch.device): Device to use for generating this session. - enforce_counter (int): In enforcement mode, dominance is increased by a factor of enforce_counter for unrepresented speakers - """ - random_seed = self._params.data_simulator.random_seed - np.random.seed(random_seed + idx) - - self._device = device - speaker_dominance = self._get_speaker_dominance() # randomly determine speaker dominance - base_speaker_dominance = np.copy(speaker_dominance) - self._set_speaker_volume() - - running_len_samples, prev_len_samples = 0, 0 - prev_speaker = None - self.annotator.init_annotation_lists() - self._noise_samples = noise_samples - self._furthest_sample = [0 for n in range(self._params.data_simulator.session_config.num_speakers)] - self._missing_silence = 0 - - # hold enforce until all speakers have spoken - enforce_time = np.random.uniform( - self._params.data_simulator.speaker_enforcement.enforce_time[0], - self._params.data_simulator.speaker_enforcement.enforce_time[1], - ) - enforce = self._params.data_simulator.speaker_enforcement.enforce_num_speakers - - session_len_samples = int( - (self._params.data_simulator.session_config.session_length * self._params.data_simulator.sr) - ) - array = torch.zeros(session_len_samples).to(self._device) - is_speech = torch.zeros(session_len_samples).to(self._device) - - self.sampler.get_session_silence_mean() - self.sampler.get_session_overlap_mean() - - while running_len_samples < session_len_samples or enforce: - # Step 1: Prepare parameters for sentence generation - # Enforce speakers depending on running length - if running_len_samples > enforce_time * session_len_samples and enforce: - speaker_dominance, enforce = self._increase_speaker_dominance(base_speaker_dominance, enforce_counter) - if enforce: - enforce_counter += 1 - - # Step 2: Select a speaker - speaker_turn = self._get_next_speaker(prev_speaker, speaker_dominance) - - # Calculate parameters for building a sentence (only add if remaining length > specific time) - max_samples_in_sentence = session_len_samples - running_len_samples - if enforce: - max_samples_in_sentence = float('inf') - elif ( - max_samples_in_sentence - < self._params.data_simulator.session_params.end_buffer * self._params.data_simulator.sr - ): - break - - # Step 3: Generate a sentence - self._build_sentence(speaker_turn, speaker_ids, speaker_wav_align_map, max_samples_in_sentence) - length = len(self._sentence) - - # Step 4: Generate a timestamp for either silence or overlap - start = self._add_silence_or_overlap( - speaker_turn=speaker_turn, - prev_speaker=prev_speaker, - start=running_len_samples, - length=length, - session_len_samples=session_len_samples, - prev_len_samples=prev_len_samples, - enforce=enforce, - ) - # step 5: add sentence to array - array, is_speech, end = self._add_sentence_to_array( - start=start, length=length, array=array, is_speech=is_speech, - ) - - # Step 6: Build entries for output files - new_rttm_entries = self.annotator.create_new_rttm_entry( - words=self._words, - alignments=self._alignments, - start=start / self._params.data_simulator.sr, - end=end / self._params.data_simulator.sr, - speaker_id=speaker_ids[speaker_turn], - ) - - self.annotator.annote_lists['rttm'].extend(new_rttm_entries) - - new_json_entry = self.annotator.create_new_json_entry( - text=self._text, - wav_filename=os.path.join(basepath, filename + '.wav'), - start=start / self._params.data_simulator.sr, - length=length / self._params.data_simulator.sr, - speaker_id=speaker_ids[speaker_turn], - rttm_filepath=os.path.join(basepath, filename + '.rttm'), - ctm_filepath=os.path.join(basepath, filename + '.ctm'), - ) - self.annotator.annote_lists['json'].append(new_json_entry) - - new_ctm_entries = self.annotator.create_new_ctm_entry( - words=self._words, - alignments=self._alignments, - session_name=filename, - speaker_id=speaker_ids[speaker_turn], - start=int(start / self._params.data_simulator.sr), - ) - - self.annotator.annote_lists['ctm'].extend(new_ctm_entries) - - running_len_samples = np.maximum(running_len_samples, end) - ( - self.sampler.running_speech_len_samples, - self.sampler.running_silence_len_samples, - ) = self._get_session_silence_from_rttm( - rttm_list=self.annotator.annote_lists['rttm'], running_len_samples=running_len_samples - ) - - self._furthest_sample[speaker_turn] = running_len_samples - prev_speaker = speaker_turn - prev_len_samples = length - - # Step 7-1: Add optional perturbations to the whole session, such as white noise. - if self._params.data_simulator.session_augmentor.add_sess_aug: - # NOTE: This perturbation is not reflected in the session SNR in meta dictionary. - array = perturb_audio(array, self._params.data_simulator.sr, self.session_augmentor, device=array.device) - - # Step 7-2: Additive background noise from noise manifest files - if self._params.data_simulator.background_noise.add_bg: - if len(self._noise_samples) > 0: - avg_power_array = torch.mean(array[is_speech == 1] ** 2) - bg, snr = get_background_noise( - len_array=len(array), - power_array=avg_power_array, - noise_samples=self._noise_samples, - audio_read_buffer_dict=self._audio_read_buffer_dict, - snr_min=self._params.data_simulator.background_noise.snr_min, - snr_max=self._params.data_simulator.background_noise.snr_max, - background_noise_snr=self._params.data_simulator.background_noise.snr, - seed=(random_seed + idx), - device=self._device, - ) - array += bg - else: - raise ValueError('No background noise samples found in self._noise_samples.') - else: - snr = "N/A" - - # Step 7: Normalize and write to disk - array = normalize_audio(array) - - if torch.is_tensor(array): - array = array.cpu().numpy() - sf.write(os.path.join(basepath, filename + '.wav'), array, self._params.data_simulator.sr) - - self.annotator.write_annotation_files( - basepath=basepath, filename=filename, meta_data=self._get_session_meta_data(array=array, snr=snr), - ) - - # Step 8: Clean up memory - del array - self.clean_up() - return basepath, filename - - def generate_sessions(self, random_seed: int = None): - """ - Generate several multispeaker audio sessions and corresponding list files. - - Args: - random_seed (int): random seed for reproducibility - """ - logging.info(f"Generating Diarization Sessions") - if random_seed is None: - random_seed = self._params.data_simulator.random_seed - np.random.seed(random_seed) - - output_dir = self._params.data_simulator.outputs.output_dir - - basepath = get_cleaned_base_path( - output_dir, overwrite_output=self._params.data_simulator.outputs.overwrite_output - ) - OmegaConf.save(self._params, os.path.join(output_dir, "params.yaml")) - - tp = concurrent.futures.ProcessPoolExecutor(max_workers=self.num_workers) - futures = [] - - num_sessions = self._params.data_simulator.session_config.num_sessions - source_noise_manifest = read_noise_manifest( - add_bg=self._params.data_simulator.background_noise.add_bg, - background_manifest=self._params.data_simulator.background_noise.background_manifest, - ) - queue = [] - - # add radomly sampled arguments to a list(queue) for multiprocessing - for sess_idx in range(num_sessions): - filename = self._params.data_simulator.outputs.output_filename + f"_{sess_idx}" - speaker_ids = get_speaker_ids( - sess_idx=sess_idx, - speaker_samples=self._speaker_samples, - permutated_speaker_inds=self._permutated_speaker_inds, - ) - speaker_wav_align_map = get_speaker_samples(speaker_ids=speaker_ids, speaker_samples=self._speaker_samples) - noise_samples = self.sampler.sample_noise_manifest(noise_manifest=source_noise_manifest) - - if torch.cuda.is_available(): - device = torch.device(f"cuda:{sess_idx % torch.cuda.device_count()}") - else: - device = self._device - queue.append((sess_idx, basepath, filename, speaker_ids, speaker_wav_align_map, noise_samples, device)) - - # for multiprocessing speed, we avoid loading potentially huge manifest list and speaker sample files into each process. - if self.num_workers > 1: - self._manifest = None - self._speaker_samples = None - - # Chunk the sessions into smaller chunks for very large number of sessions (10K+ sessions) - for chunk_idx in range(self.chunk_count): - futures = [] - stt_idx, end_idx = ( - chunk_idx * self.multiprocessing_chunksize, - min((chunk_idx + 1) * self.multiprocessing_chunksize, num_sessions), - ) - for sess_idx in range(stt_idx, end_idx): - self._furthest_sample = [0 for n in range(self._params.data_simulator.session_config.num_speakers)] - self._audio_read_buffer_dict = {} - if self.num_workers > 1: - futures.append(tp.submit(self._generate_session, *queue[sess_idx])) - else: - futures.append(queue[sess_idx]) - - if self.num_workers > 1: - generator = concurrent.futures.as_completed(futures) - else: - generator = futures - - for future in tqdm( - generator, - desc=f"[{chunk_idx+1}/{self.chunk_count}] Waiting jobs from {stt_idx+1: 2} to {end_idx: 2}", - unit="jobs", - total=len(futures), - ): - if self.num_workers > 1: - basepath, filename = future.result() - else: - self._noise_samples = self.sampler.sample_noise_manifest(noise_manifest=source_noise_manifest,) - basepath, filename = self._generate_session(*future) - - self.annotator.add_to_filename_lists(basepath=basepath, filename=filename) - - # throw warning if number of speakers is less than requested - self._check_missing_speakers() - - tp.shutdown() - self.annotator.write_filelist_files(basepath=basepath) - logging.info(f"Data simulation has been completed, results saved at: {basepath}") - - -class RIRMultiSpeakerSimulator(MultiSpeakerSimulator): - """ - RIR Augmented Multispeaker Audio Session Simulator - simulates multispeaker audio sessions using single-speaker - audio files and corresponding word alignments, as well as simulated RIRs for augmentation. - - Args: - cfg: OmegaConf configuration loaded from yaml file. - - Parameters (in addition to the base MultiSpeakerSimulator parameters): - rir_generation: - use_rir (bool): Whether to generate synthetic RIR - toolkit (str): Which toolkit to use ("pyroomacoustics", "gpuRIR") - room_config: - room_sz (list): Size of the shoebox room environment (1d array for specific, 2d array for random range to be - sampled from) - pos_src (list): Positions of the speakers in the simulated room environment (2d array for specific, 3d array - for random ranges to be sampled from) - noise_src_pos (list): Position in room for the ambient background noise source - mic_config: - num_channels (int): Number of output audio channels - pos_rcv (list): Microphone positions in the simulated room environment (1d/2d array for specific, 2d/3d array - for range assuming num_channels is 1/2+) - orV_rcv (list or null): Microphone orientations (needed for non-omnidirectional microphones) - mic_pattern (str): Microphone type ("omni" - omnidirectional) - currently only omnidirectional microphones are - supported for pyroomacoustics - absorbtion_params: (Note that only `T60` is used for pyroomacoustics simulations) - abs_weights (list): Absorption coefficient ratios for each surface - T60 (float): Room reverberation time (`T60` is the time it takes for the RIR to decay by 60DB) - att_diff (float): Starting attenuation (if this is different than att_max, the diffuse reverberation model is - used by gpuRIR) - att_max (float): End attenuation when using the diffuse reverberation model (gpuRIR) - """ - - def __init__(self, cfg): - super().__init__(cfg) - self._check_args_rir() - - def _check_args_rir(self): - """ - Checks RIR YAML arguments to ensure they are within valid ranges - """ - - if not (self._params.data_simulator.rir_generation.toolkit in ['pyroomacoustics', 'gpuRIR']): - raise Exception("Toolkit must be pyroomacoustics or gpuRIR") - if self._params.data_simulator.rir_generation.toolkit == 'pyroomacoustics' and not PRA: - raise ImportError("pyroomacoustics should be installed to run this simulator with RIR augmentation") - - if self._params.data_simulator.rir_generation.toolkit == 'gpuRIR' and not GPURIR: - raise ImportError("gpuRIR should be installed to run this simulator with RIR augmentation") - - if len(self._params.data_simulator.rir_generation.room_config.room_sz) != 3: - raise Exception("Incorrect room dimensions provided") - if self._params.data_simulator.rir_generation.mic_config.num_channels == 0: - raise Exception("Number of channels should be greater or equal to 1") - if len(self._params.data_simulator.rir_generation.room_config.pos_src) < 2: - raise Exception("Less than 2 provided source positions") - for sublist in self._params.data_simulator.rir_generation.room_config.pos_src: - if len(sublist) != 3: - raise Exception("Three coordinates must be provided for sources positions") - if len(self._params.data_simulator.rir_generation.mic_config.pos_rcv) == 0: - raise Exception("No provided mic positions") - for sublist in self._params.data_simulator.rir_generation.room_config.pos_src: - if len(sublist) != 3: - raise Exception("Three coordinates must be provided for mic positions") - - if self._params.data_simulator.session_config.num_speakers != len( - self._params.data_simulator.rir_generation.room_config.pos_src - ): - raise Exception("Number of speakers is not equal to the number of provided source positions") - if self._params.data_simulator.rir_generation.mic_config.num_channels != len( - self._params.data_simulator.rir_generation.mic_config.pos_rcv - ): - raise Exception("Number of channels is not equal to the number of provided microphone positions") - - if ( - not self._params.data_simulator.rir_generation.mic_config.orV_rcv - and self._params.data_simulator.rir_generation.mic_config.mic_pattern != 'omni' - ): - raise Exception("Microphone orientations must be provided if mic_pattern != omni") - if self._params.data_simulator.rir_generation.mic_config.orV_rcv is not None: - if len(self._params.data_simulator.rir_generation.mic_config.orV_rcv) != len( - self._params.data_simulator.rir_generation.mic_config.pos_rcv - ): - raise Exception("A different number of microphone orientations and microphone positions were provided") - for sublist in self._params.data_simulator.rir_generation.mic_config.orV_rcv: - if len(sublist) != 3: - raise Exception("Three coordinates must be provided for orientations") - - def _generate_rir_gpuRIR(self): - """ - Create simulated RIR using the gpuRIR library - - Returns: - RIR (tensor): Generated RIR - RIR_pad (int): Length of padding added when convolving the RIR with an audio file - """ - room_sz_tmp = np.array(self._params.data_simulator.rir_generation.room_config.room_sz) - if room_sz_tmp.ndim == 2: # randomize - room_sz = np.zeros(room_sz_tmp.shape[0]) - for i in range(room_sz_tmp.shape[0]): - room_sz[i] = np.random.uniform(room_sz_tmp[i, 0], room_sz_tmp[i, 1]) - else: - room_sz = room_sz_tmp - - pos_src_tmp = np.array(self._params.data_simulator.rir_generation.room_config.pos_src) - if pos_src_tmp.ndim == 3: # randomize - pos_src = np.zeros((pos_src_tmp.shape[0], pos_src_tmp.shape[1])) - for i in range(pos_src_tmp.shape[0]): - for j in range(pos_src_tmp.shape[1]): - pos_src[i] = np.random.uniform(pos_src_tmp[i, j, 0], pos_src_tmp[i, j, 1]) - else: - pos_src = pos_src_tmp - - if self._params.data_simulator.background_noise.add_bg: - pos_src = np.vstack((pos_src, self._params.data_simulator.rir_generation.room_config.noise_src_pos)) - - mic_pos_tmp = np.array(self._params.data_simulator.rir_generation.mic_config.pos_rcv) - if mic_pos_tmp.ndim == 3: # randomize - mic_pos = np.zeros((mic_pos_tmp.shape[0], mic_pos_tmp.shape[1])) - for i in range(mic_pos_tmp.shape[0]): - for j in range(mic_pos_tmp.shape[1]): - mic_pos[i] = np.random.uniform(mic_pos_tmp[i, j, 0], mic_pos_tmp[i, j, 1]) - else: - mic_pos = mic_pos_tmp - - orV_rcv = self._params.data_simulator.rir_generation.mic_config.orV_rcv - if orV_rcv: # not needed for omni mics - orV_rcv = np.array(orV_rcv) - mic_pattern = self._params.data_simulator.rir_generation.mic_config.mic_pattern - abs_weights = self._params.data_simulator.rir_generation.absorbtion_params.abs_weights - T60 = self._params.data_simulator.rir_generation.absorbtion_params.T60 - att_diff = self._params.data_simulator.rir_generation.absorbtion_params.att_diff - att_max = self._params.data_simulator.rir_generation.absorbtion_params.att_max - sr = self._params.data_simulator.sr - - beta = beta_SabineEstimation(room_sz, T60, abs_weights=abs_weights) # Reflection coefficients - Tdiff = att2t_SabineEstimator(att_diff, T60) # Time to start the diffuse reverberation model [s] - Tmax = att2t_SabineEstimator(att_max, T60) # Time to stop the simulation [s] - nb_img = t2n(Tdiff, room_sz) # Number of image sources in each dimension - RIR = simulateRIR( - room_sz, beta, pos_src, mic_pos, nb_img, Tmax, sr, Tdiff=Tdiff, orV_rcv=orV_rcv, mic_pattern=mic_pattern - ) - RIR_pad = RIR.shape[2] - 1 - return RIR, RIR_pad - - def _generate_rir_pyroomacoustics(self) -> Tuple[torch.Tensor, int]: - """ - Create simulated RIR using the pyroomacoustics library - - Returns: - RIR (tensor): Generated RIR - RIR_pad (int): Length of padding added when convolving the RIR with an audio file - """ - - rt60 = self._params.data_simulator.rir_generation.absorbtion_params.T60 # The desired reverberation time - sr = self._params.data_simulator.sr - - room_sz_tmp = np.array(self._params.data_simulator.rir_generation.room_config.room_sz) - if room_sz_tmp.ndim == 2: # randomize - room_sz = np.zeros(room_sz_tmp.shape[0]) - for i in range(room_sz_tmp.shape[0]): - room_sz[i] = np.random.uniform(room_sz_tmp[i, 0], room_sz_tmp[i, 1]) - else: - room_sz = room_sz_tmp - - pos_src_tmp = np.array(self._params.data_simulator.rir_generation.room_config.pos_src) - if pos_src_tmp.ndim == 3: # randomize - pos_src = np.zeros((pos_src_tmp.shape[0], pos_src_tmp.shape[1])) - for i in range(pos_src_tmp.shape[0]): - for j in range(pos_src_tmp.shape[1]): - pos_src[i] = np.random.uniform(pos_src_tmp[i, j, 0], pos_src_tmp[i, j, 1]) - else: - pos_src = pos_src_tmp - - # We invert Sabine's formula to obtain the parameters for the ISM simulator - e_absorption, max_order = pra.inverse_sabine(rt60, room_sz) - room = pra.ShoeBox(room_sz, fs=sr, materials=pra.Material(e_absorption), max_order=max_order) - - if self._params.data_simulator.background_noise.add_bg: - pos_src = np.vstack((pos_src, self._params.data_simulator.rir_generation.room_config.noise_src_pos)) - for pos in pos_src: - room.add_source(pos) - - # currently only supports omnidirectional microphones - mic_pattern = self._params.data_simulator.rir_generation.mic_config.mic_pattern - if self._params.data_simulator.rir_generation.mic_config.mic_pattern == 'omni': - mic_pattern = DirectivityPattern.OMNI - dir_vec = DirectionVector(azimuth=0, colatitude=90, degrees=True) - dir_obj = CardioidFamily(orientation=dir_vec, pattern_enum=mic_pattern,) - - mic_pos_tmp = np.array(self._params.data_simulator.rir_generation.mic_config.pos_rcv) - if mic_pos_tmp.ndim == 3: # randomize - mic_pos = np.zeros((mic_pos_tmp.shape[0], mic_pos_tmp.shape[1])) - for i in range(mic_pos_tmp.shape[0]): - for j in range(mic_pos_tmp.shape[1]): - mic_pos[i] = np.random.uniform(mic_pos_tmp[i, j, 0], mic_pos_tmp[i, j, 1]) - else: - mic_pos = mic_pos_tmp - - room.add_microphone_array(mic_pos.T, directivity=dir_obj) - - room.compute_rir() - rir_pad = 0 - for channel in room.rir: - for pos in channel: - if pos.shape[0] - 1 > rir_pad: - rir_pad = pos.shape[0] - 1 - return room.rir, rir_pad - - def _convolve_rir(self, input, speaker_turn: int, RIR: torch.Tensor) -> Tuple[list, int]: - """ - Augment one sentence (or background noise segment) using a synthetic RIR. - - Args: - input (torch.tensor): Input audio. - speaker_turn (int): Current speaker turn. - RIR (torch.tensor): Room Impulse Response. - Returns: - output_sound (list): List of tensors containing augmented audio - length (int): Length of output audio channels (or of the longest if they have different lengths) - """ - output_sound = [] - length = 0 - for channel in range(self._params.data_simulator.rir_generation.mic_config.num_channels): - if self._params.data_simulator.rir_generation.toolkit == 'gpuRIR': - out_channel = convolve(input, RIR[speaker_turn, channel, : len(input)]).tolist() - elif self._params.data_simulator.rir_generation.toolkit == 'pyroomacoustics': - out_channel = convolve(input, RIR[channel][speaker_turn][: len(input)]).tolist() - if len(out_channel) > length: - length = len(out_channel) - output_sound.append(torch.tensor(out_channel)) - return output_sound, length - - def _generate_session( - self, - idx: int, - basepath: str, - filename: str, - speaker_ids: list, - speaker_wav_align_map: dict, - noise_samples: list, - device: torch.device, - enforce_counter: int = 2, - ): - """ - Generate a multispeaker audio session and corresponding label files. - - Args: - idx (int): Index for current session (out of total number of sessions). - basepath (str): Path to output directory. - filename (str): Filename for output files. - speaker_ids (list): List of speaker IDs that will be used in this session. - speaker_wav_align_map (dict): Dictionary containing speaker IDs and their corresponding wav filepath and alignments. - noise_samples (list): List of randomly sampled noise source files that will be used for generating this session. - device (torch.device): Device to use for generating this session. - enforce_counter (int): In enforcement mode, dominance is increased by a factor of enforce_counter for unrepresented speakers - """ - random_seed = self._params.data_simulator.random_seed - np.random.seed(random_seed + idx) - - self._device = device - speaker_dominance = self._get_speaker_dominance() # randomly determine speaker dominance - base_speaker_dominance = np.copy(speaker_dominance) - self._set_speaker_volume() - - running_len_samples, prev_len_samples = 0, 0 # starting point for each sentence - prev_speaker = None - self.annotator.init_annotation_lists() - self._noise_samples = noise_samples - self._furthest_sample = [0 for n in range(self._params.data_simulator.session_config.num_speakers)] - - # Room Impulse Response Generation (performed once per batch of sessions) - if self._params.data_simulator.rir_generation.toolkit == 'gpuRIR': - RIR, RIR_pad = self._generate_rir_gpuRIR() - elif self._params.data_simulator.rir_generation.toolkit == 'pyroomacoustics': - RIR, RIR_pad = self._generate_rir_pyroomacoustics() - else: - raise Exception("Toolkit must be pyroomacoustics or gpuRIR") - - # hold enforce until all speakers have spoken - enforce_time = np.random.uniform( - self._params.data_simulator.speaker_enforcement.enforce_time[0], - self._params.data_simulator.speaker_enforcement.enforce_time[1], - ) - enforce = self._params.data_simulator.speaker_enforcement.enforce_num_speakers - - session_len_samples = int( - (self._params.data_simulator.session_config.session_length * self._params.data_simulator.sr) - ) - array = torch.zeros((session_len_samples, self._params.data_simulator.rir_generation.mic_config.num_channels)) - is_speech = torch.zeros(session_len_samples) - - while running_len_samples < session_len_samples or enforce: - # Step 1: Prepare parameters for sentence generation - # Enforce speakers depending on running length - if running_len_samples > enforce_time * session_len_samples and enforce: - speaker_dominance, enforce = self._increase_speaker_dominance(base_speaker_dominance, enforce_counter) - if enforce: - enforce_counter += 1 - - # Step 2: Select a speaker - speaker_turn = self._get_next_speaker(prev_speaker, speaker_dominance) - - # Calculate parameters for building a sentence (only add if remaining length > specific time) - max_samples_in_sentence = ( - session_len_samples - running_len_samples - RIR_pad - ) # sentence will be RIR_len - 1 longer than the audio was pre-augmentation - if enforce: - max_samples_in_sentence = float('inf') - elif ( - max_samples_in_sentence - < self._params.data_simulator.session_params.end_buffer * self._params.data_simulator.sr - ): - break - - # Step 3: Generate a sentence - self._build_sentence(speaker_turn, speaker_ids, speaker_wav_align_map, max_samples_in_sentence) - augmented_sentence, length = self._convolve_rir(self._sentence, speaker_turn, RIR) - - # Step 4: Generate a time-stamp for either silence or overlap - start = self._add_silence_or_overlap( - speaker_turn=speaker_turn, - prev_speaker=prev_speaker, - start=running_len_samples, - length=length, - session_len_samples=session_len_samples, - prev_len_samples=prev_len_samples, - enforce=enforce, - ) - # step 5: add sentence to array - end = start + length - if end > len(array): - array = torch.nn.functional.pad(array, (0, 0, 0, end - len(array))) - is_speech = torch.nn.functional.pad(is_speech, (0, end - len(is_speech))) - is_speech[start:end] = 1 - - for channel in range(self._params.data_simulator.rir_generation.mic_config.num_channels): - len_ch = len(augmented_sentence[channel]) # accounts for how channels are slightly different lengths - array[start : start + len_ch, channel] += augmented_sentence[channel] - - # Step 6: Build entries for output files - new_rttm_entries = self.annotator.create_new_rttm_entry( - self._words, - self._alignments, - start / self._params.data_simulator.sr, - end / self._params.data_simulator.sr, - speaker_ids[speaker_turn], - ) - - self.annotator.annote_lists['rttm'].extend(new_rttm_entries) - - new_json_entry = self.annotator.create_new_json_entry( - self._text, - os.path.join(basepath, filename + '.wav'), - start / self._params.data_simulator.sr, - length / self._params.data_simulator.sr, - speaker_ids[speaker_turn], - os.path.join(basepath, filename + '.rttm'), - os.path.join(basepath, filename + '.ctm'), - ) - self.annotator.annote_lists['json'].append(new_json_entry) - - new_ctm_entries = self.annotator.create_new_ctm_entry( - filename, speaker_ids[speaker_turn], start / self._params.data_simulator.sr - ) - self.annotator.annote_lists['ctm'].extend(new_ctm_entries) - - running_len_samples = np.maximum(running_len_samples, end) - self._furthest_sample[speaker_turn] = running_len_samples - prev_speaker = speaker_turn - prev_len_samples = length - - # Step 7-1: Add optional perturbations to the whole session, such as white noise. - if self._params.data_simulator.session_augmentor.add_sess_aug: - # NOTE: This perturbation is not reflected in the session SNR in meta dictionary. - array = perturb_audio(array, self._params.data_simulator.sr, self.session_augmentor) - - # Step 7-2: Additive background noise from noise manifest files - if self._params.data_simulator.background_noise.add_bg: - if len(self._noise_samples) > 0: - avg_power_array = torch.mean(array[is_speech == 1] ** 2) - bg, snr = get_background_noise( - len_array=len(array), - power_array=avg_power_array, - noise_samples=self._noise_samples, - audio_read_buffer_dict=self._audio_read_buffer_dict, - snr_min=self._params.data_simulator.background_noise.snr_min, - snr_max=self._params.data_simulator.background_noise.snr_max, - background_noise_snr=self._params.data_simulator.background_noise.snr, - seed=(random_seed + idx), - device=self._device, - ) - array += bg - length = array.shape[0] - bg, snr = self._get_background(length, avg_power_array) - augmented_bg, _ = self._convolve_rir(bg, -1, RIR) - for channel in range(self._params.data_simulator.rir_generation.mic_config.num_channels): - array[:, channel] += augmented_bg[channel][:length] - else: - snr = "N/A" - - # Step 7: Normalize and write to disk - array = normalize_audio(array) - - if torch.is_tensor(array): - array = array.cpu().numpy() - sf.write(os.path.join(basepath, filename + '.wav'), array, self._params.data_simulator.sr) - - self.annotator.write_annotation_files( - basepath=basepath, filename=filename, meta_data=self._get_session_meta_data(array=array, snr=snr), - ) - - del array - self.clean_up() - return basepath, filename - - -def check_angle(key: str, val: Union[float, Iterable[float]]) -> bool: - """Check if the angle value is within the expected range. Input - values are in degrees. - - Note: - azimuth: angle between a projection on the horizontal (xy) plane and - positive x axis. Increases counter-clockwise. Range: [-180, 180]. - elevation: angle between a vector an its projection on the horizontal (xy) plane. - Positive above, negative below, i.e., north=+90, south=-90. Range: [-90, 90] - yaw: rotation around the z axis. Defined accoding to right-hand rule. - Range: [-180, 180] - pitch: rotation around the yʹ axis. Defined accoding to right-hand rule. - Range: [-90, 90] - roll: rotation around the xʺ axis. Defined accoding to right-hand rule. - Range: [-180, 180] - - Args: - key: angle type - val: values in degrees - - Returns: - True if all values are within the expected range. - """ - if np.isscalar(val): - min_val = max_val = val - else: - min_val = min(val) - max_val = max(val) - - if key == 'azimuth' and -180 <= min_val <= max_val <= 180: - return True - if key == 'elevation' and -90 <= min_val <= max_val <= 90: - return True - if key == 'yaw' and -180 <= min_val <= max_val <= 180: - return True - if key == 'pitch' and -90 <= min_val <= max_val <= 90: - return True - if key == 'roll' and -180 <= min_val <= max_val <= 180: - return True - - raise ValueError(f'Invalid value for angle {key} = {val}') - - -def wrap_to_180(angle: float) -> float: - """Wrap an angle to range ±180 degrees. - - Args: - angle: angle in degrees - - Returns: - Angle in degrees wrapped to ±180 degrees. - """ - return angle - np.floor(angle / 360 + 1 / 2) * 360 - - -class ArrayGeometry(object): - """A class to simplify handling of array geometry. - - Supports translation and rotation of the array and calculation of - spherical coordinates of a given point relative to the internal - coordinate system of the array. - - Args: - mic_positions: 3D coordinates, with shape (num_mics, 3) - center: optional position of the center of the array. Defaults to the average of the coordinates. - internal_cs: internal coordinate system for the array relative to the global coordinate system. - Defaults to (x, y, z), and is rotated with the array. - """ - - def __init__( - self, - mic_positions: Union[np.ndarray, List], - center: Optional[np.ndarray] = None, - internal_cs: Optional[np.ndarray] = None, - ): - if isinstance(mic_positions, Iterable): - mic_positions = np.array(mic_positions) - - if not mic_positions.ndim == 2: - raise ValueError( - f'Expecting a 2D array specifying mic positions, but received {mic_positions.ndim}-dim array' - ) - - if not mic_positions.shape[1] == 3: - raise ValueError(f'Expecting 3D positions, but received {mic_positions.shape[1]}-dim positions') - - mic_positions_center = np.mean(mic_positions, axis=0) - self.centered_positions = mic_positions - mic_positions_center - self.center = mic_positions_center if center is None else center - - # Internal coordinate system - if internal_cs is None: - # Initially aligned with the global - self.internal_cs = np.eye(3) - else: - self.internal_cs = internal_cs - - @property - def num_mics(self): - """Return the number of microphones for the current array. - """ - return self.centered_positions.shape[0] - - @property - def positions(self): - """Absolute positions of the microphones. - """ - return self.centered_positions + self.center - - @property - def internal_positions(self): - """Positions in the internal coordinate system. - """ - return np.matmul(self.centered_positions, self.internal_cs.T) - - @property - def radius(self): - """Radius of the array, relative to the center. - """ - return max(np.linalg.norm(self.centered_positions, axis=1)) - - @staticmethod - def get_rotation(yaw: float = 0, pitch: float = 0, roll: float = 0) -> Rotation: - """Get a Rotation object for given angles. - - All angles are defined according to the right-hand rule. - - Args: - yaw: rotation around the z axis - pitch: rotation around the yʹ axis - roll: rotation around the xʺ axis - - Returns: - A rotation object constructed using the provided angles. - """ - check_angle('yaw', yaw) - check_angle('pitch', pitch) - check_angle('roll', roll) - - return Rotation.from_euler('ZYX', [yaw, pitch, roll], degrees=True) - - def translate(self, to: np.ndarray): - """Translate the array center to a new point. - - Translation does not change the centered positions or the internal coordinate system. - - Args: - to: 3D point, shape (3,) - """ - self.center = to - - def rotate(self, yaw: float = 0, pitch: float = 0, roll: float = 0): - """Apply rotation on the mic array. - - This rotates the centered microphone positions and the internal - coordinate system, it doesn't change the center of the array. - - All angles are defined according to the right-hand rule. - For example, this means that a positive pitch will result in a rotation from z - to x axis, which will result in a reduced elevation with respect to the global - horizontal plane. - - Args: - yaw: rotation around the z axis - pitch: rotation around the yʹ axis - roll: rotation around the xʺ axis - """ - # construct rotation using TB angles - rotation = self.get_rotation(yaw=yaw, pitch=pitch, roll=roll) - - # rotate centered positions - self.centered_positions = rotation.apply(self.centered_positions) - - # apply the same transformation on the internal coordinate system - self.internal_cs = rotation.apply(self.internal_cs) - - def new_rotated_array(self, yaw: float = 0, pitch: float = 0, roll: float = 0): - """Create a new array by rotating this array. - - Args: - yaw: rotation around the z axis - pitch: rotation around the yʹ axis - roll: rotation around the xʺ axis - - Returns: - A new ArrayGeometry object constructed using the provided angles. - """ - new_array = ArrayGeometry(mic_positions=self.positions, center=self.center, internal_cs=self.internal_cs) - new_array.rotate(yaw=yaw, pitch=pitch, roll=roll) - return new_array - - def spherical_relative_to_array( - self, point: np.ndarray, use_internal_cs: bool = True - ) -> Tuple[float, float, float]: - """Return spherical coordinates of a point relative to the internal coordinate system. - - Args: - point: 3D coordinate, shape (3,) - use_internal_cs: Calculate position relative to the internal coordinate system. - If `False`, the positions will be calculated relative to the - external coordinate system centered at `self.center`. - - Returns: - A tuple (distance, azimuth, elevation) relative to the mic array. - """ - rel_position = point - self.center - distance = np.linalg.norm(rel_position) - - if use_internal_cs: - # transform from the absolute coordinate system to the internal coordinate system - rel_position = np.matmul(self.internal_cs, rel_position) - - # get azimuth - azimuth = np.arctan2(rel_position[1], rel_position[0]) / np.pi * 180 - # get elevation - elevation = np.arcsin(rel_position[2] / distance) / np.pi * 180 - - return distance, azimuth, elevation - - def __str__(self): - with np.printoptions(precision=3, suppress=True): - desc = f"{type(self)}:\ncenter =\n{self.center}\ncentered positions =\n{self.centered_positions}\nradius = \n{self.radius:.3}\nabsolute positions =\n{self.positions}\ninternal coordinate system =\n{self.internal_cs}\n\n" - return desc - - def plot(self, elev=30, azim=-55, mic_size=25): - """Plot microphone positions. - - Args: - elev: elevation for the view of the plot - azim: azimuth for the view of the plot - mic_size: size of the microphone marker in the plot - """ - fig = plt.figure() - ax = fig.add_subplot(projection='3d') - - # show mic positions - for m in range(self.num_mics): - # show mic - ax.scatter( - self.positions[m, 0], - self.positions[m, 1], - self.positions[m, 2], - marker='o', - c='black', - s=mic_size, - depthshade=False, - ) - # add label - ax.text(self.positions[m, 0], self.positions[m, 1], self.positions[m, 2], str(m), c='red', zorder=10) - - # show the internal coordinate system - ax.quiver( - self.center[0], - self.center[1], - self.center[2], - self.internal_cs[:, 0], - self.internal_cs[:, 1], - self.internal_cs[:, 2], - length=self.radius, - label='internal cs', - normalize=False, - linestyle=':', - linewidth=1.0, - ) - for dim, label in enumerate(['x′', 'y′', 'z′']): - label_pos = self.center + self.radius * self.internal_cs[dim] - ax.text(label_pos[0], label_pos[1], label_pos[2], label, tuple(self.internal_cs[dim]), c='blue') - try: - # Unfortunately, equal aspect ratio has been added very recently to Axes3D - ax.set_aspect('equal') - except NotImplementedError: - logging.warning('Equal aspect ratio not supported by Axes3D') - # Set view - ax.view_init(elev=elev, azim=azim) - # Set reasonable limits for all axes, even for the case of an unequal aspect ratio - ax.set_xlim([self.center[0] - self.radius, self.center[0] + self.radius]) - ax.set_ylim([self.center[1] - self.radius, self.center[1] + self.radius]) - ax.set_zlim([self.center[2] - self.radius, self.center[2] + self.radius]) - - ax.set_xlabel('x/m') - ax.set_ylabel('y/m') - ax.set_zlabel('z/m') - ax.set_title('Microphone positions') - ax.legend() - plt.show() - - -def convert_placement_to_range( - placement: dict, room_dim: Iterable[float], object_radius: float = 0 -) -> List[List[float]]: - """Given a placement dictionary, return ranges for each dimension. - - Args: - placement: dictionary containing x, y, height, and min_to_wall - room_dim: dimensions of the room, shape (3,) - object_radius: radius of the object to be placed - - Returns - List with a range of values for each dimensions. - """ - if not np.all(np.array(room_dim) > 0): - raise ValueError(f'Room dimensions must be positive: {room_dim}') - - if object_radius < 0: - raise ValueError(f'Object radius must be non-negative: {object_radius}') - - placement_range = [None] * 3 - min_to_wall = placement.get('min_to_wall', 0) - - if min_to_wall < 0: - raise ValueError(f'Min distance to wall must be positive: {min_to_wall}') - - for idx, key in enumerate(['x', 'y', 'height']): - # Room dimension - dim = room_dim[idx] - # Construct the range - val = placement.get(key) - if val is None: - # No constrained specified on the coordinate of the mic center - min_val, max_val = 0, dim - elif np.isscalar(val): - min_val = max_val = val - else: - if len(val) != 2: - raise ValueError(f'Invalid value for placement for dim {idx}/{key}: {str(placement)}') - min_val, max_val = val - - # Make sure the array is not too close to a wall - min_val = max(min_val, min_to_wall + object_radius) - max_val = min(max_val, dim - min_to_wall - object_radius) - - if min_val > max_val or min(min_val, max_val) < 0: - raise ValueError(f'Invalid range dim {idx}/{key}: min={min_val}, max={max_val}') - - placement_range[idx] = [min_val, max_val] - - return placement_range - - -class RIRCorpusGenerator(object): - """Creates a corpus of RIRs based on a defined configuration of rooms and microphone array. - - RIRs are generated using `generate` method. - """ - - def __init__(self, cfg: DictConfig): - """ - Args: - cfg: dictionary with parameters of the simulation - """ - logging.info("Initialize RIRCorpusGenerator") - self._cfg = cfg - self.check_cfg() - - @property - def cfg(self): - """Property holding the internal config of the object. - - Note: - Changes to this config are not reflected in the state of the object. - Please create a new model with the updated config. - """ - return self._cfg - - @property - def sample_rate(self): - return self._cfg.sample_rate - - @cfg.setter - def cfg(self, cfg): - """Property holding the internal config of the object. - - Note: - Changes to this config are not reflected in the state of the object. - Please create a new model with the updated config. - """ - self._cfg = cfg - - def check_cfg(self): - """ - Checks provided configuration to ensure it has the minimal required - configuration the values are in a reasonable range. - """ - # sample rate - sample_rate = self.cfg.get('sample_rate') - if sample_rate is None: - raise ValueError('Sample rate not provided.') - elif sample_rate < 0: - raise ValueError(f'Sample rate must to be positive: {sample_rate}') - - # room configuration - room_cfg = self.cfg.get('room') - if room_cfg is None: - raise ValueError('Room configuration not provided') - - if room_cfg.get('num') is None: - raise ValueError('Number of rooms per subset not provided') - - if room_cfg.get('dim') is None: - raise ValueError('Room dimensions not provided') - - for idx, key in enumerate(['width', 'length', 'height']): - dim = room_cfg.dim.get(key) - - if dim is None: - # not provided - raise ValueError(f'Room {key} needs to be a scalar or a range, currently it is None') - elif np.isscalar(dim) and dim <= 0: - # fixed dimension - raise ValueError(f'A fixed dimension must be positive for {key}: {dim}') - elif len(dim) != 2 or not 0 < dim[0] < dim[1]: - # not a valid range - raise ValueError(f'Range must be specified with two positive increasing elements for {key}: {dim}') - - rt60 = room_cfg.get('rt60') - if rt60 is None: - # not provided - raise ValueError(f'RT60 needs to be a scalar or a range, currently it is None') - elif np.isscalar(rt60) and rt60 <= 0: - # fixed dimension - raise ValueError(f'RT60 must be positive: {rt60}') - elif len(rt60) != 2 or not 0 < rt60[0] < rt60[1]: - # not a valid range - raise ValueError(f'RT60 range must be specified with two positive increasing elements: {rt60}') - - # mic array - mic_cfg = self.cfg.get('mic_array') - if mic_cfg is None: - raise ValueError('Mic configuration not provided') - - if mic_cfg.get('positions') == 'random': - # Only num_mics and placement are required - mic_cfg_keys = ['num_mics', 'placement'] - else: - mic_cfg_keys = ['positions', 'placement', 'orientation'] - - for key in mic_cfg_keys: - if key not in mic_cfg: - raise ValueError(f'Mic array {key} not provided') - - # source - source_cfg = self.cfg.get('source') - if source_cfg is None: - raise ValueError('Source configuration not provided') - - if source_cfg.get('num') is None: - raise ValueError('Number of sources per room not provided') - elif source_cfg.num <= 0: - raise ValueError(f'Number of sources must be positive: {source_cfg.num}') - - if 'placement' not in source_cfg: - raise ValueError('Source placement dictionary not provided') - - # anechoic - if self.cfg.get('anechoic') is None: - raise ValueError(f'Anechoic configuratio not provided.') - - def generate_room_params(self) -> dict: - """Generate randomized room parameters based on the provided - configuration. - """ - # Prepare room sim parameters - if not PRA: - raise ImportError('pyroomacoustics is required for room simulation') - - room_cfg = self.cfg.room - - # Prepare rt60 - if room_cfg.rt60 is None: - raise ValueError(f'Room RT60 needs to be a scalar or a range, currently it is None') - - if np.isscalar(room_cfg.rt60): - assert room_cfg.rt60 > 0, f'RT60 should be positive: {room_cfg.rt60}' - rt60 = room_cfg.rt60 - elif len(room_cfg.rt60) == 2: - assert ( - 0 < room_cfg.rt60[0] <= room_cfg.rt60[1] - ), f'Expecting two non-decreasing values for RT60, received {room_cfg.rt60}' - rt60 = self.random.uniform(low=room_cfg.rt60[0], high=room_cfg.rt60[1]) - else: - raise ValueError(f'Unexpected value for RT60: {room_cfg.rt60}') - - # Generate a room with random dimensions - num_retries = self.cfg.get('num_retries', 20) - - for n in range(num_retries): - - # width, length, height - room_dim = np.zeros(3) - - # prepare dimensions - for idx, key in enumerate(['width', 'length', 'height']): - # get configured dimension - dim = room_cfg.dim[key] - - # set a value - if dim is None: - raise ValueError(f'Room {key} needs to be a scalar or a range, currently it is None') - elif np.isscalar(dim): - assert dim > 0, f'Dimension should be positive for {key}: {dim}' - room_dim[idx] = dim - elif len(dim) == 2: - assert 0 < dim[0] <= dim[1], f'Expecting two non-decreasing values for {key}, received {dim}' - # Reduce dimension if the previous attempt failed - room_dim[idx] = self.random.uniform(low=dim[0], high=dim[1] - n * (dim[1] - dim[0]) / num_retries) - else: - raise ValueError(f'Unexpected value for {key}: {dim}') - - try: - # Get parameters from size and RT60 - room_absorption, room_max_order = pra.inverse_sabine(rt60, room_dim) - break - except Exception as e: - logging.debug('Inverse sabine failed: %s', str(e)) - # Inverse sabine may fail if the room is too large for the selected RT60. - # Try again by generate a smaller room. - room_absorption = room_max_order = None - continue - - if room_absorption is None or room_max_order is None: - raise RuntimeError(f'Evaluation of parameters failed for RT60 {rt60}s and room size {room_dim}.') - - # Return the required values - room_params = { - 'dim': room_dim, - 'absorption': room_absorption, - 'max_order': room_max_order, - 'rt60_theoretical': rt60, - 'anechoic_absorption': self.cfg.anechoic.absorption, - 'anechoic_max_order': self.cfg.anechoic.max_order, - 'sample_rate': self.cfg.sample_rate, - } - return room_params - - def generate_array(self, room_dim: Iterable[float]) -> ArrayGeometry: - """Generate array placement for the current room and config. - - Args: - room_dim: dimensions of the room, [width, length, height] - - Returns: - Randomly placed microphone array. - """ - mic_cfg = self.cfg.mic_array - - if mic_cfg.positions == 'random': - # Create a radom set of microphones - num_mics = mic_cfg.num_mics - mic_positions = [] - - # Each microphone is placed individually - placement_range = convert_placement_to_range( - placement=mic_cfg.placement, room_dim=room_dim, object_radius=0 - ) - - # Randomize mic placement - for m in range(num_mics): - position_m = [None] * 3 - for idx in range(3): - position_m[idx] = self.random.uniform(low=placement_range[idx][0], high=placement_range[idx][1]) - mic_positions.append(position_m) - - mic_array = ArrayGeometry(mic_positions) - - else: - mic_array = ArrayGeometry(mic_cfg.positions) - - # Randomize center placement - center = np.zeros(3) - placement_range = convert_placement_to_range( - placement=mic_cfg.placement, room_dim=room_dim, object_radius=mic_array.radius - ) - - for idx in range(len(center)): - center[idx] = self.random.uniform(low=placement_range[idx][0], high=placement_range[idx][1]) - - # Place the array at the configured center point - mic_array.translate(to=center) - - # Randomize orientation - orientation = dict() - for key in ['yaw', 'roll', 'pitch']: - # angle for current orientation - angle = mic_cfg.orientation[key] - - if angle is None: - raise ValueError(f'Mic array {key} should be a scalar or a range, currently it is set to None.') - - # check it's within the expected range - check_angle(key, angle) - - if np.isscalar(angle): - orientation[key] = angle - elif len(angle) == 2: - assert angle[0] <= angle[1], f"Expecting two non-decreasing values for {key}, received {angle}" - # generate integer values, for easier bucketing, if necessary - orientation[key] = self.random.uniform(low=angle[0], high=angle[1]) - else: - raise ValueError(f'Unexpected value for orientation {key}: {angle}') - - # Rotate the array to match the selected orientation - mic_array.rotate(**orientation) - - return mic_array - - def generate_source_position(self, room_dim: Iterable[float]) -> List[List[float]]: - """Generate position for all sources in a room. - - Args: - room_dim: dimensions of a 3D shoebox room - - Returns: - List of source positions, with each position characterized with a 3D coordinate - """ - source_cfg = self.cfg.source - placement_range = convert_placement_to_range(placement=source_cfg.placement, room_dim=room_dim) - source_position = [] - - for n in range(source_cfg.num): - # generate a random point withing the range - s_pos = [None] * 3 - for idx in range(len(s_pos)): - s_pos[idx] = self.random.uniform(low=placement_range[idx][0], high=placement_range[idx][1]) - source_position.append(s_pos) - - return source_position - - def generate(self): - """Generate RIR corpus. - - This method will prepare randomized examples based on the current configuration, - run room simulations and save results to output_dir. - """ - logging.info("Generate RIR corpus") - - # Initialize - self.random = default_rng(seed=self.cfg.random_seed) - - # Prepare output dir - output_dir = self.cfg.output_dir - if output_dir.endswith('.yaml'): - output_dir = output_dir[:-5] - - # Create absolute path - logging.info('Output dir set to: %s', output_dir) - - # Generate all cases - for subset, num_rooms in self.cfg.room.num.items(): - - output_dir_subset = os.path.join(output_dir, subset) - examples = [] - - if not os.path.exists(output_dir_subset): - logging.info('Creating output directory: %s', output_dir_subset) - os.makedirs(output_dir_subset) - elif os.path.isdir(output_dir_subset) and len(os.listdir(output_dir_subset)) > 0: - raise RuntimeError(f'Output directory {output_dir_subset} is not empty.') - - # Generate examples - for n_room in range(num_rooms): - - # room info - room_params = self.generate_room_params() - - # array placement - mic_array = self.generate_array(room_params['dim']) - - # source placement - source_position = self.generate_source_position(room_params['dim']) - - # file name for the file - room_filepath = os.path.join(output_dir_subset, f'{subset}_room_{n_room:06d}.h5') - - # prepare example - example = { - 'room_params': room_params, - 'mic_array': mic_array, - 'source_position': source_position, - 'room_filepath': room_filepath, - } - examples.append(example) - - # Simulation - if (num_workers := self.cfg.get('num_workers')) is None: - num_workers = os.cpu_count() - 1 - - if num_workers > 1: - logging.info(f'Simulate using {num_workers} workers') - with multiprocessing.Pool(processes=num_workers) as pool: - metadata = list(tqdm(pool.imap(simulate_room_kwargs, examples), total=len(examples))) - - else: - logging.info('Simulate using a single worker') - metadata = [] - for example in tqdm(examples, total=len(examples)): - metadata.append(simulate_room(**example)) - - # Save manifest - manifest_filepath = os.path.join(output_dir, f'{subset}_manifest.json') - - if os.path.exists(manifest_filepath) and os.path.isfile(manifest_filepath): - raise RuntimeError(f'Manifest config file exists: {manifest_filepath}') - - # Make all paths in the manifest relative to the output dir - for data in metadata: - data['room_filepath'] = os.path.relpath(data['room_filepath'], start=output_dir) - - write_manifest(manifest_filepath, metadata) - - # Generate plots with information about generated data - plot_filepath = os.path.join(output_dir, f'{subset}_info.png') - - if os.path.exists(plot_filepath) and os.path.isfile(plot_filepath): - raise RuntimeError(f'Plot file exists: {plot_filepath}') - - plot_rir_manifest_info(manifest_filepath, plot_filepath=plot_filepath) - - # Save used configuration for reference - config_filepath = os.path.join(output_dir, 'config.yaml') - if os.path.exists(config_filepath) and os.path.isfile(config_filepath): - raise RuntimeError(f'Output config file exists: {config_filepath}') - - OmegaConf.save(self.cfg, config_filepath, resolve=True) - - -def simulate_room_kwargs(kwargs: dict) -> dict: - """Wrapper around `simulate_room` to handle kwargs. - - `pool.map(simulate_room_kwargs, examples)` would be - equivalent to `pool.starstarmap(simulate_room, examples)` - if `starstarmap` would exist. - - Args: - kwargs: kwargs that are forwarded to `simulate_room` - - Returns: - Dictionary with metadata, see `simulate_room` - """ - return simulate_room(**kwargs) - - -def simulate_room( - room_params: dict, mic_array: ArrayGeometry, source_position: Iterable[Iterable[float]], room_filepath: str, -) -> dict: - """Simulate room - - Args: - room_params: parameters of the room to be simulated - mic_array: defines positions of the microphones - source_positions: positions for all sources to be simulated - room_filepath: results are saved to this path - - Returns: - Dictionary with metadata based on simulation setup - and simulation results. Used to create the corresponding - manifest file. - """ - # room with the selected parameters - room_sim = pra.ShoeBox( - room_params['dim'], - fs=room_params['sample_rate'], - materials=pra.Material(room_params['absorption']), - max_order=room_params['max_order'], - ) - - # same geometry for generating anechoic responses - room_anechoic = pra.ShoeBox( - room_params['dim'], - fs=room_params['sample_rate'], - materials=pra.Material(room_params['anechoic_absorption']), - max_order=room_params['anechoic_max_order'], - ) - - # Compute RIRs - for room in [room_sim, room_anechoic]: - # place the array - room.add_microphone_array(mic_array.positions.T) - - # place the sources - for s_pos in source_position: - room.add_source(s_pos) - - # generate RIRs - room.compute_rir() - - # Get metadata for sources - source_distance = [] - source_azimuth = [] - source_elevation = [] - for s_pos in source_position: - distance, azimuth, elevation = mic_array.spherical_relative_to_array(s_pos) - source_distance.append(distance) - source_azimuth.append(azimuth) - source_elevation.append(elevation) - - # RIRs - rir_dataset = { - 'rir': convert_rir_to_multichannel(room_sim.rir), - 'anechoic': convert_rir_to_multichannel(room_anechoic.rir), - } - - # Prepare metadata dict and return - metadata = { - 'room_filepath': room_filepath, - 'sample_rate': room_params['sample_rate'], - 'dim': room_params['dim'], - 'rir_absorption': room_params['absorption'], - 'rir_max_order': room_params['max_order'], - 'rir_rt60_theory': room_sim.rt60_theory(), - 'rir_rt60_measured': room_sim.measure_rt60().mean(axis=0), # average across mics for each source - 'anechoic_rt60_theory': room_anechoic.rt60_theory(), - 'anechoic_rt60_measured': room_anechoic.measure_rt60().mean(axis=0), # average across mics for each source - 'anechoic_absorption': room_params['anechoic_absorption'], - 'anechoic_max_order': room_params['anechoic_max_order'], - 'mic_positions': mic_array.positions, - 'mic_center': mic_array.center, - 'source_position': source_position, - 'source_distance': source_distance, - 'source_azimuth': source_azimuth, - 'source_elevation': source_elevation, - 'num_sources': len(source_position), - } - - # Save simulated RIR - save_rir_simulation(room_filepath, rir_dataset, metadata) - - return convert_numpy_to_serializable(metadata) - - -def save_rir_simulation(filepath: str, rir_dataset: Dict[str, List[np.array]], metadata: dict): - """Save simulated RIRs and metadata. - - Args: - filepath: Path to the file where the data will be saved. - rir_dataset: Dictionary with RIR data. Each item is a set of multi-channel RIRs. - metadata: Dictionary with related metadata. - """ - if os.path.exists(filepath): - raise RuntimeError(f'Output file exists: {room_filepath}') - - num_sources = metadata['num_sources'] - - with h5py.File(filepath, 'w') as h5f: - # Save RIRs, each RIR set in a separate group - for rir_key, rir_value in rir_dataset.items(): - if len(rir_value) != num_sources: - raise ValueError( - f'Each RIR dataset should have exactly {num_sources} elements. Current RIR {key} has {len(rir_value)} elements' - ) - - rir_group = h5f.create_group(rir_key) - - # RIRs for different sources are saved under [group]['idx'] - for idx, rir in enumerate(rir_value): - rir_group.create_dataset(f'{idx}', data=rir_value[idx]) - - # Save metadata - metadata_group = h5f.create_group('metadata') - for key, value in metadata.items(): - metadata_group.create_dataset(key, data=value) - - -def load_rir_simulation(filepath: str, source: int = 0, rir_key: str = 'rir') -> Tuple[np.ndarray, float]: - """Load simulated RIRs and metadata. - - Args: - filepath: Path to simulated RIR data - source: Index of a source. - rir_key: String to denote which RIR to load, if there are multiple available. - - Returns: - Multichannel RIR as ndarray with shape (num_samples, num_channels) and scalar sample rate. - """ - with h5py.File(filepath, 'r') as h5f: - # Load RIR - rir = h5f[rir_key][f'{source}'][:] - - # Load metadata - sample_rate = h5f['metadata']['sample_rate'][()] - - return rir, sample_rate - - -def convert_numpy_to_serializable(data: Union[dict, float, np.ndarray]) -> Union[dict, float, np.ndarray]: - """Convert all numpy estries to list. - Can be used to preprocess data before writing to a JSON file. - - Args: - data: Dictionary, array or scalar. - - Returns: - The same structure, but converted to list if - the input is np.ndarray, so `data` can be seralized. - """ - if isinstance(data, dict): - for key, val in data.items(): - data[key] = convert_numpy_to_serializable(val) - elif isinstance(data, list): - data = [convert_numpy_to_serializable(d) for d in data] - elif isinstance(data, np.ndarray): - data = data.tolist() - elif isinstance(data, np.integer): - data = int(data) - elif isinstance(data, np.floating): - data = float(data) - elif isinstance(data, np.generic): - data = data.item() - - return data - - -def convert_rir_to_multichannel(rir: List[List[np.ndarray]]) -> List[np.ndarray]: - """Convert RIR to a list of arrays. - - Args: - rir: list of lists, each element is a single-channel RIR - - Returns: - List of multichannel RIRs - """ - num_mics = len(rir) - num_sources = len(rir[0]) - - mc_rir = [None] * num_sources - - for n_source in range(num_sources): - rir_len = [len(rir[m][n_source]) for m in range(num_mics)] - max_len = max(rir_len) - mc_rir[n_source] = np.zeros((max_len, num_mics)) - for n_mic, len_mic in enumerate(rir_len): - mc_rir[n_source][:len_mic, n_mic] = rir[n_mic][n_source] - - return mc_rir - - -def plot_rir_manifest_info(filepath: str, plot_filepath: str = None): - """Plot distribution of parameters from manifest file. - - Args: - filepath: path to a RIR corpus manifest file - plot_filepath: path to save the plot at - """ - metadata = read_manifest(filepath) - - # source placement - source_distance = [] - source_azimuth = [] - source_elevation = [] - source_height = [] - - # room config - rir_rt60_theory = [] - rir_rt60_measured = [] - anechoic_rt60_theory = [] - anechoic_rt60_measured = [] - - # get the required data - for data in metadata: - # source config - source_distance += data['source_distance'] - source_azimuth += data['source_azimuth'] - source_elevation += data['source_elevation'] - source_height += [s_pos[2] for s_pos in data['source_position']] - - # room config - rir_rt60_theory.append(data['rir_rt60_theory']) - rir_rt60_measured += data['rir_rt60_measured'] - anechoic_rt60_theory.append(data['anechoic_rt60_theory']) - anechoic_rt60_measured += data['anechoic_rt60_measured'] - - # plot - plt.figure(figsize=(12, 6)) - - plt.subplot(2, 4, 1) - plt.hist(source_distance, label='distance') - plt.xlabel('distance / m') - plt.ylabel('# examples') - plt.title('Source-to-array center distance') - - plt.subplot(2, 4, 2) - plt.hist(source_azimuth, label='azimuth') - plt.xlabel('azimuth / deg') - plt.ylabel('# examples') - plt.title('Source-to-array center azimuth') - - plt.subplot(2, 4, 3) - plt.hist(source_elevation, label='elevation') - plt.xlabel('elevation / deg') - plt.ylabel('# examples') - plt.title('Source-to-array center elevation') - - plt.subplot(2, 4, 4) - plt.hist(source_height, label='source height') - plt.xlabel('height / m') - plt.ylabel('# examples') - plt.title('Source height') - - plt.subplot(2, 4, 5) - plt.hist(rir_rt60_theory, label='theory') - plt.xlabel('RT60 / s') - plt.ylabel('# examples') - plt.title('RT60 theory') - - plt.subplot(2, 4, 6) - plt.hist(rir_rt60_measured, label='measured') - plt.xlabel('RT60 / s') - plt.ylabel('# examples') - plt.title('RT60 measured') - - plt.subplot(2, 4, 7) - plt.hist(anechoic_rt60_theory, label='theory') - plt.xlabel('RT60 / s') - plt.ylabel('# examples') - plt.title('RT60 theory (anechoic)') - - plt.subplot(2, 4, 8) - plt.hist(anechoic_rt60_measured, label='measured') - plt.xlabel('RT60 / s') - plt.ylabel('# examples') - plt.title('RT60 measured (anechoic)') - - for n in range(8): - plt.subplot(2, 4, n + 1) - plt.grid() - plt.legend(loc='lower left') - - plt.tight_layout() - - if plot_filepath is not None: - plt.savefig(plot_filepath) - plt.close() - logging.info('Plot saved at %s', plot_filepath) - - -class RIRMixGenerator(object): - """Creates a dataset of mixed signals at the microphone - by combining target speech, background noise and interference. - - Correspnding signals are are generated and saved - using the `generate` method. - - Input configuration is expexted to have the following structure - ``` - sample_rate: sample rate used for simulation - room: - subset: manifest for RIR data - target: - subset: manifest for target source data - noise: - subset: manifest for noise data - interference: - subset: manifest for interference data - interference_probability: probability that interference is present - max_num_interferers: max number of interferers, randomly selected between 0 and max - mix: - subset: - num: number of examples to generate - rsnr: range of RSNR - rsir: range of RSIR - ref_mic: reference microphone - ref_mic_rms: desired RMS at ref_mic - ``` - """ - - def __init__(self, cfg: DictConfig): - """ - Instantiate a RIRMixGenerator object. - - Args: - cfg: generator configuration defining data for room, - target signal, noise, interference and mixture - """ - logging.info("Initialize RIRMixGenerator") - self._cfg = cfg - self.check_cfg() - - self.subsets = self.cfg.room.keys() - logging.info('Initialized with %d subsets: %s', len(self.subsets), str(self.subsets)) - - # load manifests - self.metadata = dict() - for subset in self.subsets: - subset_data = dict() - - logging.info('Loading data for %s', subset) - for key in ['room', 'target', 'noise', 'interference']: - try: - subset_data[key] = read_manifest(self.cfg[key][subset]) - logging.info('\t%-*s: \t%d files', 15, key, len(subset_data[key])) - except Exception as e: - subset_data[key] = None - logging.info('\t%-*s: \t0 files', 15, key) - logging.warning('\t\tManifest data not loaded. Exception: %s', str(e)) - - self.metadata[subset] = subset_data - - logging.info('Loaded all manifests') - - self.num_retries = self.cfg.get('num_retries', 5) - - @property - def cfg(self): - """Property holding the internal config of the object. - - Note: - Changes to this config are not reflected in the state of the object. - Please create a new model with the updated config. - """ - return self._cfg - - @property - def sample_rate(self): - return self._cfg.sample_rate - - @cfg.setter - def cfg(self, cfg): - """Property holding the internal config of the object. - - Note: - Changes to this config are not reflected in the state of the object. - Please create a new model with the updated config. - """ - self._cfg = cfg - - def check_cfg(self): - """ - Checks provided configuration to ensure it has the minimal required - configuration the values are in a reasonable range. - """ - # sample rate - sample_rate = self.cfg.get('sample_rate') - if sample_rate is None: - raise ValueError('Sample rate not provided.') - elif sample_rate < 0: - raise ValueError(f'Sample rate must be positive: {sample_rate}') - - # room configuration - room_cfg = self.cfg.get('room') - if not room_cfg: - raise ValueError( - 'Room configuration not provided. Expecting RIR manifests in format {subset: path_to_manifest}' - ) - - # target configuration - target_cfg = self.cfg.get('target') - if not target_cfg: - raise ValueError( - 'Target configuration not provided. Expecting audio manifests in format {subset: path_to_manifest}' - ) - - for key in ['azimuth', 'elevation', 'distance']: - value = target_cfg.get(key) - - if value is None or np.isscalar(value): - # no constraint or a fixed dimension is ok - pass - elif len(value) != 2 or not value[0] < value[1]: - # not a valid range - raise ValueError(f'Range must be specified with two positive increasing elements for {key}: {value}') - - # noise configuration - noise_cfg = self.cfg.get('noise') - if not noise_cfg: - raise ValueError( - 'Noise configuration not provided. Expecting audio manifests in format {subset: path_to_manifest}' - ) - - # interference configuration - interference_cfg = self.cfg.get('interference') - if not interference_cfg: - logging.info('Interference configuration not provided.') - else: - interference_probability = interference_cfg.get('interference_probability', 0) - max_num_interferers = interference_cfg.get('max_num_interferers', 0) - min_azimuth_to_target = interference_cfg.get('min_azimuth_to_target', 0) - if interference_probability is not None: - if interference_probability < 0: - raise ValueError( - f'Interference probability must be non-negative. Current value: {interference_prob}' - ) - elif interference_probability > 0: - assert ( - max_num_interferers is not None and max_num_interferers > 0 - ), f'Max number of interferers must be positive. Current value: {max_num_interferers}' - assert ( - min_azimuth_to_target is not None and min_azimuth_to_target >= 0 - ), f'Min azimuth to target must be non-negative' - - # mix configuration - mix_cfg = self.cfg.get('mix') - if not mix_cfg: - raise ValueError('Mix configuration not provided. Expecting configuration for each subset.') - if 'ref_mic' not in mix_cfg: - raise ValueError('Reference microphone not defined.') - if 'ref_mic_rms' not in mix_cfg: - raise ValueError('Reference microphone RMS not defined.') - - def generate_target(self, subset: str) -> dict: - """ - Prepare a dictionary with target configuration. - - The output dictionary contains the following information - ``` - room_index: index of the selected room from the RIR corpus - room_filepath: path to the room simulation file - source: index of the selected source for the target - rt60: reverberation time of the selected room - num_mics: number of microphones - azimuth: azimuth of the target source, relative to the microphone array - elevation: elevation of the target source, relative to the microphone array - distance: distance of the target source, relative to the microphone array - audio_filepath: path to the audio file for the target source - text: text for the target source audio signal, if available - duration: duration of the target source audio signal - ``` - - Args: - subset: string denoting a subset which will be used to selected target - audio and room parameters. - - Returns: - Dictionary with target configuration, including room, source index, and audio information. - """ - # Utility function - def select_target_source(room_metadata, room_indices): - """Find a room and a source that satisfies the constraints. - """ - for room_index in room_indices: - # Select room - room_data = room_metadata[room_index] - - # Candidate sources - sources = self.random.choice(room_data['num_sources'], size=self.num_retries, replace=False) - - # Select target source in this room - for source in sources: - # Check constraints - constraints_met = [] - for constraint in ['azimuth', 'elevation', 'distance']: - if self.cfg.target.get(constraint) is not None: - # Check that the selected source is in the range - source_value = room_data[f'source_{constraint}'][source] - if self.cfg.target[constraint][0] <= source_value <= self.cfg.target[constraint][1]: - constraints_met.append(True) - else: - constraints_met.append(False) - # No need to check the remaining constraints - break - - # Check if a feasible source is found - if all(constraints_met): - # A feasible source has been found - return source, room_index - - return None, None - - # Prepare room & source position - room_metadata = self.metadata[subset]['room'] - room_indices = self.random.choice(len(room_metadata), size=self.num_retries, replace=False) - source, room_index = select_target_source(room_metadata, room_indices) - - if source is None: - raise RuntimeError(f'Could not find a feasible source given target constraints {self.cfg.target}') - - room_data = room_metadata[room_index] - - # Optional: select subset of channels - num_available_mics = len(room_data['mic_positions']) - if 'mic_array' in self.cfg: - num_mics = self.cfg.mic_array['num_mics'] - mic_selection = self.cfg.mic_array['selection'] - - if mic_selection == 'random': - logging.debug('Randomly selecting %d mics', num_mics) - selected_mics = self.random.choice(num_available_mics, size=num_mics, replace=False) - elif isinstance(mic_selection, Iterable): - logging.debug('Using explicitly selected mics: %s', str(mic_selection)) - assert ( - 0 <= min(mic_selection) < num_available_mics - ), f'Expecting mic_selection in range [0,{num_available_mics}), current value: {mic_selection}' - selected_mics = np.array(mic_selection) - else: - raise ValueError(f'Unexpected value for mic_selection: {mic_selection}') - else: - logging.debug('Using all %d available mics', num_available_mics) - num_mics = num_available_mics - selected_mics = np.arange(num_mics) - - # Double-check the number of mics is as expected - assert ( - len(selected_mics) == num_mics - ), f'Expecting {num_mics} mics, but received {len(selected_mics)} mics: {selected_mics}' - logging.debug('Selected mics: %s', str(selected_mics)) - - # Calculate distance from the source to each microphone - mic_positions = np.array(room_data['mic_positions'])[selected_mics] - source_position = np.array(room_data['source_position'][source]) - distance_source_to_mic = np.linalg.norm(mic_positions - source_position, axis=1) - - # Handle relative paths - room_filepath = room_data['room_filepath'] - if not os.path.isabs(room_filepath): - manifest_dir = os.path.dirname(self.cfg.room[subset]) - room_filepath = os.path.join(manifest_dir, room_filepath) - - target_cfg = { - 'room_index': int(room_index), - 'room_filepath': room_filepath, - 'source': source, - 'rt60': room_data['rir_rt60_measured'][source], - 'selected_mics': selected_mics.tolist(), - # Positions - 'source_position': source_position.tolist(), - 'mic_positions': mic_positions.tolist(), - # Relative to center of the array - 'azimuth': room_data['source_azimuth'][source], - 'elevation': room_data['source_elevation'][source], - 'distance': room_data['source_distance'][source], - # Relative to mics - 'distance_source_to_mic': distance_source_to_mic, - } - - return target_cfg - - def generate_interference(self, subset: str, target_cfg: dict) -> List[dict]: - """ - Prepare a list of dictionaries with interference configuration. - - Args: - subset: string denoting a subset which will be used to select interference audio. - target_cfg: dictionary with target configuration. This is used to determine - the minimal required duration for the noise signal. - - Returns: - List of dictionary with interference configuration, including source index and audio information - for one or more interference sources. - """ - if (interference_metadata := self.metadata[subset]['interference']) is None: - # No interference to be configured - return None - - # Configure interfering sources - max_num_sources = self.cfg.interference.get('max_num_interferers', 0) - interference_probability = self.cfg.interference.get('interference_probability', 0) - - if ( - max_num_sources >= 1 - and interference_probability > 0 - and self.random.uniform(low=0.0, high=1.0) < interference_probability - ): - # interference present - num_interferers = self.random.integers(low=1, high=max_num_sources + 1) - else: - # interference not present - return None - - # Room setup: same room as target - room_index = target_cfg['room_index'] - room_data = self.metadata[subset]['room'][room_index] - feasible_sources = list(range(room_data['num_sources'])) - # target source is not eligible - feasible_sources.remove(target_cfg['source']) - - # Constraints for interfering sources - min_azimuth_to_target = self.cfg.interference.get('min_azimuth_to_target', 0) - - # Prepare interference configuration - interference_cfg = [] - for n in range(num_interferers): - - # Select a source - source = None - while len(feasible_sources) > 0 and source is None: - - # Select a potential source for the target - source = self.random.choice(feasible_sources) - feasible_sources.remove(source) - - # Check azimuth separation - if min_azimuth_to_target > 0: - source_azimuth = room_data['source_azimuth'][source] - azimuth_diff = wrap_to_180(source_azimuth - target_cfg['azimuth']) - if abs(azimuth_diff) < min_azimuth_to_target: - # Try again - source = None - continue - - if source is None: - logging.warning('Could not select a feasible interference source %d of %s', n, num_interferers) - - # Return what we have for now or None - return interference_cfg if interference_cfg else None - - # Current source setup - interfering_source = { - 'source': source, - 'selected_mics': target_cfg['selected_mics'], - 'position': room_data['source_position'][source], - 'azimuth': room_data['source_azimuth'][source], - 'elevation': room_data['source_elevation'][source], - 'distance': room_data['source_distance'][source], - } - - # Done with interference for this source - interference_cfg.append(interfering_source) - - return interference_cfg - - def generate_mix(self, subset: str, target_cfg: dict) -> dict: - """Generate scaling parameters for mixing - the target speech at the microphone, background noise - and interference signal at the microphone. - - The output dictionary contains the following information - ``` - rsnr: reverberant signal-to-noise ratio - rsir: reverberant signal-to-interference ratio - ref_mic: reference microphone for calculating the metrics - ref_mic_rms: RMS of the signal at the reference microphone - ``` - - Args: - subset: string denoting the subset of configuration - target_cfg: dictionary with target configuration - - Returns: - Dictionary containing configured RSNR, RSIR, ref_mic - and RMS on ref_mic. - """ - mix_cfg = dict() - - for key in ['rsnr', 'rsir', 'ref_mic', 'ref_mic_rms', 'min_duration']: - if key in self.cfg.mix[subset]: - # Take the value from subset config - value = self.cfg.mix[subset].get(key) - else: - # Take the global value - value = self.cfg.mix.get(key) - - if value is None: - mix_cfg[key] = None - elif np.isscalar(value): - mix_cfg[key] = value - elif len(value) == 2: - # Select from the given range, including the upper bound - mix_cfg[key] = self.random.integers(low=value[0], high=value[1] + 1) - else: - # Select one of the multiple values - mix_cfg[key] = self.random.choice(value) - - if mix_cfg['ref_mic'] == 'closest': - # Select the closest mic as the reference - mix_cfg['ref_mic'] = np.argmin(target_cfg['distance_source_to_mic']) - - # Configuration for saving individual components - mix_cfg['save'] = OmegaConf.to_object(self.cfg.mix['save']) if 'save' in self.cfg.mix else {} - - return mix_cfg - - def generate(self): - """Generate a corpus of microphone signals by mixing target, background noise - and interference signals. - - This method will prepare randomized examples based on the current configuration, - run simulations and save results to output_dir. - """ - logging.info('Generate mixed signals') - - # Initialize - self.random = default_rng(seed=self.cfg.random_seed) - - # Prepare output dir - output_dir = self.cfg.output_dir - if output_dir.endswith('.yaml'): - output_dir = output_dir[:-5] - - # Create absolute path - logging.info('Output dir set to: %s', output_dir) - - # Generate all cases - for subset in self.subsets: - - output_dir_subset = os.path.join(output_dir, subset) - examples = [] - - if not os.path.exists(output_dir_subset): - logging.info('Creating output directory: %s', output_dir_subset) - os.makedirs(output_dir_subset) - elif os.path.isdir(output_dir_subset) and len(os.listdir(output_dir_subset)) > 0: - raise RuntimeError(f'Output directory {output_dir_subset} is not empty.') - - num_examples = self.cfg.mix[subset].num - logging.info('Preparing %d examples for subset %s', num_examples, subset) - - # Generate examples - for n_example in tqdm(range(num_examples), total=num_examples, desc=f'Preparing {subset}'): - # prepare configuration - target_cfg = self.generate_target(subset) - interference_cfg = self.generate_interference(subset, target_cfg) - mix_cfg = self.generate_mix(subset, target_cfg) - - # base file name - base_output_filepath = os.path.join(output_dir_subset, f'{subset}_example_{n_example:09d}') - - # prepare example - example = { - 'sample_rate': self.sample_rate, - 'target_cfg': target_cfg, - 'interference_cfg': interference_cfg, - 'mix_cfg': mix_cfg, - 'base_output_filepath': base_output_filepath, - } - - examples.append(example) - - # Audio data - audio_metadata = { - 'target': self.metadata[subset]['target'], - 'target_dir': os.path.dirname(self.cfg.target[subset]), # manifest_dir - 'noise': self.metadata[subset]['noise'], - 'noise_dir': os.path.dirname(self.cfg.noise[subset]), # manifest_dir - } - - if interference_cfg is not None: - audio_metadata.update( - { - 'interference': self.metadata[subset]['interference'], - 'interference_dir': os.path.dirname(self.cfg.interference[subset]), # manifest_dir - } - ) - - # Simulation - if (num_workers := self.cfg.get('num_workers')) is None: - num_workers = os.cpu_count() - 1 - - if num_workers is not None and num_workers > 1: - logging.info(f'Simulate using {num_workers} workers') - examples_and_audio_metadata = zip(examples, itertools.repeat(audio_metadata, len(examples))) - with multiprocessing.Pool(processes=num_workers) as pool: - metadata = list( - tqdm( - pool.imap(simulate_room_mix_helper, examples_and_audio_metadata), - total=len(examples), - desc=f'Simulating {subset}', - ) - ) - else: - logging.info('Simulate using a single worker') - metadata = [] - for example in tqdm(examples, total=len(examples), desc=f'Simulating {subset}'): - metadata.append(simulate_room_mix(**example, audio_metadata=audio_metadata)) - - # Save manifest - manifest_filepath = os.path.join(output_dir, f'{os.path.basename(output_dir)}_{subset}.json') - - if os.path.exists(manifest_filepath) and os.path.isfile(manifest_filepath): - raise RuntimeError(f'Manifest config file exists: {manifest_filepath}') - - # Make all paths in the manifest relative to the output dir - for data in tqdm(metadata, total=len(metadata), desc=f'Making filepaths relative {subset}'): - for key, val in data.items(): - if key.endswith('_filepath') and val is not None: - data[key] = os.path.relpath(val, start=output_dir) - - write_manifest(manifest_filepath, metadata) - - # Generate plots with information about generated data - plot_filepath = os.path.join(output_dir, f'{os.path.basename(output_dir)}_{subset}_info.png') - - if os.path.exists(plot_filepath) and os.path.isfile(plot_filepath): - raise RuntimeError(f'Plot file exists: {plot_filepath}') - - plot_mix_manifest_info(manifest_filepath, plot_filepath=plot_filepath) - - # Save used configuration for reference - config_filepath = os.path.join(output_dir, 'config.yaml') - if os.path.exists(config_filepath) and os.path.isfile(config_filepath): - raise RuntimeError(f'Output config file exists: {config_filepath}') - - OmegaConf.save(self.cfg, config_filepath, resolve=True) - - -def convolve_rir(signal: np.ndarray, rir: np.ndarray) -> np.ndarray: - """Convolve signal with a possibly multichannel IR in rir, i.e., - calculate the following for each channel m: - - signal_m = rir_m \ast signal - - Args: - signal: single-channel signal (samples,) - rir: single- or multi-channel IR, (samples,) or (samples, channels) - - Returns: - out: same length as signal, same number of channels as rir, shape (samples, channels) - """ - num_samples = len(signal) - if rir.ndim == 1: - # convolve and trim to length - out = convolve(signal, rir)[:num_samples] - elif rir.ndim == 2: - num_channels = rir.shape[1] - out = np.zeros((num_samples, num_channels)) - for m in range(num_channels): - out[:, m] = convolve(signal, rir[:, m])[:num_samples] - - else: - raise RuntimeError(f'RIR with {rir.ndim} not supported') - - return out - - -def calculate_drr(rir: np.ndarray, sample_rate: float, n_direct: List[int], n_0_ms=2.5) -> List[float]: - """Calculate direct-to-reverberant ratio (DRR) from the measured RIR. - - Calculation is done as in eq. (3) from [1]. - - Args: - rir: room impulse response, shape (num_samples, num_channels) - sample_rate: sample rate for the impulse response - n_direct: direct path delay - n_0_ms: window around n_direct for calculating the direct path energy - - Returns: - Calculated DRR for each channel of the input RIR. - - References: - [1] Eaton et al, The ACE challenge: Corpus description and performance evaluation, WASPAA 2015 - """ - # Define a window around the direct path delay - n_0 = int(n_0_ms * sample_rate / 1000) - - len_rir, num_channels = rir.shape - drr = [None] * num_channels - for m in range(num_channels): - - # Window around the direct path - dir_start = max(n_direct[m] - n_0, 0) - dir_end = n_direct[m] + n_0 - - # Power of the direct component - pow_dir = np.sum(np.abs(rir[dir_start:dir_end, m]) ** 2) / len_rir - - # Power of the reverberant component - pow_reverberant = (np.sum(np.abs(rir[0:dir_start, m]) ** 2) + np.sum(np.abs(rir[dir_end:, m]) ** 2)) / len_rir - - # DRR in dB - drr[m] = pow2db(pow_dir / pow_reverberant) - - return drr - - -def normalize_max(x: np.ndarray, max_db: float = 0, eps: float = 1e-16) -> np.ndarray: - """Normalize max input value to max_db full scale (±1). - - Args: - x: input signal - max_db: desired max magnitude compared to full scale - eps: small regularization constant - - Returns: - Normalized signal with max absolute value max_db. - """ - max_val = db2mag(max_db) - return max_val * x / (np.max(np.abs(x)) + eps) - - -def simultaneously_active_rms( - x: np.ndarray, - y: np.ndarray, - sample_rate: float, - rms_threshold_db: float = -60, - window_len_ms: float = 200, - min_active_duration: float = 0.5, -) -> Tuple[float, float]: - """Calculate RMS over segments where both input signals are active. - - Args: - x: first input signal - y: second input signal - sample_rate: sample rate for input signals in Hz - rms_threshold_db: threshold for determining activity of the signal, relative - to max absolute value - window_len_ms: window length in milliseconds, used for calculating segmental RMS - min_active_duration: minimal duration of the active segments - - Returns: - RMS value over active segments for x and y. - """ - if len(x) != len(y): - raise RuntimeError(f'Expecting signals of same length: len(x)={len(x)}, len(y)={len(y)}') - window_len = int(window_len_ms * sample_rate / 1000) - rms_threshold = db2mag(rms_threshold_db) # linear scale - - x_normalized = normalize_max(x) - y_normalized = normalize_max(y) - - x_active_power = y_active_power = active_len = 0 - for start in range(0, len(x) - window_len, window_len): - window = slice(start, start + window_len) - - # check activity on the scaled signal - x_window_rms = rms(x_normalized[window]) - y_window_rms = rms(y_normalized[window]) - - if x_window_rms > rms_threshold and y_window_rms > rms_threshold: - # sum the power of the original non-scaled signal - x_active_power += np.sum(np.abs(x[window]) ** 2) - y_active_power += np.sum(np.abs(y[window]) ** 2) - active_len += window_len - - if active_len < int(min_active_duration * sample_rate): - raise RuntimeError( - f'Signals are simultaneously active less than {min_active_duration} s: only {active_len/sample_rate} s' - ) - - # normalize - x_active_power /= active_len - y_active_power /= active_len - - return np.sqrt(x_active_power), np.sqrt(y_active_power) - - -def scaled_disturbance( - signal: np.ndarray, - disturbance: np.ndarray, - sdr: float, - sample_rate: float = None, - ref_channel: int = 0, - eps: float = 1e-16, -) -> np.ndarray: - """ - Args: - signal: numpy array, shape (num_samples, num_channels) - disturbance: numpy array, same shape as signal - sdr: desired signal-to-disturbance ration - sample_rate: sample rate of the input signals - ref_channel: ref mic used to calculate RMS - eps: regularization constant - - Returns: - Scaled disturbance, so that signal-to-disturbance ratio at ref_channel - is approximately equal to input SDR during simultaneously active - segment of signal and disturbance. - """ - if signal.shape != disturbance.shape: - raise ValueError(f'Signal and disturbance shapes do not match: {signal.shape} != {disturbance.shape}') - - # set scaling based on RMS at ref_mic - signal_rms, disturbance_rms = simultaneously_active_rms( - signal[:, ref_channel], disturbance[:, ref_channel], sample_rate=sample_rate - ) - disturbance_gain = db2mag(-sdr) * signal_rms / (disturbance_rms + eps) - # scale disturbance - scaled_disturbance = disturbance_gain * disturbance - return scaled_disturbance - - -def prepare_source_signal( - signal_type: str, - sample_rate: int, - audio_data: List[dict], - audio_dir: Optional[str] = None, - min_duration: Optional[int] = None, - ref_signal: Optional[np.ndarray] = None, - mic_positions: Optional[np.ndarray] = None, - num_retries: int = 10, -) -> tuple: - """Prepare an audio signal for a source. - - Args: - signal_type: 'point' or 'diffuse' - sample_rate: Sampling rate for the signal - audio_data: List of audio items, each is a dictionary with audio_filepath, duration, offset and optionally text - audio_dir: Base directory for resolving paths, e.g., manifest basedir - min_duration: Minimal duration to be loaded if ref_signal is not provided, in seconds - ref_signal: Optional, used to determine the length of the signal - mic_positions: Optional, used to prepare approximately diffuse signal - num_retries: Number of retries when selecting the source files - - Returns: - (audio_signal, metadata), where audio_signal is an ndarray and metadata is a dictionary - with audio filepaths, durations and offsets - """ - if not signal_type in ['point', 'diffuse']: - raise ValueError(f'Unexpected signal type {signal_type}.') - - if audio_data is None: - # No data to load - return None - - metadata = {} - - if ref_signal is None: - audio_signal = None - # load at least one sample if min_duration is not provided - samples_to_load = int(min_duration * sample_rate) if min_duration is not None else 1 - source_signals_metadata = {'audio_filepath': [], 'duration': [], 'offset': [], 'text': []} - - while samples_to_load > 0: - # Select a random item and load the audio - item = random.choice(audio_data) - - audio_filepath = item['audio_filepath'] - if not os.path.isabs(audio_filepath) and audio_dir is not None: - audio_filepath = os.path.join(audio_dir, audio_filepath) - - # Load audio - check_min_sample_rate(audio_filepath, sample_rate) - audio_segment = AudioSegment.from_file( - audio_file=audio_filepath, - target_sr=sample_rate, - duration=item['duration'], - offset=item.get('offset', 0), - ) - - if signal_type == 'point': - if audio_segment.num_channels > 1: - raise RuntimeError( - f'Expecting single-channel source signal, but received {audio_segment.num_channels}. File: {audio_filepath}' - ) - else: - raise ValueError(f'Unexpected signal type {signal_type}.') - - source_signals_metadata['audio_filepath'].append(audio_filepath) - source_signals_metadata['duration'].append(item['duration']) - source_signals_metadata['duration'].append(item.get('offset', 0)) - source_signals_metadata['text'].append(item.get('text')) - - # not perfect, since different files may have different distributions - segment_samples = normalize_max(audio_segment.samples) - # concatenate - audio_signal = ( - np.concatenate((audio_signal, segment_samples)) if audio_signal is not None else segment_samples - ) - # remaining samples - samples_to_load -= len(segment_samples) - - # Finally, we need only the metadata for the complete signal - metadata = { - 'duration': sum(source_signals_metadata['duration']), - 'offset': 0, - } - - # Add text only if all source signals have text - if all([isinstance(tt, str) for tt in source_signals_metadata['text']]): - metadata['text'] = ' '.join(source_signals_metadata['text']) - else: - # Load a signal with total_len samples and ensure it has enough simultaneous activity/overlap with ref_signal - # Concatenate multiple files if necessary - total_len = len(ref_signal) - - for n in range(num_retries): - - audio_signal = None - source_signals_metadata = {'audio_filepath': [], 'duration': [], 'offset': []} - - if signal_type == 'point': - samples_to_load = total_len - elif signal_type == 'diffuse': - # Load longer signal so it can be reshaped into (samples, mics) and - # used to generate approximately diffuse noise field - num_mics = len(mic_positions) - samples_to_load = num_mics * total_len - - while samples_to_load > 0: - # Select an audio file - item = random.choice(audio_data) - - audio_filepath = item['audio_filepath'] - if not os.path.isabs(audio_filepath) and audio_dir is not None: - audio_filepath = os.path.join(audio_dir, audio_filepath) - - # Load audio signal - check_min_sample_rate(audio_filepath, sample_rate) - - if (max_offset := item['duration'] - np.ceil(samples_to_load / sample_rate)) > 0: - # Load with a random offset if the example is longer than samples_to_load - offset = random.uniform(0, max_offset) - duration = -1 - else: - # Load the whole file - offset, duration = 0, item['duration'] - audio_segment = AudioSegment.from_file( - audio_file=audio_filepath, target_sr=sample_rate, duration=duration, offset=offset - ) - - # Prepare a single-channel signal - if audio_segment.num_channels == 1: - # Take all samples - segment_samples = audio_segment.samples - else: - # Take a random channel - selected_channel = random.choice(range(audio_segment.num_channels)) - segment_samples = audio_segment.samples[:, selected_channel] - - source_signals_metadata['audio_filepath'].append(audio_filepath) - source_signals_metadata['duration'].append(len(segment_samples) / sample_rate) - source_signals_metadata['offset'].append(offset) - - # not perfect, since different files may have different distributions - segment_samples = normalize_max(segment_samples) - # concatenate - audio_signal = ( - np.concatenate((audio_signal, segment_samples)) if audio_signal is not None else segment_samples - ) - # remaining samples - samples_to_load -= len(segment_samples) - - if signal_type == 'diffuse' and num_mics > 1: - try: - # Trim and reshape to num_mics to prepare num_mics source signals - audio_signal = audio_signal[: num_mics * total_len].reshape(num_mics, -1).T - - # Make spherically diffuse noise - audio_signal = generate_approximate_noise_field( - mic_positions=np.array(mic_positions), noise_signal=audio_signal, sample_rate=sample_rate - ) - except Exception as e: - logging.info('Failed to generate approximate noise field: %s', str(e)) - logging.info('Try again.') - # Try again - audio_signal, source_signals_metadata = None, {} - continue - - # Trim to length - audio_signal = audio_signal[:total_len, ...] - - # Include the channel dimension if the reference includes it - if ref_signal.ndim == 2 and audio_signal.ndim == 1: - audio_signal = audio_signal[:, None] - - try: - # Signal and ref_signal should be simultaneously active - simultaneously_active_rms(ref_signal, audio_signal, sample_rate=sample_rate) - # We have enough overlap - break - except Exception as e: - # Signal and ref_signal are not overlapping, try again - logging.info('Exception: %s', str(e)) - logging.info('Signals are not overlapping, try again.') - audio_signal, source_signals_metadata = None, {} - continue - - if audio_signal is None: - logging.warning('Audio signal not set: %s.', signal_type) - - metadata['source_signals'] = source_signals_metadata - - return audio_signal, metadata - - -def check_min_sample_rate(filepath: str, sample_rate: float): - """Make sure the file's sample rate is at least sample_rate. - This will make sure that we have only downsampling if loading - this file, while upsampling is not permitted. - - Args: - filepath: path to a file - sample_rate: desired sample rate - """ - file_sample_rate = librosa.get_samplerate(path=filepath) - if file_sample_rate < sample_rate: - raise RuntimeError( - f'Sample rate ({file_sample_rate}) is lower than the desired sample rate ({sample_rate}). File: {filepath}.' - ) - - -def simulate_room_mix( - sample_rate: int, - target_cfg: dict, - interference_cfg: dict, - mix_cfg: dict, - audio_metadata: dict, - base_output_filepath: str, - max_amplitude: float = 0.999, - eps: float = 1e-16, -) -> dict: - """Simulate mixture signal at the microphone, including target, noise and - interference signals and mixed at specific RSNR and RSIR. - - Args: - sample_rate: Sample rate for all signals - target_cfg: Dictionary with configuration of the target. Includes - room_filepath, source index, audio_filepath, duration - noise_cfg: List of dictionaries, where each item includes audio_filepath, - offset and duration. - interference_cfg: List of dictionaries, where each item contains source - index - mix_cfg: Dictionary with the mixture configuration. Includes RSNR, RSIR, - ref_mic and ref_mic_rms. - audio_metadata: Dictionary with a list of files for target, noise and interference - base_output_filepath: All output audio files will be saved with this prefix by - adding a diffierent suffix for each component, e.g., _mic.wav. - max_amplitude: Maximum amplitude of the mic signal, used to prevent clipping. - eps: Small regularization constant. - - Returns: - Dictionary with metadata based on the mixture setup and - simulation results. This corresponds to a line of the - output manifest file. - """ - # Local utilities - def load_rir( - room_filepath: str, source: int, selected_mics: list, sample_rate: float, rir_key: str = 'rir' - ) -> np.ndarray: - """Load a RIR and check that the sample rate is matching the desired sample rate - - Args: - room_filepath: Path to a room simulation in an h5 file - source: Index of the desired source - sample_rate: Sample rate of the simulation - rir_key: Key of the RIR to load from the simulation. - - Returns: - Numpy array with shape (num_samples, num_channels) - """ - rir, rir_sample_rate = load_rir_simulation(room_filepath, source=source, rir_key=rir_key) - if rir_sample_rate != sample_rate: - raise RuntimeError( - f'RIR sample rate ({sample_rate}) is not matching the expected sample rate ({sample_rate}). File: {room_filepath}' - ) - return rir[:, selected_mics] - - def get_early_rir( - rir: np.ndarray, rir_anechoic: np.ndarray, sample_rate: int, early_duration: float = 0.050 - ) -> np.ndarray: - """Return only the early part of the RIR. - """ - early_len = int(early_duration * sample_rate) - direct_path_delay = np.min(np.argmax(rir_anechoic, axis=0)) - rir_early = rir.copy() - rir_early[direct_path_delay + early_len :, :] = 0 - return rir_early - - def save_audio( - base_path: str, - tag: str, - audio_signal: Optional[np.ndarray], - sample_rate: int, - save: str = 'all', - ref_mic: Optional[int] = None, - format: str = 'wav', - subtype: str = 'float', - ): - """Save audio signal and return filepath. - """ - if (audio_signal is None) or (not save): - return None - - if save == 'ref_mic': - # save only ref_mic - audio_signal = audio_signal[:, ref_mic] - - audio_filepath = base_path + f'_{tag}.{format}' - sf.write(audio_filepath, audio_signal, sample_rate, subtype) - - return audio_filepath - - # Target RIRs - target_rir = load_rir( - target_cfg['room_filepath'], - source=target_cfg['source'], - selected_mics=target_cfg['selected_mics'], - sample_rate=sample_rate, - ) - target_rir_anechoic = load_rir( - target_cfg['room_filepath'], - source=target_cfg['source'], - sample_rate=sample_rate, - selected_mics=target_cfg['selected_mics'], - rir_key='anechoic', - ) - target_rir_early = get_early_rir(rir=target_rir, rir_anechoic=target_rir_anechoic, sample_rate=sample_rate) - - # Target signals - target_signal, target_metadata = prepare_source_signal( - signal_type='point', - sample_rate=sample_rate, - audio_data=audio_metadata['target'], - audio_dir=audio_metadata['target_dir'], - min_duration=mix_cfg['min_duration'], - ) - source_signals_metadata = {'target': target_metadata['source_signals']} - - # Convolve target - target_reverberant = convolve_rir(target_signal, target_rir) - target_anechoic = convolve_rir(target_signal, target_rir_anechoic) - target_early = convolve_rir(target_signal, target_rir_early) - - # Prepare noise signal - noise, noise_metadata = prepare_source_signal( - signal_type='diffuse', - sample_rate=sample_rate, - mic_positions=target_cfg['mic_positions'], - audio_data=audio_metadata['noise'], - audio_dir=audio_metadata['noise_dir'], - ref_signal=target_reverberant, - ) - source_signals_metadata['noise'] = noise_metadata['source_signals'] - - # Prepare interference signal - if interference_cfg is None: - interference = None - else: - # Load interference signals - interference = 0 - source_signals_metadata['interference'] = [] - for i_cfg in interference_cfg: - # Load single-channel signal for directional interference - i_signal, i_metadata = prepare_source_signal( - signal_type='point', - sample_rate=sample_rate, - audio_data=audio_metadata['interference'], - audio_dir=audio_metadata['interference_dir'], - ref_signal=target_signal, - ) - source_signals_metadata['interference'].append(i_metadata['source_signals']) - # Load RIR from the same room as the target, but a difference source - i_rir = load_rir( - target_cfg['room_filepath'], - source=i_cfg['source'], - selected_mics=i_cfg['selected_mics'], - sample_rate=sample_rate, - ) - # Convolve interference - i_reverberant = convolve_rir(i_signal, i_rir) - # Sum - interference += i_reverberant - - # Scale and add components of the signal - mic = target_reverberant.copy() - - if noise is not None: - noise = scaled_disturbance( - signal=target_reverberant, - disturbance=noise, - sdr=mix_cfg['rsnr'], - sample_rate=sample_rate, - ref_channel=mix_cfg['ref_mic'], - ) - # Update mic signal - mic += noise - - if interference is not None: - interference = scaled_disturbance( - signal=target_reverberant, - disturbance=interference, - sdr=mix_cfg['rsir'], - sample_rate=sample_rate, - ref_channel=mix_cfg['ref_mic'], - ) - # Update mic signal - mic += interference - - # Set the final mic signal level - mic_rms = rms(mic[:, mix_cfg['ref_mic']]) - global_gain = db2mag(mix_cfg['ref_mic_rms']) / (mic_rms + eps) - mic_max = np.max(np.abs(mic)) - if (clipped_max := mic_max * global_gain) > max_amplitude: - # Downscale the global gain to prevent clipping + adjust ref_mic_rms accordingly - clipping_prevention_gain = max_amplitude / clipped_max - global_gain *= clipping_prevention_gain - mix_cfg['ref_mic_rms'] += mag2db(clipping_prevention_gain) - - logging.debug( - 'Clipping prevented for example %s (protection gain: %.2f dB)', - base_output_filepath, - mag2db(clipping_prevention_gain), - ) - - # save signals - signals = { - 'mic': mic, - 'target_reverberant': target_reverberant, - 'target_anechoic': target_anechoic, - 'target_early': target_early, - 'noise': noise, - 'interference': interference, - } - - metadata = {} - - for tag, signal in signals.items(): - - if signal is not None: - # scale all signal components with the global gain - signal = global_gain * signal - - audio_filepath = save_audio( - base_path=base_output_filepath, - tag=tag, - audio_signal=signal, - sample_rate=sample_rate, - save=mix_cfg['save'].get(tag, 'all'), - ref_mic=mix_cfg['ref_mic'], - format=mix_cfg['save'].get('format', 'wav'), - subtype=mix_cfg['save'].get('subtype', 'float'), - ) - - if tag == 'mic': - metadata['audio_filepath'] = audio_filepath - else: - metadata[tag + '_filepath'] = audio_filepath - - # Add metadata - metadata.update( - { - 'text': target_metadata.get('text'), - 'duration': target_metadata['duration'], - 'target_cfg': target_cfg, - 'interference_cfg': interference_cfg, - 'mix_cfg': mix_cfg, - 'ref_channel': mix_cfg.get('ref_mic'), - 'rt60': target_cfg.get('rt60'), - 'drr': calculate_drr(target_rir, sample_rate, n_direct=np.argmax(target_rir_anechoic, axis=0)), - 'rsnr': None if noise is None else mix_cfg['rsnr'], - 'rsir': None if interference is None else mix_cfg['rsir'], - 'source_signals': source_signals_metadata, - } - ) - - return convert_numpy_to_serializable(metadata) - - -def simulate_room_mix_helper(example_and_audio_metadata: tuple) -> dict: - """Wrapper around `simulate_room_mix` for pool.imap. - - Args: - args: example and audio_metadata that are forwarded to `simulate_room_mix` - - Returns: - Dictionary with metadata, see `simulate_room_mix` - """ - example, audio_metadata = example_and_audio_metadata - return simulate_room_mix(**example, audio_metadata=audio_metadata) - - -def plot_mix_manifest_info(filepath: str, plot_filepath: str = None): - """Plot distribution of parameters from the manifest file. - - Args: - filepath: path to a RIR corpus manifest file - plot_filepath: path to save the plot at - """ - metadata = read_manifest(filepath) - - # target info - target_distance = [] - target_azimuth = [] - target_elevation = [] - target_duration = [] - - # room config - rt60 = [] - drr = [] - - # noise - rsnr = [] - rsir = [] - - # get the required data - for data in metadata: - # target info - target_distance.append(data['target_cfg']['distance']) - target_azimuth.append(data['target_cfg']['azimuth']) - target_elevation.append(data['target_cfg']['elevation']) - target_duration.append(data['duration']) - - # room config - rt60.append(data['rt60']) - drr += data['drr'] # average DRR across all mics - - # noise - if data['rsnr'] is not None: - rsnr.append(data['rsnr']) - - if data['rsir'] is not None: - rsir.append(data['rsir']) - - # plot - plt.figure(figsize=(12, 6)) - - plt.subplot(2, 4, 1) - plt.hist(target_distance, label='distance') - plt.xlabel('distance / m') - plt.ylabel('# examples') - plt.title('Target-to-array distance') - - plt.subplot(2, 4, 2) - plt.hist(target_azimuth, label='azimuth') - plt.xlabel('azimuth / deg') - plt.ylabel('# examples') - plt.title('Target-to-array azimuth') - - plt.subplot(2, 4, 3) - plt.hist(target_elevation, label='elevation') - plt.xlabel('elevation / deg') - plt.ylabel('# examples') - plt.title('Target-to-array elevation') - - plt.subplot(2, 4, 4) - plt.hist(target_duration, label='duration') - plt.xlabel('time / s') - plt.ylabel('# examples') - plt.title('Target duration') - - plt.subplot(2, 4, 5) - plt.hist(rt60, label='RT60') - plt.xlabel('RT60 / s') - plt.ylabel('# examples') - plt.title('RT60') - - plt.subplot(2, 4, 6) - plt.hist(drr, label='DRR') - plt.xlabel('DRR / dB') - plt.ylabel('# examples') - plt.title('DRR [avg over mics]') - - if len(rsnr) > 0: - plt.subplot(2, 4, 7) - plt.hist(rsnr, label='RSNR') - plt.xlabel('RSNR / dB') - plt.ylabel('# examples') - plt.title(f'RSNR [{100 * len(rsnr) / len(rt60):.0f}% ex]') - - if len(rsir): - plt.subplot(2, 4, 8) - plt.hist(rsir, label='RSIR') - plt.xlabel('RSIR / dB') - plt.ylabel('# examples') - plt.title(f'RSIR [{100 * len(rsir) / len(rt60):.0f}% ex]') - - for n in range(8): - plt.subplot(2, 4, n + 1) - plt.grid() - plt.legend(loc='lower left') - - plt.tight_layout() - - if plot_filepath is not None: - plt.savefig(plot_filepath) - plt.close() - logging.info('Plot saved at %s', plot_filepath) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_label.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_label.py deleted file mode 100644 index 058d0157fcbd123f9029d08b2ad32037d6375b40..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_label.py +++ /dev/null @@ -1,497 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import Dict, List, Optional - -import torch - -from nemo.collections.asr.parts.preprocessing.feature_loader import ExternalFeatureLoader -from nemo.collections.common.parts.preprocessing import collections -from nemo.core.classes import Dataset -from nemo.core.neural_types import AcousticEncodedRepresentation, LabelsType, LengthsType, NeuralType -from nemo.utils import logging - - -def _feature_collate_fn(batch): - """collate batch of feat sig, feat len, labels, labels len, assuming all features have the same shape. - Args: - batch (FloatTensor, LongTensor, LongTensor, LongTensor): A tuple of tuples of feature, feature lengths, - encoded labels, and encoded labels length. - """ - packed_batch = list(zip(*batch)) - if len(packed_batch) == 5: - _, feat_lengths, _, labels_lengths, sample_ids = packed_batch - elif len(packed_batch) == 4: - sample_ids = None - _, feat_lengths, _, labels_lengths = packed_batch - else: - raise ValueError("Expects 4 or 5 tensors in the batch!") - - features, labels = [], [] - for b in batch: - feat_i, labels_i = b[0], b[2] - features.append(feat_i) - labels.append(labels_i) - - features = torch.stack(features) - feat_lengths = torch.stack(feat_lengths) - - labels = torch.stack(labels) - labels_lengths = torch.stack(labels_lengths) - - if sample_ids is None: - return features, feat_lengths, labels, labels_lengths - else: - sample_ids = torch.tensor(sample_ids, dtype=torch.int32) - return features, feat_lengths, labels, labels_lengths, sample_ids - - -def _audio_feature_collate_fn(batch, feat_pad_val, label_pad_id): - """collate batch of audio feature, audio len, labels, labels len - Args: - batch (Optional[FloatTensor], Optional[LongTensor], LongTensor, - LongTensor): A tuple of tuples of feature, feature lengths, - labels, and label lengths. This collate func assumes the - features are torch tensors of Log-Melspectrogram (i.e. [N_MEL, T]). - """ - packed_batch = list(zip(*batch)) - if len(packed_batch) == 5: - _, feat_lengths, _, labels_lengths, sample_ids = packed_batch - elif len(packed_batch) == 4: - sample_ids = None - _, feat_lengths, _, labels_lengths = packed_batch - else: - raise ValueError("Expects 4 or 5 tensors in the batch!") - max_feat_len = 0 - has_feat = feat_lengths[0] is not None - if has_feat: - max_feat_len = max(feat_lengths).item() - max_labels_len = max(labels_lengths).item() - - features, labels = [], [] - for b in batch: - feat_i, feat_i_len, label_i, label_i_len = b[0], b[1], b[2], b[3] - - if has_feat: - feat_i_len = feat_i_len.item() - if feat_i_len < max_feat_len: - pad = (0, max_feat_len - feat_i_len) - feat_i = torch.nn.functional.pad(feat_i, pad, value=feat_pad_val) - features.append(feat_i) - - label_i_len = label_i_len.item() - if label_i_len < max_labels_len: - pad = (0, max_labels_len - label_i_len) - label_i = torch.nn.functional.pad(label_i, pad, value=label_pad_id) - labels.append(label_i) - - if has_feat: - features = torch.stack(features) - feature_lengths = torch.stack(feat_lengths) - else: - features, feat_lengths = None, None - labels = torch.stack(labels) - labels_lengths = torch.stack(labels_lengths) - - if sample_ids is None: - return features, feature_lengths, labels, labels_lengths - else: - sample_ids = torch.tensor(sample_ids, dtype=torch.int32) - return features, feature_lengths, labels, labels_lengths, sample_ids - - -def _vad_feature_segment_collate_fn(batch, window_length_in_sec, shift_length_in_sec, frame_unit_in_sec): - """collate batch of audio features, features len, tokens, tokens len - Args: - batch (Optional[FloatTensor], Optional[LongTensor], LongTensor, - LongTensor): A tuple of tuples of signal, signal lengths, - encoded tokens, and encoded tokens length. This collate func - assumes the signals are 1d torch tensors (i.e. mono audio). - batch size equals to 1. - """ - slice_length = int(window_length_in_sec / frame_unit_in_sec) - audio_features, feat_lengths, _, tokens_lengths = zip(*batch) - - slice_length = int(min(slice_length, max(feat_lengths))) - shift = int(shift_length_in_sec / frame_unit_in_sec) - has_audio = feat_lengths[0] is not None - - f_dim = audio_features[0].shape[0] - audio_features, num_slices, tokens, feat_lengths = [], [], [], [] - append_len_start = torch.div(slice_length, 2, rounding_mode='trunc') - append_len_end = slice_length - torch.div(slice_length, 2, rounding_mode='trunc') - for feat_i, feat_i_len, tokens_i, _ in batch: - start = torch.zeros(f_dim, append_len_start) - end = torch.zeros(f_dim, append_len_end) - feat_i = torch.cat((start, feat_i, end), dim=1) - feat_i_len += slice_length - - if has_audio: - slices = max(1, torch.div(feat_i_len - slice_length, shift, rounding_mode='trunc')) - - for slice_id in range(slices): - start_idx = slice_id * shift - end_idx = start_idx + slice_length - feat_slice = feat_i[:, start_idx:end_idx] - audio_features.append(feat_slice) - - num_slices.append(slices) - tokens.extend([tokens_i] * slices) - feat_lengths.extend([slice_length] * slices) - - if has_audio: - audio_features = torch.stack(audio_features) - feat_lengths = torch.tensor(feat_lengths) - else: - audio_features, feat_lengths = None, None - - tokens = torch.stack(tokens) - tokens_lengths = torch.tensor(num_slices) - return audio_features, feat_lengths, tokens, tokens_lengths - - -class _FeatureSeqSpeakerLabelDataset(Dataset): - """ - Dataset that loads tensors via a json file containing paths to feature files, sequences of labels. - Each new line is a different sample. Example below: - and their target labels. JSON files should be of the following format: - {"feature_filepath": "/path/to/feature_0.p", "seq_label": speakerA speakerB SpeakerA ....} \ - ... - {"feature_filepath": "/path/to/feature_n.p", "seq_label": target_seq_label_n} - target_seq_label_n is the string of sequence of speaker label, separated by space. - - Args: - manifest_filepath (str): Dataset parameter. Path to JSON containing data. - labels (Optional[list]): Dataset parameter. List of unique labels collected from all samples. - feature_loader : Dataset parameter. Feature loader to load (external) feature. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - # TODO output type for external features - output_types = { - 'external_feat': NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - 'feat_length': NeuralType(tuple('B'), LengthsType()), - } - - if self.is_speaker_emb: - output_types.update( - { - 'embs': NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - 'embs_length': NeuralType(tuple('B'), LengthsType()), - 'label': NeuralType(('B', 'T'), LabelsType()), - 'label_length': NeuralType(tuple('B'), LengthsType()), - } - ) - else: - output_types.update( - {'label': NeuralType(('B', 'T'), LabelsType()), 'label_length': NeuralType(tuple('B'), LengthsType()),} - ) - - return output_types - - def __init__( - self, *, manifest_filepath: str, labels: List[str], feature_loader, is_speaker_emb: bool = False, - ): - super().__init__() - self.collection = collections.ASRFeatureSequenceLabel(manifests_files=manifest_filepath.split(','),) - - self.feature_loader = feature_loader - self.labels = labels if labels else self.collection.uniq_labels - self.is_speaker_emb = is_speaker_emb - - self.label2id, self.id2label = {}, {} - for label_id, label in enumerate(self.labels): - self.label2id[label] = label_id - self.id2label[label_id] = label - - for idx in range(len(self.labels[:5])): - logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx])) - - def __len__(self): - return len(self.collection) - - def __getitem__(self, index): - sample = self.collection[index] - - features = self.feature_loader.process(sample.feature_file) - f, fl = features, torch.tensor(features.shape[0]).long() - - t = torch.tensor(sample.seq_label).float() - tl = torch.tensor(len(sample.seq_label)).long() - - return f, fl, t, tl - - -class FeatureToSeqSpeakerLabelDataset(_FeatureSeqSpeakerLabelDataset): - """ - Dataset that loads tensors via a json file containing paths to feature - files and sequence of speakers. Each new line is a - different sample. Example below: - {"feature_filepath": "/path/to/feature_0.p", "seq_label": speakerA speakerB SpeakerA ....} \ - ... - {"feature_filepath": "/path/to/feature_n.p", "seq_label": target_seq_label_n} - target_seq_label_n is the string of sequence of speaker label, separated by space. - - Args: - manifest_filepath (str): Path to manifest json as described above. Canbe comma-separated paths. - labels (Optional[list]): String containing all the possible labels to map to - if None then automatically picks from ASRFeatureSequenceLabel collection. - feature_loader, Feature load to loader (external) feature. - - """ - - def _collate_fn(self, batch): - return _feature_collate_fn(batch) - - -class FeatureToLabelDataset(Dataset): - """ - Dataset that loads tensors via a json file containing paths to feature files and their labels. - Each new line is a different sample. Example below: - and their target labels. JSON files should be of the following format: - {"feature_filepath": "/path/to/audio_feature.pt", "label": "1"} - ... - {"feature_filepath": "/path/to/audio_feature.pt", "label": "0"} - Args: - manifest_filepath (str): Path to JSON containing data. - labels (Optional[list]): List of unique labels collected from all samples. - augmentor (Optional): feature augmentation - window_length_in_sec (float): Window length in seconds. - shift_length_in_sec (float): Shift length in seconds. - is_regression_task (bool): if True, the labels are treated as for a regression task. - cal_labels_occurrence (bool): if True, the labels occurrence will be calculated. - zero_spec_db_val (float): Value to replace non-speech signals in log-melspectrogram. - min_duration (float): Minimum duration of the audio file in seconds. - max_duration (float): Maximum duration of the audio file in seconds. - """ - - ZERO_LEVEL_SPEC_DB_VAL = -16.635 # Log-Melspectrogram value for zero signal - FRAME_UNIT_TIME_SECS = 0.01 - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - output_types = { - 'audio_feat': NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - 'feat_length': NeuralType(tuple('B'), LengthsType()), - 'labels': NeuralType(('B'), LabelsType()), - 'labels_length': NeuralType(tuple('B'), LengthsType()), - } - - return output_types - - def __init__( - self, - *, - manifest_filepath: str, - labels: List[str] = None, - augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None, - window_length_in_sec: float = 0.63, - shift_length_in_sec: float = 0.01, - is_regression_task: bool = False, - cal_labels_occurrence: Optional[bool] = False, - zero_spec_db_val: float = -16.635, - min_duration: Optional[float] = None, - max_duration: Optional[float] = None, - ): - super().__init__() - self.window_length_in_sec = window_length_in_sec - self.shift_length_in_sec = shift_length_in_sec - self.zero_spec_db_val = zero_spec_db_val - - if isinstance(manifest_filepath, str): - manifest_filepath = manifest_filepath.split(',') - - self.collection = collections.ASRFeatureLabel( - manifests_files=manifest_filepath, - is_regression_task=is_regression_task, - cal_labels_occurrence=cal_labels_occurrence, - min_duration=min_duration, - max_duration=max_duration, - ) - - self.feature_loader = ExternalFeatureLoader(augmentor=augmentor) - self.labels = labels if labels else self.collection.uniq_labels - - self.is_regression_task = is_regression_task - - if not is_regression_task: - self.labels = labels if labels else self.collection.uniq_labels - self.num_classes = len(self.labels) if self.labels is not None else 1 - self.label2id, self.id2label = {}, {} - self.id2occurrence, self.labels_occurrence = {}, [] - - for label_id, label in enumerate(self.labels): - self.label2id[label] = label_id - self.id2label[label_id] = label - if cal_labels_occurrence: - self.id2occurrence[label_id] = self.collection.labels_occurrence[label] - - if cal_labels_occurrence: - self.labels_occurrence = [self.id2occurrence[k] for k in sorted(self.id2occurrence)] - - for idx in range(len(self.labels[:5])): - logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx])) - else: - self.labels = [] - self.num_classes = 1 - - def __len__(self): - return len(self.collection) - - def __getitem__(self, index): - sample = self.collection[index] - - features = self.feature_loader.process(sample.feature_file) - f, fl = features, torch.tensor(features.shape[1]).long() - - t = torch.tensor(self.label2id[sample.label]) - tl = torch.tensor(1).long() - - return f, fl, t, tl - - def _collate_fn(self, batch): - return _audio_feature_collate_fn(batch, self.zero_spec_db_val, 0) - - def _vad_segment_collate_fn(self, batch): - return _vad_feature_segment_collate_fn( - batch, self.window_length_in_sec, self.shift_length_in_sec, self.FRAME_UNIT_TIME_SECS - ) - - -class FeatureToMultiLabelDataset(Dataset): - """ - Dataset that loads tensors via a json file containing paths to feature files and their labels. - Each new line is a different sample. Example below: - and their target labels. JSON files should be of the following format: - {"feature_filepath": "/path/to/audio_feature.pt", "label": "1 1 0 0 1"} - ... - {"feature_filepath": "/path/to/audio_feature.pt", "label": "0 1 0 0"} - Args: - manifest_filepath (str): Path to JSON containing data. - labels (Optional[list]): List of unique labels collected from all samples. - augmentor (Optional): feature augmentation - delimiter (str): delimiter to split the labels. - is_regression_task (bool): if True, the labels are treated as for a regression task. - cal_labels_occurrence (bool): if True, the labels occurrence will be calculated. - zero_spec_db_val (float): Value to replace non-speech signals in log-melspectrogram. - min_duration (float): Minimum duration of the audio file in seconds. - max_duration (float): Maximum duration of the audio file in seconds. - """ - - ZERO_LEVEL_SPEC_DB_VAL = -16.635 # Log-Melspectrogram value for zero signal - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - output_types = { - 'audio_feat': NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - 'feat_length': NeuralType(tuple('B'), LengthsType()), - 'labels': NeuralType(('B', 'T'), LabelsType()), - 'labels_length': NeuralType(tuple('B'), LengthsType()), - } - - return output_types - - def __init__( - self, - *, - manifest_filepath: str, - labels: List[str] = None, - augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None, - delimiter: Optional[str] = None, - is_regression_task: bool = False, - cal_labels_occurrence: Optional[bool] = False, - zero_spec_db_val: float = -16.635, - min_duration: Optional[float] = None, - max_duration: Optional[float] = None, - ): - super().__init__() - self.delimiter = delimiter - self.zero_spec_db_val = zero_spec_db_val - - if isinstance(manifest_filepath, str): - manifest_filepath = manifest_filepath.split(',') - - self.collection = collections.ASRFeatureLabel( - manifests_files=manifest_filepath, - is_regression_task=is_regression_task, - cal_labels_occurrence=cal_labels_occurrence, - delimiter=delimiter, - min_duration=min_duration, - max_duration=max_duration, - ) - - self.is_regression_task = is_regression_task - self.feature_loader = ExternalFeatureLoader(augmentor=augmentor) - self.labels = labels if labels else self.collection.uniq_labels - - self.label2id, self.id2label = {}, {} - if not is_regression_task: - self.labels = labels if labels else self._get_label_set() - self.num_classes = len(self.labels) if self.labels is not None else 1 - self.label2id, self.id2label = {}, {} - for label_id, label in enumerate(self.labels): - self.label2id[label] = label_id - self.id2label[label_id] = label - if cal_labels_occurrence: - self.id2occurrence[label_id] = self.collection.labels_occurrence[label] - self.labels_occurrence.append(self.id2occurrence[label_id]) - - for idx in range(len(self.labels[:5])): - logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx])) - else: - self.labels = [] - self.num_classes = 1 - - def _get_label_set(self): - labels = [] - for sample in self.collection: - label_str = sample.label - if label_str: - label_str_list = label_str.split(self.delimiter) if self.delimiter else label_str.split() - labels.extend(label_str_list) - return sorted(set(labels)) - - def _label_str_to_tensor(self, label_str: str): - labels = label_str.split(self.delimiter) if self.delimiter else label_str.split() - - if self.is_regression_task: - labels = [float(s) for s in labels] - labels = torch.tensor(labels).float() - else: - labels = [self.label2id[s] for s in labels] - labels = torch.tensor(labels).long() - return labels - - def __len__(self): - return len(self.collection) - - def __getitem__(self, index): - sample = self.collection[index] - - features = self.feature_loader.process(sample.feature_file) - f, fl = features, torch.tensor(features.shape[1]).long() - - t = self._label_str_to_tensor(sample.label) - tl = torch.tensor(t.size(0)).long() - - return f, fl, t, tl - - def _collate_fn(self, batch): - return _audio_feature_collate_fn(batch, self.zero_spec_db_val, 0) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_label_dataset.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_label_dataset.py deleted file mode 100644 index 08803f43ce8d8fc65e69afb785ecdc2290b267ab..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_label_dataset.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import Optional - -from nemo.collections.asr.data import feature_to_label - - -def get_feature_seq_speakerlabel_dataset( - feature_loader, config: dict -) -> feature_to_label.FeatureToSeqSpeakerLabelDataset: - """ - Instantiates a FeatureSeqSpeakerLabelDataset. - Args: - config: Config of the FeatureToSeqSpeakerLabelDataset. - - Returns: - An instance of FeatureToSeqSpeakerLabelDataset. - """ - dataset = feature_to_label.FeatureToSeqSpeakerLabelDataset( - manifest_filepath=config['manifest_filepath'], labels=config['labels'], feature_loader=feature_loader, - ) - return dataset - - -def get_feature_label_dataset( - config: dict, augmentor: Optional['FeatureAugmentor'] = None -) -> feature_to_label.FeatureToLabelDataset: - dataset = feature_to_label.FeatureToLabelDataset( - manifest_filepath=config['manifest_filepath'], - labels=config['labels'], - augmentor=augmentor, - window_length_in_sec=config.get("window_length_in_sec", 0.63), - shift_length_in_sec=config.get("shift_length_in_sec", 0.08), - is_regression_task=config.get("is_regression_task", False), - cal_labels_occurrence=config.get("cal_labels_occurrence", False), - zero_spec_db_val=config.get("zero_spec_db_val", -16.635), - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - ) - return dataset - - -def get_feature_multi_label_dataset( - config: dict, augmentor: Optional['FeatureAugmentor'] = None -) -> feature_to_label.FeatureToMultiLabelDataset: - dataset = feature_to_label.FeatureToMultiLabelDataset( - manifest_filepath=config['manifest_filepath'], - labels=config['labels'], - augmentor=augmentor, - delimiter=config.get('delimiter', None), - is_regression_task=config.get("is_regression_task", False), - cal_labels_occurrence=config.get("cal_labels_occurrence", False), - zero_spec_db_val=config.get("zero_spec_db_val", -16.635), - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - ) - return dataset diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_text.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_text.py deleted file mode 100644 index a7e295051ae805f312146e386df775dc4e68edcc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_text.py +++ /dev/null @@ -1,488 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Callable, Dict, List, Optional, Tuple, Union - -import torch - -from nemo.collections.asr.data.feature_to_label import _audio_feature_collate_fn -from nemo.collections.asr.parts.preprocessing.feature_loader import ExternalFeatureLoader -from nemo.collections.asr.parts.preprocessing.features import normalize_batch -from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType -from nemo.collections.asr.parts.utils.vad_utils import load_speech_segments_from_rttm -from nemo.collections.common import tokenizers -from nemo.collections.common.parts.preprocessing import collections, parsers -from nemo.core.classes import Dataset -from nemo.core.neural_types import AcousticEncodedRepresentation, LabelsType, LengthsType, NeuralType - - -class ASRFeatureManifestProcessor: - def __init__( - self, - manifest_filepath: str, - parser: Union[str, Callable], - max_duration: Optional[float] = None, - min_duration: Optional[float] = None, - max_utts: int = 0, - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - pad_id: int = 0, - index_by_file_id: bool = False, - ): - self.parser = parser - self.collection = collections.ASRFeatureText( - manifests_files=manifest_filepath, - parser=parser, - min_duration=min_duration, - max_duration=max_duration, - max_number=max_utts, - index_by_file_id=index_by_file_id, - ) - - self.eos_id = eos_id - self.bos_id = bos_id - self.pad_id = pad_id - - def process_text_by_id(self, index: int) -> Tuple[List[int], int]: - sample = self.collection[index] - return self.process_text_by_sample(sample) - - def process_text_by_file_id(self, file_id: str) -> Tuple[List[int], int]: - manifest_idx = self.collection.mapping[file_id][0] - sample = self.collection[manifest_idx] - return self.process_text_by_sample(sample) - - def process_text_by_sample(self, sample: collections.ASRAudioText.OUTPUT_TYPE) -> Tuple[List[int], int]: - t, tl = sample.text_tokens, len(sample.text_tokens) - - if self.bos_id is not None: - t = [self.bos_id] + t - tl += 1 - if self.eos_id is not None: - t = t + [self.eos_id] - tl += 1 - - return t, tl - - -class _FeatureTextDataset(Dataset): - """ - Dataset that loads tensors via a json file containing paths to audio feature files, transcripts, - durations (in seconds) and optional RTTM files. Each new line is a different sample. Example below: - {"feature_filepath": "/path/to/audio_feature.pt", "text_filepath": "/path/to/audio.txt", - "rttm_filepath": "/path/to/audio_rttm.rttm", "duration": 23.147} - ... - {"feature_filepath": "/path/to/audio_feature.pt", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt": - "utterance_id", "ctm_utt": "en_4156", "side": "A"} - Args: - manifest_filepath (str): Path to manifest json as described above. Can be comma-separated paths. - parser: Str for a language specific preprocessor or a callable. - normalize (bool): whether and where to normalize feature, must be one of [None, "post_norm", "pre_norm"] - normalize_type (Union[str, dict]): how to normalize feature, see `nemo.collections.asr.parts.preprocessing.features.normalize_batch` - use_rttm (bool): whether to use RTTM files if there is any, default to False - rttm_mode (str): how to use RTTM files, must be one of ['mask', 'drop'], default to 'mask' - feat_min_len (int): minimum length of feature when rttm_mode=deop, default to 4. - feat_mask_val (Optional[float]): value used to mask features with RTTM files, default to None to use zero mel-spectralgram - frame_unit_time_secs (float): time in seconds for each frame - sample_rate (int): Sample rate to resample loaded audio to - int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor object used to augment loaded audio - max_duration (float): If audio exceeds this length, do not include in dataset - min_duration (float): If audio is less than this length, do not include in dataset - max_utts (int): Limit number of utterances - trim (bool): whether or not to trim silence. Defaults to False - bos_id (int): Id of beginning of sequence symbol to append if not None - eos_id (int): Id of end of sequence symbol to append if not None - pad_id (int): Id of pad symbol. Defaults to 0 - return_sample_id (bool): whether to return the sample_id as a part of each sample - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. - """ - - ZERO_LEVEL_SPEC_DB_VAL = -16.635 # Log-Melspectrogram value for zero signal - NORM_MODES = ["pre_norm", "post_norm"] - RTTM_MODES = ["mask", "drop"] - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'features': NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - 'feature_length': NeuralType(tuple('B'), LengthsType()), - 'transcripts': NeuralType(('B', 'T'), LabelsType()), - 'transcript_length': NeuralType(tuple('B'), LengthsType()), - 'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True), - } - - def __init__( - self, - manifest_filepath: str, - parser: Union[str, Callable], - normalize: Optional[str] = "post_norm", - normalize_type: Union[str, dict] = "per_feature", - use_rttm: bool = False, - rttm_mode: str = "mask", - feat_min_len: int = 4, - feat_mask_val: Optional[float] = None, - frame_unit_time_secs: float = 0.01, - sample_rate: Optional[int] = 16000, - augmentor: 'nemo.collections.asr.parts.perturb.FeatureAugmentor' = None, - max_duration: Optional[int] = None, - min_duration: Optional[int] = None, - max_utts: int = 0, - trim: bool = False, - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - pad_id: int = 0, - return_sample_id: bool = False, - channel_selector: Optional[ChannelSelectorType] = None, - ): - if type(manifest_filepath) == str: - manifest_filepath = manifest_filepath.split(",") - - self.sample_rate = sample_rate - self.normalize = normalize - self.normalize_type = normalize_type - self.use_rttm = use_rttm - self.rttm_mode = rttm_mode - if self.use_rttm and self.rttm_mode not in self.RTTM_MODES: - raise ValueError(f"`rttm_mode` must be one of {self.RTTM_MODES}, got `{rttm_mode}` instead") - - self.feat_min_len = feat_min_len - if feat_mask_val is not None: - self.feat_mask_val = feat_mask_val - elif normalize == "pre_norm": - self.feat_mask_val = 0.0 # similar to SpectralAugmentation - else: - self.feat_mask_val = self.ZERO_LEVEL_SPEC_DB_VAL - - if normalize is not None and normalize not in self.NORM_MODES: - raise ValueError(f"`normalize` must be one of {self.NORM_MODES}, got `{normalize}` instead") - - self.frame_unit_time_secs = frame_unit_time_secs - - self.manifest_processor = ASRFeatureManifestProcessor( - manifest_filepath=manifest_filepath, - parser=parser, - max_duration=max_duration, - min_duration=min_duration, - max_utts=max_utts, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - ) - self.featurizer = ExternalFeatureLoader(augmentor=augmentor) - self.trim = trim - self.return_sample_id = return_sample_id - self.channel_selector = channel_selector - - def get_manifest_sample(self, sample_id): - return self.manifest_processor.collection[sample_id] - - def __getitem__(self, index): - sample = self.manifest_processor.collection[index] - offset = sample.offset - - if offset is None: - offset = 0 - - features = self.featurizer.process(sample.feature_file) - - f, fl = features, torch.tensor(features.shape[1]).long() - - t, tl = self.manifest_processor.process_text_by_sample(sample=sample) - - # Feature normalization - if self.normalize is None: - if self.use_rttm and sample.rttm_file: - f = self.process_features_with_rttm(f, offset, sample.rttm_file, self.feat_mask_val) - elif self.normalize == "post_norm": - # (Optional) Masking based on RTTM file - if self.use_rttm and sample.rttm_file: - f = self.process_features_with_rttm(f, offset, sample.rttm_file, self.feat_mask_val) - - f = self.normalize_feature(f) - else: # pre-norm - f = self.normalize_feature(f) - # (Optional) Masking based on RTTM file - if self.use_rttm and sample.rttm_file: - f = self.process_features_with_rttm(f, offset, sample.rttm_file, self.feat_mask_val) - - if self.return_sample_id: - output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long(), index - else: - output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long() - - return output - - def process_features_with_rttm(self, features, offset, rttm_file, mask_val): - segments = load_speech_segments_from_rttm(rttm_file) - new_features = features.clone() - sid, fid = 0, 0 - for i in range(features.size(1)): - t = offset + i * self.frame_unit_time_secs - while sid < len(segments) - 1 and segments[sid][1] < t: - sid += 1 - if segments[sid][1] == 0 or t < segments[sid][0] or t > segments[sid][1]: - # not in speech segment - if self.rttm_mode == "drop": - # drop the frame - continue - else: - # mask the frame with specified value - new_features[:, i] = mask_val - fid += 1 - else: - # in speech segment - new_features[:, fid] = features[:, i] - fid += 1 - - if fid < self.feat_min_len and self.rttm_mode == "drop": - new_features[:, : self.feat_min_len] = mask_val - return new_features[:, : self.feat_min_len] - return new_features[:, :fid] - - def __len__(self): - return len(self.manifest_processor.collection) - - def _collate_fn(self, batch): - return _audio_feature_collate_fn( - batch, feat_pad_val=self.feat_mask_val, label_pad_id=self.manifest_processor.pad_id - ) - - def normalize_feature(self, feat): - """ - Args: - feat: feature tensor of shape [M, T] - """ - feat = feat.unsqueeze(0) # add batch dim - feat, _, _ = normalize_batch(feat, torch.tensor([feat.size(-1)]), self.normalize_type) - return feat.squeeze(0) # delete batch dim - - -class FeatureToCharDataset(_FeatureTextDataset): - """ - Dataset that loads tensors via a json file containing paths to audio feature - files, transcripts, durations (in seconds) and optional RTTM files. Each new line is a - different sample. Example below: - {"feature_filepath": "/path/to/audio_feature.pt", "text_filepath": - "/path/to/audio.txt", "duration": 23.147, "rttm_filepath": "/path/to/audio_rttm.rttm",} - ... - {"feature_filepath": "/path/to/audio_feature.pt", "text": "the - transcription", "offset": 301.75, "duration": 0.82, "utt": - "utterance_id", "ctm_utt": "en_4156", "side": "A"} - - Args: - manifest_filepath (str): Path to manifest json as described above. Can - be comma-separated paths. - labels (str): String containing all the possible characters to map to - normalize (str): how to normalize feature, must be one of [None, "post_norm", "pre_norm"] - normalize_type (Union[str, dict]): how to normalize feature, see `nemo.collections.asr.parts.preprocessing.features.normalize_batch` - use_rttm (bool): whether to use RTTM files if there is any, default to False - rttm_mode (str): how to use RTTM files, must be one of ['mask', 'drop'], default to 'mask' - feat_min_len (int): minimum length of feature, default to 4 - feat_mask_val (Optional[float]): value used to mask features with RTTM files, default to None to use zero mel-spectralgram - frame_unit_time_secs: time in seconds for each frame - sample_rate (int): Sample rate to resample loaded audio to - int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor - object used to augment loaded audio - max_duration: If audio exceeds this length, do not include in dataset - min_duration: If audio is less than this length, do not include - in dataset - max_utts: Limit number of utterances - blank_index: blank character index, default = -1 - unk_index: unk_character index, default = -1 - bos_id: Id of beginning of sequence symbol to append if not None - eos_id: Id of end of sequence symbol to append if not None - return_sample_id (bool): whether to return the sample_id as a part of each sample - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. - """ - - def __init__( - self, - manifest_filepath: str, - labels: Union[str, List[str]], - normalize: Optional[str] = "post_norm", - normalize_type: Union[str, dict] = "per_feature", - use_rttm: bool = False, - rttm_mode: str = "mask", - feat_min_len: int = 4, - feat_mask_val: Optional[float] = None, - frame_unit_time_secs: float = 0.01, - sample_rate: Optional[int] = 16000, - augmentor: 'nemo.collections.asr.parts.perturb.FeatureAugmentor' = None, - max_duration: Optional[int] = None, - min_duration: Optional[int] = None, - max_utts: int = 0, - blank_index: int = -1, - unk_index: int = -1, - trim: bool = False, - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - pad_id: int = 0, - parser: Union[str, Callable] = 'en', - return_sample_id: bool = False, - channel_selector: Optional[ChannelSelectorType] = None, - ): - self.labels = labels - - parser = parsers.make_parser( - labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize - ) - - super().__init__( - manifest_filepath=manifest_filepath, - parser=parser, - normalize=normalize, - normalize_type=normalize_type, - use_rttm=use_rttm, - rttm_mode=rttm_mode, - feat_min_len=feat_min_len, - feat_mask_val=feat_mask_val, - frame_unit_time_secs=frame_unit_time_secs, - sample_rate=sample_rate, - augmentor=augmentor, - max_duration=max_duration, - min_duration=min_duration, - max_utts=max_utts, - trim=trim, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - return_sample_id=return_sample_id, - channel_selector=channel_selector, - ) - - -class FeatureToBPEDataset(_FeatureTextDataset): - """ - Dataset that loads tensors via a json file containing paths to audio feature - files, transcripts, durations (in seconds) and optional RTTM files. Each new line is a different sample. - Example below: - {"audio_filepath": "/path/to/audio.wav", "text_filepath": - "/path/to/audio.txt", "duration": 23.147, "rttm_filepath": "/path/to/audio_rttm.rttm",} - ... - {"audio_filepath": "/path/to/audio.wav", "text": "the - transcription", "offset": 301.75, "duration": 0.82, "utt": - "utterance_id", "ctm_utt": "en_4156", "side": "A"} - - In practice, the dataset and manifest used for character encoding and byte pair encoding - are exactly the same. The only difference lies in how the dataset tokenizes the text in - the manifest. - - Args: - manifest_filepath (str): Path to manifest json as described above. Can - be comma-separated paths. - tokenizer: A subclass of the Tokenizer wrapper found in the common collection, - nemo.collections.common.tokenizers.TokenizerSpec. ASR Models support a subset of - all available tokenizers. - normalize (str): how to normalize feature, must be one of [None, "post_norm", "pre_norm"] - normalize_type (Union[str, dict]): how to normalize feature, see `nemo.collections.asr.parts.preprocessing.features.normalize_batch` - use_rttm (bool): whether to use RTTM files if there is any, default to False - rttm_mode (str): how to use RTTM files, must be one of ['mask', 'drop'], default to 'mask' - feat_min_len (int): minimum length of feature, default to 4 - feat_mask_val (Optional[float]): value used to mask features with RTTM files, default to None to use zero mel-spectralgram - frame_unit_time_secs: time in seconds for each frame - sample_rate (int): Sample rate to resample loaded audio to - int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor - object used to augment loaded audio - max_duration: If audio exceeds this length, do not include in dataset - min_duration: If audio is less than this length, do not include - in dataset - max_utts: Limit number of utterances - trim: Whether to trim silence segments - use_start_end_token: Boolean which dictates whether to add [BOS] and [EOS] - tokens to beginning and ending of speech respectively. - return_sample_id (bool): whether to return the sample_id as a part of each sample - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. - """ - - def __init__( - self, - manifest_filepath: str, - tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec', - normalize: Optional[str] = "post_norm", - normalize_type: Union[str, dict] = "per_feature", - use_rttm: bool = False, - rttm_mode: str = "mask", - feat_min_len: int = 4, - feat_mask_val: Optional[float] = None, - frame_unit_time_secs: float = 0.01, - sample_rate: Optional[int] = 16000, - augmentor: 'nemo.collections.asr.parts.perturb.FeatureAugmentor' = None, - max_duration: Optional[int] = None, - min_duration: Optional[int] = None, - max_utts: int = 0, - use_start_end_token: bool = True, - trim: bool = False, - return_sample_id: bool = False, - channel_selector: Optional[ChannelSelectorType] = None, - ): - if use_start_end_token and hasattr(tokenizer, "bos_id") and tokenizer.bos_id > 0: - bos_id = tokenizer.bos_id - else: - bos_id = None - - if use_start_end_token and hasattr(tokenizer, "eos_id") and tokenizer.eos_id > 0: - eos_id = tokenizer.eos_id - else: - eos_id = None - - if hasattr(tokenizer, "pad_id") and tokenizer.pad_id > 0: - pad_id = tokenizer.pad_id - else: - pad_id = 0 - - class TokenizerWrapper: - def __init__(self, tokenizer): - if isinstance(tokenizer, tokenizers.aggregate_tokenizer.AggregateTokenizer): - self.is_aggregate = True - else: - self.is_aggregate = False - self._tokenizer = tokenizer - - def __call__(self, *args): - if isinstance(args[0], List) and self.is_aggregate: - t = [] - for span in args[0]: - t.extend(self._tokenizer.text_to_ids(span['str'], span['lang'])) - return t - - t = self._tokenizer.text_to_ids(*args) - return t - - super().__init__( - manifest_filepath=manifest_filepath, - parser=TokenizerWrapper(tokenizer), - normalize=normalize, - normalize_type=normalize_type, - use_rttm=use_rttm, - rttm_mode=rttm_mode, - feat_min_len=feat_min_len, - feat_mask_val=feat_mask_val, - frame_unit_time_secs=frame_unit_time_secs, - sample_rate=sample_rate, - augmentor=augmentor, - max_duration=max_duration, - min_duration=min_duration, - max_utts=max_utts, - trim=trim, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - return_sample_id=return_sample_id, - channel_selector=channel_selector, - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_text_dataset.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_text_dataset.py deleted file mode 100644 index 6bc03bc0b33df3e08f16e383d8dc522c68563bce..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/feature_to_text_dataset.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Optional - -from nemo.collections.asr.data.feature_to_text import FeatureToBPEDataset, FeatureToCharDataset -from nemo.utils import logging - - -def get_char_dataset(config: dict, augmentor: Optional['FeatureAugmentor'] = None) -> FeatureToCharDataset: - """ - Instantiates a Character Encoding based FeatureToCharDataset. - - Args: - config: Config of the FeatureToCharDataset. - augmentor: Optional AudioAugmentor object for augmentations on audio data. - - Returns: - An instance of FeatureToCharDataset. - """ - if 'labels' not in config: - logging.warning(f"dataset does not have explicitly defined labels") - - dataset = FeatureToCharDataset( - manifest_filepath=config['manifest_filepath'], - labels=config.get('labels', None), - normalize=config.get('normalize', 'post_norm'), - normalize_type=config.get('normalize_type', 'per_feature'), - use_rttm=config.get('use_rttm', False), - rttm_mode=config.get('rttm_mode', 'mask'), - feat_min_len=config.get('feat_min_len', 4), - feat_mask_val=config.get('feat_mask_val', None), - frame_unit_time_secs=config.get('frame_unit_time_secs', 0.01), - sample_rate=config.get('sample_rate', 16000), - augmentor=augmentor, - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - max_utts=config.get('max_utts', 0), - blank_index=config.get('blank_index', -1), - unk_index=config.get('unk_index', -1), - trim=config.get('trim_silence', False), - parser=config.get('parser', 'en'), - return_sample_id=config.get('return_sample_id', False), - channel_selector=config.get('channel_selector', None), - ) - return dataset - - -def get_bpe_dataset( - config: dict, tokenizer: 'TokenizerSpec', augmentor: Optional['FeatureAugmentor'] = None -) -> FeatureToBPEDataset: - """ - Instantiates a Byte Pair Encoding / Word Piece Encoding based FeatureoToBPEDataset. - - Args: - config: Config of the FeatureToBPEDataset. - tokenizer: An instance of a TokenizerSpec object. - augmentor: Optional FeatureAugmentor object for augmentations on audio features. - - Returns: - An instance of FeatureToBPEDataset. - """ - dataset = FeatureToBPEDataset( - manifest_filepath=config['manifest_filepath'], - tokenizer=tokenizer, - normalize=config.get('normalize', 'post_norm'), - normalize_type=config.get('normalize_type', 'per_feature'), - use_rttm=config.get('use_rttm', False), - rttm_mode=config.get('rttm_mode', 'mask'), - feat_min_len=config.get('feat_min_len', 4), - feat_mask_val=config.get('feat_mask_val', None), - frame_unit_time_secs=config.get('frame_unit_time_secs', 0.01), - sample_rate=config.get('sample_rate', 16000), - augmentor=augmentor, - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - max_utts=config.get('max_utts', 0), - trim=config.get('trim_silence', False), - use_start_end_token=config.get('use_start_end_token', True), - return_sample_id=config.get('return_sample_id', False), - channel_selector=config.get('channel_selector', None), - ) - return dataset diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/data/text_to_text.py b/SoundScribe/SpeakerID/nemo/collections/asr/data/text_to_text.py deleted file mode 100644 index 88b417ea21bc5e1a0ead0dc186d1ab60cc6ddb22..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/data/text_to_text.py +++ /dev/null @@ -1,482 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import concurrent.futures -import copy -import gc -import json -import math -import random -from pathlib import Path -from typing import Any, Callable, Dict, Iterable, List, NamedTuple, Optional, Set, Union - -import numpy as np -import torch -import torch.utils.data -from torch.nn.utils.rnn import pad_sequence -from tqdm.auto import tqdm - -from nemo.collections.asr.data.audio_to_text import _speech_collate_fn -from nemo.collections.common.tokenizers import TokenizerSpec -from nemo.core.classes import Dataset, IterableDataset -from nemo.utils import logging - -try: - from nemo_text_processing.text_normalization.normalize import Normalizer -except Exception as e: - pass # Normalizer imported only for annotation purposes, error can be ignored - -AnyPath = Union[Path, str] - - -class TextToTextItem(NamedTuple): - tts_text: torch.Tensor # normalized and tokenized text for TTS - transcript: torch.Tensor # tokenized text for ASR - speaker: int # speaker id for multi-speaker TTS - - -class TextToTextBatch(NamedTuple): - tts_texts: torch.Tensor # tokenized texts for tts - tts_text_lengths: torch.Tensor - transcripts: torch.Tensor # tokenized texts for ASR - transcript_lengths: torch.Tensor - speakers: torch.Tensor # speaker ids for multi-speaker TTS - - @staticmethod - def collate_fn(batch: List[TextToTextItem], asr_pad_id: int, tts_text_pad_id: int) -> TextToTextBatch: - return TextToTextBatch( - tts_texts=pad_sequence([item.tts_text for item in batch], batch_first=True, padding_value=tts_text_pad_id), - tts_text_lengths=torch.tensor([item.tts_text.shape[0] for item in batch]).long(), - transcripts=pad_sequence([item.transcript for item in batch], batch_first=True, padding_value=asr_pad_id), - transcript_lengths=torch.tensor([item.transcript.shape[0] for item in batch]).long(), - speakers=torch.tensor([item.speaker for item in batch]).long(), - ) - - -class TextOrAudioToTextBatch(NamedTuple): - audio_signals: torch.Tensor - audio_signal_lengths: torch.Tensor - tts_texts: torch.Tensor - tts_text_lengths: torch.Tensor - speakers: torch.Tensor - transcripts: torch.Tensor - transcript_lengths: torch.Tensor - - @staticmethod - def collate_fn( - batch: List[Union[TextToTextItem, tuple]], tts_text_pad_id: int, asr_pad_id: int - ) -> Union[TextToTextBatch, TextOrAudioToTextBatch, tuple]: - """ - Collate function for dataloader - Can accept mixed batch of text-to-text items and audio-text items (typical for ASR) - """ - text_items: List[TextToTextItem] = [item for item in batch if isinstance(item, TextToTextItem)] - if not text_items: - # pure audio-text batch - return _speech_collate_fn(batch=batch, pad_id=asr_pad_id) - - asr_items = [item for item in batch if not isinstance(item, TextToTextItem)] - - if not asr_items: - # pure text-to-text batch - return TextToTextBatch.collate_fn(batch=text_items, asr_pad_id=asr_pad_id, tts_text_pad_id=tts_text_pad_id) - - # mixed batch - - # each asr item is a tuple: - # audio_signal (0), audio_length (1), transcript (2), transcript_length (3), sample_id (4, optional) - audio_signals = pad_sequence([item[0] for item in asr_items], batch_first=True, padding_value=0.0) - audio_signal_lengths = torch.tensor([item[1] for item in asr_items]).long() - - tts_texts = pad_sequence( - [item.tts_text for item in text_items], batch_first=True, padding_value=tts_text_pad_id - ) - tts_text_lengths = torch.tensor([item.tts_text.shape[0] for item in text_items]).long() - speakers = torch.tensor([item.speaker for item in text_items]).long() - - transcripts = pad_sequence( - [item.transcript for item in text_items] + [item[2] for item in asr_items], - batch_first=True, - padding_value=asr_pad_id, - ) - transcript_lengths = torch.tensor( - [item.transcript.shape[0] for item in text_items] + [item[3] for item in asr_items] - ).long() - - return TextOrAudioToTextBatch( - audio_signals=audio_signals, - audio_signal_lengths=audio_signal_lengths, - tts_texts=tts_texts, - tts_text_lengths=tts_text_lengths, - speakers=speakers, - transcripts=transcripts, - transcript_lengths=transcript_lengths, - ) - - -def _asr_text_to_tokens(text: str) -> np.ndarray: - """ - Helper function for asr tokenization with multiprocessing pool only. - Must be defined on the top level. - Expects asr_tokenizer_global, asr_bos_id_global, asr_eos_id_global to exist in the current pool process - """ - ids = asr_tokenizer_global.text_to_ids(text) - if asr_bos_id_global is not None: - ids = [asr_bos_id_global] + ids - if asr_eos_id_global is not None: - ids.append(asr_eos_id_global) - return np.asarray(ids) - - -def _tts_text_to_tokens(text: str) -> np.ndarray: - """ - Helper function for asr tokenization with multiprocessing pool only. - Must be defined on the top level. - Expects tts_tokenizer_global to exist in the current pool process - """ - return np.asarray(tts_tokenizer_global(text)) - - -def _iterate_manifest(filepath: AnyPath) -> Iterable[Dict[str, Any]]: - """ - Helper function to iterate manifest - """ - with open(filepath, "r", encoding="utf-8") as f: - for line in f: - record = json.loads(line) - yield record - - -class TextToTextDatasetBase: - """ - Base class for loading text-to-text manifests - Map-style and Iterable datasets should inherit this class - """ - - asr_pad_id: int - tts_text_pad_id: int - asr_bos_id: Optional[int] = None - asr_eos_id: Optional[int] = None - data: List[Dict[str, Any]] - - def __init__( - self, - manifest_filepath: Union[AnyPath, List[AnyPath]], - speakers_filepath: Union[AnyPath, List[AnyPath]], - asr_tokenizer: TokenizerSpec, - asr_use_start_end_token: bool, - tts_parser: Callable, - tts_text_pad_id: int, - tts_text_normalizer: "Normalizer", - tts_text_normalizer_call_kwargs: Dict, - min_words: int = 1, - max_words: int = 1_000_000, - tokenizer_workers: int = 1, - num_parts: int = 1, - current_part_index: int = 0, - ): - super().__init__() - # ASR tokenizer setup - if asr_use_start_end_token and hasattr(asr_tokenizer, 'bos_token'): - self.asr_bos_id = asr_tokenizer.bos_id - - if asr_use_start_end_token and hasattr(asr_tokenizer, 'eos_token'): - self.asr_eos_id = asr_tokenizer.eos_id - - if hasattr(asr_tokenizer, 'pad_token'): - self.asr_pad_id = asr_tokenizer.pad_id - else: - self.asr_pad_id = 0 - - self.asr_tokenizer = asr_tokenizer - - # TTS tokenizer setup - self.tts_parser = tts_parser - self.tts_normalizer = tts_text_normalizer - self.tts_normalizer_kwargs = tts_text_normalizer_call_kwargs - self.tts_text_pad_id = tts_text_pad_id - - # Load speakers - if isinstance(speakers_filepath, str): - speakers_filepath = speakers_filepath.split(",") - elif isinstance(speakers_filepath, Path): - speakers_filepath = [speakers_filepath] - speakers: Set[int] = set() - for filepath in speakers_filepath: - with open(Path(filepath).expanduser(), "r") as f: - speakers.update(map(int, f.read().split())) - self.speakers = np.asarray(sorted(speakers)) - logging.info(f"Loaded {len(self.speakers)} speakers") - - # Load manifest - if isinstance(manifest_filepath, str): - manifest_filepath = manifest_filepath.split(",") - elif isinstance(manifest_filepath, Path): - manifest_filepath = [manifest_filepath] - self.manifest_paths = [Path(filepath) for filepath in manifest_filepath] - - num_skipped_words = 0 - num_skipped_utterances = 0 - asr_texts = [] - tts_texts = [] - need_normalization = False - - for manifest_path in self.manifest_paths: - for tmp_item in tqdm(_iterate_manifest(manifest_path)): - text = tmp_item["text"] - num_words = len(text.split()) - # skip if number of works not in desired range - # TODO: maybe it would be valuable to sample sub-utterances from long utterances - if not (min_words <= num_words <= max_words): - num_skipped_words += num_words - num_skipped_utterances += 1 - continue - asr_texts.append(tmp_item["text"]) - if "tts_text_normalized" in tmp_item: - tts_texts.append(tmp_item["tts_text_normalized"]) - else: - tts_texts.append(tmp_item["tts_text"]) - need_normalization = True - - if need_normalization: - logging.warning("TTS normalization is extremely slow! It is recommended to normalize TTS text") - - if num_skipped_utterances: - logging.warning(f"Skipped {num_skipped_utterances} utterances " f"with {num_skipped_words}") - - num_utterances = len(asr_texts) - # preprocessing is very costly, if we need only part - remove unnecessary utterances - if num_parts > 1: - # NB: floor division, full dataset can contain fewer utterances than original, like in tarred dataset - num_utterances_part = num_utterances // num_parts - start = num_utterances_part * current_part_index - end = start + num_utterances_part - logging.info( - f"Taking part of the dataset: {current_part_index} index, total {num_parts} from {start} to {end}" - ) - asr_texts = asr_texts[start:end] - tts_texts = tts_texts[start:end] - num_utterances = num_utterances_part - - self.data = [dict() for _ in range(num_utterances)] - - if len(asr_texts) == 0: - # no data was loaded - logging.warning("Text-to-text dataset is empty") - return - - if tokenizer_workers == 1: - logging.warning( - "Preprocessing large text with tokenizer_workers=1 may be slow with TTS tokenizer. " - "Prefer tokenizer_workers=(num_cpu_cores/num_gpus_per_node)" - ) - for i, tokenized_text in enumerate( - tqdm((self._asr_text_to_tokens(text) for text in asr_texts), total=len(asr_texts)) - ): - self.data[i]["asr_text_tokens"] = tokenized_text - else: - # Multiprocessing hack: use global variables for every process (not really global in program context) - def _init_asr_tokenize_process(tokenizer, bos_id, eos_id): - global asr_tokenizer_global, asr_bos_id_global, asr_eos_id_global # process-global - # deepcopy to avoid serialization of parent models - asr_tokenizer_global = copy.deepcopy(tokenizer) - asr_bos_id_global = copy.deepcopy(bos_id) - asr_eos_id_global = copy.deepcopy(eos_id) - - with concurrent.futures.ProcessPoolExecutor( - initializer=_init_asr_tokenize_process, - initargs=(asr_tokenizer, self.asr_bos_id, self.asr_eos_id), - max_workers=tokenizer_workers, - ) as pool: - # chunk size for pool map is empirically chosen as a trade-off between speed and responsiveness - for i, tokenized_text in enumerate( - tqdm(pool.map(_asr_text_to_tokens, asr_texts, chunksize=1000), total=len(asr_texts)) - ): - self.data[i]["asr_text_tokens"] = tokenized_text - # force free memory - del asr_texts - gc.collect() - - if tokenizer_workers == 1: - logging.warning( - "Preprocessing large text with tokenizer_workers=1 may be slow with TTS tokenizer. " - "Prefer tokenizer_workers=(num_cpu_cores/num_gpus_per_node)" - ) - for i, tokenized_text in enumerate( - tqdm( - (self._tts_text_to_tokens(text, normalize=need_normalization) for text in tts_texts), - total=len(tts_texts), - ) - ): - self.data[i]["tts_text_tokens"] = tokenized_text - else: - if need_normalization: - # TODO: implement, if we really need normalization inplace - raise NotImplementedError( - "Normalization with tokenizer_workers > 1 is not implemented. " - "It is not recommended to use normalization on the fly at all, since it's extremely slow" - ) - - def _init_tts_tokenize_process(tokenizer): - global tts_tokenizer_global # process-global - tts_tokenizer_global = copy.deepcopy(tokenizer) - - with concurrent.futures.ProcessPoolExecutor( - initializer=_init_tts_tokenize_process, initargs=(tts_parser,), max_workers=tokenizer_workers, - ) as pool: - # chunk size for pool map is empirically chosen as a trade-off between speed and responsiveness - for i, tokenized_text in enumerate( - tqdm(pool.map(_tts_text_to_tokens, tts_texts, chunksize=1000), total=len(tts_texts)) - ): - self.data[i]["tts_text_tokens"] = tokenized_text - # force free memory - del tts_texts - gc.collect() - - def _asr_text_to_tokens(self, text: str) -> np.ndarray: - ids = self.asr_tokenizer.text_to_ids(text) - if self.asr_bos_id is not None: - ids = [self.asr_bos_id] + ids - if self.asr_eos_id is not None: - ids.append(self.asr_eos_id) - return np.asarray(ids) - - def _tts_text_to_tokens(self, text: str, normalize=True) -> np.ndarray: - if normalize: - text = self.tts_normalizer.normalize(text, **self.tts_normalizer_kwargs) - tokens = self.tts_parser(text) - return np.asarray(tokens) - - def __getitem__(self, index): - item = self.data[index] - return TextToTextItem( - transcript=torch.from_numpy(item["asr_text_tokens"]).long(), - tts_text=torch.from_numpy(item["tts_text_tokens"]).long(), - speaker=random.choice(self.speakers), - ) - - def __len__(self): - return len(self.data) - - -class TextToTextDataset(TextToTextDatasetBase, Dataset): - """Text-to-Text Map-style Dataset for hybrid ASR-TTS models""" - - def __init__( - self, - manifest_filepath: Union[AnyPath, List[AnyPath]], - speakers_filepath: Union[AnyPath, List[AnyPath]], - asr_tokenizer: TokenizerSpec, - asr_use_start_end_token: bool, - tts_parser: Callable, - tts_text_pad_id: int, - tts_text_normalizer: "Normalizer", - tts_text_normalizer_call_kwargs: Dict, - min_words: int = 1, - max_words: int = 1_000_000, - tokenizer_workers: int = 1, - ): - super().__init__( - manifest_filepath=manifest_filepath, - speakers_filepath=speakers_filepath, - asr_tokenizer=asr_tokenizer, - asr_use_start_end_token=asr_use_start_end_token, - tts_parser=tts_parser, - tts_text_pad_id=tts_text_pad_id, - tts_text_normalizer=tts_text_normalizer, - tts_text_normalizer_call_kwargs=tts_text_normalizer_call_kwargs, - min_words=min_words, - max_words=max_words, - tokenizer_workers=tokenizer_workers, - num_parts=1, - ) - - def collate_fn( - self, batch: List[Union[TextToTextItem, tuple]] - ) -> Union[TextToTextBatch, TextOrAudioToTextBatch, tuple]: - """ - Collate function for dataloader - Can accept mixed batch of text-to-text items and audio-text items (typical for ASR) - """ - return TextOrAudioToTextBatch.collate_fn( - batch=batch, asr_pad_id=self.asr_pad_id, tts_text_pad_id=self.tts_text_pad_id - ) - - -class TextToTextIterableDataset(TextToTextDatasetBase, IterableDataset): - """ - Text-to-Text Iterable Dataset for hybrid ASR-TTS models - Only part necessary for current process should be loaded and stored - """ - - def __init__( - self, - manifest_filepath: Union[AnyPath, List[AnyPath]], - speakers_filepath: Union[AnyPath, List[AnyPath]], - asr_tokenizer: TokenizerSpec, - asr_use_start_end_token: bool, - tts_parser: Callable, - tts_text_pad_id: int, - tts_text_normalizer: "Normalizer", - tts_text_normalizer_call_kwargs: Dict, - min_words: int = 1, - max_words: int = 1_000_000, - tokenizer_workers: int = 1, - num_parts: int = 1, - current_part_index: int = 0, - ): - super().__init__( - manifest_filepath=manifest_filepath, - speakers_filepath=speakers_filepath, - asr_tokenizer=asr_tokenizer, - asr_use_start_end_token=asr_use_start_end_token, - tts_parser=tts_parser, - tts_text_pad_id=tts_text_pad_id, - tts_text_normalizer=tts_text_normalizer, - tts_text_normalizer_call_kwargs=tts_text_normalizer_call_kwargs, - min_words=min_words, - max_words=max_words, - tokenizer_workers=tokenizer_workers, - num_parts=num_parts, - current_part_index=current_part_index, - ) - - def __iter__(self): - # Implementation based on docs: https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset - worker_info = torch.utils.data.get_worker_info() - if worker_info is None: # single-process data loading, return the full iterator - start = 0 - end = len(self) - else: # in a worker process - # split workload - per_worker = int(math.ceil(len(self) / float(worker_info.num_workers))) - worker_id = worker_info.id - start = worker_id * per_worker - end = min(start + per_worker, len(self)) - indices = np.arange(start, end) - np.random.shuffle(indices) - return map(self.__getitem__, indices) - - def collate_fn( - self, batch: List[Union[TextToTextItem, tuple]] - ) -> Union[TextToTextBatch, TextOrAudioToTextBatch, tuple]: - """ - Collate function for dataloader - Can accept mixed batch of text-to-text items and audio-text items (typical for ASR) - """ - return TextOrAudioToTextBatch.collate_fn( - batch=batch, asr_pad_id=self.asr_pad_id, tts_text_pad_id=self.tts_text_pad_id - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/losses/__init__.py deleted file mode 100644 index 3e50cea1d6926e0777bc7a6c29915f41ea98477a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.losses.angularloss import AngularSoftmaxLoss -from nemo.collections.asr.losses.audio_losses import SDRLoss -from nemo.collections.asr.losses.ctc import CTCLoss -from nemo.collections.asr.losses.lattice_losses import LatticeLoss -from nemo.collections.asr.losses.ssl_losses.contrastive import ContrastiveLoss -from nemo.collections.asr.losses.ssl_losses.ctc import CTCLossForSSL -from nemo.collections.asr.losses.ssl_losses.mlm import MLMLoss -from nemo.collections.asr.losses.ssl_losses.rnnt import RNNTLossForSSL diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index c1f75058b36a049841ec9a2f778d219dcddfb7cb..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/angularloss.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/angularloss.cpython-310.pyc deleted file mode 100644 index a2fd66b67acf0ecc07870d666fbf7c3aa47fcd9b..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/angularloss.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/audio_losses.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/audio_losses.cpython-310.pyc deleted file mode 100644 index 76c216a2d13cf93c1664ce52c4020233b7911e32..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/audio_losses.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/ctc.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/ctc.cpython-310.pyc deleted file mode 100644 index d737419416e8759ca79f8b62a3484a9394a2f399..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/ctc.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/lattice_losses.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/lattice_losses.cpython-310.pyc deleted file mode 100644 index a261eb540f0a2bee890af1a71f88212a8b070beb..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/lattice_losses.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/rnnt.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/rnnt.cpython-310.pyc deleted file mode 100644 index b13995e87d3b234a6bd21ef3410979b9942c7b7f..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/rnnt.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/rnnt_pytorch.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/rnnt_pytorch.cpython-310.pyc deleted file mode 100644 index 7538b96bcf5553738a9de95f13fbbb23d3624593..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/losses/__pycache__/rnnt_pytorch.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/angularloss.py b/SoundScribe/SpeakerID/nemo/collections/asr/losses/angularloss.py deleted file mode 100644 index e2aee9bba6eaa391573acb70be1bfcb53e2940bd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/losses/angularloss.py +++ /dev/null @@ -1,68 +0,0 @@ -# ! /usr/bin/python -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - -from nemo.core.classes import Loss, Typing, typecheck -from nemo.core.neural_types import LabelsType, LogitsType, LossType, NeuralType - -__all__ = ['AngularSoftmaxLoss'] - - -class AngularSoftmaxLoss(Loss, Typing): - """ - Computes ArcFace Angular softmax angle loss - reference: https://openaccess.thecvf.com/content_CVPR_2019/papers/Deng_ArcFace_Additive_Angular_Margin_Loss_for_Deep_Face_Recognition_CVPR_2019_paper.pdf - args: - scale: scale value for cosine angle - margin: margin value added to cosine angle - """ - - @property - def input_types(self): - """Input types definitions for AnguarLoss. - """ - return { - "logits": NeuralType(('B', 'D'), LogitsType()), - "labels": NeuralType(('B',), LabelsType()), - } - - @property - def output_types(self): - """Output types definitions for AngularLoss. - loss: - NeuralType(None) - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, scale=20.0, margin=1.35): - super().__init__() - - self.eps = 1e-7 - self.scale = scale - self.margin = margin - - @typecheck() - def forward(self, logits, labels): - numerator = self.scale * torch.cos( - torch.acos(torch.clamp(torch.diagonal(logits.transpose(0, 1)[labels]), -1.0 + self.eps, 1 - self.eps)) - + self.margin - ) - excl = torch.cat( - [torch.cat((logits[i, :y], logits[i, y + 1 :])).unsqueeze(0) for i, y in enumerate(labels)], dim=0 - ) - denominator = torch.exp(numerator) + torch.sum(torch.exp(self.scale * excl), dim=1) - L = numerator - torch.log(denominator) - return -torch.mean(L) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/audio_losses.py b/SoundScribe/SpeakerID/nemo/collections/asr/losses/audio_losses.py deleted file mode 100644 index 62ce4a9f7eddedb34c99a9eb0796b8412f28bb3b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/losses/audio_losses.py +++ /dev/null @@ -1,412 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from typing import List, Optional - -import numpy as np -import torch - -from nemo.collections.asr.parts.preprocessing.features import make_seq_mask_like -from nemo.collections.asr.parts.utils.audio_utils import toeplitz -from nemo.core.classes import Loss, Typing, typecheck -from nemo.core.neural_types import AudioSignal, LengthsType, LossType, MaskType, NeuralType -from nemo.utils import logging - -__all__ = ['SDRLoss'] - - -def temporal_mean( - input: torch.Tensor, - input_length: Optional[torch.Tensor] = None, - mask: Optional[torch.Tensor] = None, - keepdim: bool = False, - eps: float = 1e-10, -) -> torch.Tensor: - """Calculate mean along temporal dimension with optionally - averaging only over valid samples (based on the input length). - - Args: - input: Batch of signals, shape (B, C, T) - input_length: Optional, length of each example in the batch, shape (B,) - mask: Optional, temporal mask for each example in the batch, shape (B, T) - keepdim: Whether to keep the temporal dimension - eps: Regularization to avoid division by zero - - Returns: - (B, C, 1) if keepdim=True, otherwise (B, C) - """ - if input_length is not None: - if mask is not None: - raise RuntimeError( - 'Argument `input_length` is mutually exclusive with `mask`. Both cannot be used at the same time.' - ) - # Construct a binary mask - mask = make_seq_mask_like(lengths=input_length, like=input, time_dim=-1, valid_ones=True).squeeze(1) - - if mask is None: - # No length information, assume all samples are valid - mean = torch.mean(input, dim=-1, keepdim=keepdim) - else: - # Average using temporal mask - mean = mask.unsqueeze(1) * input - mean = torch.sum(mean, axis=-1, keepdim=keepdim) - normalization = torch.sum(mask, axis=-1, keepdim=keepdim) - mean = mean / (normalization.unsqueeze(1) + eps) - - return mean - - -def scale_invariant_target( - estimate: torch.Tensor, - target: torch.Tensor, - input_length: Optional[torch.Tensor] = None, - mask: Optional[torch.Tensor] = None, - eps: float = 1e-10, -) -> torch.Tensor: - """Calculate optimal scale-invariant target. - Assumes time dimension is the last dimension in the array. - - Calculate scaled target obtained by solving - - min_scale || scale * target - estimate ||^2 - - for each example in batch and each channel (b, c). - - Args: - estimate: tensor, shape (B, C, T) - target: tensor, shape (B, C, T) - input_length: optional, length of valid samples, shape (B,) - mask: optional, mask for input samples, shape (B, T) - eps: regularization constant - - Returns: - Scaled target, shape (B, C, T) - """ - if input_length is not None: - if mask is not None: - raise RuntimeError( - 'Argument `input_length` is mutually exclusive with `mask`. Both cannot be used at the same time.' - ) - - # Construct a binary mask - mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True).squeeze(1) - - estimate_dot_target = temporal_mean(estimate * target, mask=mask, keepdim=True, eps=eps) - target_pow = temporal_mean(torch.abs(target) ** 2, mask=mask, keepdim=True, eps=eps) - scale = estimate_dot_target / (target_pow + eps) - target_scaled = scale * target - - # Mask to keep only the valid samples - if mask is not None: - target_scaled = mask.unsqueeze(1) * target_scaled - - return target_scaled - - -def convolution_invariant_target( - estimate: torch.Tensor, - target: torch.Tensor, - input_length: Optional[torch.Tensor] = None, - mask: Optional[torch.Tensor] = None, - filter_length: int = 512, - diag_reg: float = 1e-6, - eps: float = 1e-8, -) -> torch.Tensor: - """Calculate optimal convolution-invariant target for a given estimate. - Assumes time dimension is the last dimension in the array. - - Calculate target filtered with a linear f obtained by solving - - min_filter || conv(filter, target) - estimate ||^2 - - for each example in batch and each channel (b, c). - - Args: - estimate: tensor, shape (B, C, T) - target: tensor, shape (B, C, T) - input_length: optional, length of valid samples, shape (B,) - mask: optional, mask for input samples, shape (B, T) - filter_length: length of the (convolutional) filter for target - diag_reg: relative diagonal regularization for the linear system - eps: absolute regularization for the diagonal - - Returns: - Filtered target, shape (B, C, T) - - Reference: - C. Boeddeker et al., Convolutive Transfer Function Invariant SDR training criteria for Multi-Channel Reverberant Speech Separation, 2021 - """ - if input_length is not None: - if mask is not None: - raise RuntimeError( - 'Argument `input_length` is mutually exclusive with `mask`. Both cannot be used at the same time.' - ) - - if torch.min(input_length) < filter_length: - logging.warning( - 'Current min input_length (%d) is smaller than filter_length (%d). This will result in a singular linear system.', - torch.min(input_length), - filter_length, - ) - - # Construct a binary mask - mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True).squeeze(1) - - # Apply a mask, if available - if mask is not None: - estimate = mask.unsqueeze(1) * estimate - target = mask.unsqueeze(1) * target - - # Calculate filtered target - input_shape = estimate.shape - estimate = estimate.view(-1, input_shape[-1]) - target = target.view(-1, input_shape[-1]) - - n_fft = 2 ** math.ceil(math.log2(2 * input_shape[-1] - 1)) - - T = torch.fft.rfft(target, n=n_fft) - E = torch.fft.rfft(estimate, n=n_fft) - - # Target autocorrelation - tt_corr = torch.fft.irfft(torch.abs(T) ** 2, n=n_fft) - # Target-estimate crosscorrelation - te_corr = torch.fft.irfft(T.conj() * E, n=n_fft) - - # Use only filter_length - tt_corr = tt_corr[..., :filter_length] - te_corr = te_corr[..., :filter_length] - - # Diagonal regularization - if diag_reg is not None: - tt_corr[..., 0] += diag_reg * tt_corr[..., 0] + eps - - # Construct the Toeplitz system matrix - TT = toeplitz(tt_corr) - - # Solve the linear system for the optimal filter - filt = torch.linalg.solve(TT, te_corr) - - # Calculate filtered target - T_filt = T * torch.fft.rfft(filt, n=n_fft) - target_filt = torch.fft.irfft(T_filt, n=n_fft) - - # Reshape to the original format - target_filt = target_filt[..., : input_shape[-1]].view(*input_shape) - - # Mask to keep only the valid samples - if mask is not None: - target_filt = mask.unsqueeze(1) * target_filt - - return target_filt - - -def calculate_sdr_batch( - estimate: torch.Tensor, - target: torch.Tensor, - input_length: Optional[torch.Tensor] = None, - mask: Optional[torch.Tensor] = None, - scale_invariant: bool = False, - convolution_invariant: bool = False, - convolution_filter_length: Optional[int] = 512, - remove_mean: bool = True, - sdr_max: Optional[float] = None, - eps: float = 1e-8, -) -> torch.Tensor: - """Calculate signal-to-distortion ratio per channel. - - SDR = 10 * log10( ||t||_2^2 / (||e-t||_2^2 + alpha * ||t||^2) - - where - alpha = 10^(-sdr_max/10) - - Optionally, use scale- or convolution- invariant target signal. - - Args: - estimate: estimated signal, shape (B, C, T) - target: target signal, shape (B, C, T) - input_length: Optional, length of valid samples, shape (B,) - mask: Optional, temporal mask, shape (B, T) - scale_invariant: Use scale invariant SDR - convolution_invariant: Use convolution invariant SDR - convolution_filter_length: Filter length for convolution invariant SDR - remove_mean: If True, mean will be removed before calculating SDR - eps: Small regularization constant - - Returns: - SDR in dB for each channel, shape (B, C) - """ - if scale_invariant and convolution_invariant: - raise ValueError(f'Arguments scale_invariant and convolution_invariant cannot be used simultaneously.') - - assert ( - estimate.shape == target.shape - ), f'Estimate shape ({estimate.shape}) not matching target shape ({target.shape})' - - if input_length is not None: - if mask is not None: - raise RuntimeError( - 'Argument `input_length` is mutually exclusive with `mask`. Both cannot be used at the same time.' - ) - - # Construct a binary mask - mask = make_seq_mask_like(lengths=input_length, like=estimate, time_dim=-1, valid_ones=True).squeeze(1) - - if remove_mean: - estimate = estimate - temporal_mean(estimate, mask=mask, keepdim=True, eps=eps) - target = target - temporal_mean(target, mask=mask, keepdim=True, eps=eps) - - if scale_invariant or (convolution_invariant and convolution_filter_length == 1): - target = scale_invariant_target(estimate=estimate, target=target, mask=mask, eps=eps) - elif convolution_invariant: - target = convolution_invariant_target( - estimate=estimate, target=target, mask=mask, filter_length=convolution_filter_length, eps=eps, - ) - - distortion = estimate - target - - target_pow = temporal_mean(torch.abs(target) ** 2, mask=mask, eps=eps) - distortion_pow = temporal_mean(torch.abs(distortion) ** 2, mask=mask, eps=eps) - - if sdr_max is not None: - distortion_pow = distortion_pow + 10 ** (-sdr_max / 10) * target_pow - - sdr = target_pow / (distortion_pow + eps) - sdr = 10 * torch.log10(sdr + eps) - - return sdr - - -class SDRLoss(Loss, Typing): - """ - Computes signal-to-distortion ratio (SDR) loss with weighted average across channels. - - Args: - weight: weight for SDR of each output channel, used for averaging the loss across channels. Defaults to `None` (averaging). - reduction: batch reduction. Defaults to `mean` over the batch. - scale_invariant: If `True`, use scale-invariant SDR. Defaults to `False`. - remove_mean: Remove mean before calculating the loss. Defaults to `True`. - sdr_max: Soft thresholding of the loss to SDR_max. - eps: Small value for regularization. - """ - - def __init__( - self, - weight: Optional[List[float]] = None, - reduction: str = 'mean', - scale_invariant: bool = False, - convolution_invariant: bool = False, - convolution_filter_length: Optional[int] = 512, - remove_mean: bool = True, - sdr_max: Optional[float] = None, - eps: float = 1e-8, - ): - super().__init__() - - # SDR weight buffer - if weight is not None: - if any([w <= 0 for w in weight]): - raise ValueError(f'Weight must be positive! Current value: {weight}') - elif not np.isclose(sum(weight), 1, atol=1e-6): - raise ValueError(f'Weight should add to one, current weight: {weight}') - weight = torch.tensor(weight).reshape(1, -1) - logging.info(f'Channel weight set to %s', weight) - self.register_buffer('weight', weight) - self.weight: Optional[Tensor] - - # Batch reduction - self.reduction = reduction - if reduction == 'mean': - self.reduce = torch.mean - else: - raise ValueError(f'Unexpected reduction mode {reduction}.') - - # SDR calculation setup - if scale_invariant and convolution_invariant: - raise ValueError( - f'{self.__class__.__name__}: arguments scale_invariant and convolution_invariant cannot be used simultaneously.' - ) - self.scale_invariant = scale_invariant - self.convolution_invariant = convolution_invariant - self.convolution_filter_length = convolution_filter_length - self.remove_mean = remove_mean - self.sdr_max = sdr_max - self.eps = eps - - @property - def input_types(self): - """Input types definitions for SDRLoss. - """ - signal_shape = ('B', 'C', 'T') - return { - "estimate": NeuralType(signal_shape, AudioSignal()), - "target": NeuralType(signal_shape, AudioSignal()), - "input_length": NeuralType(tuple('B'), LengthsType(), optional=True), - "mask": NeuralType(('B', 'T'), MaskType(), optional=True), - } - - @property - def output_types(self): - """Output types definitions for SDRLoss. - loss: - NeuralType(None) - """ - return {"loss": NeuralType(elements_type=LossType())} - - @typecheck() - def forward( - self, - estimate: torch.Tensor, - target: torch.Tensor, - input_length: Optional[torch.Tensor] = None, - mask: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """For input batch of multi-channel signals, calculate SDR between estimate and target for each channel, - perform averaging across channels (weighting optional), and apply reduction across the batch. - - Args: - estimate: Batch of signals, shape (B, T, C) - target: Batch of signals, shape (B, T, C) - input_length: Batch of lengths, shape (B,) - mask: Batch of temporal masks, shape (B, T) - - Returns: - Scalar loss. - """ - - sdr = calculate_sdr_batch( - estimate=estimate, - target=target, - input_length=input_length, - mask=mask, - scale_invariant=self.scale_invariant, - convolution_invariant=self.convolution_invariant, - convolution_filter_length=self.convolution_filter_length, - remove_mean=self.remove_mean, - sdr_max=self.sdr_max, - eps=self.eps, - ) - - # channel averaging - if self.weight is None: - sdr = torch.mean(sdr, dim=1) - else: - # weighting across channels - sdr = sdr * self.weight - sdr = torch.sum(sdr, dim=1) - - # reduction - sdr = self.reduce(sdr) - - return -sdr diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/bce_loss.py b/SoundScribe/SpeakerID/nemo/collections/asr/losses/bce_loss.py deleted file mode 100644 index 30e31b8610ec650d5cfc9a283588f37edd24da78..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/losses/bce_loss.py +++ /dev/null @@ -1,73 +0,0 @@ -# ! /usr/bin/python -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - -from nemo.core.classes import Loss, Typing, typecheck -from nemo.core.neural_types import LabelsType, LengthsType, LossType, NeuralType, ProbsType - -__all__ = ['BCELoss'] - - -class BCELoss(Loss, Typing): - """ - Computes Binary Cross Entropy (BCE) loss. The BCELoss class expects output from Sigmoid function. - """ - - @property - def input_types(self): - """Input types definitions for AnguarLoss. - """ - return { - "probs": NeuralType(('B', 'T', 'C'), ProbsType()), - 'labels': NeuralType(('B', 'T', 'C'), LabelsType()), - "signal_lengths": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """ - Output types definitions for binary cross entropy loss. Weights for labels can be set using weight variables. - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, reduction='sum', alpha=1.0, weight=torch.tensor([0.5, 0.5])): - super().__init__() - self.reduction = reduction - self.loss_weight = weight - self.loss_f = torch.nn.BCELoss(weight=self.loss_weight, reduction=self.reduction) - - @typecheck() - def forward(self, probs, labels, signal_lengths): - """ - Calculate binary cross entropy loss based on probs, labels and signal_lengths variables. - - Args: - probs (torch.tensor) - Predicted probability value which ranges from 0 to 1. Sigmoid output is expected. - labels (torch.tensor) - Groundtruth label for the predicted samples. - signal_lengths (torch.tensor): - The actual length of the sequence without zero-padding. - - Returns: - loss (NeuralType) - Binary cross entropy loss value. - """ - probs_list = [probs[k, : signal_lengths[k], :] for k in range(probs.shape[0])] - targets_list = [labels[k, : signal_lengths[k], :] for k in range(labels.shape[0])] - probs = torch.cat(probs_list, dim=0) - labels = torch.cat(targets_list, dim=0) - return self.loss_f(probs, labels) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ctc.py b/SoundScribe/SpeakerID/nemo/collections/asr/losses/ctc.py deleted file mode 100644 index 8a1f72448893fba9eda61d29b9decf2392d688b0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ctc.py +++ /dev/null @@ -1,82 +0,0 @@ -# ! /usr/bin/python -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from torch import nn - -from nemo.core.classes import Serialization, Typing, typecheck -from nemo.core.neural_types import LabelsType, LengthsType, LogprobsType, LossType, NeuralType - -__all__ = ['CTCLoss'] - - -class CTCLoss(nn.CTCLoss, Serialization, Typing): - @property - def input_types(self): - """Input types definitions for CTCLoss. - """ - return { - "log_probs": NeuralType(('B', 'T', 'D'), LogprobsType()), - "targets": NeuralType(('B', 'T'), LabelsType()), - "input_lengths": NeuralType(tuple('B'), LengthsType()), - "target_lengths": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Output types definitions for CTCLoss. - loss: - NeuralType(None) - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, num_classes, zero_infinity=False, reduction='mean_batch'): - self._blank = num_classes - # Don't forget to properly call base constructor - if reduction not in ['none', 'mean', 'sum', 'mean_batch', 'mean_volume']: - raise ValueError('`reduction` must be one of [mean, sum, mean_batch, mean_volume]') - - self.config_reduction = reduction - if reduction == 'mean_batch' or reduction == 'mean_volume': - ctc_reduction = 'none' - self._apply_reduction = True - elif reduction in ['sum', 'mean', 'none']: - ctc_reduction = reduction - self._apply_reduction = False - super().__init__(blank=self._blank, reduction=ctc_reduction, zero_infinity=zero_infinity) - - def reduce(self, losses, target_lengths): - if self.config_reduction == 'mean_batch': - losses = losses.mean() # global batch size average - elif self.config_reduction == 'mean_volume': - losses = losses.sum() / target_lengths.sum() # same as above but longer samples weigh more - - return losses - - @typecheck() - def forward(self, log_probs, targets, input_lengths, target_lengths): - # override forward implementation - # custom logic, if necessary - input_lengths = input_lengths.long() - target_lengths = target_lengths.long() - targets = targets.long() - # here we transpose because we expect [B, T, D] while PyTorch assumes [T, B, D] - log_probs = log_probs.transpose(1, 0) - loss = super().forward( - log_probs=log_probs, targets=targets, input_lengths=input_lengths, target_lengths=target_lengths - ) - if self._apply_reduction: - loss = self.reduce(loss, target_lengths) - return loss diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/lattice_losses.py b/SoundScribe/SpeakerID/nemo/collections/asr/losses/lattice_losses.py deleted file mode 100644 index 7dae44bfd1d18a0946a34efa25077ad9bacad947..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/losses/lattice_losses.py +++ /dev/null @@ -1,184 +0,0 @@ -# ! /usr/bin/python -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Optional - -import torch -from omegaconf import DictConfig - -from nemo.core.classes import Loss, typecheck -from nemo.core.neural_types import LabelsType, LengthsType, LogprobsType, LossType, NeuralType - - -class LatticeLoss(Loss): - """Family of loss functions based on various lattice scores. - - Note: - Requires k2 v1.14 or later to be installed to use this loss function. - - Losses can be selected via the config, and optionally be passed keyword arguments as follows. - - Examples: - .. code-block:: yaml - - model: # Model config - ... - graph_module_cfg: # Config for graph modules, e.g. LatticeLoss - criterion_type: "map" - loss_type: "mmi" - split_batch_size: 0 - backend_cfg: - topo_type: "default" # other options: "compact", "shared_blank", "minimal" - topo_with_self_loops: true - token_lm: # must be provided for criterion_type: "map" - - Args: - num_classes: Number of target classes for the decoder network to predict. - (Excluding the blank token). - - reduction: Type of reduction to perform on loss. Possible values are `mean_batch`, `mean`, `sum`, or None. - None will return a torch vector comprising the individual loss values of the batch. - - backend: Which backend to use for loss calculation. Currently only `k2` is supported. - - criterion_type: Type of criterion to use. Choices: `ml` and `map`, - with `ml` standing for Maximum Likelihood and `map` for Maximum A Posteriori Probability. - - loss_type: Type of the loss function to use. Choices: `ctc` and `rnnt` for `ml`, and `mmi` for `map`. - - split_batch_size: Local batch size. Used for memory consumption reduction at the cost of speed performance. - Effective if complies 0 < split_batch_size < batch_size. - - graph_module_cfg: Optional Dict of (str, value) pairs that are passed to the backend loss function. - """ - - @property - def input_types(self): - """Input types definitions for LatticeLoss. - """ - return { - "log_probs": NeuralType(("B", "T", "D") if self._3d_input else ("B", "T", "T", "D"), LogprobsType()), - "targets": NeuralType(("B", "T"), LabelsType()), - "input_lengths": NeuralType(tuple("B"), LengthsType()), - "target_lengths": NeuralType(tuple("B"), LengthsType()), - } - - @property - def output_types(self): - """Output types definitions for LatticeLoss. - loss: - NeuralType(None) - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__( - self, - num_classes: int, - reduction: str = "mean_batch", - backend: str = "k2", - criterion_type: str = "ml", - loss_type: str = "ctc", - split_batch_size: int = 0, - graph_module_cfg: Optional[DictConfig] = None, - ): - super().__init__() - self._blank = num_classes - self.split_batch_size = split_batch_size - inner_reduction = None - if reduction == "mean_batch": - inner_reduction = "none" - self._apply_batch_mean = True - elif reduction in ["sum", "mean", "none"]: - inner_reduction = reduction - self._apply_batch_mean = False - - # we assume that self._blank + 1 == num_classes - if backend == "k2": - if criterion_type == "ml": - if loss_type == "ctc": - from nemo.collections.asr.parts.k2.ml_loss import CtcLoss as K2Loss - elif loss_type == "rnnt": - from nemo.collections.asr.parts.k2.ml_loss import RnntLoss as K2Loss - else: - raise ValueError(f"Unsupported `loss_type`: {loss_type}.") - elif criterion_type == "map": - if loss_type == "ctc": - from nemo.collections.asr.parts.k2.map_loss import CtcMmiLoss as K2Loss - else: - raise ValueError(f"Unsupported `loss_type`: {loss_type}.") - else: - raise ValueError(f"Unsupported `criterion_type`: {criterion_type}.") - - self._loss = K2Loss( - num_classes=self._blank + 1, blank=self._blank, reduction=inner_reduction, cfg=graph_module_cfg, - ) - elif backend == "gtn": - raise NotImplementedError(f"Backend {backend} is not supported.") - else: - raise ValueError(f"Invalid value of `backend`: {backend}.") - - self.criterion_type = criterion_type - self.loss_type = loss_type - self._3d_input = self.loss_type != "rnnt" - - if self.split_batch_size > 0: - # don't need to guard grad_utils - from nemo.collections.asr.parts.k2.grad_utils import PartialGrad - - self._partial_loss = PartialGrad(self._loss) - - def update_graph(self, graph): - """Updates graph of the backend loss function. - """ - if self.criterion_type != "ml": - self._loss.update_graph(graph) - - @typecheck() - def forward(self, log_probs, targets, input_lengths, target_lengths): - # override forward implementation - # custom logic, if necessary - - assert not (torch.isnan(log_probs).any() or torch.isinf(log_probs).any()) - - log_probs = log_probs.float() - input_lengths = input_lengths.long() - target_lengths = target_lengths.long() - targets = targets.long() - batch_size = log_probs.shape[0] - if self.split_batch_size > 0 and self.split_batch_size <= batch_size: - loss_list = [] - for batch_idx in range(0, batch_size, self.split_batch_size): - begin = batch_idx - end = min(begin + self.split_batch_size, batch_size) - input_lengths_part = input_lengths[begin:end] - log_probs_part = log_probs[begin:end, : input_lengths_part.max()] - target_lengths_part = target_lengths[begin:end] - targets_part = targets[begin:end, : target_lengths_part.max()] - loss_part, _ = ( - self._partial_loss(log_probs_part, targets_part, input_lengths_part, target_lengths_part) - if log_probs_part.requires_grad - else self._loss(log_probs_part, targets_part, input_lengths_part, target_lengths_part) - ) - del log_probs_part, targets_part, input_lengths_part, target_lengths_part - loss_list.append(loss_part) - loss = torch.cat(loss_list, 0) - else: - loss, _ = self._loss( - log_probs=log_probs, targets=targets, input_lengths=input_lengths, target_lengths=target_lengths, - ) - if self._apply_batch_mean: - # torch.mean gives nan if loss is empty - loss = torch.mean(loss) if loss.nelement() > 0 else torch.sum(loss) - return loss diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/rnnt.py b/SoundScribe/SpeakerID/nemo/collections/asr/losses/rnnt.py deleted file mode 100644 index 894be6319c99f82bd564fa765a50dde312eb3a19..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/losses/rnnt.py +++ /dev/null @@ -1,508 +0,0 @@ -# ! /usr/bin/python -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2018-2019, Mingkun Huang -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -import operator -from dataclasses import dataclass -from typing import Any, Callable, Dict, List, Optional, Set - -import torch -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.asr.losses.rnnt_pytorch import MultiblankRNNTLossPytorch, RNNTLossPytorch, TDTLossPytorch -from nemo.core.classes import Loss, typecheck -from nemo.core.neural_types import LabelsType, LengthsType, LogprobsType, LossType, NeuralType -from nemo.core.utils import numba_utils -from nemo.core.utils.k2_utils import K2_INSTALLATION_MESSAGE -from nemo.core.utils.numba_utils import NUMBA_INSTALLATION_MESSAGE -from nemo.utils import logging, logging_mode, model_utils - -try: - import warprnnt_pytorch as warprnnt - - WARP_RNNT_AVAILABLE = True -except (ImportError, ModuleNotFoundError): - WARP_RNNT_AVAILABLE = False - -try: - from nemo.collections.asr.parts.numba.rnnt_loss import MultiblankRNNTLossNumba, RNNTLossNumba, TDTLossNumba - - NUMBA_RNNT_AVAILABLE = True -except (ImportError, ModuleNotFoundError): - NUMBA_RNNT_AVAILABLE = False - -try: - from nemo.collections.asr.parts.k2.graph_transducer import GraphRnntLoss - from nemo.collections.asr.parts.k2.w_transducer import GraphWTransducerLoss - - K2_AVAILABLE = True -except (ImportError, ModuleNotFoundError): - K2_AVAILABLE = False - -WARP_RNNT_INSTALLATION_MESSAGE = ( - "Could not import `warprnnt_pytorch`.\n" - "Please visit https://github.com/HawkAaron/warp-transducer " - "and follow the steps in the readme to build and install the " - "pytorch bindings for RNNT Loss, or use the provided docker " - "container that supports RNN-T loss." -) - - -@dataclass -class RNNTLossConfig: - loss_name: str - lib_name: str - is_available: bool = False - installation_msg: str = "" - min_version: Optional[str] = None - force_float32: bool = True # default True for now for all losses except graph-based - - -# Resolved list of available RNNT losses -RNNT_LOSS_RESOLVER = { - "warprnnt": RNNTLossConfig( - loss_name="warprnnt", - lib_name="warprnnt_pytorch", - is_available=WARP_RNNT_AVAILABLE, - installation_msg=WARP_RNNT_INSTALLATION_MESSAGE, - force_float32=True, - ), - "warprnnt_numba": RNNTLossConfig( - loss_name="warprnnt_numba", - lib_name="numba", - min_version='0.53.0', - is_available=NUMBA_RNNT_AVAILABLE, - installation_msg=NUMBA_INSTALLATION_MESSAGE, - force_float32=False, # This is only temporarily false, will be dynamically updated during resolution - ), - "pytorch": RNNTLossConfig( - loss_name="pytorch", - lib_name="torch", - min_version='0.0', - is_available=True, - installation_msg="Pure Pytorch implementation of RNN-T loss. Slow and for debugging purposes only.", - force_float32=True, - ), - "multiblank_rnnt": RNNTLossConfig( - loss_name="multiblank_rnnt", - lib_name="numba", - min_version='0.53.0', - is_available=NUMBA_RNNT_AVAILABLE, - installation_msg=NUMBA_INSTALLATION_MESSAGE, - force_float32=True, - ), - "multiblank_rnnt_pytorch": RNNTLossConfig( - loss_name="pytorch", - lib_name="torch", - min_version='0.0', - is_available=True, - installation_msg="Pure Pytorch implementation of Multiblank RNN-T loss. Slow and for debugging purposes only.", - force_float32=True, - ), - "graph_w_transducer": RNNTLossConfig( - loss_name="graph_w_transducer", - lib_name="k2", - is_available=K2_AVAILABLE, - installation_msg=K2_INSTALLATION_MESSAGE, - force_float32=False, - ), - "graph_rnnt": RNNTLossConfig( - loss_name="graph_rnnt", - lib_name="k2", - is_available=K2_AVAILABLE, - installation_msg=K2_INSTALLATION_MESSAGE, - force_float32=False, - ), - "tdt": RNNTLossConfig( - loss_name="tdt", - lib_name="numba", - min_version='0.53.0', - is_available=NUMBA_RNNT_AVAILABLE, - installation_msg=NUMBA_INSTALLATION_MESSAGE, - ), - "tdt_pytorch": RNNTLossConfig( - loss_name="tdt_pytorch", - lib_name="torch", - min_version='0.0', - is_available=True, - installation_msg="Pure Pytorch implementation of TDT loss. Slow and for debugging purposes only.", - ), -} - -RNNT_LOSS_RESOLVER['default'] = RNNT_LOSS_RESOLVER['warprnnt_numba'] - - -def _warn_unused_additional_kwargs(loss_name, kwargs): - if len(kwargs) > 0: - logging.warning( - f"Loss function `{loss_name}` was provided with following additional kwargs,\n" - f"however they were ignored as it is unused.\n" - f"{kwargs}" - ) - - -def _clean_kwargs( - loss_name: str, kwargs: Optional[Dict[str, Any]], init_method: Callable, ignore_params: Optional[Set[str]] = None -) -> Dict[str, Any]: - """ - Cleans kwargs for the given loss function. Warn if there are unused kwargs. - - Args: - loss_name: name of the loss function - kwargs: kwargs to clean - init_method: LossClass.__init__ method - ignore_params: set of argument names for init_method to ignore - - Returns: - only used kwargs for the given `init_method` - """ - if not kwargs: - return {} - init_params = set(inspect.signature(init_method).parameters.keys()) - {"self"} - if ignore_params is not None: - init_params -= ignore_params - unused_kwargs = dict() - used_kwargs = dict() - for key, value in kwargs.items(): - if key not in init_params: - unused_kwargs[key] = value - else: - used_kwargs[key] = value - if len(unused_kwargs) > 0: - _warn_unused_additional_kwargs(loss_name, unused_kwargs) - return used_kwargs - - -def resolve_rnnt_default_loss_name() -> str: - return RNNT_LOSS_RESOLVER['default'].loss_name - - -def resolve_rnnt_loss(loss_name: str, blank_idx: int, loss_kwargs: dict = None) -> torch.nn.Module: - loss_function_names = list(RNNT_LOSS_RESOLVER.keys()) - - if loss_name not in loss_function_names: - raise ValueError( - f"Provided `loss_name` {loss_name} not in list of available RNNT losses \n" f"{loss_function_names}" - ) - - all_available_losses = {name: config for name, config in RNNT_LOSS_RESOLVER.items() if config.is_available} - - loss_config = RNNT_LOSS_RESOLVER[loss_name] # type: RNNTLossConfig - - # Re-raise import error with installation message - if not loss_config.is_available: - msg = ( - f"Installed RNNT losses are : {list(all_available_losses.keys())}.\n" - f"****************************************************************\n" - f"To install the selected loss function, please follow the steps below:\n" - f"{loss_config.installation_msg}" - ) - raise ImportError(msg) - - # Library version check - if loss_config.min_version is not None: - ver_matched, msg = model_utils.check_lib_version( - loss_config.lib_name, checked_version=loss_config.min_version, operator=operator.ge - ) - - if ver_matched is False: - msg = ( - f"{msg}\n" - f"****************************************************************\n" - f"To update the selected loss function, please follow the steps below:\n" - f"{loss_config.installation_msg}" - ) - raise RuntimeError(msg) - - # Resolve loss functions sequentially - loss_kwargs = {} if loss_kwargs is None else loss_kwargs - - if isinstance(loss_kwargs, DictConfig): - loss_kwargs = OmegaConf.to_container(loss_kwargs, resolve=True) - - # Get actual loss name for `default` - if loss_name == 'default': - loss_name = loss_config.loss_name - - """ - Resolve RNNT loss functions - """ - if loss_name == 'warprnnt': - loss_func = warprnnt.RNNTLoss(blank=blank_idx, reduction='none') - _warn_unused_additional_kwargs(loss_name, loss_kwargs) - - elif loss_name == 'warprnnt_numba': - # Update loss config's forced float32 flag if set to None - loss_config.force_float32 = not numba_utils.is_numba_cuda_fp16_supported() - - fastemit_lambda = loss_kwargs.pop('fastemit_lambda', 0.0) - clamp = loss_kwargs.pop('clamp', -1.0) - loss_func = RNNTLossNumba(blank=blank_idx, reduction='none', fastemit_lambda=fastemit_lambda, clamp=clamp) - _warn_unused_additional_kwargs(loss_name, loss_kwargs) - - elif loss_name == 'pytorch': - loss_func = RNNTLossPytorch(blank=blank_idx, reduction='none') - _warn_unused_additional_kwargs(loss_name, loss_kwargs) - - elif loss_name == 'multiblank_rnnt': - fastemit_lambda = loss_kwargs.pop('fastemit_lambda', 0.0) - clamp = loss_kwargs.pop('clamp', -1.0) - big_blank_durations = loss_kwargs.pop('big_blank_durations', None) - sigma = loss_kwargs.pop('sigma', 0.0) - loss_func = MultiblankRNNTLossNumba( - blank=blank_idx, - big_blank_durations=big_blank_durations, - reduction='none', - fastemit_lambda=fastemit_lambda, - clamp=clamp, - sigma=sigma, - ) - _warn_unused_additional_kwargs(loss_name, loss_kwargs) - - elif loss_name == 'multiblank_rnnt_pytorch': - big_blank_durations = loss_kwargs.pop('big_blank_durations', None) - sigma = loss_kwargs.pop('sigma', 0.0) - loss_func = MultiblankRNNTLossPytorch( - blank=blank_idx, big_blank_durations=big_blank_durations, reduction='none', sigma=sigma - ) - _warn_unused_additional_kwargs(loss_name, loss_kwargs) - - elif loss_name == 'tdt': - fastemit_lambda = loss_kwargs.pop('fastemit_lambda', 0.0) - clamp = loss_kwargs.pop('clamp', -1.0) - durations = loss_kwargs.pop('durations', None) - sigma = loss_kwargs.pop('sigma', 0.0) - omega = loss_kwargs.pop('omega', 0.0) - loss_func = TDTLossNumba( - blank=blank_idx, - durations=durations, - reduction='none', - fastemit_lambda=fastemit_lambda, - clamp=clamp, - sigma=sigma, - omega=omega, - ) - _warn_unused_additional_kwargs(loss_name, loss_kwargs) - - elif loss_name == 'tdt_pytorch': - durations = loss_kwargs.pop('durations', None) - sigma = loss_kwargs.pop('sigma', 0.0) - loss_func = TDTLossPytorch(blank=blank_idx, durations=durations, reduction='none', sigma=sigma) - _warn_unused_additional_kwargs(loss_name, loss_kwargs) - - elif loss_name == "graph_rnnt": - loss_kwargs = _clean_kwargs(loss_name, loss_kwargs, GraphRnntLoss.__init__, ignore_params={"blank"}) - loss_func = GraphRnntLoss(blank=blank_idx, **loss_kwargs) - elif loss_name == "graph_w_transducer": - loss_kwargs = _clean_kwargs(loss_name, loss_kwargs, GraphWTransducerLoss.__init__, ignore_params={"blank"}) - loss_func = GraphWTransducerLoss(blank=blank_idx, **loss_kwargs) - else: - raise ValueError( - f"Invalid value of `loss_name`: {loss_name}. Allowed loss names are :" f"{loss_function_names}" - ) - - return loss_func - - -class RNNTLoss(Loss): - @property - def input_types(self): - """Input types definitions for CTCLoss. - """ - return { - "log_probs": NeuralType(('B', 'T', 'T', 'D'), LogprobsType()), - "targets": NeuralType(('B', 'T'), LabelsType()), - "input_lengths": NeuralType(tuple('B'), LengthsType()), - "target_lengths": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Output types definitions for CTCLoss. - loss: - NeuralType(None) - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, num_classes, reduction: str = 'mean_batch', loss_name: str = "default", loss_kwargs=None): - """ - RNN-T Loss function based on https://github.com/HawkAaron/warp-transducer. - Optionally, can utilize a numba implementation of the same loss without having to compile the loss, - albiet there is a small speed penalty for JIT numba compile. - - Note: - Requires Numba 0.53.0 or later to be installed to use this loss function. - - Losses can be selected via the config, and optionally be passed keyword arguments as follows. - - Examples: - .. code-block:: yaml - - model: # RNNT Model config - ... - loss: - loss_name: "warprnnt_numba" - warprnnt_numba_kwargs: - fastemit_lambda: 0.0 - - Warning: - In the case that GPU memory is exhausted in order to compute RNNTLoss, it might cause - a core dump at the cuda level with the following error message. - - ``` - ... - costs = costs.to(acts.device) - RuntimeError: CUDA error: an illegal memory access was encountered - terminate called after throwing an instance of 'c10::Error' - ``` - - Please kill all remaining python processes after this point, and use a smaller batch size - for train, validation and test sets so that CUDA memory is not exhausted. - - Args: - num_classes: Number of target classes for the joint network to predict. - In all cases (conventional RNNT, multi-blank RNNT, and TDT model), this equals the token-id - for the standard "blank" symbol. In particular, say V is the number of non-blank tokens in - the vocabulary, then in the case of, - standard RNNT: num_classes = V - multiblank RNNT: num_classes = V + number-big-blanks (since we store big-blanks before - standard blank, and the standard blank is the last symbol in the vocab) - TDT: num_classes = V. Note, V here does not include any of the "duration outputs". - - reduction: Type of reduction to perform on loss. Possible values are - `mean_batch`, 'mean_volume`, `mean`, `sum` or None. - `None` will return a torch vector comprising the individual loss values of the batch. - `mean_batch` will average the losses in the batch - `mean` will divide each loss by the target length and then average - `mean_volume` will add up all the losses and divide by sum of target lengths - - loss_name: String that is resolved into an RNNT loss function. Available list of losses - is ininitialized in `RNNT_LOSS_RESOLVER` dictionary. - - loss_kwargs: Optional Dict of (str, value) pairs that are passed to the instantiated loss - function. - """ - super(RNNTLoss, self).__init__() - - if reduction not in [None, 'mean', 'sum', 'mean_batch', 'mean_volume']: - raise ValueError('`reduction` must be one of [mean, sum, mean_batch, mean_volume]') - - self._blank = num_classes - self.reduction = reduction - self._loss = resolve_rnnt_loss(loss_name, blank_idx=self._blank, loss_kwargs=loss_kwargs) - self._force_float32 = RNNT_LOSS_RESOLVER[loss_name].force_float32 - self._fp16_compat_checked = False - - def reduce(self, losses, target_lengths): - - if isinstance(losses, List): - losses = torch.cat(losses, 0) - target_lengths = torch.cat(target_lengths, 0) - - if self.reduction == 'mean_batch': - losses = losses.mean() # global batch size average - elif self.reduction == 'mean': - losses = torch.div(losses, target_lengths).mean() - elif self.reduction == 'sum': - losses = losses.sum() - elif self.reduction == 'mean_volume': - losses = losses.sum() / target_lengths.sum() # same as above but longer samples weigh more - - return losses - - @typecheck() - def forward(self, log_probs, targets, input_lengths, target_lengths): - # Cast to int 64 - targets = targets.long() - input_lengths = input_lengths.long() - target_lengths = target_lengths.long() - - max_logit_len = input_lengths.max() - max_targets_len = target_lengths.max() - - # Force cast joint to float32 - if not self._force_float32 and numba_utils.is_numba_cuda_fp16_supported(): - # Execute the kernel in fp16 - pass - elif self._force_float32 and log_probs.dtype != torch.float32: - # Log just once if fp16 tensor was passed and fp16 Numba CUDA loss could not be used. - if log_probs.dtype == torch.float16 and not self._fp16_compat_checked: - _, reason = numba_utils.is_numba_cuda_fp16_supported(return_reason=True) - logging.warning( - f"Provided RNNT Joint tensor is of dtype {log_probs.dtype}, but RNNT loss could not be calculated " - f"in fp16 due to following reason stated below. Loss will be calculated in fp32. \n\n" - f"{reason}", - mode=logging_mode.ONCE, - ) - self._fp16_compat_checked = True - - # Upcast the activation tensor and compute loss and grads in fp32 - logits_orig = log_probs - log_probs = log_probs.float() - del logits_orig # save memory *before* computing the loss - - # Ensure that shape mismatch does not occur due to padding - # Due to padding and subsequent downsampling, it may be possible that - # max sequence length computed does not match the actual max sequence length - # of the log_probs tensor, therefore we increment the input_lengths by the difference. - # This difference is generally small. - if log_probs.shape[1] != max_logit_len: - log_probs = log_probs.narrow(dim=1, start=0, length=max_logit_len).contiguous() - - # Reduce transcript length to correct alignment if additional padding was applied. - # Transcript: [B, L] -> [B, L']; If L' < L - if not targets.is_contiguous(): - targets = targets.contiguous() - - if targets.shape[1] != max_targets_len: - targets = targets.narrow(dim=1, start=0, length=max_targets_len).contiguous() - - # Temporarily override loss reduction - loss_reduction = self._loss.reduction - self._loss.reduction = None - - # Compute RNNT loss - loss = self._loss(acts=log_probs, labels=targets, act_lens=input_lengths, label_lens=target_lengths) - - # Loss reduction can be dynamic, so reset it after call - self._loss.reduction = loss_reduction - - # reduce here using our own reduction function - if self.reduction is not None: - loss = self.reduce(loss, target_lengths) - - # del new variables that may have been created - del ( - log_probs, - targets, - input_lengths, - target_lengths, - ) - - return loss diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/rnnt_pytorch.py b/SoundScribe/SpeakerID/nemo/collections/asr/losses/rnnt_pytorch.py deleted file mode 100644 index c8eee90a2eb5ca571d82ded1ccf02a1585edc0ed..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/losses/rnnt_pytorch.py +++ /dev/null @@ -1,374 +0,0 @@ -# ! /usr/bin/python -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List - -import torch - -from nemo.core.classes import Loss -from nemo.core.neural_types import LabelsType, LengthsType, LogprobsType, LossType, NeuralType - - -class RNNTLossPytorch(Loss): - @property - def input_types(self): - """Input types definitions for CTCLoss. - """ - return { - "acts": NeuralType(('B', 'T', 'T', 'D'), LogprobsType()), - "labels": NeuralType(('B', 'T'), LabelsType()), - "act_lens": NeuralType(tuple('B'), LengthsType()), - "label_lens": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Output types definitions for CTCLoss. - loss: - NeuralType(None) - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, blank, reduction): - super().__init__() - self.blank = blank - self.reduction = reduction - - def forward(self, acts, labels, act_lens, label_lens): - # CPU patch for FP16 - if not acts.is_cuda and acts.dtype == torch.float16: - acts = acts.float() - - acts = torch.log_softmax(acts, -1) - - forward_logprob = self.compute_forward_prob(acts, labels, act_lens, label_lens) - losses = -forward_logprob - if self.reduction == 'mean_batch': - losses = losses.mean() # global batch size average - elif self.reduction == 'mean': - losses = torch.div(losses, label_lens).mean() - elif self.reduction == 'sum': - losses = losses.sum() - elif self.reduction == 'mean_volume': - losses = losses.sum() / label_lens.sum() # same as above but longer samples weigh more - - return losses - - def compute_forward_prob(self, acts, labels, act_lens, label_lens): - B, T, U, _ = acts.shape - - log_alpha = torch.zeros(B, T, U) - log_alpha = log_alpha.to(acts.device) - - for t in range(T): - for u in range(U): - if u == 0: - if t == 0: - # this is the base case: (t=0, u=0) with log-alpha = 0. - log_alpha[:, t, u] = 0.0 - else: - # this is case for (t = 0, u > 0), reached by (t, u - 1) - # emitting a blank symbol. - log_alpha[:, t, u] = log_alpha[:, t - 1, u] + acts[:, t - 1, 0, self.blank] - else: - if t == 0: - # in case of (u > 0, t = 0), this is only reached from - # (t, u - 1) with a label emission. - gathered = torch.gather( - acts[:, t, u - 1], dim=1, index=labels[:, u - 1].view(-1, 1).type(torch.int64) - ).reshape(-1) - log_alpha[:, t, u] = log_alpha[:, t, u - 1] + gathered.to(log_alpha.device) - else: - # here both t and u are > 0, this state is reachable - # with two possibilities: (t - 1, u) with a blank emission - # or (t, u - 1) with a label emission. - log_alpha[:, t, u] = torch.logsumexp( - torch.stack( - [ - log_alpha[:, t - 1, u] + acts[:, t - 1, u, self.blank], - log_alpha[:, t, u - 1] - + torch.gather( - acts[:, t, u - 1], dim=1, index=labels[:, u - 1].view(-1, 1).type(torch.int64) - ).reshape(-1), - ] - ), - dim=0, - ) - - log_probs = [] - for b in range(B): - # here we need to add the final blank emission weights. - to_append = ( - log_alpha[b, act_lens[b] - 1, label_lens[b]] + acts[b, act_lens[b] - 1, label_lens[b], self.blank] - ) - log_probs.append(to_append) - log_prob = torch.stack(log_probs) - - return log_prob - - -class TDTLossPytorch(Loss): - """ - Pure Python implementation of TDT loss (https://arxiv.org/pdf/2304.06795.pdf) - """ - - @property - def input_types(self): - """Input types definitions for CTCLoss. - """ - return { - "acts": NeuralType(('B', 'T', 'T', 'D'), LogprobsType()), - "labels": NeuralType(('B', 'T'), LabelsType()), - "act_lens": NeuralType(tuple('B'), LengthsType()), - "label_lens": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Output types definitions for CTCLoss. - loss: - NeuralType(None) - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, blank: int, durations: List[int] = [], reduction: str = 'sum', sigma: float = 0.0): - super().__init__() - self.blank = blank - self.durations = durations - self.n_durations = len(durations) - self.reduction = reduction - self.sigma = sigma - - def forward(self, acts, labels, act_lens, label_lens): - label_acts = acts[:, :, :, : -self.n_durations] - duration_acts = acts[:, :, :, -self.n_durations :] - - # the - self.sigma here is for logit-undernormalization. Check the paper for details. - label_acts = torch.log_softmax(label_acts, -1) - self.sigma - - duration_acts = torch.log_softmax(duration_acts, -1) - - forward_logprob, _ = self.compute_forward_prob(label_acts, duration_acts, labels, act_lens, label_lens) - losses = -forward_logprob - if self.reduction == 'mean_batch': - losses = losses.mean() # global batch size average - elif self.reduction == 'mean': - losses = torch.div(losses, label_lens).mean() - elif self.reduction == 'sum': - losses = losses.sum() - elif self.reduction == 'mean_volume': - losses = losses.sum() / label_lens.sum() # same as above but longer samples weigh more - - return losses - - def logsumexp(self, a, b): - ret = torch.logsumexp(torch.stack([a, b]), dim=0) - return ret - - def compute_forward_prob(self, acts, duration_acts, labels, act_lens, label_lens): - """This function implements Equation 7 in the TDT paper https://arxiv.org/pdf/2304.06795.pdf, - Simply put, for each alpha(t, u), it sums over the contribution from all incoming blank arcs and non-blank arcs. - """ - B, T, U, _ = acts.shape - - log_alpha = torch.zeros(B, T, U) - log_alpha = log_alpha.cuda() - for b in range(B): - for t in range(T): - for u in range(U): - if u == 0: - if t == 0: - # both t and u are 0, this is the base case for alphas. - log_alpha[b, t, u] = 0.0 - else: - # u = 0 and t != 0: only considers blank emissions. - log_alpha[b, t, u] = -1000.0 - for n, l in enumerate(self.durations): - if ( - t - l >= 0 and l > 0 - ): # checking conditions for blank emission, l has to be at least 1 - tmp = ( - log_alpha[b, t - l, u] - + acts[b, t - l, u, self.blank] - + duration_acts[b, t - l, u, n] - ) - log_alpha[b, t, u] = self.logsumexp(tmp, 1.0 * log_alpha[b, t, u]) - - else: - # u != 0 here, need to consider both blanks and non-blanks. - log_alpha[b, t, u] = -1000.0 - for n, l in enumerate(self.durations): - if t - l >= 0: - if l > 0: # for blank emissions. Need to ensure index is not out-of-bound. - tmp = ( - log_alpha[b, t - l, u] - + acts[b, t - l, u, self.blank] - + duration_acts[b, t - l, u, n] - ) - log_alpha[b, t, u] = self.logsumexp(tmp, 1.0 * log_alpha[b, t, u]) - - # non-blank emissions. - tmp = ( - log_alpha[b, t - l, u - 1] - + acts[b, t - l, u - 1, labels[b, u - 1]] - + duration_acts[b, t - l, u - 1, n] - ) - log_alpha[b, t, u] = self.logsumexp(tmp, 1.0 * log_alpha[b, t, u]) - - log_probs = [] - for b in range(B): - tt = torch.Tensor([-1000.0]).cuda()[0] - - # need to loop over all possible ways that blank with different durations contributes to the final loss. - for n, l in enumerate(self.durations): - if act_lens[b] - l >= 0 and l > 0: - bb = ( - log_alpha[b, act_lens[b] - l, label_lens[b]] - + acts[b, act_lens[b] - l, label_lens[b], self.blank] - + duration_acts[b, act_lens[b] - l, label_lens[b], n] - ) - - tt = self.logsumexp(bb, 1.0 * tt) - - log_probs.append(tt) - - log_prob = torch.stack(log_probs) - - return log_prob, log_alpha - - -class MultiblankRNNTLossPytorch(Loss): - """ - Pure Python implementation of multi-blank transducer loss (https://arxiv.org/pdf/2211.03541.pdf) - """ - - @property - def input_types(self): - """Input types definitions for CTCLoss. - """ - return { - "acts": NeuralType(('B', 'T', 'T', 'D'), LogprobsType()), - "labels": NeuralType(('B', 'T'), LabelsType()), - "act_lens": NeuralType(tuple('B'), LengthsType()), - "label_lens": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Output types definitions for CTCLoss. - loss: - NeuralType(None) - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, blank, big_blank_durations, reduction: str = "sum", sigma: float = 0.0): - super().__init__() - self.blank = blank - self.big_blank_durations = big_blank_durations - self.reduction = reduction - self.sigma = sigma - - def forward(self, acts, labels, act_lens, label_lens): - acts = torch.log_softmax(acts, -1) - self.sigma - forward_logprob, _ = self.compute_forward_prob(acts, labels, act_lens, label_lens) - - losses = -forward_logprob - if self.reduction == 'mean_batch': - losses = losses.mean() # global batch size average - elif self.reduction == 'mean': - losses = torch.div(losses, label_lens).mean() - elif self.reduction == 'sum': - losses = losses.sum() - elif self.reduction == 'mean_volume': - losses = losses.sum() / label_lens.sum() # same as above but longer samples weigh more - - return losses - - def compute_forward_prob(self, acts, labels, act_lens, label_lens): - B, T, U, _ = acts.shape - - log_alpha = torch.zeros(B, T, U, device=acts.device) - for t in range(T): - for u in range(U): - if u == 0: - if t == 0: - # this is the base case: (t=0, u=0) with log-alpha = 0. - log_alpha[:, t, u] = 0.0 - else: - # this is case for (t = 0, u > 0), reached by (t, u - d) - # emitting a blank symbol of duration d. - log_alpha[:, t, u] = log_alpha[:, t - 1, u] + acts[:, t - 1, 0, self.blank] - for i, d in enumerate(self.big_blank_durations): - if t >= d: - tt = log_alpha[:, t - d, u] + acts[:, t - d, 0, self.blank - 1 - i] - log_alpha[:, t, u] = torch.logsumexp( - torch.stack([1.0 * log_alpha[:, t, u], tt]), dim=0 - ) - - else: - if t == 0: - # in case of (u > 0, t = 0), this is only reached from - # (t, u - 1) with a label emission. - gathered = torch.gather( - acts[:, t, u - 1], dim=1, index=labels[:, u - 1].view(-1, 1).type(torch.int64) - ).reshape(-1) - log_alpha[:, t, u] = log_alpha[:, t, u - 1] + gathered - else: - # here both t and u are > 0, this state is reachable - # with two possibilities: (t - d, u) with emission of - # blank with duration d, or (t, u - 1) with a label emission. - - # first we take care of the standard blank. - log_alpha[:, t, u] = torch.logsumexp( - torch.stack( - [ - log_alpha[:, t - 1, u] + acts[:, t - 1, u, self.blank], - log_alpha[:, t, u - 1] - + torch.gather( - acts[:, t, u - 1], dim=1, index=labels[:, u - 1].view(-1, 1).type(torch.int64) - ).reshape(-1), - ] - ), - dim=0, - ) - - # now we go over all big blanks. They need to be considered if current t >= blank duration d. - for i, d in enumerate(self.big_blank_durations): - if t >= d: - tt = log_alpha[:, t - d, u] + acts[:, t - d, u, self.blank - 1 - i] - log_alpha[:, t, u] = torch.logsumexp( - torch.stack([1.0 * log_alpha[:, t, u], tt]), dim=0 - ) - - log_probs = [] - for b in range(B): - # here we need to add the final blank emission weights, which needs - # to consider all possible blank durations. - to_append = ( - log_alpha[b, act_lens[b] - 1, label_lens[b]] + acts[b, act_lens[b] - 1, label_lens[b], self.blank] - ) - - for i, d in enumerate(self.big_blank_durations): - if act_lens[b] >= d: - tt = ( - log_alpha[b, act_lens[b] - d, label_lens[b]] - + acts[b, act_lens[b] - d, label_lens[b], self.blank - 1 - i] - ) - to_append = torch.logsumexp(torch.stack([1.0 * to_append, tt]), dim=0) - - log_probs.append(to_append) - log_prob = torch.stack(log_probs) - - return log_prob, log_alpha diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__init__.py deleted file mode 100644 index 2497903efb6d87cc21b9f7bdcfe2f68ca7c1043e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.losses.ssl_losses.contrastive import ContrastiveLoss diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 181d2ad486b961f57b2407d9ec2c9023329a6e1f..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__pycache__/contrastive.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__pycache__/contrastive.cpython-310.pyc deleted file mode 100644 index 7a59ee228816be303fc20a5f5e684624b59c5c51..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__pycache__/contrastive.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__pycache__/ctc.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__pycache__/ctc.cpython-310.pyc deleted file mode 100644 index db78ffc61afcb938ad110fdf2082c75e3f68d6c7..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__pycache__/ctc.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__pycache__/mlm.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__pycache__/mlm.cpython-310.pyc deleted file mode 100644 index 9797621aab5e3ca10686093919a12d169318c013..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__pycache__/mlm.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__pycache__/rnnt.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__pycache__/rnnt.cpython-310.pyc deleted file mode 100644 index 16ade138bd7674d5ab77ce4f6d343c148a4b3b0f..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/__pycache__/rnnt.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/contrastive.py b/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/contrastive.py deleted file mode 100644 index bab691913c0ad64f46b6ed6353fcac36f0749b4a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/contrastive.py +++ /dev/null @@ -1,297 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import torch.nn.functional as F -from torch import nn - -from nemo.core import Loss, typecheck -from nemo.core.neural_types import AcousticEncodedRepresentation, LengthsType, LossType, NeuralType, SpectrogramType - -__all__ = ["ContrastiveLoss"] - - -class ContrastiveLoss(Loss): - @property - def input_types(self): - """Input types definitions for Contrastive. - """ - return { - "spectrograms": NeuralType(("B", "D", "T"), SpectrogramType()), - "spec_masks": NeuralType(("B", "D", "T"), SpectrogramType()), - "decoder_outputs": NeuralType(("B", "T", "D"), AcousticEncodedRepresentation()), - "decoder_lengths": NeuralType(tuple('B'), LengthsType(), optional=True), - } - - @property - def output_types(self): - """Output types definitions for Contrastive. - loss: - NeuralType(None) - """ - return {"loss": NeuralType(elements_type=LossType())} - - @property - def needs_labels(self): - return False - - def __init__( - self, - in_dim: int, - proj_dim: int = 128, - combine_time_steps: int = 1, - num_negatives: int = 100, - quantized_targets: bool = False, - codebook_size: int = 320, - prob_ppl_weight: float = 0.1, - logit_temp: float = 0.1, - reduce: str = "sum", - sample_from_same_utterance_only: bool = True, - sample_from_non_masked: bool = False, - sample_from_codebook: bool = False, - group_loss: bool = False, - num_groups: int = 2, - quantizer_temp_start: float = 2, - quantizer_temp_min: float = 0.5, - quantizer_temp_decay: float = 0.999995, - mask_threshold: float = 0.8, - store_ids: bool = True, - reduce_ids: bool = False, - multiplier: float = 16.0, - ): - """ - Loss function representing the contrastive task of identifying the true latent speech representation of - the masked spectrogram steps from a set of sampled distractors. - - Args: - in_dim: Number of spectrogram channels. - proj_dim: Number of channels in the model outputs. - combine_time_steps: How many time steps should be combined into a single representation. - num_negatives: Number of sampled negatives for each target. - quantized_targets: Bool that determines if the targets should be quantized. - codebook_size: Number of vectors in the codebook per group. - prob_ppl_weight: Float multiplier on the perplexity loss for target quantization. - logit_temp: Float temperature for normalizing logits. - reduce: String representing the type of reduction used for cross entropy. - sample_from_same_utterance_only: Bool that determines if negatives should be sampled only from same utterance. - sample_from_non_masked: Bool that determines if negatives should be sampled from non-masked steps of the spectrogram. - sample_from_codebook: Bool that determines if negatives should be sampled from entire codebook. - group_loss: Bool that determines if loss should be computed separately for each group in the quantizer codebook. - num_groups: Number of groups in the quantizer codebook. - quantizer_temp_start: Starting temperature in quantizer. - quantizer_temp_min: Minimum temperature in quantizer. - quantizer_temp_decay: Decay rate of quantizer temperature per global step. - mask_threshold: Float threshold for determining if a time step of the spectrogram is masked based on percent of masked channels. - store_ids: Bool that determines if the quantizer ids will be stored to be potentially used by other losses. - reduce_ids: Bool that determines if we convert any sequence of consecutive equivalent ids to a single occurence of that id. - multiplier: Float multipler on final loss - """ - - super().__init__() - quantizer_temp = (quantizer_temp_start, quantizer_temp_min, quantizer_temp_decay) - self.quantized_targets = quantized_targets - self.num_negatives = num_negatives - self.prob_ppl_weight = prob_ppl_weight - if self.quantized_targets: - quantizer_cfg = { - "_target_": "nemo.collections.asr.parts.submodules.ssl_quantizers.GumbelVectorQuantizer", - "dim": in_dim * combine_time_steps, - "vq_dim": proj_dim, - "num_vars": codebook_size, - "groups": num_groups, - "temp": quantizer_temp, - "combine_groups": True, - "time_first": True, - } - self.quantizer = ContrastiveLoss.from_config_dict(quantizer_cfg) - self.prob_ppl_weight = prob_ppl_weight - self.logit_temp = logit_temp - self.reduce = reduce - self.combine_time_steps = combine_time_steps - self.sample_from_same_utterance_only = sample_from_same_utterance_only - self.sample_from_non_masked = sample_from_non_masked - self.sample_from_codebook = sample_from_codebook - self.group_loss = group_loss - self.mask_threshold = mask_threshold - self.multiplier = multiplier - - self.store_ids = store_ids - self.reduce_ids = reduce_ids - - if not self.quantized_targets: - self.target_proj = nn.Linear(in_dim * combine_time_steps, proj_dim) - - def sample_negatives(self, y, num): - # y - T'xBxC or T'xC - - high = y.shape[0] - neg_idxs = torch.multinomial(torch.ones((num, high), device=y.device), self.num_negatives) - - negs = y[neg_idxs.view(-1)] - negs = negs.view((num, self.num_negatives) + y.shape[1:]) - negs = negs.transpose(0, 1) - # negs - NxT'xBxC or NxT'xC - - return negs, neg_idxs - - @typecheck() - def forward(self, spectrograms, spec_masks, decoder_outputs, decoder_lengths=None): - spec_in = spectrograms.transpose(-2, -1) - masks = spec_masks.transpose(-2, -1) - targets = spec_in - # BxTxC - - targets = targets.reshape(targets.shape[0], targets.shape[1] // self.combine_time_steps, -1) - masks = masks.reshape(targets.shape[0], targets.shape[1], -1) - - if self.quantized_targets: - if self.store_ids: - # store ids for use by other losses - targets, prob_ppl_loss, cur_codebook_temp, self.target_ids = self.quantizer(targets, return_ids=True) - - if self.reduce_ids: - # reduce consecutive equivalent ids to a single occurence - _, indices = torch.unique_consecutive(self.target_ids, return_inverse=True) - indices -= indices.min(dim=1, keepdims=True)[0] - reduced_ids = torch.zeros_like(self.target_ids) - reduced_ids = reduced_ids.scatter_(1, indices, self.target_ids) - reduced_lens = indices.max(dim=-1)[0] + 1 - - self.target_ids = reduced_ids.narrow(1, 0, reduced_lens.max()) - self.target_lengths = reduced_lens - - else: - self.target_lengths = None - - else: - targets, prob_ppl_loss, cur_codebook_temp = self.quantizer(targets) - else: - targets = self.target_proj(targets) - - if self.sample_from_same_utterance_only: - bs = decoder_outputs.shape[0] - masks = masks.mean(-1) > self.mask_threshold - out_masked_only = decoder_outputs[masks] - targets_masked_only = targets[masks] - out_masked_only = out_masked_only.reshape(bs, -1, out_masked_only.shape[-1]) - targets_masked_only = targets_masked_only.reshape(bs, -1, targets_masked_only.shape[-1]) - - # BxT'xC - # number of masked time steps to predict (T') - # -> T'xBxC - - out_masked_only = out_masked_only.transpose(0, 1) - targets_masked_only = targets_masked_only.transpose(0, 1) - # -> T'xBxC - - if self.sample_from_non_masked: - # sample from all steps in utterance - negatives, _ = self.sample_negatives( - targets.transpose(0, 1), targets_masked_only.size(0), # TxBxC # T' - ) - else: - # only sample from masked steps in utterance - negatives, _ = self.sample_negatives(targets_masked_only, targets_masked_only.size(0)) # T'xBxC # T' - # NxT'xBxC - - out_masked_only = out_masked_only.reshape(-1, out_masked_only.shape[-1]) - targets_masked_only = targets_masked_only.reshape(-1, targets_masked_only.shape[-1]) - negatives = negatives.reshape(self.num_negatives, -1, negatives.shape[-1]) - - # T'BxC and NxT'BxC - - else: - masks = masks.mean(-1) > self.mask_threshold - out_masked_only = decoder_outputs[masks] - targets_masked_only = targets[masks] - - # T'xC - # number of masked time steps to predict (T') - - if self.group_loss: - num_groups = self.quantizer.groups - negatives = self.quantizer.vars.reshape(num_groups, self.quantizer.num_vars, -1) - # GxNx(C//G) - negatives = negatives.transpose(0, 1) - # NxGx(C//G) - negatives = negatives.unsqueeze(1).expand(-1, out_masked_only.shape[0], -1, -1) - # NxT'xGx(C//G) - negatives = negatives.reshape(negatives.shape[0], -1, negatives.shape[-1]) - # NxT'Gx(C//G) - - out_masked_only = out_masked_only.reshape(-1, out_masked_only.shape[-1] // num_groups) - targets_masked_only = targets_masked_only.reshape(-1, targets_masked_only.shape[-1] // num_groups) - # T'Gx(C//G) - elif self.sample_from_codebook: - # sample from the full codebook - negatives = self.quantizer.sample_from_codebook(self.num_negatives, targets_masked_only.size(0)) - elif self.sample_from_non_masked: - # sample from all steps in batch - negatives, _ = self.sample_negatives( - targets.reshape(targets.shape[0] * targets.shape[1], -1), targets_masked_only.size(0), # BTxC - ) # T' - else: - # only sample from masked steps - negatives, _ = self.sample_negatives(targets_masked_only, targets_masked_only.size(0)) # T'xC # T' - # NxT'xC - - # Calculate similarity between outputs and all targets - similarity_scores = self._calculate_similarity(out_masked_only, negatives, targets_masked_only) - # (1+N)xT' - # cosine similarity of outs with targets + N negatives - - # Create targets of size T - similarity_targets = decoder_outputs.new_zeros(similarity_scores.size(1), dtype=torch.long) - # T' - # targets are 0, since it's the first, followed by N sampled negatives - - # Transpose similarity scores to TxF for loss - similarity_scores = similarity_scores.transpose(0, 1) - # T'x(1+N) - - loss = F.cross_entropy(similarity_scores, similarity_targets, reduction=self.reduce) - - sample_size = similarity_targets.numel() - - if self.prob_ppl_weight != 0 and self.quantized_targets: - prob_ppl_loss = self.prob_ppl_weight * prob_ppl_loss * sample_size - loss += prob_ppl_loss - - if not isinstance(loss, torch.Tensor): - loss = torch.Tensor([0]).to(device=decoder_outputs.device) - - batch_size = spectrograms.shape[0] - loss *= self.multiplier / batch_size - - return loss - - def _calculate_similarity(self, logits, negatives, targets): - neg_is_pos = (targets == negatives).all(-1) - # NxT' - true where the negative is actually the positive - targets = targets.unsqueeze(0) - # 1xT'xC - targets = torch.cat([targets, negatives], dim=0) - # (1+N)xT'XC - logits = torch.cosine_similarity( - logits.float().unsqueeze(0).expand(targets.shape[0], -1, -1), targets.float(), dim=-1 - ).type_as(logits) - # (1+N)xT' - logits /= self.logit_temp - if neg_is_pos.any(): - logits[1:][neg_is_pos] = float("-inf") - return logits - - def set_num_updates(self, num_updates): - if self.quantized_targets: - self.quantizer.set_num_updates(num_updates) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/ctc.py b/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/ctc.py deleted file mode 100644 index e71d60ac4956ecc9a4983dc44d0481963358287d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/ctc.py +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.losses import CTCLoss -from nemo.core import Loss, typecheck -from nemo.core.neural_types import LabelsType, LengthsType, LossType, NeuralType, SpectrogramType, VoidType - -__all__ = ["CTCLossForSSL"] - - -class CTCLossForSSL(Loss): - @property - def input_types(self): - """Input types definitions for Contrastive. - """ - return { - "spec_masks": NeuralType(("B", "D", "T"), SpectrogramType()), - "decoder_outputs": NeuralType(("B", "T", "D"), VoidType()), - "targets": NeuralType(('B', 'T'), LabelsType()), - "decoder_lengths": NeuralType(tuple('B'), LengthsType(), optional=True), - "target_lengths": NeuralType(tuple('B'), LengthsType(), optional=True), - } - - @property - def output_types(self): - """Output types definitions for Contrastive. - loss: - NeuralType(None) - """ - return {"loss": NeuralType(elements_type=LossType())} - - @property - def needs_labels(self): - return True - - def __init__(self, num_classes, zero_infinity=True, reduction='mean_batch'): - super().__init__() - self.loss = CTCLoss(num_classes=num_classes, reduction=reduction, zero_infinity=zero_infinity) - - @typecheck() - def forward(self, spec_masks, decoder_outputs, targets, decoder_lengths=None, target_lengths=None): - loss = self.loss( - log_probs=decoder_outputs, targets=targets, input_lengths=decoder_lengths, target_lengths=target_lengths - ) - - return loss diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/mlm.py b/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/mlm.py deleted file mode 100644 index 89de01dc1b341b2a0023d82d4374e8447129e7b5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/mlm.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import torch.nn.functional as F -from torch import nn - -from nemo.core import Loss, typecheck -from nemo.core.neural_types import LabelsType, LengthsType, LogprobsType, LossType, NeuralType, SpectrogramType - -__all__ = ["MLMLoss"] - - -class MLMLoss(Loss): - @property - def input_types(self): - """Input types definitions for Contrastive. - """ - return { - "spec_masks": NeuralType(("B", "D", "T"), SpectrogramType()), - "decoder_outputs": NeuralType(("B", "T", "D"), LogprobsType()), - "targets": NeuralType(('B', 'T'), LabelsType()), - "decoder_lengths": NeuralType(tuple('B'), LengthsType(), optional=True), - "target_lengths": NeuralType(tuple('B'), LengthsType(), optional=True), - } - - @property - def output_types(self): - """Output types definitions for Contrastive. - loss: - NeuralType(None) - """ - return {"loss": NeuralType(elements_type=LossType())} - - @property - def needs_labels(self): - return True - - def __init__( - self, combine_time_steps: int = 1, mask_threshold: float = 0.8, - ): - super().__init__() - self.nll_loss = nn.NLLLoss() - self.combine_time_steps = combine_time_steps - self.mask_threshold = mask_threshold - - @typecheck() - def forward(self, spec_masks, decoder_outputs, targets, decoder_lengths=None, target_lengths=None): - - # outputs are log_probs - masks = spec_masks.transpose(-2, -1) - # BxTxC - - masks = masks.reshape(masks.shape[0], masks.shape[1] // self.combine_time_steps, -1) - masks = masks.mean(-1) > self.mask_threshold - - out_masked_only = decoder_outputs[masks] - targets = F.pad(targets, (0, masks.shape[-1] - targets.shape[-1])) - targets_masked_only = targets[masks] - - loss = self.nll_loss(out_masked_only, targets_masked_only) - loss = torch.mean(loss) - - return loss diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/rnnt.py b/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/rnnt.py deleted file mode 100644 index 0336063638f747bdc83613c0625a95731d4316aa..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/losses/ssl_losses/rnnt.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.losses.rnnt import RNNTLoss -from nemo.core import Loss, typecheck -from nemo.core.neural_types import LabelsType, LengthsType, LogprobsType, LossType, NeuralType, SpectrogramType - -__all__ = ["RNNTLossForSSL"] - - -class RNNTLossForSSL(Loss): - @property - def input_types(self): - """Input types definitions for Contrastive. - """ - return { - "spec_masks": NeuralType(("B", "D", "T"), SpectrogramType()), - "decoder_outputs": NeuralType(('B', 'T', 'T', 'D'), LogprobsType()), - "targets": NeuralType(('B', 'T'), LabelsType()), - "decoder_lengths": NeuralType(tuple('B'), LengthsType(), optional=True), - "target_lengths": NeuralType(tuple('B'), LengthsType(), optional=True), - } - - @property - def output_types(self): - """Output types definitions for Contrastive. - loss: - NeuralType(None) - """ - return {"loss": NeuralType(elements_type=LossType())} - - @property - def needs_labels(self): - return True - - def __init__(self, num_classes): - super().__init__() - self.loss = RNNTLoss(num_classes=num_classes) - - @typecheck() - def forward(self, spec_masks, decoder_outputs, targets, decoder_lengths=None, target_lengths=None): - - loss = self.loss( - log_probs=decoder_outputs, targets=targets, input_lengths=decoder_lengths, target_lengths=target_lengths - ) - - return loss diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__init__.py deleted file mode 100644 index 9e3250071955216f6abc505e6181fb59931baa8d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 912d2f02dd071671befe88fec23f7c4d55bb7c0b..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/audio.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/audio.cpython-310.pyc deleted file mode 100644 index ba4b15576822bc57bfd7e0c328cf7d1771a89b77..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/audio.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/der.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/der.cpython-310.pyc deleted file mode 100644 index 9927a1fceb8806a85d757b5e06405a4a505e2661..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/der.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/multi_binary_acc.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/multi_binary_acc.cpython-310.pyc deleted file mode 100644 index c85bc557cf55881e1f12015ff214d33f4f60045b..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/multi_binary_acc.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/rnnt_wer.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/rnnt_wer.cpython-310.pyc deleted file mode 100644 index f7fd9aad01f3281eee4beb560b0da64e9063fb69..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/rnnt_wer.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/rnnt_wer_bpe.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/rnnt_wer_bpe.cpython-310.pyc deleted file mode 100644 index 55558e95d12dcb3f24c03beba9853832b8e593bd..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/rnnt_wer_bpe.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/wer.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/wer.cpython-310.pyc deleted file mode 100644 index 1d5d5d75f56cebb52b6af8b000e519af6d92bf33..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/wer.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/wer_bpe.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/wer_bpe.cpython-310.pyc deleted file mode 100644 index 8e24dad5b0dff5cc9c1f1271d310cc948358f802..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/__pycache__/wer_bpe.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/audio.py b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/audio.py deleted file mode 100644 index 5e8c2915e3fad6334953ffc0a0faa4cc2636d7ef..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/audio.py +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Callable, Iterable, List, Optional, Tuple - -import torch -from torchmetrics import Metric -from torchmetrics.audio.pesq import PerceptualEvaluationSpeechQuality -from torchmetrics.audio.pit import PermutationInvariantTraining -from torchmetrics.audio.sdr import ScaleInvariantSignalDistortionRatio, SignalDistortionRatio -from torchmetrics.audio.snr import ScaleInvariantSignalNoiseRatio, SignalNoiseRatio -from torchmetrics.audio.stoi import ShortTimeObjectiveIntelligibility - -from nemo.utils import logging - -__all__ = ['AudioMetricWrapper'] - -__VERIFIED_METRICS__ = [ - PermutationInvariantTraining, - ScaleInvariantSignalDistortionRatio, - SignalDistortionRatio, - ScaleInvariantSignalNoiseRatio, - SignalNoiseRatio, - PerceptualEvaluationSpeechQuality, - ShortTimeObjectiveIntelligibility, -] - - -class AudioMetricWrapper(Metric): - """A wrapper around an audio metric enabling selection of a specific channel - and handling of examples in a batch with varying valid input length. - - Note: - This class assumes that the underlying metric uses averaging to calculate the - value over a batch. This assumption is only used by `forward` and does not - impact other methods, such as `update` and `compute`. - - Args: - metric: base metric that should be wrapped. It is assumed that calculation - of the metric over a batch is done by averaging. - channel: Optional, for selecting a channel from `preds` and `target` signals. - If None, all channels are used. - metric_using_batch_averaging: Optional, used to denote that the base metric - is using averaging to calculate the metric value - for a batch. - """ - - full_state_update: bool = False - - def __init__( - self, metric: Metric, channel: Optional[int] = None, metric_using_batch_averaging: Optional[bool] = None - ): - super().__init__() - if not isinstance(metric, Metric): - raise ValueError(f"Expected argument `metric` to be an instance of `torchmetrics.Metric` but got {metric}") - - if not metric_using_batch_averaging and type(metric) not in __VERIFIED_METRICS__: - raise ValueError( - f'Metric {metric} is not in verified metrics. {self.__class__.__name__} assumes reduction over batch is calculated using averaging. \n' - 'This should not affect the final results, but values for a single batch obtained using `forward` may be inaccurate if using `input_length`. \n' - 'To suppress this message, please confirm the used metric is using batch averaging and set "metric_using_batch_averaging = True"' - ) - - self._metric = metric - self._channel = channel - logging.debug('Setup metric %s, channel %s', metric, str(channel)) - - def _select_channel(self, preds: torch.Tensor, target: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """Select a single channel from input signals. - - Args: - preds: tensor with shape (B, C, T) - target: tensor with shape (B, C, T) - - Returns: - Original tensors if self.channel is None, shape (B, C, T). - A single channel from input tensors if self.channel is set, shape (B, T) - """ - if self._channel is None: - return preds, target - else: - return preds[:, self._channel, ...], target[:, self._channel, ...] - - @staticmethod - def _trim_inputs( - preds: torch.Tensor, target: torch.Tensor, input_length: torch.Tensor - ) -> Iterable[Tuple[torch.Tensor, torch.Tensor]]: - """Trim input tensors to input_length samples. - - Args: - preds: tensor with shape (B, C, T) - target: tensor with shape (B, C, T) - - Returns: - An iterable with tuples of (preds, target) with - the correct length. - """ - # Each example has a different length - for b_idx, b_len in enumerate(input_length): - b_preds = preds[b_idx, ..., :b_len] - b_target = target[b_idx, ..., :b_len] - - yield b_preds, b_target - - @staticmethod - def _batch_reduction(batch_values: List[torch.Tensor]) -> torch.Tensor: - """Reduce metric values for each example in a batch to a single - value for the whole batch. - - Args: - batch_values: list of metric values for each example in a batch - - Returns: - Average metric value over the batch. - """ - return sum(batch_values) / len(batch_values) - - def update(self, preds: torch.Tensor, target: torch.Tensor, input_length: Optional[torch.Tensor] = None) -> None: - """Update the underlying metric by taking into account channel selector and input length. - - Args: - preds: tensor with predictions, shape (B, C, T) - target: tensor with target signals, shape (B, C, T) - input_length: Optional, input tensor with length (in samples) of each signal in the batch, shape (B,). - If not provided, it is assumed that all samples are valid. - """ - preds, target = self._select_channel(preds=preds, target=target) - - if input_length is None: - self._metric.update(preds=preds, target=target) - else: - # Each example in this batch has a different length - for b_preds, b_target in self._trim_inputs(preds=preds, target=target, input_length=input_length): - self._metric.update(preds=b_preds, target=b_target) - - def compute(self) -> torch.Tensor: - """Compute the underlying metric. - """ - return self._metric.compute() - - def forward( - self, preds: torch.Tensor, target: torch.Tensor, input_length: Optional[torch.Tensor] = None - ) -> torch.Tensor: - """Call underlying forward method to add the batch statistics to the accumulated metric state - and return the result for the current batch. - - Args: - preds: tensor with predictions, shape (B, C, T) - target: tensor with target signals, shape (B, C, T) - input_length: Optional, input tensor with length (in samples) of each signal in the batch, shape (B,). - If not provided, it is assumed that all samples are valid. - - Returns: - Underlying metric averaged on the current batch. - """ - preds, target = self._select_channel(preds=preds, target=target) - - if input_length is None: - return self._metric(preds=preds, target=target) - else: - # Each example in this batch has a different length - batch_values = [] - for b_preds, b_target in self._trim_inputs(preds=preds, target=target, input_length=input_length): - batch_values.append(self._metric(preds=b_preds, target=b_target)) - # Average over the batch - return self._batch_reduction(batch_values) - - def reset(self) -> None: - """Reset the underlying metric. - """ - self._metric.reset() - - def __repr__(self) -> str: - """Return string representation of the object. - """ - _op_metric = f"(metric: {repr(self._metric)}, channel: {self._channel})" - repr_str = self.__class__.__name__ + _op_metric - - return repr_str - - def _wrap_compute(self, compute: Callable) -> Callable: - """Overwrite to do nothing, as in CompositionalMetric. - """ - return compute diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/der.py b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/der.py deleted file mode 100644 index fc5cded970d07f786dac958086024030fb32de6a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/der.py +++ /dev/null @@ -1,427 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -from itertools import permutations -from typing import Dict, List, Optional, Tuple - -import numpy as np -import torch -from pyannote.core import Segment, Timeline -from pyannote.metrics.diarization import DiarizationErrorRate - -from nemo.collections.asr.metrics.wer import word_error_rate -from nemo.collections.asr.parts.utils.optimization_utils import linear_sum_assignment - -from nemo.utils import logging - -__all__ = [ - 'score_labels', - 'calculate_session_cpWER', - 'calculate_session_cpWER_bruteforce', - 'concat_perm_word_error_rate', -] - - -def get_partial_ref_labels(pred_labels: List[str], ref_labels: List[str]) -> List[str]: - """ - For evaluation of online diarization performance, generate partial reference labels - from the last prediction time. - - Args: - pred_labels (list[str]): list of partial prediction labels - ref_labels (list[str]): list of full reference labels - - Returns: - ref_labels_out (list[str]): list of partial reference labels - """ - # If there is no reference, return empty list - if len(ref_labels) == 0: - return [] - - # If there is no prediction, set the last prediction time to 0 - if len(pred_labels) == 0: - last_pred_time = 0 - else: - # The lastest prediction time in the prediction labels - last_pred_time = max([float(labels.split()[1]) for labels in pred_labels]) - ref_labels_out = [] - for label in ref_labels: - start, end, speaker = label.split() - start, end = float(start), float(end) - # If the current [start, end] interval extends beyond the end of hypothesis time stamps - if start < last_pred_time: - end_time = min(end, last_pred_time) - label = f"{start} {end_time} {speaker}" - ref_labels_out.append(label) - # Other cases where the current [start, end] interval is before the last prediction time - elif end < last_pred_time: - ref_labels_out.append(label) - return ref_labels_out - - -def get_online_DER_stats( - DER: float, - CER: float, - FA: float, - MISS: float, - diar_eval_count: int, - der_stat_dict: Dict[str, float], - deci: int = 3, -) -> Tuple[Dict[str, float], Dict[str, float]]: - """ - For evaluation of online diarization performance, add cumulative, average, and maximum DER/CER. - - Args: - DER (float): Diarization Error Rate from the start to the current point - CER (float): Confusion Error Rate from the start to the current point - FA (float): False Alarm from the start to the current point - MISS (float): Miss rate from the start to the current point - diar_eval_count (int): Number of evaluation sessions - der_stat_dict (dict): Dictionary containing cumulative, average, and maximum DER/CER - deci (int): Number of decimal places to round - - Returns: - der_dict (dict): Dictionary containing DER, CER, FA, and MISS - der_stat_dict (dict): Dictionary containing cumulative, average, and maximum DER/CER - """ - der_dict = { - "DER": round(100 * DER, deci), - "CER": round(100 * CER, deci), - "FA": round(100 * FA, deci), - "MISS": round(100 * MISS, deci), - } - der_stat_dict['cum_DER'] += DER - der_stat_dict['cum_CER'] += CER - der_stat_dict['avg_DER'] = round(100 * der_stat_dict['cum_DER'] / diar_eval_count, deci) - der_stat_dict['avg_CER'] = round(100 * der_stat_dict['cum_CER'] / diar_eval_count, deci) - der_stat_dict['max_DER'] = round(max(der_dict['DER'], der_stat_dict['max_DER']), deci) - der_stat_dict['max_CER'] = round(max(der_dict['CER'], der_stat_dict['max_CER']), deci) - return der_dict, der_stat_dict - - -def uem_timeline_from_file(uem_file, uniq_name=''): - """ - Generate pyannote timeline segments for uem file - - file format - UNIQ_SPEAKER_ID CHANNEL START_TIME END_TIME - """ - timeline = Timeline(uri=uniq_name) - with open(uem_file, 'r') as f: - lines = f.readlines() - for line in lines: - line = line.strip() - speaker_id, channel, start_time, end_time = line.split() - timeline.add(Segment(float(start_time), float(end_time))) - - return timeline - - -def score_labels( - AUDIO_RTTM_MAP, all_reference, all_hypothesis, collar=0.25, ignore_overlap=True, verbose: bool = True -) -> Optional[Tuple[DiarizationErrorRate, Dict]]: - """ - Calculate DER, CER, FA and MISS rate from hypotheses and references. Hypothesis results are - coming from Pyannote-formatted speaker diarization results and References are coming from - Pyannote-formatted RTTM data. - - - Args: - AUDIO_RTTM_MAP (dict): Dictionary containing information provided from manifestpath - all_reference (list[uniq_name,Annotation]): reference annotations for score calculation - all_hypothesis (list[uniq_name,Annotation]): hypothesis annotations for score calculation - verbose (bool): Warns if RTTM file is not found. - - Returns: - metric (pyannote.DiarizationErrorRate): Pyannote Diarization Error Rate metric object. This object contains detailed scores of each audiofile. - mapping (dict): Mapping dict containing the mapping speaker label for each audio input - - < Caveat > - Unlike md-eval.pl, "no score" collar in pyannote.metrics is the maximum length of - "no score" collar from left to right. Therefore, if 0.25s is applied for "no score" - collar in md-eval.pl, 0.5s should be applied for pyannote.metrics. - """ - metric = None - if len(all_reference) == len(all_hypothesis): - metric = DiarizationErrorRate(collar=2 * collar, skip_overlap=ignore_overlap) - - mapping_dict = {} - for (reference, hypothesis) in zip(all_reference, all_hypothesis): - ref_key, ref_labels = reference - _, hyp_labels = hypothesis - uem = AUDIO_RTTM_MAP[ref_key].get('uem_filepath', None) - if uem is not None: - uem = uem_timeline_from_file(uem_file=uem, uniq_name=ref_key) - metric(ref_labels, hyp_labels, uem=uem, detailed=True) - mapping_dict[ref_key] = metric.optimal_mapping(ref_labels, hyp_labels) - - DER = abs(metric) - CER = metric['confusion'] / metric['total'] - FA = metric['false alarm'] / metric['total'] - MISS = metric['missed detection'] / metric['total'] - itemized_errors = (DER, CER, FA, MISS) - - logging.info( - "Cumulative Results for collar {} sec and ignore_overlap {}: \n FA: {:.4f}\t MISS {:.4f}\t \ - Diarization ER: {:.4f}\t, Confusion ER:{:.4f}".format( - collar, ignore_overlap, FA, MISS, DER, CER - ) - ) - - return metric, mapping_dict, itemized_errors - elif verbose: - logging.warning( - "Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate" - ) - return None - - -def evaluate_der(audio_rttm_map_dict, all_reference, all_hypothesis, diar_eval_mode='all'): - """ - Evaluate with a selected diarization evaluation scheme - - AUDIO_RTTM_MAP (dict): - Dictionary containing information provided from manifestpath - all_reference (list[uniq_name,annotation]): - reference annotations for score calculation - all_hypothesis (list[uniq_name,annotation]): - hypothesis annotations for score calculation - diar_eval_mode (str): - Diarization evaluation modes - - diar_eval_mode == "full": - DIHARD challenge style evaluation, the most strict way of evaluating diarization - (collar, ignore_overlap) = (0.0, False) - diar_eval_mode == "fair": - Evaluation setup used in VoxSRC challenge - (collar, ignore_overlap) = (0.25, False) - diar_eval_mode == "forgiving": - Traditional evaluation setup - (collar, ignore_overlap) = (0.25, True) - diar_eval_mode == "all": - Compute all three modes (default) - """ - eval_settings = [] - if diar_eval_mode == "full": - eval_settings = [(0.0, False)] - elif diar_eval_mode == "fair": - eval_settings = [(0.25, False)] - elif diar_eval_mode == "forgiving": - eval_settings = [(0.25, True)] - elif diar_eval_mode == "all": - eval_settings = [(0.0, False), (0.25, False), (0.25, True)] - else: - raise ValueError("`diar_eval_mode` variable contains an unsupported value") - - for collar, ignore_overlap in eval_settings: - diar_score = score_labels( - AUDIO_RTTM_MAP=audio_rttm_map_dict, - all_reference=all_reference, - all_hypothesis=all_hypothesis, - collar=collar, - ignore_overlap=ignore_overlap, - ) - return diar_score - - -def calculate_session_cpWER_bruteforce(spk_hypothesis: List[str], spk_reference: List[str]) -> Tuple[float, str, str]: - """ - Calculate cpWER with actual permutations in brute-force way when LSA algorithm cannot deliver the correct result. - - Args: - spk_hypothesis (list): - List containing the hypothesis transcript for each speaker. A list containing the sequence - of words is assigned for each speaker. - - Example: - >>> spk_hypothesis = ["hey how are you we that's nice", "i'm good yes hi is your sister"] - - spk_reference (list): - List containing the reference transcript for each speaker. A list containing the sequence - of words is assigned for each speaker. - - Example: - >>> spk_reference = ["hi how are you well that's nice", "i'm good yeah how is your sister"] - - Returns: - cpWER (float): - cpWER value for the given session. - min_perm_hyp_trans (str): - Hypothesis transcript containing the permutation that minimizes WER. Words are separated by spaces. - ref_trans (str): - Reference transcript in an arbitrary permutation. Words are separated by spaces. - """ - p_wer_list, permed_hyp_lists = [], [] - ref_word_list = [] - - # Concatenate the hypothesis transcripts into a list - for spk_id, word_list in enumerate(spk_reference): - ref_word_list.append(word_list) - ref_trans = " ".join(ref_word_list) - - # Calculate WER for every permutation - for hyp_word_list in permutations(spk_hypothesis): - hyp_trans = " ".join(hyp_word_list) - permed_hyp_lists.append(hyp_trans) - - # Calculate a WER value of the permuted and concatenated transcripts - p_wer = word_error_rate(hypotheses=[hyp_trans], references=[ref_trans]) - p_wer_list.append(p_wer) - - # Find the lowest WER and its hypothesis transcript - argmin_idx = np.argmin(p_wer_list) - min_perm_hyp_trans = permed_hyp_lists[argmin_idx] - cpWER = p_wer_list[argmin_idx] - return cpWER, min_perm_hyp_trans, ref_trans - - -def calculate_session_cpWER( - spk_hypothesis: List[str], spk_reference: List[str], use_lsa_only: bool = False -) -> Tuple[float, str, str]: - """ - Calculate a session-level concatenated minimum-permutation word error rate (cpWER) value. cpWER is - a scoring method that can evaluate speaker diarization and speech recognition performance at the same time. - cpWER is calculated by going through the following steps. - - 1. Concatenate all utterances of each speaker for both reference and hypothesis files. - 2. Compute the WER between the reference and all possible speaker permutations of the hypothesis. - 3. Pick the lowest WER among them (this is assumed to be the best permutation: `min_perm_hyp_trans`). - - cpWER was proposed in the following article: - CHiME-6 Challenge: Tackling Multispeaker Speech Recognition for Unsegmented Recordings - https://arxiv.org/pdf/2004.09249.pdf - - Implementation: - - Brute force permutation method for calculating cpWER has a time complexity of `O(n!)`. - - To reduce the computational burden, linear sum assignment (LSA) algorithm is applied - (also known as Hungarian algorithm) to find the permutation that leads to the lowest WER. - - In this implementation, instead of calculating all WER values for all permutation of hypotheses, - we only calculate WER values of (estimated number of speakers) x (reference number of speakers) - combinations with `O(n^2)`) time complexity and then select the permutation that yields the lowest - WER based on LSA algorithm. - - LSA algorithm has `O(n^3)` time complexity in the worst case. - - We cannot use LSA algorithm to find the best permutation when there are more hypothesis speakers - than reference speakers. In this case, we use the brute-force permutation method instead. - - Example: - >>> transcript_A = ['a', 'b', 'c', 'd', 'e', 'f'] # 6 speakers - >>> transcript_B = ['a c b d', 'e f'] # 2 speakers - - [case1] hypothesis is transcript_A, reference is transcript_B - [case2] hypothesis is transcript_B, reference is transcript_A - - LSA algorithm based cpWER is: - [case1] 4/6 (4 deletion) - [case2] 2/6 (2 substitution) - brute force permutation based cpWER is: - [case1] 0 - [case2] 2/6 (2 substitution) - - Args: - spk_hypothesis (list): - List containing the hypothesis transcript for each speaker. A list containing the sequence - of words is assigned for each speaker. - - Example: - >>> spk_hypothesis = ["hey how are you we that's nice", "i'm good yes hi is your sister"] - - spk_reference (list): - List containing the reference transcript for each speaker. A list containing the sequence - of words is assigned for each speaker. - - Example: - >>> spk_reference = ["hi how are you well that's nice", "i'm good yeah how is your sister"] - - Returns: - cpWER (float): - cpWER value for the given session. - min_perm_hyp_trans (str): - Hypothesis transcript containing the permutation that minimizes WER. Words are separated by spaces. - ref_trans (str): - Reference transcript in an arbitrary permutation. Words are separated by spaces. - """ - # Get all pairs of (estimated num of spks) x (reference num of spks) combinations - hyp_ref_pair = [spk_hypothesis, spk_reference] - all_pairs = list(itertools.product(*hyp_ref_pair)) - - num_hyp_spks, num_ref_spks = len(spk_hypothesis), len(spk_reference) - - if not use_lsa_only and num_ref_spks < num_hyp_spks: - # Brute force algorithm when there are more speakers in the hypothesis - cpWER, min_perm_hyp_trans, ref_trans = calculate_session_cpWER_bruteforce(spk_hypothesis, spk_reference) - else: - # Calculate WER for each speaker in hypothesis with reference - # There are (number of hyp speakers) x (number of ref speakers) combinations - lsa_wer_list = [] - for (spk_hyp_trans, spk_ref_trans) in all_pairs: - spk_wer = word_error_rate(hypotheses=[spk_hyp_trans], references=[spk_ref_trans]) - lsa_wer_list.append(spk_wer) - - # Make a cost matrix and calculate a linear sum assignment on the cost matrix. - # Row is hypothesis index and column is reference index - cost_wer = torch.tensor(lsa_wer_list).reshape([len(spk_hypothesis), len(spk_reference)]) - row_hyp_ind, col_ref_ind = linear_sum_assignment(cost_wer) - - # In case where hypothesis has more speakers, add words from residual speakers - hyp_permed = [spk_hypothesis[k] for k in np.argsort(col_ref_ind)] - min_perm_hyp_trans = " ".join(hyp_permed) - - # Concatenate the reference transcripts into a string variable - ref_trans = " ".join(spk_reference) - - # Calculate a WER value from the permutation that yields the lowest WER. - cpWER = word_error_rate(hypotheses=[min_perm_hyp_trans], references=[ref_trans]) - - return cpWER, min_perm_hyp_trans, ref_trans - - -def concat_perm_word_error_rate( - spk_hypotheses: List[List[str]], spk_references: List[List[str]] -) -> Tuple[List[float], List[str], List[str]]: - """ - Launcher function for `calculate_session_cpWER`. Calculate session-level cpWER and average cpWER. - For detailed information about cpWER, see docstrings of `calculate_session_cpWER` function. - - As opposed to `cpWER`, `WER` is the regular WER value where the hypothesis transcript contains - words in temporal order regardless of the speakers. `WER` value can be different from cpWER value, - depending on the speaker diarization results. - - Args: - spk_hypotheses (list): - List containing the lists of speaker-separated hypothesis transcripts. - spk_references (list): - List containing the lists of speaker-separated reference transcripts. - - Returns: - cpWER (float): - List containing cpWER values for each session - min_perm_hyp_trans (list): - List containing transcripts that lead to the minimum WER in string format - ref_trans (list): - List containing concatenated reference transcripts - """ - if len(spk_hypotheses) != len(spk_references): - raise ValueError( - "In concatenated-minimum permutation word error rate calculation, " - "hypotheses and reference lists must have the same number of elements. But got arguments:" - f"{len(spk_hypotheses)} and {len(spk_references)} correspondingly" - ) - cpWER_values, hyps_spk, refs_spk = [], [], [] - for (spk_hypothesis, spk_reference) in zip(spk_hypotheses, spk_references): - cpWER, min_hypothesis, concat_reference = calculate_session_cpWER(spk_hypothesis, spk_reference) - cpWER_values.append(cpWER) - hyps_spk.append(min_hypothesis) - refs_spk.append(concat_reference) - return cpWER_values, hyps_spk, refs_spk diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/multi_binary_acc.py b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/multi_binary_acc.py deleted file mode 100644 index 8cc21c53ad82121e8be2c0dc8483a92757ad1e0e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/multi_binary_acc.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging - -import torch -from torchmetrics import Metric - -__all__ = ['MultiBinaryAccuracy'] - - -class MultiBinaryAccuracy(Metric): - """ - This metric computes accuracies that are needed to evaluate multiple binary outputs. - For example, if a model returns a set of multiple sigmoid outputs per each sample or at each time step, - F1 score can be calculated to monitor Type 1 error and Type 2 error together. - - Example: - def validation_step(self, batch, batch_idx): - ... - signals, signal_lengths, targets = batch - preds, _ = self.forward(input_signal=signals, - signal_lengths=signal_lengths, - targets=targets) - loss = self.loss(logits=preds, labels=targets) - self._accuracy_valid(preds, targets, signal_lengths) - f1_acc = self._accuracy.compute() - self.val_outputs = {'val_loss': loss, 'val_f1_acc': f1_acc} - return self.val_outputs - - def on_validation_epoch_end(self): - ... - val_loss_mean = torch.stack([x['val_loss'] for x in self.val_outputs]).mean() - correct_counts = torch.stack([x['val_correct_counts'] for x in self.val_outputs]).sum(axis=0) - total_counts = torch.stack([x['val_total_counts'] for x in self.val_outputs]).sum(axis=0) - - self._accuracy_valid.correct_counts_k = correct_counts - self._accuracy_valid.total_counts_k = total_counts - f1_acc = self._accuracy_valid.compute() - self._accuracy_valid.reset() - - self.log('val_loss', val_loss_mean) - self.log('val_f1_acc', f1_acc) - self.val_outputs.clear() # free memory - return {'val_loss': val_loss_mean, 'val_f1_acc': f1_acc} - - Args: - preds (torch.Tensor): - Predicted values which should be in range of [0, 1]. - targets (torch.Tensor): - Target values which should be in range of [0, 1]. - signal_lengths (torch.Tensor): - Length of each sequence in the batch input. signal_lengths values are used to - filter out zero-padded parts in each sequence. - - Returns: - f1_score (torch.Tensor): - F1 score calculated from the predicted value and binarized target values. - """ - - full_state_update = False - - def __init__(self, dist_sync_on_step=False): - super().__init__(dist_sync_on_step=dist_sync_on_step) - self.total_correct_counts = 0 - self.total_sample_counts = 0 - self.true_positive_count = 0 - self.false_positive_count = 0 - self.false_negative_count = 0 - - def update(self, preds: torch.Tensor, targets: torch.Tensor, signal_lengths: torch.Tensor) -> torch.Tensor: - with torch.no_grad(): - preds_list = [preds[k, : signal_lengths[k], :] for k in range(preds.shape[0])] - targets_list = [targets[k, : signal_lengths[k], :] for k in range(targets.shape[0])] - self.preds = torch.cat(preds_list, dim=0) - self.targets = torch.cat(targets_list, dim=0) - - self.true = self.preds.round().bool() == self.targets.round().bool() - self.false = self.preds.round().bool() != self.targets.round().bool() - self.positive = self.preds.round().bool() == 1 - self.negative = self.preds.round().bool() == 0 - - self.positive_count = torch.sum(self.preds.round().bool() == True) - self.true_positive_count += torch.sum(torch.logical_and(self.true, self.positive)) - self.false_positive_count += torch.sum(torch.logical_and(self.false, self.positive)) - self.false_negative_count += torch.sum(torch.logical_and(self.false, self.negative)) - - self.total_correct_counts += torch.sum(self.preds.round().bool() == self.targets.round().bool()) - self.total_sample_counts += torch.prod(torch.tensor(self.targets.shape)) - - def compute(self): - """ - Compute F1 score from the accumulated values. Return -1 if the F1 score is NaN. - """ - self.precision = self.true_positive_count / (self.true_positive_count + self.false_positive_count) - self.recall = self.true_positive_count / (self.true_positive_count + self.false_negative_count) - self.f1_score = 2 * self.precision * self.recall / (self.precision + self.recall) - if torch.isnan(self.f1_score): - logging.warn("self.f1_score contains NaN value. Returning -1 instead of NaN value.") - self.f1_score = -1 - return self.f1_score diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/rnnt_wer.py b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/rnnt_wer.py deleted file mode 100644 index 5518c8f0a25cf509bb207cb8c2a2dac2a2e0d53b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/rnnt_wer.py +++ /dev/null @@ -1,1326 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import re -from abc import abstractmethod -from dataclasses import dataclass, field, is_dataclass -from typing import Callable, Dict, List, Optional, Tuple, Union - -import editdistance -import numpy as np -import torch -from omegaconf import OmegaConf -from torchmetrics import Metric - -from nemo.collections.asr.metrics.wer import move_dimension_to_the_front -from nemo.collections.asr.parts.submodules import rnnt_beam_decoding as beam_decode -from nemo.collections.asr.parts.submodules import rnnt_greedy_decoding as greedy_decode -from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceConfig, ConfidenceMixin -from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis, NBestHypotheses -from nemo.utils import logging - -__all__ = ['RNNTDecoding', 'RNNTWER'] - - -class AbstractRNNTDecoding(ConfidenceMixin): - """ - Used for performing RNN-T auto-regressive decoding of the Decoder+Joint network given the encoder state. - - Args: - decoding_cfg: A dict-like object which contains the following key-value pairs. - strategy: str value which represents the type of decoding that can occur. - Possible values are : - - greedy, greedy_batch (for greedy decoding). - - beam, tsd, alsd (for beam search decoding). - - compute_hypothesis_token_set: A bool flag, which determines whether to compute a list of decoded - tokens as well as the decoded string. Default is False in order to avoid double decoding - unless required. - - preserve_alignments: Bool flag which preserves the history of logprobs generated during - decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `alignments` in it. Here, `alignments` is a List of List of - Tuple(Tensor (of length V + 1), Tensor(scalar, label after argmax)). - - In order to obtain this hypothesis, please utilize `rnnt_decoder_predictions_tensor` function - with the `return_hypotheses` flag set to True. - - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary. - U is the number of target tokens for the current timestep Ti. - - compute_timestamps: A bool flag, which determines whether to compute the character/subword, or - word based timestamp mapping the output log-probabilities to discrete intervals of timestamps. - The timestamps will be available in the returned Hypothesis.timestep as a dictionary. - - rnnt_timestamp_type: A str value, which represents the types of timestamps that should be calculated. - Can take the following values - "char" for character/subword time stamps, "word" for word level - time stamps and "all" (default), for both character level and word level time stamps. - - word_seperator: Str token representing the seperator between words. - - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores - generated during decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `alignments` is a List of List of ints. - - confidence_cfg: A dict-like object which contains the following key-value pairs related to confidence - scores. In order to obtain hypotheses with confidence scores, please utilize - `rnnt_decoder_predictions_tensor` function with the `preserve_frame_confidence` flag set to True. - - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores - generated during decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `alignments` is a List of List of floats. - - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. - U is the number of target tokens for the current timestep Ti. - preserve_token_confidence: Bool flag which preserves the history of per-token confidence scores - generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `token_confidence` in it. Here, `token_confidence` is a List of floats. - - The length of the list corresponds to the number of recognized tokens. - preserve_word_confidence: Bool flag which preserves the history of per-word confidence scores - generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `word_confidence` in it. Here, `word_confidence` is a List of floats. - - The length of the list corresponds to the number of recognized words. - exclude_blank: Bool flag indicating that blank token confidence scores are to be excluded - from the `token_confidence`. - aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. - Valid options are `mean`, `min`, `max`, `prod`. - method_cfg: A dict-like object which contains the method name and settings to compute per-frame - confidence scores. - - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). - Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - - The config may further contain the following sub-dictionaries: - "greedy": - max_symbols: int, describing the maximum number of target tokens to decode per - timestep during greedy decoding. Setting to larger values allows longer sentences - to be decoded, at the cost of increased execution time. - preserve_frame_confidence: Same as above, overrides above value. - confidence_method_cfg: Same as above, overrides confidence_cfg.method_cfg. - - "beam": - beam_size: int, defining the beam size for beam search. Must be >= 1. - If beam_size == 1, will perform cached greedy search. This might be slightly different - results compared to the greedy search above. - - score_norm: optional bool, whether to normalize the returned beam score in the hypotheses. - Set to True by default. - - return_best_hypothesis: optional bool, whether to return just the best hypothesis or all of the - hypotheses after beam search has concluded. This flag is set by default. - - tsd_max_sym_exp: optional int, determines number of symmetric expansions of the target symbols - per timestep of the acoustic model. Larger values will allow longer sentences to be decoded, - at increased cost to execution time. - - alsd_max_target_len: optional int or float, determines the potential maximum target sequence length. - If an integer is provided, it can decode sequences of that particular maximum length. - If a float is provided, it can decode sequences of int(alsd_max_target_len * seq_len), - where seq_len is the length of the acoustic model output (T). - - NOTE: - If a float is provided, it can be greater than 1! - By default, a float of 2.0 is used so that a target sequence can be at most twice - as long as the acoustic model output length T. - - maes_num_steps: Number of adaptive steps to take. From the paper, 2 steps is generally sufficient, - and can be reduced to 1 to improve decoding speed while sacrificing some accuracy. int > 0. - - maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to keep this as 1 - in order to reduce expensive beam search cost later. int >= 0. - - maes_expansion_beta: Maximum number of prefix expansions allowed, in addition to the beam size. - Effectively, the number of hypothesis = beam_size + maes_expansion_beta. Must be an int >= 0, - and affects the speed of inference since large values will perform large beam search in the next step. - - maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the expansions. - The default (2.3) is selected from the paper. It performs a comparison (max_log_prob - gamma <= log_prob[v]) - where v is all vocabulary indices in the Vocab set and max_log_prob is the "most" likely token to be - predicted. Gamma therefore provides a margin of additional tokens which can be potential candidates for - expansion apart from the "most likely" candidate. - Lower values will reduce the number of expansions (by increasing pruning-by-value, thereby improving speed - but hurting accuracy). Higher values will increase the number of expansions (by reducing pruning-by-value, - thereby reducing speed but potentially improving accuracy). This is a hyper parameter to be experimentally - tuned on a validation set. - - softmax_temperature: Scales the logits of the joint prior to computing log_softmax. - - decoder: The Decoder/Prediction network module. - joint: The Joint network module. - blank_id: The id of the RNNT blank token. - """ - - def __init__(self, decoding_cfg, decoder, joint, blank_id: int): - super(AbstractRNNTDecoding, self).__init__() - - # Convert dataclass to config object - if is_dataclass(decoding_cfg): - decoding_cfg = OmegaConf.structured(decoding_cfg) - - self.cfg = decoding_cfg - self.blank_id = blank_id - self.num_extra_outputs = joint.num_extra_outputs - self.big_blank_durations = self.cfg.get("big_blank_durations", None) - self.durations = self.cfg.get("durations", None) - self.compute_hypothesis_token_set = self.cfg.get("compute_hypothesis_token_set", False) - self.compute_langs = decoding_cfg.get('compute_langs', False) - self.preserve_alignments = self.cfg.get('preserve_alignments', None) - self.joint_fused_batch_size = self.cfg.get('fused_batch_size', None) - self.compute_timestamps = self.cfg.get('compute_timestamps', None) - self.word_seperator = self.cfg.get('word_seperator', ' ') - - if self.durations is not None: # this means it's a TDT model. - if blank_id == 0: - raise ValueError("blank_id must equal len(non_blank_vocabs) for TDT models") - if self.big_blank_durations is not None: - raise ValueError("duration and big_blank_durations can't both be not None") - if self.cfg.strategy not in ['greedy', 'greedy_batch']: - raise ValueError("currently only greedy and greedy_batch inference is supported for TDT models") - - if self.big_blank_durations is not None: # this means it's a multi-blank model. - if blank_id == 0: - raise ValueError("blank_id must equal len(vocabs) for multi-blank RNN-T models") - if self.cfg.strategy not in ['greedy', 'greedy_batch']: - raise ValueError( - "currently only greedy and greedy_batch inference is supported for multi-blank models" - ) - - possible_strategies = ['greedy', 'greedy_batch', 'beam', 'tsd', 'alsd', 'maes'] - if self.cfg.strategy not in possible_strategies: - raise ValueError(f"Decoding strategy must be one of {possible_strategies}") - - # Update preserve alignments - if self.preserve_alignments is None: - if self.cfg.strategy in ['greedy', 'greedy_batch']: - self.preserve_alignments = self.cfg.greedy.get('preserve_alignments', False) - - elif self.cfg.strategy in ['beam', 'tsd', 'alsd', 'maes']: - self.preserve_alignments = self.cfg.beam.get('preserve_alignments', False) - - # Update compute timestamps - if self.compute_timestamps is None: - if self.cfg.strategy in ['greedy', 'greedy_batch']: - self.compute_timestamps = self.cfg.greedy.get('compute_timestamps', False) - - elif self.cfg.strategy in ['beam', 'tsd', 'alsd', 'maes']: - self.compute_timestamps = self.cfg.beam.get('compute_timestamps', False) - - # Test if alignments are being preserved for RNNT - if self.compute_timestamps is True and self.preserve_alignments is False: - raise ValueError("If `compute_timesteps` flag is set, then `preserve_alignments` flag must also be set.") - - # initialize confidence-related fields - self._init_confidence(self.cfg.get('confidence_cfg', None)) - - # Confidence estimation is not implemented for these strategies - if ( - not self.preserve_frame_confidence - and self.cfg.strategy in ['beam', 'tsd', 'alsd', 'maes'] - and self.cfg.beam.get('preserve_frame_confidence', False) - ): - raise NotImplementedError(f"Confidence calculation is not supported for strategy `{self.cfg.strategy}`") - - if self.cfg.strategy == 'greedy': - if self.big_blank_durations is None: - if self.durations is None: - self.decoding = greedy_decode.GreedyRNNTInfer( - decoder_model=decoder, - joint_model=joint, - blank_index=self.blank_id, - max_symbols_per_step=( - self.cfg.greedy.get('max_symbols', None) - or self.cfg.greedy.get('max_symbols_per_step', None) - ), - preserve_alignments=self.preserve_alignments, - preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, - ) - else: - self.decoding = greedy_decode.GreedyTDTInfer( - decoder_model=decoder, - joint_model=joint, - blank_index=self.blank_id, - durations=self.durations, - max_symbols_per_step=( - self.cfg.greedy.get('max_symbols', None) - or self.cfg.greedy.get('max_symbols_per_step', None) - ), - preserve_alignments=self.preserve_alignments, - preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, - ) - else: - self.decoding = greedy_decode.GreedyMultiblankRNNTInfer( - decoder_model=decoder, - joint_model=joint, - blank_index=self.blank_id, - big_blank_durations=self.big_blank_durations, - max_symbols_per_step=( - self.cfg.greedy.get('max_symbols', None) or self.cfg.greedy.get('max_symbols_per_step', None) - ), - preserve_alignments=self.preserve_alignments, - preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, - ) - - elif self.cfg.strategy == 'greedy_batch': - if self.big_blank_durations is None: - if self.durations is None: - self.decoding = greedy_decode.GreedyBatchedRNNTInfer( - decoder_model=decoder, - joint_model=joint, - blank_index=self.blank_id, - max_symbols_per_step=( - self.cfg.greedy.get('max_symbols', None) - or self.cfg.greedy.get('max_symbols_per_step', None) - ), - preserve_alignments=self.preserve_alignments, - preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, - ) - else: - self.decoding = greedy_decode.GreedyBatchedTDTInfer( - decoder_model=decoder, - joint_model=joint, - blank_index=self.blank_id, - durations=self.durations, - max_symbols_per_step=( - self.cfg.greedy.get('max_symbols', None) - or self.cfg.greedy.get('max_symbols_per_step', None) - ), - preserve_alignments=self.preserve_alignments, - preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, - ) - - else: - self.decoding = greedy_decode.GreedyBatchedMultiblankRNNTInfer( - decoder_model=decoder, - joint_model=joint, - blank_index=self.blank_id, - big_blank_durations=self.big_blank_durations, - max_symbols_per_step=( - self.cfg.greedy.get('max_symbols', None) or self.cfg.greedy.get('max_symbols_per_step', None) - ), - preserve_alignments=self.preserve_alignments, - preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, - ) - - elif self.cfg.strategy == 'beam': - - self.decoding = beam_decode.BeamRNNTInfer( - decoder_model=decoder, - joint_model=joint, - beam_size=self.cfg.beam.beam_size, - return_best_hypothesis=decoding_cfg.beam.get('return_best_hypothesis', True), - search_type='default', - score_norm=self.cfg.beam.get('score_norm', True), - softmax_temperature=self.cfg.beam.get('softmax_temperature', 1.0), - preserve_alignments=self.preserve_alignments, - ) - - elif self.cfg.strategy == 'tsd': - - self.decoding = beam_decode.BeamRNNTInfer( - decoder_model=decoder, - joint_model=joint, - beam_size=self.cfg.beam.beam_size, - return_best_hypothesis=decoding_cfg.beam.get('return_best_hypothesis', True), - search_type='tsd', - score_norm=self.cfg.beam.get('score_norm', True), - tsd_max_sym_exp_per_step=self.cfg.beam.get('tsd_max_sym_exp', 10), - softmax_temperature=self.cfg.beam.get('softmax_temperature', 1.0), - preserve_alignments=self.preserve_alignments, - ) - - elif self.cfg.strategy == 'alsd': - - self.decoding = beam_decode.BeamRNNTInfer( - decoder_model=decoder, - joint_model=joint, - beam_size=self.cfg.beam.beam_size, - return_best_hypothesis=decoding_cfg.beam.get('return_best_hypothesis', True), - search_type='alsd', - score_norm=self.cfg.beam.get('score_norm', True), - alsd_max_target_len=self.cfg.beam.get('alsd_max_target_len', 2), - softmax_temperature=self.cfg.beam.get('softmax_temperature', 1.0), - preserve_alignments=self.preserve_alignments, - ) - - elif self.cfg.strategy == 'maes': - - self.decoding = beam_decode.BeamRNNTInfer( - decoder_model=decoder, - joint_model=joint, - beam_size=self.cfg.beam.beam_size, - return_best_hypothesis=decoding_cfg.beam.get('return_best_hypothesis', True), - search_type='maes', - score_norm=self.cfg.beam.get('score_norm', True), - maes_num_steps=self.cfg.beam.get('maes_num_steps', 2), - maes_prefix_alpha=self.cfg.beam.get('maes_prefix_alpha', 1), - maes_expansion_gamma=self.cfg.beam.get('maes_expansion_gamma', 2.3), - maes_expansion_beta=self.cfg.beam.get('maes_expansion_beta', 2.0), - softmax_temperature=self.cfg.beam.get('softmax_temperature', 1.0), - preserve_alignments=self.preserve_alignments, - ngram_lm_model=self.cfg.beam.get('ngram_lm_model', None), - ngram_lm_alpha=self.cfg.beam.get('ngram_lm_alpha', 0.0), - hat_subtract_ilm=self.cfg.beam.get('hat_subtract_ilm', False), - hat_ilm_weight=self.cfg.beam.get('hat_ilm_weight', 0.0), - ) - - else: - - raise ValueError( - f"Incorrect decoding strategy supplied. Must be one of {possible_strategies}\n" - f"but was provided {self.cfg.strategy}" - ) - - # Update the joint fused batch size or disable it entirely if needed. - self.update_joint_fused_batch_size() - - def rnnt_decoder_predictions_tensor( - self, - encoder_output: torch.Tensor, - encoded_lengths: torch.Tensor, - return_hypotheses: bool = False, - partial_hypotheses: Optional[List[Hypothesis]] = None, - ) -> Tuple[List[str], Optional[List[List[str]]], Optional[Union[Hypothesis, NBestHypotheses]]]: - """ - Decode an encoder output by autoregressive decoding of the Decoder+Joint networks. - - Args: - encoder_output: torch.Tensor of shape [B, D, T]. - encoded_lengths: torch.Tensor containing lengths of the padded encoder outputs. Shape [B]. - return_hypotheses: bool. If set to True it will return list of Hypothesis or NBestHypotheses - - Returns: - If `return_best_hypothesis` is set: - A tuple (hypotheses, None): - hypotheses - list of Hypothesis (best hypothesis per sample). - Look at rnnt_utils.Hypothesis for more information. - - If `return_best_hypothesis` is not set: - A tuple(hypotheses, all_hypotheses) - hypotheses - list of Hypothesis (best hypothesis per sample). - Look at rnnt_utils.Hypothesis for more information. - all_hypotheses - list of NBestHypotheses. Each NBestHypotheses further contains a sorted - list of all the hypotheses of the model per sample. - Look at rnnt_utils.NBestHypotheses for more information. - """ - # Compute hypotheses - with torch.inference_mode(): - hypotheses_list = self.decoding( - encoder_output=encoder_output, encoded_lengths=encoded_lengths, partial_hypotheses=partial_hypotheses - ) # type: [List[Hypothesis]] - - # extract the hypotheses - hypotheses_list = hypotheses_list[0] # type: List[Hypothesis] - - prediction_list = hypotheses_list - - if isinstance(prediction_list[0], NBestHypotheses): - hypotheses = [] - all_hypotheses = [] - - for nbest_hyp in prediction_list: # type: NBestHypotheses - n_hyps = nbest_hyp.n_best_hypotheses # Extract all hypotheses for this sample - decoded_hyps = self.decode_hypothesis(n_hyps) # type: List[str] - - # If computing timestamps - if self.compute_timestamps is True: - timestamp_type = self.cfg.get('rnnt_timestamp_type', 'all') - for hyp_idx in range(len(decoded_hyps)): - decoded_hyps[hyp_idx] = self.compute_rnnt_timestamps(decoded_hyps[hyp_idx], timestamp_type) - - hypotheses.append(decoded_hyps[0]) # best hypothesis - all_hypotheses.append(decoded_hyps) - - if return_hypotheses: - return hypotheses, all_hypotheses - - best_hyp_text = [h.text for h in hypotheses] - all_hyp_text = [h.text for hh in all_hypotheses for h in hh] - return best_hyp_text, all_hyp_text - - else: - hypotheses = self.decode_hypothesis(prediction_list) # type: List[str] - - # If computing timestamps - if self.compute_timestamps is True: - timestamp_type = self.cfg.get('rnnt_timestamp_type', 'all') - for hyp_idx in range(len(hypotheses)): - hypotheses[hyp_idx] = self.compute_rnnt_timestamps(hypotheses[hyp_idx], timestamp_type) - - if return_hypotheses: - # greedy decoding, can get high-level confidence scores - if self.preserve_frame_confidence and ( - self.preserve_word_confidence or self.preserve_token_confidence - ): - hypotheses = self.compute_confidence(hypotheses) - return hypotheses, None - - best_hyp_text = [h.text for h in hypotheses] - return best_hyp_text, None - - def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hypothesis, NBestHypotheses]]: - """ - Decode a list of hypotheses into a list of strings. - - Args: - hypotheses_list: List of Hypothesis. - - Returns: - A list of strings. - """ - for ind in range(len(hypotheses_list)): - # Extract the integer encoded hypothesis - prediction = hypotheses_list[ind].y_sequence - - if type(prediction) != list: - prediction = prediction.tolist() - - # RNN-T sample level is already preprocessed by implicit RNNT decoding - # Simply remove any blank and possibly big blank tokens - if self.big_blank_durations is not None: # multi-blank RNNT - num_extra_outputs = len(self.big_blank_durations) - prediction = [p for p in prediction if p < self.blank_id - num_extra_outputs] - elif self.durations is not None: # TDT model. - prediction = [p for p in prediction if p < self.blank_id] - else: # standard RNN-T - prediction = [p for p in prediction if p != self.blank_id] - - # De-tokenize the integer tokens; if not computing timestamps - if self.compute_timestamps is True: - # keep the original predictions, wrap with the number of repetitions per token and alignments - # this is done so that `rnnt_decoder_predictions_tensor()` can process this hypothesis - # in order to compute exact time stamps. - alignments = copy.deepcopy(hypotheses_list[ind].alignments) - token_repetitions = [1] * len(alignments) # preserve number of repetitions per token - hypothesis = (prediction, alignments, token_repetitions) - else: - hypothesis = self.decode_tokens_to_str(prediction) - - # TODO: remove - # collapse leading spaces before . , ? for PC models - hypothesis = re.sub(r'(\s+)([\.\,\?])', r'\2', hypothesis) - - if self.compute_hypothesis_token_set: - hypotheses_list[ind].tokens = self.decode_ids_to_tokens(prediction) - - # De-tokenize the integer tokens - hypotheses_list[ind].text = hypothesis - - return hypotheses_list - - def compute_confidence(self, hypotheses_list: List[Hypothesis]) -> List[Hypothesis]: - """ - Computes high-level (per-token and/or per-word) confidence scores for a list of hypotheses. - Assumes that `frame_confidence` is present in the hypotheses. - - Args: - hypotheses_list: List of Hypothesis. - - Returns: - A list of hypotheses with high-level confidence scores. - """ - if self.exclude_blank_from_confidence: - for hyp in hypotheses_list: - hyp.token_confidence = hyp.non_blank_frame_confidence - else: - for hyp in hypotheses_list: - offset = 0 - token_confidence = [] - if len(hyp.timestep) > 0: - for ts, te in zip(hyp.timestep, hyp.timestep[1:] + [len(hyp.frame_confidence)]): - if ts != te: - # tokens are considered to belong to the last non-blank token, if any. - token_confidence.append( - self._aggregate_confidence( - [hyp.frame_confidence[ts][offset]] - + [fc[0] for fc in hyp.frame_confidence[ts + 1 : te]] - ) - ) - offset = 0 - else: - token_confidence.append(hyp.frame_confidence[ts][offset]) - offset += 1 - hyp.token_confidence = token_confidence - if self.preserve_word_confidence: - for hyp in hypotheses_list: - hyp.word_confidence = self._aggregate_token_confidence(hyp) - return hypotheses_list - - @abstractmethod - def decode_tokens_to_str(self, tokens: List[int]) -> str: - """ - Implemented by subclass in order to decoder a token id list into a string. - - Args: - tokens: List of int representing the token ids. - - Returns: - A decoded string. - """ - raise NotImplementedError() - - @abstractmethod - def decode_ids_to_tokens(self, tokens: List[int]) -> List[str]: - """ - Implemented by subclass in order to decode a token id list into a token list. - A token list is the string representation of each token id. - - Args: - tokens: List of int representing the token ids. - - Returns: - A list of decoded tokens. - """ - raise NotImplementedError() - - @abstractmethod - def decode_tokens_to_lang(self, tokens: List[int]) -> str: - """ - Implemented by subclass in order to - compute the most likely language ID (LID) string given the tokens. - - Args: - tokens: List of int representing the token ids. - - Returns: - A decoded LID string. - """ - raise NotImplementedError() - - @abstractmethod - def decode_ids_to_langs(self, tokens: List[int]) -> List[str]: - """ - Implemented by subclass in order to - decode a token id list into language ID (LID) list. - - Args: - tokens: List of int representing the token ids. - - Returns: - A list of decoded LIDS. - """ - raise NotImplementedError() - - def update_joint_fused_batch_size(self): - if self.joint_fused_batch_size is None: - # do nothing and let the Joint itself handle setting up of the fused batch - return - - if not hasattr(self.decoding.joint, 'set_fused_batch_size'): - logging.warning( - "The joint module does not have `set_fused_batch_size(int)` as a setter function.\n" - "Ignoring update of joint fused batch size." - ) - return - - if not hasattr(self.decoding.joint, 'set_fuse_loss_wer'): - logging.warning( - "The joint module does not have `set_fuse_loss_wer(bool, RNNTLoss, RNNTWER)` " - "as a setter function.\n" - "Ignoring update of joint fused batch size." - ) - return - - if self.joint_fused_batch_size > 0: - self.decoding.joint.set_fused_batch_size(self.joint_fused_batch_size) - else: - logging.info("Joint fused batch size <= 0; Will temporarily disable fused batch step in the Joint.") - self.decoding.joint.set_fuse_loss_wer(False) - - def compute_rnnt_timestamps(self, hypothesis: Hypothesis, timestamp_type: str = "all"): - assert timestamp_type in ['char', 'word', 'all'] - - # Unpack the temporary storage - decoded_prediction, alignments, token_repetitions = hypothesis.text - - # Retrieve offsets - char_offsets = word_offsets = None - char_offsets = self._compute_offsets(hypothesis, token_repetitions, self.blank_id) - - # finally, set the flattened decoded predictions to text field for later text decoding - hypothesis.text = decoded_prediction - - # Assert number of offsets and hypothesis tokens are 1:1 match. - num_flattened_tokens = 0 - for t in range(len(char_offsets)): - # Subtract one here for the extra RNNT BLANK token emitted to designate "End of timestep" - num_flattened_tokens += len(char_offsets[t]['char']) - 1 - - if num_flattened_tokens != len(hypothesis.text): - raise ValueError( - f"`char_offsets`: {char_offsets} and `processed_tokens`: {hypothesis.text}" - " have to be of the same length, but are: " - f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:" - f" {len(hypothesis.text)}" - ) - - encoded_char_offsets = copy.deepcopy(char_offsets) - - # Correctly process the token ids to chars/subwords. - for i, offsets in enumerate(char_offsets): - decoded_chars = [] - for char in offsets['char'][:-1]: # ignore the RNNT Blank token at end of every timestep with -1 subset - decoded_chars.append(self.decode_tokens_to_str([int(char)])) - char_offsets[i]["char"] = decoded_chars - - # detect char vs subword models - lens = [] - for v in char_offsets: - tokens = v["char"] - # each token may be either 1 unicode token or multiple unicode token - # for character based models, only 1 token is used - # for subword, more than one token can be used. - # Computing max, then summing up total lens is a test to check for char vs subword - # For char models, len(lens) == sum(lens) - # but this is violated for subword models. - max_len = max(len(c) for c in tokens) - lens.append(max_len) - - # array of one or more chars implies subword based model with multiple char emitted per TxU step (via subword) - if sum(lens) > len(lens): - text_type = 'subword' - else: - # full array of ones implies character based model with 1 char emitted per TxU step - text_type = 'char' - - # retrieve word offsets from character offsets - word_offsets = None - if timestamp_type in ['word', 'all']: - if text_type == 'char': - word_offsets = self._get_word_offsets_chars(char_offsets, word_delimiter_char=self.word_seperator) - else: - # utilize the copy of char offsets with the correct integer ids for tokens - # so as to avoid tokenize -> detokenize -> compare -> merge steps. - word_offsets = self._get_word_offsets_subwords_sentencepiece( - encoded_char_offsets, - hypothesis, - decode_ids_to_tokens=self.decode_ids_to_tokens, - decode_tokens_to_str=self.decode_tokens_to_str, - ) - - # attach results - if len(hypothesis.timestep) > 0: - timestep_info = hypothesis.timestep - else: - timestep_info = [] - - # Setup defaults - hypothesis.timestep = {"timestep": timestep_info} - - # Add char / subword time stamps - if char_offsets is not None and timestamp_type in ['char', 'all']: - hypothesis.timestep['char'] = char_offsets - - # Add word time stamps - if word_offsets is not None and timestamp_type in ['word', 'all']: - hypothesis.timestep['word'] = word_offsets - - # Convert the flattened token indices to text - hypothesis.text = self.decode_tokens_to_str(hypothesis.text) - - return hypothesis - - @staticmethod - def _compute_offsets( - hypothesis: Hypothesis, token_repetitions: List[int], rnnt_token: int - ) -> List[Dict[str, Union[str, int]]]: - """ - Utility method that calculates the indidual time indices where a token starts and ends. - - Args: - hypothesis: A Hypothesis object that contains `text` field that holds the character / subword token - emitted at every time step after rnnt collapse. - token_repetitions: A list of ints representing the number of repetitions of each emitted token. - rnnt_token: The integer of the rnnt blank token used during rnnt collapse. - - Returns: - - """ - start_index = 0 - - # If the exact timestep information is available, utilize the 1st non-rnnt blank token timestep - # as the start index. - if hypothesis.timestep is not None and len(hypothesis.timestep) > 0: - start_index = max(0, hypothesis.timestep[0] - 1) - - # Construct the start and end indices brackets - end_indices = np.asarray(token_repetitions).cumsum() - start_indices = np.concatenate(([start_index], end_indices[:-1])) - - # Process the TxU dangling alignment tensor, containing pairs of (logits, label) - alignment_labels = [al_logits_labels for al_logits_labels in hypothesis.text[1]] - for t in range(len(alignment_labels)): - for u in range(len(alignment_labels[t])): - alignment_labels[t][u] = alignment_labels[t][u][1] # pick label from (logit, label) tuple - - # Merge the results per token into a list of dictionaries - offsets = [ - {"char": a, "start_offset": s, "end_offset": e} - for a, s, e in zip(alignment_labels, start_indices, end_indices) - ] - - # Filter out RNNT token (blank at [t][0] position). This is because blank can only occur at end of a - # time step for RNNT, so if 0th token is blank, then that timestep is skipped. - offsets = list(filter(lambda offsets: offsets["char"][0] != rnnt_token, offsets)) - return offsets - - @staticmethod - def _get_word_offsets_chars( - offsets: Dict[str, Union[str, float]], word_delimiter_char: str = " " - ) -> Dict[str, Union[str, float]]: - """ - Utility method which constructs word time stamps out of character time stamps. - - References: - This code is a port of the Hugging Face code for word time stamp construction. - - Args: - offsets: A list of dictionaries, each containing "char", "start_offset" and "end_offset". - word_delimiter_char: Character token that represents the word delimiter. By default, " ". - - Returns: - A list of dictionaries containing the word offsets. Each item contains "word", "start_offset" and - "end_offset". - """ - word_offsets = [] - - last_state = "SPACE" - word = "" - start_offset = 0 - end_offset = 0 - for i, offset in enumerate(offsets): - chars = offset["char"] - for char in chars: - state = "SPACE" if char == word_delimiter_char else "WORD" - - if state == last_state: - # If we are in the same state as before, we simply repeat what we've done before - end_offset = offset["end_offset"] - word += char - else: - # Switching state - if state == "SPACE": - # Finishing a word - word_offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset}) - else: - # Starting a new word - start_offset = offset["start_offset"] - end_offset = offset["end_offset"] - word = char - - last_state = state - - if last_state == "WORD": - word_offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset}) - - return word_offsets - - @staticmethod - def _get_word_offsets_subwords_sentencepiece( - offsets: Dict[str, Union[str, float]], - hypothesis: Hypothesis, - decode_ids_to_tokens: Callable[[List[int]], str], - decode_tokens_to_str: Callable[[List[int]], str], - ) -> Dict[str, Union[str, float]]: - """ - Utility method which constructs word time stamps out of sub-word time stamps. - - **Note**: Only supports Sentencepiece based tokenizers ! - - Args: - offsets: A list of dictionaries, each containing "char", "start_offset" and "end_offset". - hypothesis: Hypothesis object that contains `text` field, where each token is a sub-word id - after rnnt collapse. - decode_ids_to_tokens: A Callable function that accepts a list of integers and maps it to a sub-word. - decode_tokens_to_str: A Callable function that accepts a list of integers and maps it to text / str. - - Returns: - A list of dictionaries containing the word offsets. Each item contains "word", "start_offset" and - "end_offset". - """ - word_offsets = [] - built_token = [] - previous_token_index = 0 - # For every offset token - for i, offset in enumerate(offsets): - # For every subword token in offset token list (ignoring the RNNT Blank token at the end) - for char in offset['char'][:-1]: - char = int(char) - - # Compute the sub-word text representation, and the decoded text (stripped of sub-word markers). - token = decode_ids_to_tokens([char])[0] - token_text = decode_tokens_to_str([char]) - - # It is a sub-word token, or contains an identifier at the beginning such as _ or ## that was stripped - # after forcing partial text conversion of the token. - if token != token_text: - # If there are any partially or fully built sub-word token ids, construct to text. - # Note: This is "old" subword, that occurs *after* current sub-word has started. - if built_token: - word_offsets.append( - { - "word": decode_tokens_to_str(built_token), - "start_offset": offsets[previous_token_index]["start_offset"], - "end_offset": offsets[i]["start_offset"], - } - ) - - # Prepare list of new sub-word ids - built_token.clear() - built_token.append(char) - previous_token_index = i - else: - # If the token does not contain any sub-word start mark, then the sub-word has not completed yet - # Append to current sub-word list. - built_token.append(char) - - # Inject the start offset of the first token to word offsets - # This is because we always skip the delay the injection of the first sub-word due to the loop - # condition and check whether built token is ready or not. - # Therefore without this forced injection, the start_offset appears as off by 1. - # This should only be done when these arrays contain more than one element. - if offsets and word_offsets: - word_offsets[0]["start_offset"] = offsets[0]["start_offset"] - - # If there are any remaining tokens left, inject them all into the final word offset. - # The start offset of this token is the start time of the next token to process. - # The end offset of this token is the end time of the last token from offsets. - # Note that built_token is a flat list; but offsets contains a nested list which - # may have different dimensionality. - # As such, we can't rely on the length of the list of built_token to index offsets. - if built_token: - # start from the previous token index as this hasn't been committed to word_offsets yet - # if we still have content in built_token - start_offset = offsets[previous_token_index]["start_offset"] - word_offsets.append( - { - "word": decode_tokens_to_str(built_token), - "start_offset": start_offset, - "end_offset": offsets[-1]["end_offset"], - } - ) - built_token.clear() - - return word_offsets - - -class RNNTDecoding(AbstractRNNTDecoding): - """ - Used for performing RNN-T auto-regressive decoding of the Decoder+Joint network given the encoder state. - - Args: - decoding_cfg: A dict-like object which contains the following key-value pairs. - strategy: str value which represents the type of decoding that can occur. - Possible values are : - - greedy, greedy_batch (for greedy decoding). - - beam, tsd, alsd (for beam search decoding). - - compute_hypothesis_token_set: A bool flag, which determines whether to compute a list of decoded - tokens as well as the decoded string. Default is False in order to avoid double decoding - unless required. - - preserve_alignments: Bool flag which preserves the history of logprobs generated during - decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `logprobs` in it. Here, `alignments` is a List of List of - Tuple(Tensor (of length V + 1), Tensor(scalar, label after argmax)). - - In order to obtain this hypothesis, please utilize `rnnt_decoder_predictions_tensor` function - with the `return_hypotheses` flag set to True. - - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary. - U is the number of target tokens for the current timestep Ti. - - confidence_cfg: A dict-like object which contains the following key-value pairs related to confidence - scores. In order to obtain hypotheses with confidence scores, please utilize - `rnnt_decoder_predictions_tensor` function with the `preserve_frame_confidence` flag set to True. - - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores - generated during decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `alignments` is a List of List of floats. - - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. - U is the number of target tokens for the current timestep Ti. - preserve_token_confidence: Bool flag which preserves the history of per-token confidence scores - generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `token_confidence` in it. Here, `token_confidence` is a List of floats. - - The length of the list corresponds to the number of recognized tokens. - preserve_word_confidence: Bool flag which preserves the history of per-word confidence scores - generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `word_confidence` in it. Here, `word_confidence` is a List of floats. - - The length of the list corresponds to the number of recognized words. - exclude_blank: Bool flag indicating that blank token confidence scores are to be excluded - from the `token_confidence`. - aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. - Valid options are `mean`, `min`, `max`, `prod`. - method_cfg: A dict-like object which contains the method name and settings to compute per-frame - confidence scores. - - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). - Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - - The config may further contain the following sub-dictionaries: - "greedy": - max_symbols: int, describing the maximum number of target tokens to decode per - timestep during greedy decoding. Setting to larger values allows longer sentences - to be decoded, at the cost of increased execution time. - - preserve_frame_confidence: Same as above, overrides above value. - - confidence_method_cfg: Same as above, overrides confidence_cfg.method_cfg. - - "beam": - beam_size: int, defining the beam size for beam search. Must be >= 1. - If beam_size == 1, will perform cached greedy search. This might be slightly different - results compared to the greedy search above. - - score_norm: optional bool, whether to normalize the returned beam score in the hypotheses. - Set to True by default. - - return_best_hypothesis: optional bool, whether to return just the best hypothesis or all of the - hypotheses after beam search has concluded. This flag is set by default. - - tsd_max_sym_exp: optional int, determines number of symmetric expansions of the target symbols - per timestep of the acoustic model. Larger values will allow longer sentences to be decoded, - at increased cost to execution time. - - alsd_max_target_len: optional int or float, determines the potential maximum target sequence length. - If an integer is provided, it can decode sequences of that particular maximum length. - If a float is provided, it can decode sequences of int(alsd_max_target_len * seq_len), - where seq_len is the length of the acoustic model output (T). - - NOTE: - If a float is provided, it can be greater than 1! - By default, a float of 2.0 is used so that a target sequence can be at most twice - as long as the acoustic model output length T. - - maes_num_steps: Number of adaptive steps to take. From the paper, 2 steps is generally sufficient, - and can be reduced to 1 to improve decoding speed while sacrificing some accuracy. int > 0. - - maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to keep this as 1 - in order to reduce expensive beam search cost later. int >= 0. - - maes_expansion_beta: Maximum number of prefix expansions allowed, in addition to the beam size. - Effectively, the number of hypothesis = beam_size + maes_expansion_beta. Must be an int >= 0, - and affects the speed of inference since large values will perform large beam search in the next step. - - maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the expansions. - The default (2.3) is selected from the paper. It performs a comparison (max_log_prob - gamma <= log_prob[v]) - where v is all vocabulary indices in the Vocab set and max_log_prob is the "most" likely token to be - predicted. Gamma therefore provides a margin of additional tokens which can be potential candidates for - expansion apart from the "most likely" candidate. - Lower values will reduce the number of expansions (by increasing pruning-by-value, thereby improving speed - but hurting accuracy). Higher values will increase the number of expansions (by reducing pruning-by-value, - thereby reducing speed but potentially improving accuracy). This is a hyper parameter to be experimentally - tuned on a validation set. - - softmax_temperature: Scales the logits of the joint prior to computing log_softmax. - - decoder: The Decoder/Prediction network module. - joint: The Joint network module. - vocabulary: The vocabulary (excluding the RNNT blank token) which will be used for decoding. - """ - - def __init__( - self, decoding_cfg, decoder, joint, vocabulary, - ): - # we need to ensure blank is the last token in the vocab for the case of RNNT and Multi-blank RNNT. - blank_id = len(vocabulary) + joint.num_extra_outputs - - if hasattr(decoding_cfg, 'model_type') and decoding_cfg.model_type == 'tdt': - blank_id = len(vocabulary) - - self.labels_map = dict([(i, vocabulary[i]) for i in range(len(vocabulary))]) - - super(RNNTDecoding, self).__init__( - decoding_cfg=decoding_cfg, decoder=decoder, joint=joint, blank_id=blank_id, - ) - - if isinstance(self.decoding, beam_decode.BeamRNNTInfer): - self.decoding.set_decoding_type('char') - - def _aggregate_token_confidence(self, hypothesis: Hypothesis) -> List[float]: - """ - Implemented by subclass in order to aggregate token confidence to a word-level confidence. - - Args: - hypothesis: Hypothesis - - Returns: - A list of word-level confidence scores. - """ - return self._aggregate_token_confidence_chars(hypothesis.words, hypothesis.token_confidence) - - def decode_tokens_to_str(self, tokens: List[int]) -> str: - """ - Implemented by subclass in order to decoder a token list into a string. - - Args: - tokens: List of int representing the token ids. - - Returns: - A decoded string. - """ - hypothesis = ''.join(self.decode_ids_to_tokens(tokens)) - return hypothesis - - def decode_ids_to_tokens(self, tokens: List[int]) -> List[str]: - """ - Implemented by subclass in order to decode a token id list into a token list. - A token list is the string representation of each token id. - - Args: - tokens: List of int representing the token ids. - - Returns: - A list of decoded tokens. - """ - token_list = [self.labels_map[c] for c in tokens if c < self.blank_id - self.num_extra_outputs] - return token_list - - def decode_tokens_to_lang(self, tokens: List[int]) -> str: - """ - Compute the most likely language ID (LID) string given the tokens. - - Args: - tokens: List of int representing the token ids. - - Returns: - A decoded LID string. - """ - lang = self.tokenizer.ids_to_lang(tokens) - return lang - - def decode_ids_to_langs(self, tokens: List[int]) -> List[str]: - """ - Decode a token id list into language ID (LID) list. - - Args: - tokens: List of int representing the token ids. - - Returns: - A list of decoded LIDS. - """ - lang_list = self.tokenizer.ids_to_text_and_langs(tokens) - return lang_list - - -class RNNTWER(Metric): - """ - This metric computes numerator and denominator for Overall Word Error Rate (WER) between prediction and reference texts. - When doing distributed training/evaluation the result of res=WER(predictions, targets, target_lengths) calls - will be all-reduced between all workers using SUM operations. - Here contains two numbers res=[wer_numerator, wer_denominator]. WER=wer_numerator/wer_denominator. - - If used with PytorchLightning LightningModule, include wer_numerator and wer_denominators inside validation_step results. - Then aggregate (sum) then at the end of validation epoch to correctly compute validation WER. - - Example: - def validation_step(self, batch, batch_idx): - ... - wer_num, wer_denom = self.__wer(predictions, transcript, transcript_len) - self.val_outputs = {'val_loss': loss_value, 'val_wer_num': wer_num, 'val_wer_denom': wer_denom} - return self.val_outputs - - def on_validation_epoch_end(self): - ... - wer_num = torch.stack([x['val_wer_num'] for x in self.val_outputs]).sum() - wer_denom = torch.stack([x['val_wer_denom'] for x in self.val_outputs]).sum() - tensorboard_logs = {'validation_loss': val_loss_mean, 'validation_avg_wer': wer_num / wer_denom} - self.val_outputs.clear() # free memory - return {'val_loss': val_loss_mean, 'log': tensorboard_logs} - - Args: - decoding: RNNTDecoding object that will perform autoregressive decoding of the RNNT model. - batch_dim_index: Index of the batch dimension. - use_cer: Whether to use Character Error Rate isntead of Word Error Rate. - log_prediction: Whether to log a single decoded sample per call. - - Returns: - res: a tuple of 3 zero dimensional float32 ``torch.Tensor` objects: a WER score, a sum of Levenshtein's - distances for all prediction - reference pairs, total number of words in all references. - """ - - full_state_update = True - - def __init__( - self, decoding: RNNTDecoding, batch_dim_index=0, use_cer=False, log_prediction=True, dist_sync_on_step=False - ): - super(RNNTWER, self).__init__(dist_sync_on_step=dist_sync_on_step) - self.decoding = decoding - self.batch_dim_index = batch_dim_index - self.use_cer = use_cer - self.log_prediction = log_prediction - self.blank_id = self.decoding.blank_id - self.labels_map = self.decoding.labels_map - - self.add_state("scores", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False) - self.add_state("words", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False) - - def update( - self, - encoder_output: torch.Tensor, - encoded_lengths: torch.Tensor, - targets: torch.Tensor, - target_lengths: torch.Tensor, - ) -> torch.Tensor: - words = 0 - scores = 0 - references = [] - with torch.no_grad(): - # prediction_cpu_tensor = tensors[0].long().cpu() - targets_cpu_tensor = targets.long().cpu() - targets_cpu_tensor = move_dimension_to_the_front(targets_cpu_tensor, self.batch_dim_index) - tgt_lenths_cpu_tensor = target_lengths.long().cpu() - - # iterate over batch - for ind in range(targets_cpu_tensor.shape[0]): - tgt_len = tgt_lenths_cpu_tensor[ind].item() - target = targets_cpu_tensor[ind][:tgt_len].numpy().tolist() - - reference = self.decoding.decode_tokens_to_str(target) - references.append(reference) - - hypotheses, _ = self.decoding.rnnt_decoder_predictions_tensor(encoder_output, encoded_lengths) - - if self.log_prediction: - logging.info(f"\n") - logging.info(f"reference :{references[0]}") - logging.info(f"predicted :{hypotheses[0]}") - - for h, r in zip(hypotheses, references): - if self.use_cer: - h_list = list(h) - r_list = list(r) - else: - h_list = h.split() - r_list = r.split() - words += len(r_list) - # Compute Levenshtein's distance - scores += editdistance.eval(h_list, r_list) - - self.scores += torch.tensor(scores, device=self.scores.device, dtype=self.scores.dtype) - self.words += torch.tensor(words, device=self.words.device, dtype=self.words.dtype) - # return torch.tensor([scores, words]).to(predictions.device) - - def compute(self): - wer = self.scores.float() / self.words - return wer, self.scores.detach(), self.words.detach() - - -@dataclass -class RNNTDecodingConfig: - model_type: str = "rnnt" # one of "rnnt", "multiblank" or "tdt" - strategy: str = "greedy_batch" - - compute_hypothesis_token_set: bool = False - - # preserve decoding alignments - preserve_alignments: Optional[bool] = None - - # confidence config - confidence_cfg: ConfidenceConfig = field(default_factory=lambda: ConfidenceConfig()) - - # RNNT Joint fused batch size - fused_batch_size: Optional[int] = None - - # compute RNNT time stamps - compute_timestamps: Optional[bool] = None - - # compute language IDs - compute_langs: bool = False - - # token representing word seperator - word_seperator: str = " " - - # type of timestamps to calculate - rnnt_timestamp_type: str = "all" # can be char, word or all for both - - # greedy decoding config - greedy: greedy_decode.GreedyRNNTInferConfig = field(default_factory=lambda: greedy_decode.GreedyRNNTInferConfig()) - - # beam decoding config - beam: beam_decode.BeamRNNTInferConfig = field(default_factory=lambda: beam_decode.BeamRNNTInferConfig(beam_size=4)) - - # can be used to change temperature for decoding - temperature: float = 1.0 diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/rnnt_wer_bpe.py b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/rnnt_wer_bpe.py deleted file mode 100644 index e8ea8f399b99f0c33e62bae60241f4ab83404271..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/rnnt_wer_bpe.py +++ /dev/null @@ -1,430 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass -from typing import List, Union - -import editdistance -import torch -from torchmetrics import Metric - -from nemo.collections.asr.metrics.rnnt_wer import AbstractRNNTDecoding, RNNTDecodingConfig -from nemo.collections.asr.metrics.wer import move_dimension_to_the_front -from nemo.collections.asr.parts.submodules import rnnt_beam_decoding -from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis, NBestHypotheses -from nemo.collections.common.tokenizers.aggregate_tokenizer import AggregateTokenizer -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.utils import logging - -__all__ = ['RNNTBPEDecoding', 'RNNTBPEWER'] - - -class RNNTBPEDecoding(AbstractRNNTDecoding): - """ - Used for performing RNN-T auto-regressive decoding of the Decoder+Joint network given the encoder state. - - Args: - decoding_cfg: A dict-like object which contains the following key-value pairs. - strategy: str value which represents the type of decoding that can occur. - Possible values are : - - greedy, greedy_batch (for greedy decoding). - - beam, tsd, alsd (for beam search decoding). - - compute_hypothesis_token_set: A bool flag, which determines whether to compute a list of decoded - tokens as well as the decoded string. Default is False in order to avoid double decoding - unless required. - - preserve_alignments: Bool flag which preserves the history of logprobs generated during - decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `alignments` in it. Here, `alignments` is a List of List of - Tuple(Tensor (of length V + 1), Tensor(scalar, label after argmax)). - - In order to obtain this hypothesis, please utilize `rnnt_decoder_predictions_tensor` function - with the `return_hypotheses` flag set to True. - - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary. - U is the number of target tokens for the current timestep Ti. - - compute_timestamps: A bool flag, which determines whether to compute the character/subword, or - word based timestamp mapping the output log-probabilities to discrete intervals of timestamps. - The timestamps will be available in the returned Hypothesis.timestep as a dictionary. - - compute_langs: a bool flag, which allows to compute language id (LID) information per token, - word, and the entire sample (most likely language id). The LIDS will be available - in the returned Hypothesis object as a dictionary - - rnnt_timestamp_type: A str value, which represents the types of timestamps that should be calculated. - Can take the following values - "char" for character/subword time stamps, "word" for word level - time stamps and "all" (default), for both character level and word level time stamps. - - word_seperator: Str token representing the seperator between words. - - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores - generated during decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `alignments` is a List of List of ints. - - confidence_cfg: A dict-like object which contains the following key-value pairs related to confidence - scores. In order to obtain hypotheses with confidence scores, please utilize - `rnnt_decoder_predictions_tensor` function with the `preserve_frame_confidence` flag set to True. - - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores - generated during decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `alignments` is a List of List of floats. - - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. - U is the number of target tokens for the current timestep Ti. - preserve_token_confidence: Bool flag which preserves the history of per-token confidence scores - generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `token_confidence` in it. Here, `token_confidence` is a List of floats. - - The length of the list corresponds to the number of recognized tokens. - preserve_word_confidence: Bool flag which preserves the history of per-word confidence scores - generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `word_confidence` in it. Here, `word_confidence` is a List of floats. - - The length of the list corresponds to the number of recognized words. - exclude_blank: Bool flag indicating that blank token confidence scores are to be excluded - from the `token_confidence`. - aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. - Valid options are `mean`, `min`, `max`, `prod`. - method_cfg: A dict-like object which contains the method name and settings to compute per-frame - confidence scores. - - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). - Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - - The config may further contain the following sub-dictionaries: - "greedy": - max_symbols: int, describing the maximum number of target tokens to decode per - timestep during greedy decoding. Setting to larger values allows longer sentences - to be decoded, at the cost of increased execution time. - - preserve_frame_confidence: Same as above, overrides above value. - - confidence_method_cfg: Same as above, overrides confidence_cfg.method_cfg. - - "beam": - beam_size: int, defining the beam size for beam search. Must be >= 1. - If beam_size == 1, will perform cached greedy search. This might be slightly different - results compared to the greedy search above. - - score_norm: optional bool, whether to normalize the returned beam score in the hypotheses. - Set to True by default. - - return_best_hypothesis: optional bool, whether to return just the best hypothesis or all of the - hypotheses after beam search has concluded. - - tsd_max_sym_exp: optional int, determines number of symmetric expansions of the target symbols - per timestep of the acoustic model. Larger values will allow longer sentences to be decoded, - at increased cost to execution time. - - alsd_max_target_len: optional int or float, determines the potential maximum target sequence length. - If an integer is provided, it can decode sequences of that particular maximum length. - If a float is provided, it can decode sequences of int(alsd_max_target_len * seq_len), - where seq_len is the length of the acoustic model output (T). - - NOTE: - If a float is provided, it can be greater than 1! - By default, a float of 2.0 is used so that a target sequence can be at most twice - as long as the acoustic model output length T. - - maes_num_steps: Number of adaptive steps to take. From the paper, 2 steps is generally sufficient, - and can be reduced to 1 to improve decoding speed while sacrificing some accuracy. int > 0. - - maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to keep this as 1 - in order to reduce expensive beam search cost later. int >= 0. - - maes_expansion_beta: Maximum number of prefix expansions allowed, in addition to the beam size. - Effectively, the number of hypothesis = beam_size + maes_expansion_beta. Must be an int >= 0, - and affects the speed of inference since large values will perform large beam search in the next step. - - maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the expansions. - The default (2.3) is selected from the paper. It performs a comparison (max_log_prob - gamma <= log_prob[v]) - where v is all vocabulary indices in the Vocab set and max_log_prob is the "most" likely token to be - predicted. Gamma therefore provides a margin of additional tokens which can be potential candidates for - expansion apart from the "most likely" candidate. - Lower values will reduce the number of expansions (by increasing pruning-by-value, thereby improving speed - but hurting accuracy). Higher values will increase the number of expansions (by reducing pruning-by-value, - thereby reducing speed but potentially improving accuracy). This is a hyper parameter to be experimentally - tuned on a validation set. - - softmax_temperature: Scales the logits of the joint prior to computing log_softmax. - - decoder: The Decoder/Prediction network module. - joint: The Joint network module. - tokenizer: The tokenizer which will be used for decoding. - """ - - def __init__(self, decoding_cfg, decoder, joint, tokenizer: TokenizerSpec): - blank_id = tokenizer.tokenizer.vocab_size # RNNT or TDT models. - - # multi-blank RNNTs - if hasattr(decoding_cfg, 'model_type') and decoding_cfg.model_type == 'multiblank': - blank_id = tokenizer.tokenizer.vocab_size + joint.num_extra_outputs - - self.tokenizer = tokenizer - - super(RNNTBPEDecoding, self).__init__( - decoding_cfg=decoding_cfg, decoder=decoder, joint=joint, blank_id=blank_id - ) - - if isinstance(self.decoding, rnnt_beam_decoding.BeamRNNTInfer): - self.decoding.set_decoding_type('subword') - - def _aggregate_token_confidence(self, hypothesis: Hypothesis) -> List[float]: - """ - Implemented by subclass in order to reduce token confidence to a word-level confidence. - - **Note**: Only supports Sentencepiece based tokenizers! - - Args: - hypothesis: Hypothesis - - Returns: - A list of word-level confidence scores. - """ - return self._aggregate_token_confidence_subwords_sentencepiece( - hypothesis.words, hypothesis.token_confidence, hypothesis.y_sequence - ) - - def decode_tokens_to_str(self, tokens: List[int]) -> str: - """ - Implemented by subclass in order to decoder a token list into a string. - - Args: - tokens: List of int representing the token ids. - - Returns: - A decoded string. - """ - hypothesis = self.tokenizer.ids_to_text(tokens) - return hypothesis - - def decode_ids_to_tokens(self, tokens: List[int]) -> List[str]: - """ - Implemented by subclass in order to decode a token id list into a token list. - A token list is the string representation of each token id. - - Args: - tokens: List of int representing the token ids. - - Returns: - A list of decoded tokens. - """ - token_list = self.tokenizer.ids_to_tokens(tokens) - return token_list - - def decode_tokens_to_lang(self, tokens: List[int]) -> str: - """ - Compute the most likely language ID (LID) string given the tokens. - - Args: - tokens: List of int representing the token ids. - - Returns: - A decoded LID string. - """ - lang = self.tokenizer.ids_to_lang(tokens) - return lang - - def decode_ids_to_langs(self, tokens: List[int]) -> List[str]: - """ - Decode a token id list into language ID (LID) list. - - Args: - tokens: List of int representing the token ids. - - Returns: - A list of decoded LIDS. - """ - lang_list = self.tokenizer.ids_to_text_and_langs(tokens) - return lang_list - - def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hypothesis, NBestHypotheses]]: - """ - Decode a list of hypotheses into a list of strings. - Overrides the super() method optionally adding lang information - - Args: - hypotheses_list: List of Hypothesis. - - Returns: - A list of strings. - """ - hypotheses = super().decode_hypothesis(hypotheses_list) - if self.compute_langs: - if isinstance(self.tokenizer, AggregateTokenizer): - for ind in range(len(hypotheses_list)): - # Extract the integer encoded hypothesis - prediction = hypotheses_list[ind].y_sequence - - if type(prediction) != list: - prediction = prediction.tolist() - - # RNN-T sample level is already preprocessed by implicit RNNT decoding - # Simply remove any blank tokens - prediction = [p for p in prediction if p != self.blank_id] - - hypotheses[ind].langs = self.decode_tokens_to_lang(prediction) - hypotheses[ind].langs_chars = self.decode_ids_to_langs(prediction) - else: - logging.warning( - "Ignoring request for lang output in hypotheses since the model does not use an aggregate tokenizer" - ) - - return hypotheses - - -class RNNTBPEWER(Metric): - """ - This metric computes numerator and denominator for Overall Word Error Rate (WER) between prediction and reference texts. - When doing distributed training/evaluation the result of res=WER(predictions, targets, target_lengths) calls - will be all-reduced between all workers using SUM operations. - Here contains two numbers res=[wer_numerator, wer_denominator]. WER=wer_numerator/wer_denominator. - - If used with PytorchLightning LightningModule, include wer_numerator and wer_denominators inside validation_step results. - Then aggregate (sum) then at the end of validation epoch to correctly compute validation WER. - - Example: - def validation_step(self, batch, batch_idx): - ... - wer_num, wer_denom = self.__wer(predictions, transcript, transcript_len) - self.val_outputs = {'val_loss': loss_value, 'val_wer_num': wer_num, 'val_wer_denom': wer_denom} - return self.val_outputs - - def on_validation_epoch_end(self): - ... - wer_num = torch.stack([x['val_wer_num'] for x in self.val_outputs]).sum() - wer_denom = torch.stack([x['val_wer_denom'] for x in self.val_outputs]).sum() - tensorboard_logs = {'validation_loss': val_loss_mean, 'validation_avg_wer': wer_num / wer_denom} - self.val_outputs.clear() # free memory - return {'val_loss': val_loss_mean, 'log': tensorboard_logs} - - Args: - decoding: RNNTBPEDecoding object that will perform autoregressive decoding of the RNNT model. - batch_dim_index: Index of the batch dimension. - use_cer: Whether to use Character Error Rate isntead of Word Error Rate. - log_prediction: Whether to log a single decoded sample per call. - - Returns: - res: a tuple of 3 zero dimensional float32 ``torch.Tensor` objects: a WER score, a sum of Levenstein's - distances for all prediction - reference pairs, total number of words in all references. - """ - - full_state_update = True - - def __init__( - self, - decoding: RNNTBPEDecoding, - batch_dim_index=0, - use_cer: bool = False, - log_prediction: bool = True, - dist_sync_on_step=False, - ): - super(RNNTBPEWER, self).__init__(dist_sync_on_step=dist_sync_on_step) - self.decoding = decoding - self.batch_dim_index = batch_dim_index - self.use_cer = use_cer - self.log_prediction = log_prediction - self.blank_id = self.decoding.blank_id - self.tokenizer = self.decoding.tokenizer - - self.add_state("scores", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False) - self.add_state("words", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False) - - def update( - self, - encoder_output: torch.Tensor, - encoded_lengths: torch.Tensor, - targets: torch.Tensor, - target_lengths: torch.Tensor, - ) -> torch.Tensor: - words = 0 - scores = 0 - references = [] - with torch.no_grad(): - # prediction_cpu_tensor = tensors[0].long().cpu() - targets_cpu_tensor = targets.long().cpu() - targets_cpu_tensor = move_dimension_to_the_front(targets_cpu_tensor, self.batch_dim_index) - tgt_lenths_cpu_tensor = target_lengths.long().cpu() - - # iterate over batch - for ind in range(targets_cpu_tensor.shape[0]): - tgt_len = tgt_lenths_cpu_tensor[ind].item() - target = targets_cpu_tensor[ind][:tgt_len].numpy().tolist() - reference = self.decoding.decode_tokens_to_str(target) - references.append(reference) - - hypotheses, _ = self.decoding.rnnt_decoder_predictions_tensor(encoder_output, encoded_lengths) - - if self.log_prediction: - logging.info(f"\n") - logging.info(f"reference :{references[0]}") - logging.info(f"predicted :{hypotheses[0]}") - - for h, r in zip(hypotheses, references): - if self.use_cer: - h_list = list(h) - r_list = list(r) - else: - h_list = h.split() - r_list = r.split() - words += len(r_list) - # Compute Levenshtein's distance - scores += editdistance.eval(h_list, r_list) - - del hypotheses - - self.scores += torch.tensor(scores, device=self.scores.device, dtype=self.scores.dtype) - self.words += torch.tensor(words, device=self.words.device, dtype=self.words.dtype) - # return torch.tensor([scores, words]).to(predictions.device) - - def compute(self): - wer = self.scores.float() / self.words - return wer, self.scores.detach(), self.words.detach() - - -@dataclass -class RNNTBPEDecodingConfig(RNNTDecodingConfig): - pass diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/wer.py b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/wer.py deleted file mode 100644 index 46984ff86435c0dc0bc5c9bba4e6c6195c608fee..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/wer.py +++ /dev/null @@ -1,1313 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -from abc import abstractmethod -from dataclasses import dataclass, field, is_dataclass -from typing import Callable, Dict, List, Optional, Tuple, Union - -import editdistance -import jiwer -import numpy as np -import torch -from omegaconf import DictConfig, OmegaConf -from torchmetrics import Metric - -from nemo.collections.asr.parts.submodules import ctc_beam_decoding, ctc_greedy_decoding -from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceConfig, ConfidenceMixin -from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis, NBestHypotheses -from nemo.utils import logging, logging_mode - -__all__ = ['word_error_rate', 'word_error_rate_detail', 'WER', 'move_dimension_to_the_front'] - - -def word_error_rate(hypotheses: List[str], references: List[str], use_cer=False) -> float: - """ - Computes Average Word Error rate between two texts represented as - corresponding lists of string. - - Hypotheses and references must have same length. - - Args: - hypotheses (list): list of hypotheses - references(list) : list of references - use_cer (bool): set True to enable cer - - Returns: - wer (float): average word error rate - """ - scores = 0 - words = 0 - if len(hypotheses) != len(references): - raise ValueError( - "In word error rate calculation, hypotheses and reference" - " lists must have the same number of elements. But I got:" - "{0} and {1} correspondingly".format(len(hypotheses), len(references)) - ) - for h, r in zip(hypotheses, references): - if use_cer: - h_list = list(h) - r_list = list(r) - else: - h_list = h.split() - r_list = r.split() - words += len(r_list) - # May deprecate using editdistance in future release for here and rest of codebase - # once we confirm jiwer is reliable. - scores += editdistance.eval(h_list, r_list) - if words != 0: - wer = 1.0 * scores / words - else: - wer = float('inf') - return wer - - -def word_error_rate_detail( - hypotheses: List[str], references: List[str], use_cer=False -) -> Tuple[float, int, float, float, float]: - """ - Computes Average Word Error Rate with details (insertion rate, deletion rate, substitution rate) - between two texts represented as corresponding lists of string. - - Hypotheses and references must have same length. - - Args: - hypotheses (list): list of hypotheses - references(list) : list of references - use_cer (bool): set True to enable cer - - Returns: - wer (float): average word error rate - words (int): Total number of words/charactors of given reference texts - ins_rate (float): average insertion error rate - del_rate (float): average deletion error rate - sub_rate (float): average substitution error rate - """ - scores = 0 - words = 0 - ops_count = {'substitutions': 0, 'insertions': 0, 'deletions': 0} - - if len(hypotheses) != len(references): - raise ValueError( - "In word error rate calculation, hypotheses and reference" - " lists must have the same number of elements. But I got:" - "{0} and {1} correspondingly".format(len(hypotheses), len(references)) - ) - - for h, r in zip(hypotheses, references): - if use_cer: - h_list = list(h) - r_list = list(r) - else: - h_list = h.split() - r_list = r.split() - - # To get rid of the issue that jiwer does not allow empty string - if len(r_list) == 0: - if len(h_list) != 0: - errors = len(h_list) - ops_count['insertions'] += errors - else: - errors = 0 - else: - if use_cer: - measures = jiwer.cer(r, h, return_dict=True) - else: - measures = jiwer.compute_measures(r, h) - - errors = measures['insertions'] + measures['deletions'] + measures['substitutions'] - ops_count['insertions'] += measures['insertions'] - ops_count['deletions'] += measures['deletions'] - ops_count['substitutions'] += measures['substitutions'] - - scores += errors - words += len(r_list) - - if words != 0: - wer = 1.0 * scores / words - ins_rate = 1.0 * ops_count['insertions'] / words - del_rate = 1.0 * ops_count['deletions'] / words - sub_rate = 1.0 * ops_count['substitutions'] / words - else: - wer, ins_rate, del_rate, sub_rate = float('inf'), float('inf'), float('inf'), float('inf') - - return wer, words, ins_rate, del_rate, sub_rate - - -def word_error_rate_per_utt(hypotheses: List[str], references: List[str], use_cer=False) -> Tuple[List[float], float]: - """ - Computes Word Error Rate per utterance and the average WER - between two texts represented as corresponding lists of string. - - Hypotheses and references must have same length. - - Args: - hypotheses (list): list of hypotheses - references(list) : list of references - use_cer (bool): set True to enable cer - - Returns: - wer_per_utt (List[float]): word error rate per utterance - avg_wer (float): average word error rate - """ - scores = 0 - words = 0 - wer_per_utt = [] - - if len(hypotheses) != len(references): - raise ValueError( - "In word error rate calculation, hypotheses and reference" - " lists must have the same number of elements. But I got:" - "{0} and {1} correspondingly".format(len(hypotheses), len(references)) - ) - - for h, r in zip(hypotheses, references): - if use_cer: - h_list = list(h) - r_list = list(r) - else: - h_list = h.split() - r_list = r.split() - - # To get rid of the issue that jiwer does not allow empty string - if len(r_list) == 0: - if len(h_list) != 0: - errors = len(h_list) - wer_per_utt.append(float('inf')) - else: - if use_cer: - measures = jiwer.cer(r, h, return_dict=True) - er = measures['cer'] - else: - measures = jiwer.compute_measures(r, h) - er = measures['wer'] - - errors = measures['insertions'] + measures['deletions'] + measures['substitutions'] - wer_per_utt.append(er) - - scores += errors - words += len(r_list) - - if words != 0: - avg_wer = 1.0 * scores / words - else: - avg_wer = float('inf') - - return wer_per_utt, avg_wer - - -def move_dimension_to_the_front(tensor, dim_index): - all_dims = list(range(tensor.ndim)) - return tensor.permute(*([dim_index] + all_dims[:dim_index] + all_dims[dim_index + 1 :])) - - -class AbstractCTCDecoding(ConfidenceMixin): - """ - Used for performing CTC auto-regressive / non-auto-regressive decoding of the logprobs. - - Args: - decoding_cfg: A dict-like object which contains the following key-value pairs. - strategy: str value which represents the type of decoding that can occur. - Possible values are : - - greedy (for greedy decoding). - - beam (for DeepSpeed KenLM based decoding). - - compute_timestamps: A bool flag, which determines whether to compute the character/subword, or - word based timestamp mapping the output log-probabilities to discrite intervals of timestamps. - The timestamps will be available in the returned Hypothesis.timestep as a dictionary. - - ctc_timestamp_type: A str value, which represents the types of timestamps that should be calculated. - Can take the following values - "char" for character/subword time stamps, "word" for word level - time stamps and "all" (default), for both character level and word level time stamps. - - word_seperator: Str token representing the seperator between words. - - preserve_alignments: Bool flag which preserves the history of logprobs generated during - decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `logprobs` in it. Here, `logprobs` is a torch.Tensors. - - confidence_cfg: A dict-like object which contains the following key-value pairs related to confidence - scores. In order to obtain hypotheses with confidence scores, please utilize - `ctc_decoder_predictions_tensor` function with the `preserve_frame_confidence` flag set to True. - - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores - generated during decoding. When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of floats. - preserve_token_confidence: Bool flag which preserves the history of per-token confidence scores - generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `token_confidence` in it. Here, `token_confidence` is a List of floats. - - The length of the list corresponds to the number of recognized tokens. - preserve_word_confidence: Bool flag which preserves the history of per-word confidence scores - generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `word_confidence` in it. Here, `word_confidence` is a List of floats. - - The length of the list corresponds to the number of recognized words. - exclude_blank: Bool flag indicating that blank token confidence scores are to be excluded - from the `token_confidence`. - aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. - Valid options are `mean`, `min`, `max`, `prod`. - method_cfg: A dict-like object which contains the method name and settings to compute per-frame - confidence scores. - - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). - Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - - batch_dim_index: Index of the batch dimension of ``targets`` and ``predictions`` parameters of - ``ctc_decoder_predictions_tensor`` methods. Can be either 0 or 1. - - The config may further contain the following sub-dictionaries: - "greedy": - preserve_alignments: Same as above, overrides above value. - compute_timestamps: Same as above, overrides above value. - preserve_frame_confidence: Same as above, overrides above value. - confidence_method_cfg: Same as above, overrides confidence_cfg.method_cfg. - - "beam": - beam_size: int, defining the beam size for beam search. Must be >= 1. - If beam_size == 1, will perform cached greedy search. This might be slightly different - results compared to the greedy search above. - - return_best_hypothesis: optional bool, whether to return just the best hypothesis or all of the - hypotheses after beam search has concluded. This flag is set by default. - - beam_alpha: float, the strength of the Language model on the final score of a token. - final_score = acoustic_score + beam_alpha * lm_score + beam_beta * seq_length. - - beam_beta: float, the strength of the sequence length penalty on the final score of a token. - final_score = acoustic_score + beam_alpha * lm_score + beam_beta * seq_length. - - kenlm_path: str, path to a KenLM ARPA or .binary file (depending on the strategy chosen). - If the path is invalid (file is not found at path), will raise a deferred error at the moment - of calculation of beam search, so that users may update / change the decoding strategy - to point to the correct file. - - blank_id: The id of the RNNT blank token. - """ - - def __init__(self, decoding_cfg, blank_id: int): - super().__init__() - - # Convert dataclas to config - if is_dataclass(decoding_cfg): - decoding_cfg = OmegaConf.structured(decoding_cfg) - - if not isinstance(decoding_cfg, DictConfig): - decoding_cfg = OmegaConf.create(decoding_cfg) - - OmegaConf.set_struct(decoding_cfg, False) - - # update minimal config - minimal_cfg = ['greedy'] - for item in minimal_cfg: - if item not in decoding_cfg: - decoding_cfg[item] = OmegaConf.create({}) - - self.cfg = decoding_cfg - self.blank_id = blank_id - self.preserve_alignments = self.cfg.get('preserve_alignments', None) - self.compute_timestamps = self.cfg.get('compute_timestamps', None) - self.batch_dim_index = self.cfg.get('batch_dim_index', 0) - self.word_seperator = self.cfg.get('word_seperator', ' ') - - possible_strategies = ['greedy', 'beam', 'pyctcdecode', 'flashlight'] - if self.cfg.strategy not in possible_strategies: - raise ValueError(f"Decoding strategy must be one of {possible_strategies}. Given {self.cfg.strategy}") - - # Update preserve alignments - if self.preserve_alignments is None: - if self.cfg.strategy in ['greedy']: - self.preserve_alignments = self.cfg.greedy.get('preserve_alignments', False) - else: - self.preserve_alignments = self.cfg.beam.get('preserve_alignments', False) - - # Update compute timestamps - if self.compute_timestamps is None: - if self.cfg.strategy in ['greedy']: - self.compute_timestamps = self.cfg.greedy.get('compute_timestamps', False) - elif self.cfg.strategy in ['beam']: - self.compute_timestamps = self.cfg.beam.get('compute_timestamps', False) - - # initialize confidence-related fields - self._init_confidence(self.cfg.get('confidence_cfg', None)) - - # Confidence estimation is not implemented for strategies other than `greedy` - if ( - not self.preserve_frame_confidence - and self.cfg.strategy != 'greedy' - and self.cfg.beam.get('preserve_frame_confidence', False) - ): - raise NotImplementedError(f"Confidence calculation is not supported for strategy `{self.cfg.strategy}`") - - # we need timestamps to extract non-blank per-frame confidence - if self.compute_timestamps is not None: - self.compute_timestamps |= self.preserve_frame_confidence - - if self.cfg.strategy == 'greedy': - - self.decoding = ctc_greedy_decoding.GreedyCTCInfer( - blank_id=self.blank_id, - preserve_alignments=self.preserve_alignments, - compute_timestamps=self.compute_timestamps, - preserve_frame_confidence=self.preserve_frame_confidence, - confidence_method_cfg=self.confidence_method_cfg, - ) - - elif self.cfg.strategy == 'beam': - - self.decoding = ctc_beam_decoding.BeamCTCInfer( - blank_id=blank_id, - beam_size=self.cfg.beam.get('beam_size', 1), - search_type='default', - return_best_hypothesis=self.cfg.beam.get('return_best_hypothesis', True), - preserve_alignments=self.preserve_alignments, - compute_timestamps=self.compute_timestamps, - beam_alpha=self.cfg.beam.get('beam_alpha', 1.0), - beam_beta=self.cfg.beam.get('beam_beta', 0.0), - kenlm_path=self.cfg.beam.get('kenlm_path', None), - ) - - self.decoding.override_fold_consecutive_value = False - - elif self.cfg.strategy == 'pyctcdecode': - - self.decoding = ctc_beam_decoding.BeamCTCInfer( - blank_id=blank_id, - beam_size=self.cfg.beam.get('beam_size', 1), - search_type='pyctcdecode', - return_best_hypothesis=self.cfg.beam.get('return_best_hypothesis', True), - preserve_alignments=self.preserve_alignments, - compute_timestamps=self.compute_timestamps, - beam_alpha=self.cfg.beam.get('beam_alpha', 1.0), - beam_beta=self.cfg.beam.get('beam_beta', 0.0), - kenlm_path=self.cfg.beam.get('kenlm_path', None), - pyctcdecode_cfg=self.cfg.beam.get('pyctcdecode_cfg', None), - ) - - self.decoding.override_fold_consecutive_value = False - - elif self.cfg.strategy == 'flashlight': - - self.decoding = ctc_beam_decoding.BeamCTCInfer( - blank_id=blank_id, - beam_size=self.cfg.beam.get('beam_size', 1), - search_type='flashlight', - return_best_hypothesis=self.cfg.beam.get('return_best_hypothesis', True), - preserve_alignments=self.preserve_alignments, - compute_timestamps=self.compute_timestamps, - beam_alpha=self.cfg.beam.get('beam_alpha', 1.0), - beam_beta=self.cfg.beam.get('beam_beta', 0.0), - kenlm_path=self.cfg.beam.get('kenlm_path', None), - flashlight_cfg=self.cfg.beam.get('flashlight_cfg', None), - ) - - self.decoding.override_fold_consecutive_value = False - - else: - raise ValueError( - f"Incorrect decoding strategy supplied. Must be one of {possible_strategies}\n" - f"but was provided {self.cfg.strategy}" - ) - - def ctc_decoder_predictions_tensor( - self, - decoder_outputs: torch.Tensor, - decoder_lengths: torch.Tensor = None, - fold_consecutive: bool = True, - return_hypotheses: bool = False, - ) -> Tuple[List[str], Optional[List[List[str]]], Optional[Union[Hypothesis, NBestHypotheses]]]: - """ - Decodes a sequence of labels to words - - Args: - decoder_outputs: An integer torch.Tensor of shape [Batch, Time, {Vocabulary}] (if ``batch_index_dim == 0``) or [Time, Batch] - (if ``batch_index_dim == 1``) of integer indices that correspond to the index of some character in the - label set. - decoder_lengths: Optional tensor of length `Batch` which contains the integer lengths - of the sequence in the padded `predictions` tensor. - fold_consecutive: Bool, determine whether to perform "ctc collapse", folding consecutive tokens - into a single token. - return_hypotheses: Bool flag whether to return just the decoding predictions of the model - or a Hypothesis object that holds information such as the decoded `text`, - the `alignment` of emited by the CTC Model, and the `length` of the sequence (if available). - May also contain the log-probabilities of the decoder (if this method is called via - transcribe()) - - Returns: - Either a list of str which represent the CTC decoded strings per sample, - or a list of Hypothesis objects containing additional information. - """ - - if isinstance(decoder_outputs, torch.Tensor): - decoder_outputs = move_dimension_to_the_front(decoder_outputs, self.batch_dim_index) - - if ( - hasattr(self.decoding, 'override_fold_consecutive_value') - and self.decoding.override_fold_consecutive_value is not None - ): - logging.info( - f"Beam search requires that consecutive ctc tokens are not folded. \n" - f"Overriding provided value of `fold_consecutive` = {fold_consecutive} to " - f"{self.decoding.override_fold_consecutive_value}", - mode=logging_mode.ONCE, - ) - fold_consecutive = self.decoding.override_fold_consecutive_value - - with torch.inference_mode(): - # Resolve the forward step of the decoding strategy - hypotheses_list = self.decoding( - decoder_output=decoder_outputs, decoder_lengths=decoder_lengths - ) # type: List[List[Hypothesis]] - - # extract the hypotheses - hypotheses_list = hypotheses_list[0] # type: List[Hypothesis] - - if isinstance(hypotheses_list[0], NBestHypotheses): - hypotheses = [] - all_hypotheses = [] - - for nbest_hyp in hypotheses_list: # type: NBestHypotheses - n_hyps = nbest_hyp.n_best_hypotheses # Extract all hypotheses for this sample - decoded_hyps = self.decode_hypothesis( - n_hyps, fold_consecutive - ) # type: List[Union[Hypothesis, NBestHypotheses]] - - # If computing timestamps - if self.compute_timestamps is True: - timestamp_type = self.cfg.get('ctc_timestamp_type', 'all') - for hyp_idx in range(len(decoded_hyps)): - decoded_hyps[hyp_idx] = self.compute_ctc_timestamps(decoded_hyps[hyp_idx], timestamp_type) - - hypotheses.append(decoded_hyps[0]) # best hypothesis - all_hypotheses.append(decoded_hyps) - - if return_hypotheses: - return hypotheses, all_hypotheses - - best_hyp_text = [h.text for h in hypotheses] - all_hyp_text = [h.text for hh in all_hypotheses for h in hh] - return best_hyp_text, all_hyp_text - - else: - hypotheses = self.decode_hypothesis( - hypotheses_list, fold_consecutive - ) # type: List[Union[Hypothesis, NBestHypotheses]] - - # If computing timestamps - if self.compute_timestamps is True: - # greedy decoding, can get high-level confidence scores - if return_hypotheses and (self.preserve_word_confidence or self.preserve_token_confidence): - hypotheses = self.compute_confidence(hypotheses) - else: - # remove unused token_repetitions from Hypothesis.text - for hyp in hypotheses: - hyp.text = hyp.text[:2] - timestamp_type = self.cfg.get('ctc_timestamp_type', 'all') - for hyp_idx in range(len(hypotheses)): - hypotheses[hyp_idx] = self.compute_ctc_timestamps(hypotheses[hyp_idx], timestamp_type) - - if return_hypotheses: - return hypotheses, None - - best_hyp_text = [h.text for h in hypotheses] - return best_hyp_text, None - - def decode_hypothesis( - self, hypotheses_list: List[Hypothesis], fold_consecutive: bool - ) -> List[Union[Hypothesis, NBestHypotheses]]: - """ - Decode a list of hypotheses into a list of strings. - - Args: - hypotheses_list: List of Hypothesis. - fold_consecutive: Whether to collapse the ctc blank tokens or not. - - Returns: - A list of strings. - """ - for ind in range(len(hypotheses_list)): - # Extract the integer encoded hypothesis - hyp = hypotheses_list[ind] - prediction = hyp.y_sequence - predictions_len = hyp.length if hyp.length > 0 else None - - if fold_consecutive: - if type(prediction) != list: - prediction = prediction.numpy().tolist() - - if predictions_len is not None: - prediction = prediction[:predictions_len] - - # CTC decoding procedure - decoded_prediction = [] - token_lengths = [] # preserve token lengths - token_repetitions = [] # preserve number of repetitions per token - - previous = self.blank_id - last_length = 0 - last_repetition = 1 - - for pidx, p in enumerate(prediction): - if (p != previous or previous == self.blank_id) and p != self.blank_id: - decoded_prediction.append(p) - - token_lengths.append(pidx - last_length) - last_length = pidx - token_repetitions.append(last_repetition) - last_repetition = 1 - - if p == previous and previous != self.blank_id: - last_repetition += 1 - - previous = p - - if len(token_repetitions) > 0: - token_repetitions = token_repetitions[1:] + [last_repetition] - - else: - if predictions_len is not None: - prediction = prediction[:predictions_len] - decoded_prediction = prediction[prediction != self.blank_id].tolist() - token_lengths = [1] * len(decoded_prediction) # preserve number of repetitions per token - token_repetitions = [1] * len(decoded_prediction) # preserve number of repetitions per token - - # De-tokenize the integer tokens; if not computing timestamps - if self.compute_timestamps is True: - # keep the original predictions, wrap with the number of repetitions per token - # this is done so that `ctc_decoder_predictions_tensor()` can process this hypothesis - # in order to compute exact time stamps. - hypothesis = (decoded_prediction, token_lengths, token_repetitions) - else: - hypothesis = self.decode_tokens_to_str(decoded_prediction) - - # TODO: remove - # collapse leading spaces before . , ? for PC models - hypothesis = re.sub(r'(\s+)([\.\,\?])', r'\2', hypothesis) - - # Preserve this wrapped hypothesis or decoded text tokens. - hypotheses_list[ind].text = hypothesis - - return hypotheses_list - - def compute_confidence(self, hypotheses_list: List[Hypothesis]) -> List[Hypothesis]: - """ - Computes high-level (per-token and/or per-word) confidence scores for a list of hypotheses. - Assumes that `frame_confidence` is present in the hypotheses. - - Args: - hypotheses_list: List of Hypothesis. - - Returns: - A list of hypotheses with high-level confidence scores. - """ - for hyp in hypotheses_list: - if not isinstance(hyp.text, tuple) or len(hyp.text) != 3: - # the method must have been called in the wrong place - raise ValueError( - """Wrong format of the `text` attribute of a hypothesis.\n - Expected: (decoded_prediction, token_repetitions)\n - The method invocation is expected between .decode_hypothesis() and .compute_ctc_timestamps()""" - ) - token_repetitions = hyp.text[2] - hyp.text = hyp.text[:2] - token_confidence = [] - if self.exclude_blank_from_confidence: - non_blank_frame_confidence = hyp.non_blank_frame_confidence - i = 0 - for tr in token_repetitions: - # token repetition can be zero - j = i + tr - token_confidence.append(self._aggregate_confidence(non_blank_frame_confidence[i:j])) - i = j - else: - # tokens are considered to belong to the last non-blank token, if any. - token_lengths = hyp.text[1] - if len(token_lengths) > 0: - ts = token_lengths[0] - for tl in token_lengths[1:] + [len(hyp.frame_confidence)]: - token_confidence.append(self._aggregate_confidence(hyp.frame_confidence[ts : ts + tl])) - ts += tl - hyp.token_confidence = token_confidence - if self.preserve_word_confidence: - for hyp in hypotheses_list: - hyp.word_confidence = self._aggregate_token_confidence(hyp) - return hypotheses_list - - @abstractmethod - def decode_tokens_to_str(self, tokens: List[int]) -> str: - """ - Implemented by subclass in order to decoder a token id list into a string. - - Args: - tokens: List of int representing the token ids. - - Returns: - A decoded string. - """ - raise NotImplementedError() - - @abstractmethod - def decode_ids_to_tokens(self, tokens: List[int]) -> List[str]: - """ - Implemented by subclass in order to decode a token id list into a token list. - A token list is the string representation of each token id. - - Args: - tokens: List of int representing the token ids. - - Returns: - A list of decoded tokens. - """ - raise NotImplementedError() - - def compute_ctc_timestamps(self, hypothesis: Hypothesis, timestamp_type: str = "all"): - """ - Method to compute time stamps at char/subword, and word level given some hypothesis. - Requires the input hypothesis to contain a `text` field that is the tuple. The tuple contains - - the ctc collapsed integer ids, and the number of repetitions of each token. - - Args: - hypothesis: A Hypothesis object, with a wrapped `text` field. - The `text` field must contain a tuple with two values - - The ctc collapsed integer ids - A list of integers that represents the number of repetitions per token. - timestamp_type: A str value that represents the type of time stamp calculated. - Can be one of "char", "word" or "all" - - Returns: - A Hypothesis object with a modified `timestep` value, which is now a dictionary containing - the time stamp information. - """ - assert timestamp_type in ['char', 'word', 'all'] - - # Unpack the temporary storage, and set the decoded predictions - decoded_prediction, token_lengths = hypothesis.text - hypothesis.text = decoded_prediction - - # Retrieve offsets - char_offsets = word_offsets = None - char_offsets = self._compute_offsets(hypothesis, token_lengths, self.blank_id) - - # Assert number of offsets and hypothesis tokens are 1:1 match. - if len(char_offsets) != len(hypothesis.text): - raise ValueError( - f"`char_offsets`: {char_offsets} and `processed_tokens`: {hypothesis.text}" - " have to be of the same length, but are: " - f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:" - f" {len(hypothesis.text)}" - ) - - # Correctly process the token ids to chars/subwords. - for i, char in enumerate(hypothesis.text): - char_offsets[i]["char"] = self.decode_tokens_to_str([char]) - - # detect char vs subword models - lens = [len(list(v["char"])) > 1 for v in char_offsets] - if any(lens): - text_type = 'subword' - else: - text_type = 'char' - - # retrieve word offsets from character offsets - word_offsets = None - if timestamp_type in ['word', 'all']: - if text_type == 'char': - word_offsets = self._get_word_offsets_chars(char_offsets, word_delimiter_char=self.word_seperator) - else: - word_offsets = self._get_word_offsets_subwords_sentencepiece( - char_offsets, - hypothesis, - decode_ids_to_tokens=self.decode_ids_to_tokens, - decode_tokens_to_str=self.decode_tokens_to_str, - ) - - # attach results - if len(hypothesis.timestep) > 0: - timestep_info = hypothesis.timestep - else: - timestep_info = [] - - # Setup defaults - hypothesis.timestep = {"timestep": timestep_info} - - # Add char / subword time stamps - if char_offsets is not None and timestamp_type in ['char', 'all']: - hypothesis.timestep['char'] = char_offsets - - # Add word time stamps - if word_offsets is not None and timestamp_type in ['word', 'all']: - hypothesis.timestep['word'] = word_offsets - - # Convert the token indices to text - hypothesis.text = self.decode_tokens_to_str(hypothesis.text) - - return hypothesis - - @staticmethod - def _compute_offsets( - hypothesis: Hypothesis, token_lengths: List[int], ctc_token: int - ) -> List[Dict[str, Union[str, int]]]: - """ - Utility method that calculates the indidual time indices where a token starts and ends. - - Args: - hypothesis: A Hypothesis object that contains `text` field that holds the character / subword token - emitted at every time step after ctc collapse. - token_lengths: A list of ints representing the lengths of each emitted token. - ctc_token: The integer of the ctc blank token used during ctc collapse. - - Returns: - - """ - start_index = 0 - - # If the exact timestep information is available, utilize the 1st non-ctc blank token timestep - # as the start index. - if hypothesis.timestep is not None and len(hypothesis.timestep) > 0: - start_index = max(0, hypothesis.timestep[0] - 1) - - # Construct the start and end indices brackets - end_indices = np.asarray(token_lengths).cumsum() - start_indices = np.concatenate(([start_index], end_indices[:-1])) - - # Merge the results per token into a list of dictionaries - offsets = [ - {"char": t, "start_offset": s, "end_offset": e} - for t, s, e in zip(hypothesis.text, start_indices, end_indices) - ] - - # Filter out CTC token - offsets = list(filter(lambda offsets: offsets["char"] != ctc_token, offsets)) - return offsets - - @staticmethod - def _get_word_offsets_chars( - offsets: Dict[str, Union[str, float]], word_delimiter_char: str = " " - ) -> Dict[str, Union[str, float]]: - """ - Utility method which constructs word time stamps out of character time stamps. - - References: - This code is a port of the Hugging Face code for word time stamp construction. - - Args: - offsets: A list of dictionaries, each containing "char", "start_offset" and "end_offset". - word_delimiter_char: Character token that represents the word delimiter. By default, " ". - - Returns: - A list of dictionaries containing the word offsets. Each item contains "word", "start_offset" and - "end_offset". - """ - word_offsets = [] - - last_state = "SPACE" - word = "" - start_offset = 0 - end_offset = 0 - for i, offset in enumerate(offsets): - char = offset["char"] - state = "SPACE" if char == word_delimiter_char else "WORD" - - if state == last_state: - # If we are in the same state as before, we simply repeat what we've done before - end_offset = offset["end_offset"] - word += char - else: - # Switching state - if state == "SPACE": - # Finishing a word - word_offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset}) - else: - # Starting a new word - start_offset = offset["start_offset"] - end_offset = offset["end_offset"] - word = char - - last_state = state - if last_state == "WORD": - word_offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset}) - - return word_offsets - - @staticmethod - def _get_word_offsets_subwords_sentencepiece( - offsets: Dict[str, Union[str, float]], - hypothesis: Hypothesis, - decode_ids_to_tokens: Callable[[List[int]], str], - decode_tokens_to_str: Callable[[List[int]], str], - ) -> Dict[str, Union[str, float]]: - """ - Utility method which constructs word time stamps out of sub-word time stamps. - - **Note**: Only supports Sentencepiece based tokenizers ! - - Args: - offsets: A list of dictionaries, each containing "char", "start_offset" and "end_offset". - hypothesis: Hypothesis object that contains `text` field, where each token is a sub-word id - after ctc collapse. - decode_ids_to_tokens: A Callable function that accepts a list of integers and maps it to a sub-word. - decode_tokens_to_str: A Callable function that accepts a list of integers and maps it to text / str. - - Returns: - A list of dictionaries containing the word offsets. Each item contains "word", "start_offset" and - "end_offset". - """ - word_offsets = [] - built_token = [] - previous_token_index = 0 - # For every collapsed sub-word token - for i, char in enumerate(hypothesis.text): - # Compute the sub-word text representation, and the decoded text (stripped of sub-word markers). - token = decode_ids_to_tokens([char])[0] - token_text = decode_tokens_to_str([char]) - - # It is a sub-word token, or contains an identifier at the beginning such as _ or ## that was stripped - # after forcing partial text conversion of the token. - if token != token_text: - # If there are any partially or fully built sub-word token ids, construct to text. - # Note: This is "old" subword, that occurs *after* current sub-word has started. - if len(built_token) > 0: - word_offsets.append( - { - "word": decode_tokens_to_str(built_token), - "start_offset": offsets[previous_token_index]["start_offset"], - "end_offset": offsets[i]["start_offset"], - } - ) - - # Prepare list of new sub-word ids - built_token.clear() - built_token.append(char) - previous_token_index = i - else: - # If the token does not contain any sub-word start mark, then the sub-word has not completed yet - # Append to current sub-word list. - built_token.append(char) - - # Inject the start offset of the first token to word offsets - # This is because we always skip the delay the injection of the first sub-word due to the loop - # condition and check whether built token is ready or not. - # Therefore without this forced injection, the start_offset appears as off by 1. - if len(word_offsets) == 0: - # alaptev: sometimes word_offsets can be empty - if len(built_token) > 0: - word_offsets.append( - { - "word": decode_tokens_to_str(built_token), - "start_offset": offsets[0]["start_offset"], - "end_offset": offsets[-1]["end_offset"], - } - ) - built_token.clear() - else: - word_offsets[0]["start_offset"] = offsets[0]["start_offset"] - - # If there are any remaining tokens left, inject them all into the final word offset. - # Note: The start offset of this token is the start time of the first token inside build_token. - # Note: The end offset of this token is the end time of the last token inside build_token - if len(built_token) > 0: - word_offsets.append( - { - "word": decode_tokens_to_str(built_token), - "start_offset": offsets[-(len(built_token))]["start_offset"], - "end_offset": offsets[-1]["end_offset"], - } - ) - built_token.clear() - - return word_offsets - - @property - def preserve_alignments(self): - return self._preserve_alignments - - @preserve_alignments.setter - def preserve_alignments(self, value): - self._preserve_alignments = value - - if hasattr(self, 'decoding'): - self.decoding.preserve_alignments = value - - @property - def compute_timestamps(self): - return self._compute_timestamps - - @compute_timestamps.setter - def compute_timestamps(self, value): - self._compute_timestamps = value - - if hasattr(self, 'decoding'): - self.decoding.compute_timestamps = value - - @property - def preserve_frame_confidence(self): - return self._preserve_frame_confidence - - @preserve_frame_confidence.setter - def preserve_frame_confidence(self, value): - self._preserve_frame_confidence = value - - if hasattr(self, 'decoding'): - self.decoding.preserve_frame_confidence = value - - -class CTCDecoding(AbstractCTCDecoding): - """ - Used for performing CTC auto-regressive / non-auto-regressive decoding of the logprobs for character - based models. - - Args: - decoding_cfg: A dict-like object which contains the following key-value pairs. - strategy: str value which represents the type of decoding that can occur. - Possible values are : - - greedy (for greedy decoding). - - beam (for DeepSpeed KenLM based decoding). - - compute_timestamps: A bool flag, which determines whether to compute the character/subword, or - word based timestamp mapping the output log-probabilities to discrite intervals of timestamps. - The timestamps will be available in the returned Hypothesis.timestep as a dictionary. - - ctc_timestamp_type: A str value, which represents the types of timestamps that should be calculated. - Can take the following values - "char" for character/subword time stamps, "word" for word level - time stamps and "all" (default), for both character level and word level time stamps. - - word_seperator: Str token representing the seperator between words. - - preserve_alignments: Bool flag which preserves the history of logprobs generated during - decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `logprobs` in it. Here, `logprobs` is a torch.Tensors. - - confidence_cfg: A dict-like object which contains the following key-value pairs related to confidence - scores. In order to obtain hypotheses with confidence scores, please utilize - `ctc_decoder_predictions_tensor` function with the `preserve_frame_confidence` flag set to True. - - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores - generated during decoding. When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of floats. - preserve_token_confidence: Bool flag which preserves the history of per-token confidence scores - generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `token_confidence` in it. Here, `token_confidence` is a List of floats. - - The length of the list corresponds to the number of recognized tokens. - preserve_word_confidence: Bool flag which preserves the history of per-word confidence scores - generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `word_confidence` in it. Here, `word_confidence` is a List of floats. - - The length of the list corresponds to the number of recognized words. - exclude_blank: Bool flag indicating that blank token confidence scores are to be excluded - from the `token_confidence`. - aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. - Valid options are `mean`, `min`, `max`, `prod`. - method_cfg: A dict-like object which contains the method name and settings to compute per-frame - confidence scores. - - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). - Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - - batch_dim_index: Index of the batch dimension of ``targets`` and ``predictions`` parameters of - ``ctc_decoder_predictions_tensor`` methods. Can be either 0 or 1. - - The config may further contain the following sub-dictionaries: - "greedy": - preserve_alignments: Same as above, overrides above value. - compute_timestamps: Same as above, overrides above value. - preserve_frame_confidence: Same as above, overrides above value. - confidence_method_cfg: Same as above, overrides confidence_cfg.method_cfg. - - "beam": - beam_size: int, defining the beam size for beam search. Must be >= 1. - If beam_size == 1, will perform cached greedy search. This might be slightly different - results compared to the greedy search above. - - return_best_hypothesis: optional bool, whether to return just the best hypothesis or all of the - hypotheses after beam search has concluded. This flag is set by default. - - beam_alpha: float, the strength of the Language model on the final score of a token. - final_score = acoustic_score + beam_alpha * lm_score + beam_beta * seq_length. - - beam_beta: float, the strength of the sequence length penalty on the final score of a token. - final_score = acoustic_score + beam_alpha * lm_score + beam_beta * seq_length. - - kenlm_path: str, path to a KenLM ARPA or .binary file (depending on the strategy chosen). - If the path is invalid (file is not found at path), will raise a deferred error at the moment - of calculation of beam search, so that users may update / change the decoding strategy - to point to the correct file. - - blank_id: The id of the RNNT blank token. - """ - - def __init__( - self, decoding_cfg, vocabulary, - ): - blank_id = len(vocabulary) - self.vocabulary = vocabulary - self.labels_map = dict([(i, vocabulary[i]) for i in range(len(vocabulary))]) - - super().__init__(decoding_cfg=decoding_cfg, blank_id=blank_id) - - # Finalize Beam Search Decoding framework - if isinstance(self.decoding, ctc_beam_decoding.AbstractBeamCTCInfer): - self.decoding.set_vocabulary(self.vocabulary) - self.decoding.set_decoding_type('char') - - def _aggregate_token_confidence(self, hypothesis: Hypothesis) -> List[float]: - """ - Implemented by subclass in order to aggregate token confidence to a word-level confidence. - - Args: - hypothesis: Hypothesis - - Returns: - A list of word-level confidence scores. - """ - return self._aggregate_token_confidence_chars( - self.decode_tokens_to_str(hypothesis.text[0]).split(), hypothesis.token_confidence - ) - - def decode_tokens_to_str(self, tokens: List[int]) -> str: - """ - Implemented by subclass in order to decoder a token list into a string. - - Args: - tokens: List of int representing the token ids. - - Returns: - A decoded string. - """ - hypothesis = ''.join(self.decode_ids_to_tokens(tokens)) - return hypothesis - - def decode_ids_to_tokens(self, tokens: List[int]) -> List[str]: - """ - Implemented by subclass in order to decode a token id list into a token list. - A token list is the string representation of each token id. - - Args: - tokens: List of int representing the token ids. - - Returns: - A list of decoded tokens. - """ - token_list = [self.labels_map[c] for c in tokens if c != self.blank_id] - return token_list - - -class WER(Metric): - """ - This metric computes numerator and denominator for Overall Word Error Rate (WER) between prediction and reference - texts. When doing distributed training/evaluation the result of ``res=WER(predictions, targets, target_lengths)`` - calls will be all-reduced between all workers using SUM operations. Here ``res`` contains three numbers - ``res=[wer, total_levenstein_distance, total_number_of_words]``. - - If used with PytorchLightning LightningModule, include wer_numerator and wer_denominators inside validation_step - results. Then aggregate (sum) then at the end of validation epoch to correctly compute validation WER. - - Example: - def validation_step(self, batch, batch_idx): - ... - wer_num, wer_denom = self.__wer(predictions, transcript, transcript_len) - self.val_outputs = {'val_loss': loss_value, 'val_wer_num': wer_num, 'val_wer_denom': wer_denom} - return self.val_outputs - - def on_validation_epoch_end(self): - ... - wer_num = torch.stack([x['val_wer_num'] for x in self.val_outputs]).sum() - wer_denom = torch.stack([x['val_wer_denom'] for x in self.val_outputs]).sum() - tensorboard_logs = {'validation_loss': val_loss_mean, 'validation_avg_wer': wer_num / wer_denom} - self.val_outputs.clear() # free memory - return {'val_loss': val_loss_mean, 'log': tensorboard_logs} - - Args: - decoding: An instance of CTCDecoding. - use_cer: Whether to use Character Error Rate instead of Word Error Rate. - log_prediction: Whether to log a single decoded sample per call. - fold_consecutive: Whether repeated consecutive characters should be folded into one when decoding. - - Returns: - res: a tuple of 3 zero dimensional float32 ``torch.Tensor` objects: a WER score, a sum of Levenstein's - distances for all prediction - reference pairs, total number of words in all references. - """ - - full_state_update: bool = True - - def __init__( - self, - decoding: CTCDecoding, - use_cer=False, - log_prediction=True, - fold_consecutive=True, - dist_sync_on_step=False, - ): - super().__init__(dist_sync_on_step=dist_sync_on_step) - - self.decoding = decoding - self.use_cer = use_cer - self.log_prediction = log_prediction - self.fold_consecutive = fold_consecutive - - self.add_state("scores", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False) - self.add_state("words", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False) - - def update( - self, - predictions: torch.Tensor, - targets: torch.Tensor, - target_lengths: torch.Tensor, - predictions_lengths: torch.Tensor = None, - ): - """ - Updates metric state. - Args: - predictions: an integer torch.Tensor of shape ``[Batch, Time, {Vocabulary}]`` (if ``batch_dim_index == 0``) or - ``[Time, Batch]`` (if ``batch_dim_index == 1``) - targets: an integer torch.Tensor of shape ``[Batch, Time]`` (if ``batch_dim_index == 0``) or - ``[Time, Batch]`` (if ``batch_dim_index == 1``) - target_lengths: an integer torch.Tensor of shape ``[Batch]`` - predictions_lengths: an integer torch.Tensor of shape ``[Batch]`` - """ - words = 0 - scores = 0 - references = [] - with torch.no_grad(): - # prediction_cpu_tensor = tensors[0].long().cpu() - targets_cpu_tensor = targets.long().cpu() - tgt_lenths_cpu_tensor = target_lengths.long().cpu() - - # iterate over batch - for ind in range(targets_cpu_tensor.shape[0]): - tgt_len = tgt_lenths_cpu_tensor[ind].item() - target = targets_cpu_tensor[ind][:tgt_len].numpy().tolist() - reference = self.decoding.decode_tokens_to_str(target) - references.append(reference) - - hypotheses, _ = self.decoding.ctc_decoder_predictions_tensor( - predictions, predictions_lengths, fold_consecutive=self.fold_consecutive - ) - - if self.log_prediction: - logging.info(f"\n") - logging.info(f"reference:{references[0]}") - logging.info(f"predicted:{hypotheses[0]}") - - for h, r in zip(hypotheses, references): - if self.use_cer: - h_list = list(h) - r_list = list(r) - else: - h_list = h.split() - r_list = r.split() - words += len(r_list) - # Compute Levenstein's distance - scores += editdistance.eval(h_list, r_list) - - self.scores = torch.tensor(scores, device=self.scores.device, dtype=self.scores.dtype) - self.words = torch.tensor(words, device=self.words.device, dtype=self.words.dtype) - # return torch.tensor([scores, words]).to(predictions.device) - - def compute(self): - scores = self.scores.detach().float() - words = self.words.detach().float() - return scores / words, scores, words - - -@dataclass -class CTCDecodingConfig: - strategy: str = "greedy" - - # preserve decoding alignments - preserve_alignments: Optional[bool] = None - - # compute ctc time stamps - compute_timestamps: Optional[bool] = None - - # token representing word seperator - word_seperator: str = " " - - # type of timestamps to calculate - ctc_timestamp_type: str = "all" # can be char, word or all for both - - # batch dimension - batch_dim_index: int = 0 - - # greedy decoding config - greedy: ctc_greedy_decoding.GreedyCTCInferConfig = field( - default_factory=lambda: ctc_greedy_decoding.GreedyCTCInferConfig() - ) - - # beam decoding config - beam: ctc_beam_decoding.BeamCTCInferConfig = field( - default_factory=lambda: ctc_beam_decoding.BeamCTCInferConfig(beam_size=4) - ) - - # confidence config - confidence_cfg: ConfidenceConfig = field(default_factory=lambda: ConfidenceConfig()) - - # can be used to change temperature for decoding - temperature: float = 1.0 diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/wer_bpe.py b/SoundScribe/SpeakerID/nemo/collections/asr/metrics/wer_bpe.py deleted file mode 100644 index b95bb62008aeb69b0bc3410da161a5bfe3f060d5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/metrics/wer_bpe.py +++ /dev/null @@ -1,328 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass -from typing import List - -import editdistance -import torch -from torchmetrics import Metric - -from nemo.collections.asr.metrics.wer import AbstractCTCDecoding, CTCDecodingConfig -from nemo.collections.asr.parts.submodules import ctc_beam_decoding -from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis -from nemo.collections.common.tokenizers.aggregate_tokenizer import DummyTokenizer -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.utils import logging - - -class CTCBPEDecoding(AbstractCTCDecoding): - """ - Used for performing CTC auto-regressive / non-auto-regressive decoding of the logprobs for subword based - models. - - Args: - decoding_cfg: A dict-like object which contains the following key-value pairs. - strategy: str value which represents the type of decoding that can occur. - Possible values are : - - greedy (for greedy decoding). - - beam (for DeepSpeed KenLM based decoding). - - compute_timestamps: A bool flag, which determines whether to compute the character/subword, or - word based timestamp mapping the output log-probabilities to discrite intervals of timestamps. - The timestamps will be available in the returned Hypothesis.timestep as a dictionary. - - ctc_timestamp_type: A str value, which represents the types of timestamps that should be calculated. - Can take the following values - "char" for character/subword time stamps, "word" for word level - time stamps and "all" (default), for both character level and word level time stamps. - - word_seperator: Str token representing the seperator between words. - - preserve_alignments: Bool flag which preserves the history of logprobs generated during - decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `logprobs` in it. Here, `logprobs` is a torch.Tensors. - - confidence_cfg: A dict-like object which contains the following key-value pairs related to confidence - scores. In order to obtain hypotheses with confidence scores, please utilize - `ctc_decoder_predictions_tensor` function with the `preserve_frame_confidence` flag set to True. - - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores - generated during decoding. When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of floats. - preserve_token_confidence: Bool flag which preserves the history of per-token confidence scores - generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `token_confidence` in it. Here, `token_confidence` is a List of floats. - - The length of the list corresponds to the number of recognized tokens. - preserve_word_confidence: Bool flag which preserves the history of per-word confidence scores - generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `word_confidence` in it. Here, `word_confidence` is a List of floats. - - The length of the list corresponds to the number of recognized words. - exclude_blank: Bool flag indicating that blank token confidence scores are to be excluded - from the `token_confidence`. - aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. - Valid options are `mean`, `min`, `max`, `prod`. - method_cfg: A dict-like object which contains the method name and settings to compute per-frame - confidence scores. - - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). - Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - - batch_dim_index: Index of the batch dimension of ``targets`` and ``predictions`` parameters of - ``ctc_decoder_predictions_tensor`` methods. Can be either 0 or 1. - - The config may further contain the following sub-dictionaries: - "greedy": - preserve_alignments: Same as above, overrides above value. - compute_timestamps: Same as above, overrides above value. - preserve_frame_confidence: Same as above, overrides above value. - confidence_method_cfg: Same as above, overrides confidence_cfg.method_cfg. - - "beam": - beam_size: int, defining the beam size for beam search. Must be >= 1. - If beam_size == 1, will perform cached greedy search. This might be slightly different - results compared to the greedy search above. - - return_best_hypothesis: optional bool, whether to return just the best hypothesis or all of the - hypotheses after beam search has concluded. This flag is set by default. - - beam_alpha: float, the strength of the Language model on the final score of a token. - final_score = acoustic_score + beam_alpha * lm_score + beam_beta * seq_length. - - beam_beta: float, the strength of the sequence length penalty on the final score of a token. - final_score = acoustic_score + beam_alpha * lm_score + beam_beta * seq_length. - - kenlm_path: str, path to a KenLM ARPA or .binary file (depending on the strategy chosen). - If the path is invalid (file is not found at path), will raise a deferred error at the moment - of calculation of beam search, so that users may update / change the decoding strategy - to point to the correct file. - - tokenizer: NeMo tokenizer object, which inherits from TokenizerSpec. - """ - - def __init__(self, decoding_cfg, tokenizer: TokenizerSpec): - blank_id = tokenizer.tokenizer.vocab_size - self.tokenizer = tokenizer - - super().__init__(decoding_cfg=decoding_cfg, blank_id=blank_id) - - # Finalize Beam Search Decoding framework - if isinstance(self.decoding, ctc_beam_decoding.AbstractBeamCTCInfer): - if hasattr(self.tokenizer.tokenizer, 'get_vocab'): - vocab_dict = self.tokenizer.tokenizer.get_vocab() - if isinstance(self.tokenizer.tokenizer, DummyTokenizer): # AggregateTokenizer.DummyTokenizer - vocab = vocab_dict - else: - vocab = list(vocab_dict.keys()) - self.decoding.set_vocabulary(vocab) - self.decoding.set_tokenizer(tokenizer) - else: - logging.warning("Could not resolve the vocabulary of the tokenizer !") - - self.decoding.set_decoding_type('subword') - - def _aggregate_token_confidence(self, hypothesis: Hypothesis) -> List[float]: - """ - Implemented by subclass in order to aggregate token confidence to a word-level confidence. - - **Note**: Only supports Sentencepiece based tokenizers! - - Args: - hypothesis: Hypothesis - - Returns: - A list of word-level confidence scores. - """ - return self._aggregate_token_confidence_subwords_sentencepiece( - self.decode_tokens_to_str(hypothesis.text[0]).split(), hypothesis.token_confidence, hypothesis.text[0] - ) - - def decode_tokens_to_str(self, tokens: List[int]) -> str: - """ - Implemented by subclass in order to decoder a token list into a string. - - Args: - tokens: List of int representing the token ids. - - Returns: - A decoded string. - """ - hypothesis = self.tokenizer.ids_to_text(tokens) - return hypothesis - - def decode_ids_to_tokens(self, tokens: List[int]) -> List[str]: - """ - Implemented by subclass in order to decode a token id list into a token list. - A token list is the string representation of each token id. - - Args: - tokens: List of int representing the token ids. - - Returns: - A list of decoded tokens. - """ - token_list = self.tokenizer.ids_to_tokens(tokens) - return token_list - - -class WERBPE(Metric): - """ - This metric computes numerator and denominator for Overall Word Error Rate for BPE tokens (WER-BPE) between - prediction and reference texts. When doing distributed training/evaluation the result of - ``res=WERBPE(predictions, targets, target_lengths)`` calls will be all-reduced between all workers using SUM - operations. Here ``res`` contains three numbers ``res=[wer, total_levenstein_distance, total_number_of_words]``. - - If used with PytorchLightning LightningModule, include wer_numerator and wer_denominators inside validation_step - results. Then aggregate (sum) then at the end of validation epoch to correctly compute validation WER. - - Example: - def validation_step(self, batch, batch_idx): - ... - wer_num, wer_denom = self.__wer(predictions, transcript, transcript_len) - self.val_outputs = {'val_loss': loss_value, 'val_wer_num': wer_num, 'val_wer_denom': wer_denom} - return self.val_outputs - - def on_validation_epoch_end(self): - ... - wer_num = torch.stack([x['val_wer_num'] for x in self.val_outputs]).sum() - wer_denom = torch.stack([x['val_wer_denom'] for x in self.val_outputs]).sum() - tensorboard_logs = {'validation_loss': val_loss_mean, 'validation_avg_wer': wer_num / wer_denom} - self.val_outputs.clear() # free memory - return {'val_loss': val_loss_mean, 'log': tensorboard_logs} - - Args: - decoding: An instance of CTCBPEDecoding. - use_cer: Whether to compute word-error-rate or character-error-rate. - log_prediction: Whether to log a single decoded sample per call. - fold_consecutive: Whether repeated consecutive tokens should be folded into one when decoding. - - Returns: - res: a tuple of 3 zero dimensional float32 ``torch.Tensor` objects: a WER score, a sum of Levenstein's - distances for all prediction - reference pairs, total number of words in all references. - """ - - full_state_update: bool = True - - def __init__( - self, - decoding: CTCBPEDecoding, - use_cer=False, - log_prediction=True, - fold_consecutive=True, - dist_sync_on_step=False, - ): - super().__init__(dist_sync_on_step=dist_sync_on_step) - self.decoding = decoding - self.tokenizer = self.decoding.tokenizer - self.blank_id = self.decoding.tokenizer.tokenizer.vocab_size - self.use_cer = use_cer - self.log_prediction = log_prediction - self.fold_consecutive = fold_consecutive - - self.add_state("scores", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False) - self.add_state("words", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False) - - def update( - self, - predictions: torch.Tensor, - targets: torch.Tensor, - target_lengths: torch.Tensor, - predictions_lengths: torch.Tensor = None, - ): - """ - Updates metric state. - Args: - predictions: an integer torch.Tensor of shape ``[Batch, Time, {Vocabulary}]`` (if ``batch_dim_index == 0``) or - ``[Time, Batch]`` (if ``batch_dim_index == 1``) - targets: an integer torch.Tensor of shape ``[Batch, Time]`` (if ``batch_dim_index == 0``) or - ``[Time, Batch]`` (if ``batch_dim_index == 1``) - target_lengths: an integer torch.Tensor of shape ``[Batch]`` - predictions_lengths: an integer torch.Tensor of shape ``[Batch]`` - """ - words = 0 - scores = 0 - references = [] - with torch.no_grad(): - targets_cpu_tensor = targets.long().cpu() - tgt_lenths_cpu_tensor = target_lengths.long().cpu() - - # iterate over batch - for ind in range(targets_cpu_tensor.shape[0]): - tgt_len = tgt_lenths_cpu_tensor[ind].item() - target = targets_cpu_tensor[ind][:tgt_len].numpy().tolist() - reference = self.decoding.decode_tokens_to_str(target) - references.append(reference) - - hypotheses, _ = self.decoding.ctc_decoder_predictions_tensor( - predictions, predictions_lengths, fold_consecutive=self.fold_consecutive - ) - - if self.log_prediction: - logging.info(f"\n") - logging.info(f"reference:{references[0]}") - logging.info(f"predicted:{hypotheses[0]}") - - for h, r in zip(hypotheses, references): - if self.use_cer: - h_list = list(h) - r_list = list(r) - else: - h_list = h.split() - r_list = r.split() - words += len(r_list) - # Compute Levenstein's distance - scores += editdistance.eval(h_list, r_list) - - self.scores = torch.tensor(scores, device=self.scores.device, dtype=self.scores.dtype) - self.words = torch.tensor(words, device=self.words.device, dtype=self.words.dtype) - # return torch.tensor([scores, words]).to(predictions.device) - - def compute(self): - scores = self.scores.detach().float() - words = self.words.detach().float() - return scores / words, scores, words - - -@dataclass -class CTCBPEDecodingConfig(CTCDecodingConfig): - pass diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/__init__.py deleted file mode 100644 index 34f2c4f62e29afa7a52a1c7a5189595c6f14e97d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/__init__.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.models.asr_model import ASRModel -from nemo.collections.asr.models.audio_to_audio_model import AudioToAudioModel -from nemo.collections.asr.models.classification_models import EncDecClassificationModel, EncDecFrameClassificationModel -from nemo.collections.asr.models.clustering_diarizer import ClusteringDiarizer -from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE -from nemo.collections.asr.models.ctc_models import EncDecCTCModel -from nemo.collections.asr.models.enhancement_models import EncMaskDecAudioToAudioModel -from nemo.collections.asr.models.hybrid_rnnt_ctc_bpe_models import EncDecHybridRNNTCTCBPEModel -from nemo.collections.asr.models.hybrid_rnnt_ctc_models import EncDecHybridRNNTCTCModel -from nemo.collections.asr.models.k2_sequence_models import ( - EncDecK2RnntSeqModel, - EncDecK2RnntSeqModelBPE, - EncDecK2SeqModel, - EncDecK2SeqModelBPE, -) -from nemo.collections.asr.models.label_models import EncDecSpeakerLabelModel -from nemo.collections.asr.models.msdd_models import EncDecDiarLabelModel, NeuralDiarizer -from nemo.collections.asr.models.rnnt_bpe_models import EncDecRNNTBPEModel -from nemo.collections.asr.models.rnnt_models import EncDecRNNTModel -from nemo.collections.asr.models.slu_models import SLUIntentSlotBPEModel -from nemo.collections.asr.models.ssl_models import SpeechEncDecSelfSupervisedModel -from nemo.collections.asr.models.transformer_bpe_models import EncDecTransfModelBPE diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index f6813d34bd515a914267b2b6239c8c584c646391..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/asr_model.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/asr_model.cpython-310.pyc deleted file mode 100644 index 53b43380fac6f18c51adef9b79aca9bd229705fc..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/asr_model.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/audio_to_audio_model.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/audio_to_audio_model.cpython-310.pyc deleted file mode 100644 index b66508bb08dd9817af8c6e53e38a57eed3434151..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/audio_to_audio_model.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/classification_models.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/classification_models.cpython-310.pyc deleted file mode 100644 index c79f5292d5d358fb1c2ed1b903d3faf765f7fa02..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/classification_models.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/clustering_diarizer.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/clustering_diarizer.cpython-310.pyc deleted file mode 100644 index 5d8b8a2acae419480097afc6387aa039a57c6687..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/clustering_diarizer.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/ctc_bpe_models.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/ctc_bpe_models.cpython-310.pyc deleted file mode 100644 index cedf70cd59da56a862f8bd45a667a2577582bcd1..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/ctc_bpe_models.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/ctc_models.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/ctc_models.cpython-310.pyc deleted file mode 100644 index d5054cfddfe0a08563ea4e745911cbff5bee5089..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/ctc_models.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/enhancement_models.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/enhancement_models.cpython-310.pyc deleted file mode 100644 index 5b3d1dfd99c6e2e1e672858c9e1b1cd9b3c32651..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/enhancement_models.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/hybrid_rnnt_ctc_bpe_models.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/hybrid_rnnt_ctc_bpe_models.cpython-310.pyc deleted file mode 100644 index 611bd970d1940fcf7261d1172eb15162076c468b..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/hybrid_rnnt_ctc_bpe_models.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/hybrid_rnnt_ctc_models.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/hybrid_rnnt_ctc_models.cpython-310.pyc deleted file mode 100644 index 42bd6eca51c05a2d5384546ae3534b6771645d8c..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/hybrid_rnnt_ctc_models.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/k2_sequence_models.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/k2_sequence_models.cpython-310.pyc deleted file mode 100644 index 3ae615aea3756a944130fe1292783fab698fa561..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/k2_sequence_models.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/label_models.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/label_models.cpython-310.pyc deleted file mode 100644 index f52b8de958e13b225c8aef7b3ad20441b273589a..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/label_models.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/msdd_models.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/msdd_models.cpython-310.pyc deleted file mode 100644 index ad066f71a7c90a63de4c26079baea800282c69da..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/msdd_models.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/rnnt_bpe_models.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/rnnt_bpe_models.cpython-310.pyc deleted file mode 100644 index a9f270635f860b8eced234b00610901a001ecdbb..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/rnnt_bpe_models.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/rnnt_models.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/rnnt_models.cpython-310.pyc deleted file mode 100644 index 09b32504126db7fa7ecc960d5a935207ff12478b..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/rnnt_models.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/slu_models.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/slu_models.cpython-310.pyc deleted file mode 100644 index ef1202e5bb7bdbc76abbda8b31cf22898bfb06c9..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/slu_models.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/ssl_models.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/ssl_models.cpython-310.pyc deleted file mode 100644 index df5acb0ad67c6ae41b937a2b132f6da31b7b1f6d..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/ssl_models.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/transformer_bpe_models.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/transformer_bpe_models.cpython-310.pyc deleted file mode 100644 index 2bb88fc7cfffea7e75bc2be473142f13a8ff6e7f..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/__pycache__/transformer_bpe_models.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/asr_model.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/asr_model.py deleted file mode 100644 index 7e03d587139f1d85c9e9edf2169cb267a524e248..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/asr_model.py +++ /dev/null @@ -1,225 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -from abc import ABC, abstractmethod -from typing import List - -import torch - -from nemo.core.classes import ModelPT -from nemo.core.classes.common import PretrainedModelInfo -from nemo.core.classes.exportable import Exportable -from nemo.core.classes.mixins import AccessMixin -from nemo.core.utils.neural_type_utils import get_io_names -from nemo.utils import logging, model_utils -from nemo.utils.cast_utils import cast_all - -__all__ = ['ASRModel'] - - -class ASRModel(ModelPT, ABC): - @abstractmethod - def transcribe(self, paths2audio_files: List[str], batch_size: int = 4, verbose: bool = True) -> List[str]: - """ - Takes paths to audio files and returns text transcription - Args: - paths2audio_files: paths to audio fragment to be transcribed - verbose: (bool) whether to display tqdm progress bar - - Returns: - transcription texts - """ - pass - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): - val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean() - wer_num = torch.stack([x['val_wer_num'] for x in outputs]).sum() - wer_denom = torch.stack([x['val_wer_denom'] for x in outputs]).sum() - tensorboard_logs = {'val_loss': val_loss_mean, 'val_wer': wer_num / wer_denom} - return {'val_loss': val_loss_mean, 'log': tensorboard_logs} - - def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): - val_loss_mean = torch.stack([x['test_loss'] for x in outputs]).mean() - wer_num = torch.stack([x['test_wer_num'] for x in outputs]).sum() - wer_denom = torch.stack([x['test_wer_denom'] for x in outputs]).sum() - tensorboard_logs = {'test_loss': val_loss_mean, 'test_wer': wer_num / wer_denom} - return {'test_loss': val_loss_mean, 'log': tensorboard_logs} - - @classmethod - def list_available_models(cls) -> 'List[PretrainedModelInfo]': - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - Returns: - List of available pre-trained models. - """ - # recursively walk the subclasses to generate pretrained model info - list_of_models = model_utils.resolve_subclass_pretrained_model_info(cls) - return list_of_models - - def add_auxiliary_losses(self, loss: torch.Tensor, reset_registry: bool = False) -> torch.Tensor: - """ - Utility method to enable calculation of auxiliary losses for ASR training. - - Args: - loss: The output loss value prior to addition with auxiliary losses. - reset_registry: Bool, whether to reset the AccessMixin registry after adding auxiliary losses. - - Returns: - Loss tensor used for back propagation. - """ - # Add adapter auxiliary losses, if registered - if AccessMixin.is_access_enabled(): - registry = AccessMixin.get_module_registry(self) - log_dict = {} - - for loss_key, loss_registry in registry.items(): - # Add auxiliary loss to total loss - if 'adapter_loss' in loss_registry: - loss_list = loss_registry['adapter_loss'] - loss_value = sum(loss_list) - loss += loss_value - - # Log current loss name and value - keys = loss_key.split(".") - key = "/".join(keys) - key = "adapter_loss/" + key - log_dict[key] = loss_value.detach() - - if len(log_dict) > 0: - self.log_dict(log_dict) - - if reset_registry: - AccessMixin.reset_registry(self) - - # return total loss - return loss - - def setup_optimization_flags(self): - """ - Utility method that must be explicitly called by the subclass in order to support optional optimization flags. - This method is the only valid place to access self.cfg prior to DDP training occurs. - - The subclass may chose not to support this method, therefore all variables here must be checked via hasattr() - """ - # Skip update if nan/inf grads appear on any rank. - self._skip_nan_grad = False - if "skip_nan_grad" in self._cfg and self._cfg["skip_nan_grad"]: - self._skip_nan_grad = self._cfg["skip_nan_grad"] - - def on_after_backward(self): - """ - zero-out the gradients which any of them is NAN or INF - """ - super().on_after_backward() - - if hasattr(self, '_skip_nan_grad') and self._skip_nan_grad: - device = next(self.parameters()).device - valid_gradients = torch.tensor([1], device=device, dtype=torch.float32) - - # valid_gradients = True - for param_name, param in self.named_parameters(): - if param.grad is not None: - is_not_nan_or_inf = not (torch.isnan(param.grad).any() or torch.isinf(param.grad).any()) - if not is_not_nan_or_inf: - valid_gradients = valid_gradients * 0 - break - - if torch.distributed.is_initialized(): - torch.distributed.all_reduce(valid_gradients, op=torch.distributed.ReduceOp.MIN) - - if valid_gradients < 1: - logging.warning(f'detected inf or nan values in gradients! Setting gradients to zero.') - self.zero_grad() - - -class ExportableEncDecModel(Exportable): - """ - Simple utiliy mix-in to export models that consist of encoder/decoder pair - plus pre/post processor, but have to be exported as encoder/decoder pair only - (covers most ASR classes) - """ - - @property - def input_module(self): - return self.encoder - - @property - def output_module(self): - return self.decoder - - @property - def output_names(self): - otypes = self.output_module.output_types - if getattr(self.input_module, 'export_cache_support', False): - in_types = self.input_module.output_types - otypes = {n: t for (n, t) in list(otypes.items())[:1]} - for (n, t) in list(in_types.items())[1:]: - otypes[n] = t - return get_io_names(otypes, self.disabled_deployment_output_names) - - def forward_for_export( - self, input, length=None, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None - ): - """ - This forward is used when we need to export the model to ONNX format. - Inputs cache_last_channel and cache_last_time are needed to be passed for exporting streaming models. - Args: - input: Tensor that represents a batch of raw audio signals, - of shape [B, T]. T here represents timesteps. - length: Vector of length B, that contains the individual lengths of the audio sequences. - cache_last_channel: Tensor of shape [N, B, T, H] which contains the cache for last channel layers - cache_last_time: Tensor of shape [N, B, H, T] which contains the cache for last time layers - N is the number of such layers which need caching, B is batch size, H is the hidden size of activations, - and T is the length of the cache - - Returns: - the output of the model - """ - enc_fun = getattr(self.input_module, 'forward_for_export', self.input_module.forward) - if cache_last_channel is None: - encoder_output = enc_fun(audio_signal=input, length=length) - if isinstance(encoder_output, tuple): - encoder_output = encoder_output[0] - else: - encoder_output, length, cache_last_channel, cache_last_time, cache_last_channel_len = enc_fun( - audio_signal=input, - length=length, - cache_last_channel=cache_last_channel, - cache_last_time=cache_last_time, - cache_last_channel_len=cache_last_channel_len, - ) - - dec_fun = getattr(self.output_module, 'forward_for_export', self.output_module.forward) - ret = dec_fun(encoder_output=encoder_output) - if isinstance(ret, tuple): - ret = ret[0] - if cache_last_channel is not None: - ret = (ret, length, cache_last_channel, cache_last_time, cache_last_channel_len) - return cast_all(ret, from_dtype=torch.float16, to_dtype=torch.float32) - - @property - def disabled_deployment_input_names(self): - return self.encoder.disabled_deployment_input_names - - @property - def disabled_deployment_output_names(self): - return self.encoder.disabled_deployment_output_names - - def set_export_config(self, args): - if 'cache_support' in args: - enable = bool(args['cache_support']) - self.encoder.export_cache_support = enable - logging.info(f"Caching support enabled: {enable}") - self.encoder.setup_streaming_params() - super().set_export_config(args) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/audio_to_audio_model.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/audio_to_audio_model.py deleted file mode 100644 index 49364843e8b800f86797fa520b7aaa8969d97ae8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/audio_to_audio_model.py +++ /dev/null @@ -1,225 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from abc import ABC, abstractmethod -from typing import List, Union - -import hydra -import torch -from omegaconf import DictConfig, OmegaConf -from pytorch_lightning import Trainer - -from nemo.collections.asr.metrics.audio import AudioMetricWrapper -from nemo.core.classes import ModelPT -from nemo.utils import logging, model_utils - -__all__ = ['AudioToAudioModel'] - - -class AudioToAudioModel(ModelPT, ABC): - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - super().__init__(cfg=cfg, trainer=trainer) - - self._setup_loss() - - def _setup_loss(self): - """Setup loss for this model. - """ - self.loss = AudioToAudioModel.from_config_dict(self._cfg.loss) - - def _get_num_dataloaders(self, tag: str = 'val'): - if tag == 'val': - num_dataloaders = len(self._validation_dl) if isinstance(self._validation_dl, List) else 1 - elif tag == 'test': - num_dataloaders = len(self._test_dl) if isinstance(self._test_dl, List) else 1 - else: - raise ValueError(f'Unexpected tag {tag}.') - - return num_dataloaders - - def _setup_metrics(self, tag: str = 'val'): - """Setup metrics for this model for all available dataloaders. - - When using multiple DataLoaders, it is recommended to initialize separate modular - metric instances for each DataLoader and use them separately. - - Reference: - - https://torchmetrics.readthedocs.io/en/stable/pages/lightning.html#common-pitfalls - """ - # Number of currently configured dataloaders - num_dataloaders = self._get_num_dataloaders(tag) - logging.debug('Found %d dataloaders for %s', num_dataloaders, tag) - - if hasattr(self, 'metrics'): - if tag in self.metrics and len(self.metrics[tag]) == num_dataloaders: - # Exact number of metrics have already been configured, nothing else to do - logging.debug('Found %d metrics for tag %s, not necesary to initialize again', num_dataloaders, tag) - return - - if self.cfg.get('metrics') is None: - # Metrics are not available in the configuration, nothing to do - logging.debug('No metrics configured in model.metrics') - return - - if (metrics_cfg := self.cfg['metrics'].get(tag)) is None: - # Metrics configuration is not available in the configuration, nothing to do - logging.debug('No metrics configured for %s in model.metrics', tag) - return - - if 'loss' in metrics_cfg: - raise ValueError( - f'Loss is automatically included in the metrics, it should not be specified in model.metrics.{tag}.' - ) - - # Initialize metrics - if not hasattr(self, 'metrics'): - self.metrics = torch.nn.ModuleDict() - - # Setup metrics for each dataloader - self.metrics[tag] = torch.nn.ModuleList() - for dataloader_idx in range(num_dataloaders): - metrics_dataloader_idx = {} - for name, cfg in metrics_cfg.items(): - logging.debug('Initialize %s for dataloader_idx %s', name, dataloader_idx) - cfg_dict = OmegaConf.to_container(cfg) - cfg_channel = cfg_dict.pop('channel', None) - cfg_batch_averaging = cfg_dict.pop('metric_using_batch_averaging', None) - metrics_dataloader_idx[name] = AudioMetricWrapper( - metric=hydra.utils.instantiate(cfg_dict), - channel=cfg_channel, - metric_using_batch_averaging=cfg_batch_averaging, - ) - - metrics_dataloader_idx = torch.nn.ModuleDict(metrics_dataloader_idx) - self.metrics[tag].append(metrics_dataloader_idx.to(self.device)) - - logging.info( - 'Setup metrics for %s, dataloader %d: %s', tag, dataloader_idx, ', '.join(metrics_dataloader_idx) - ) - - @abstractmethod - def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str = 'val'): - pass - - def on_validation_start(self): - self._setup_metrics('val') - return super().on_validation_start() - - def on_test_start(self): - self._setup_metrics('test') - return super().on_test_start() - - def validation_step(self, batch, batch_idx, dataloader_idx: int = 0): - output_dict = self.evaluation_step(batch, batch_idx, dataloader_idx, 'val') - if isinstance(self.trainer.val_dataloaders, (list, tuple)) and len(self.trainer.val_dataloaders) > 1: - self.validation_step_outputs[dataloader_idx].append(output_dict) - else: - self.validation_step_outputs.append(output_dict) - return output_dict - - def test_step(self, batch, batch_idx, dataloader_idx=0): - output_dict = self.evaluation_step(batch, batch_idx, dataloader_idx, 'test') - if isinstance(self.trainer.test_dataloaders, (list, tuple)) and len(self.trainer.test_dataloaders) > 1: - self.test_step_outputs[dataloader_idx].append(output_dict) - else: - self.test_step_outputs.append(output_dict) - return output_dict - - def multi_evaluation_epoch_end(self, outputs, dataloader_idx: int = 0, tag: str = 'val'): - # Handle loss - loss_mean = torch.stack([x[f'{tag}_loss'] for x in outputs]).mean() - tensorboard_logs = {f'{tag}_loss': loss_mean} - - # Handle metrics for this tag and dataloader_idx - if hasattr(self, 'metrics') and tag in self.metrics: - for name, metric in self.metrics[tag][dataloader_idx].items(): - # Compute & reset the metric - value = metric.compute() - metric.reset() - # Store for logs - tensorboard_logs[f'{tag}_{name}'] = value - - return {f'{tag}_loss': loss_mean, 'log': tensorboard_logs} - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): - return self.multi_evaluation_epoch_end(outputs, dataloader_idx, 'val') - - def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): - return self.multi_evaluation_epoch_end(outputs, dataloader_idx, 'test') - - @abstractmethod - def process( - self, paths2audio_files: List[str], output_dir: str, batch_size: int = 4 - ) -> List[Union[str, List[str]]]: - """ - Takes paths to audio files and returns a list of paths to processed - audios. - - Args: - paths2audio_files: paths to audio files to be processed - output_dir: directory to save processed files - batch_size: batch size for inference - - Returns: - Paths to processed audio signals. - """ - pass - - @classmethod - def list_available_models(cls) -> 'List[PretrainedModelInfo]': - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - Returns: - List of available pre-trained models. - """ - # recursively walk the subclasses to generate pretrained model info - list_of_models = model_utils.resolve_subclass_pretrained_model_info(cls) - return list_of_models - - def setup_optimization_flags(self): - """ - Utility method that must be explicitly called by the subclass in order to support optional optimization flags. - This method is the only valid place to access self.cfg prior to DDP training occurs. - - The subclass may chose not to support this method, therefore all variables here must be checked via hasattr() - """ - # Skip update if nan/inf grads appear on any rank. - self._skip_nan_grad = False - if "skip_nan_grad" in self._cfg and self._cfg["skip_nan_grad"]: - self._skip_nan_grad = self._cfg["skip_nan_grad"] - - def on_after_backward(self): - """ - zero-out the gradients which any of them is NAN or INF - """ - super().on_after_backward() - - if hasattr(self, '_skip_nan_grad') and self._skip_nan_grad: - device = next(self.parameters()).device - valid_gradients = torch.tensor([1], device=device, dtype=torch.float32) - - # valid_gradients = True - for param_name, param in self.named_parameters(): - if param.grad is not None: - is_not_nan_or_inf = not (torch.isnan(param.grad).any() or torch.isinf(param.grad).any()) - if not is_not_nan_or_inf: - valid_gradients = valid_gradients * 0 - break - - if torch.distributed.is_initialized(): - torch.distributed.all_reduce(valid_gradients, op=torch.distributed.ReduceOp.MIN) - - if valid_gradients < 1: - logging.warning(f'detected inf or nan values in gradients! Setting gradients to zero.') - self.zero_grad() diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/classification_models.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/classification_models.py deleted file mode 100644 index 6f0b67e7a7ee0ec0c965e21646ad24b2f3e46a09..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/classification_models.py +++ /dev/null @@ -1,1203 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import json -import os -import tempfile -from abc import abstractmethod -from math import ceil, floor -from typing import Dict, List, Optional, Union - -import torch -from omegaconf import DictConfig, ListConfig, OmegaConf -from pytorch_lightning import Trainer -from torchmetrics import Accuracy -from torchmetrics.regression import MeanAbsoluteError, MeanSquaredError - -from nemo.collections.asr.data import audio_to_label_dataset, feature_to_label_dataset -from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel -from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer -from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations -from nemo.collections.common.losses import CrossEntropyLoss, MSELoss -from nemo.collections.common.metrics import TopKClassificationAccuracy -from nemo.core.classes.common import PretrainedModelInfo, typecheck -from nemo.core.neural_types import * -from nemo.utils import logging, model_utils -from nemo.utils.cast_utils import cast_all - -__all__ = ['EncDecClassificationModel', 'EncDecRegressionModel'] - - -class _EncDecBaseModel(ASRModel, ExportableEncDecModel): - """Encoder decoder Classification models.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable - # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 - self.world_size = 1 - if trainer is not None: - self.world_size = trainer.num_nodes * trainer.num_devices - - # Convert config to a DictConfig - cfg = model_utils.convert_model_config_to_dict_config(cfg) - - # Convert config to support Hydra 1.0+ instantiation - cfg = model_utils.maybe_update_config_version(cfg) - - self.is_regression_task = cfg.get('is_regression_task', False) - # Change labels if needed - self._update_decoder_config(cfg.labels, cfg.decoder) - super().__init__(cfg=cfg, trainer=trainer) - - if hasattr(self._cfg, 'spec_augment') and self._cfg.spec_augment is not None: - self.spec_augmentation = ASRModel.from_config_dict(self._cfg.spec_augment) - else: - self.spec_augmentation = None - if hasattr(self._cfg, 'crop_or_pad_augment') and self._cfg.crop_or_pad_augment is not None: - self.crop_or_pad = ASRModel.from_config_dict(self._cfg.crop_or_pad_augment) - else: - self.crop_or_pad = None - - self.preprocessor = self._setup_preprocessor() - self.encoder = self._setup_encoder() - self.decoder = self._setup_decoder() - self.loss = self._setup_loss() - self._setup_metrics() - - @abstractmethod - def _setup_preprocessor(self): - """ - Setup preprocessor for audio data - Returns: Preprocessor - - """ - pass - - @abstractmethod - def _setup_encoder(self): - """ - Setup encoder for the Encoder-Decoder network - Returns: Encoder - """ - pass - - @abstractmethod - def _setup_decoder(self): - """ - Setup decoder for the Encoder-Decoder network - Returns: Decoder - """ - pass - - @abstractmethod - def _setup_loss(self): - """ - Setup loss function for training - Returns: Loss function - - """ - pass - - @abstractmethod - def _setup_metrics(self): - """ - Setup metrics to be tracked in addition to loss - Returns: void - - """ - pass - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - if hasattr(self.preprocessor, '_sample_rate'): - audio_eltype = AudioSignal(freq=self.preprocessor._sample_rate) - else: - audio_eltype = AudioSignal() - return { - "input_signal": NeuralType(('B', 'T'), audio_eltype, optional=True), - "input_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), - "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType(), optional=True), - "processed_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), - } - - @property - @abstractmethod - def output_types(self) -> Optional[Dict[str, NeuralType]]: - pass - - def forward( - self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None - ): - has_input_signal = input_signal is not None and input_signal_length is not None - has_processed_signal = processed_signal is not None and processed_signal_length is not None - if (has_input_signal ^ has_processed_signal) == False: - raise ValueError( - f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive " - " with ``processed_signal`` and ``processed_signal_length`` arguments." - ) - - if not has_processed_signal: - processed_signal, processed_signal_length = self.preprocessor( - input_signal=input_signal, length=input_signal_length, - ) - # Crop or pad is always applied - if self.crop_or_pad is not None: - processed_signal, processed_signal_length = self.crop_or_pad( - input_signal=processed_signal, length=processed_signal_length - ) - # Spec augment is not applied during evaluation/testing - if self.spec_augmentation is not None and self.training: - processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) - encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length) - logits = self.decoder(encoder_output=encoded) - return logits - - def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]): - if 'shuffle' not in train_data_config: - train_data_config['shuffle'] = True - # preserve config - self._update_dataset_config(dataset_name='train', config=train_data_config) - - self._train_dl = self._setup_dataloader_from_config(config=DictConfig(train_data_config)) - - # Need to set this because if using an IterableDataset, the length of the dataloader is the total number - # of samples rather than the number of batches, and this messes up the tqdm progress bar. - # So we set the number of steps manually (to the correct number) to fix this. - if ( - self._train_dl is not None - and hasattr(self._train_dl, 'dataset') - and isinstance(self._train_dl.dataset, torch.utils.data.IterableDataset) - ): - # We also need to check if limit_train_batches is already set. - # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, - # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). - if isinstance(self._trainer.limit_train_batches, float): - self._trainer.limit_train_batches = int( - self._trainer.limit_train_batches - * ceil((len(self._train_dl.dataset) / self.world_size) / train_data_config['batch_size']) - ) - - def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]): - if 'shuffle' not in val_data_config: - val_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='validation', config=val_data_config) - - self._validation_dl = self._setup_dataloader_from_config(config=DictConfig(val_data_config)) - - def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]], use_feat: bool = False): - if 'shuffle' not in test_data_config: - test_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='test', config=test_data_config) - - if use_feat and hasattr(self, '_setup_feature_label_dataloader'): - self._test_dl = self._setup_feature_label_dataloader(config=DictConfig(test_data_config)) - else: - self._test_dl = self._setup_dataloader_from_config(config=DictConfig(test_data_config)) - - def test_dataloader(self): - if self._test_dl is not None: - return self._test_dl - - def _setup_dataloader_from_config(self, config: DictConfig): - - OmegaConf.set_struct(config, False) - config.is_regression_task = self.is_regression_task - OmegaConf.set_struct(config, True) - - if 'augmentor' in config: - augmentor = process_augmentations(config['augmentor']) - else: - augmentor = None - - featurizer = WaveformFeaturizer( - sample_rate=config['sample_rate'], int_values=config.get('int_values', False), augmentor=augmentor - ) - shuffle = config['shuffle'] - - # Instantiate tarred dataset loader or normal dataset loader - if config.get('is_tarred', False): - if ('tarred_audio_filepaths' in config and config['tarred_audio_filepaths'] is None) or ( - 'manifest_filepath' in config and config['manifest_filepath'] is None - ): - logging.warning( - "Could not load dataset as `manifest_filepath` is None or " - f"`tarred_audio_filepaths` is None. Provided config : {config}" - ) - return None - - if 'vad_stream' in config and config['vad_stream']: - logging.warning("VAD inference does not support tarred dataset now") - return None - - shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0 - dataset = audio_to_label_dataset.get_tarred_classification_label_dataset( - featurizer=featurizer, - config=config, - shuffle_n=shuffle_n, - global_rank=self.global_rank, - world_size=self.world_size, - ) - shuffle = False - batch_size = config['batch_size'] - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - elif hasattr(dataset.datasets[0], 'collate_fn'): - # support datasets that are lists of entries - collate_fn = dataset.datasets[0].collate_fn - else: - # support datasets that are lists of lists - collate_fn = dataset.datasets[0].datasets[0].collate_fn - - else: - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - logging.warning(f"Could not load dataset as `manifest_filepath` is None. Provided config : {config}") - return None - - if 'vad_stream' in config and config['vad_stream']: - logging.info("Perform streaming frame-level VAD") - dataset = audio_to_label_dataset.get_speech_label_dataset(featurizer=featurizer, config=config) - batch_size = 1 - collate_fn = dataset.vad_frame_seq_collate_fn - else: - dataset = audio_to_label_dataset.get_classification_label_dataset(featurizer=featurizer, config=config) - batch_size = config['batch_size'] - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - elif hasattr(dataset.datasets[0], 'collate_fn'): - # support datasets that are lists of entries - collate_fn = dataset.datasets[0].collate_fn - else: - # support datasets that are lists of lists - collate_fn = dataset.datasets[0].datasets[0].collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=batch_size, - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def _setup_feature_label_dataloader(self, config: DictConfig) -> torch.utils.data.DataLoader: - """ - setup dataloader for VAD inference with audio features as input - """ - - OmegaConf.set_struct(config, False) - config.is_regression_task = self.is_regression_task - OmegaConf.set_struct(config, True) - - if 'augmentor' in config: - augmentor = process_augmentations(config['augmentor']) - else: - augmentor = None - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - logging.warning(f"Could not load dataset as `manifest_filepath` is None. Provided config : {config}") - return None - - dataset = feature_to_label_dataset.get_feature_label_dataset(config=config, augmentor=augmentor) - if 'vad_stream' in config and config['vad_stream']: - collate_func = dataset._vad_segment_collate_fn - batch_size = 1 - shuffle = False - else: - collate_func = dataset._collate_fn - batch_size = config['batch_size'] - shuffle = config['shuffle'] - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=batch_size, - collate_fn=collate_func, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - @torch.no_grad() - def transcribe(self, paths2audio_files: List[str], batch_size: int = 4, logprobs=False) -> List[str]: - """ - Generate class labels for provided audio files. Use this method for debugging and prototyping. - - Args: - paths2audio_files: (a list) of paths to audio files. \ - Recommended length per file is approximately 1 second. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - logprobs: (bool) pass True to get log probabilities instead of class labels. - - Returns: - - A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files - """ - if paths2audio_files is None or len(paths2audio_files) == 0: - return [] - # We will store transcriptions here - labels = [] - # Model's mode and device - mode = self.training - device = next(self.parameters()).device - - if hasattr(self.preprocessor.featurizer, 'dither'): - dither_value = self.preprocessor.featurizer.dither - if hasattr(self.preprocessor.featurizer, 'pad_to'): - pad_to_value = self.preprocessor.featurizer.pad_to - - try: - if hasattr(self.preprocessor.featurizer, 'dither'): - self.preprocessor.featurizer.dither = 0.0 - if hasattr(self.preprocessor.featurizer, 'pad_to'): - self.preprocessor.featurizer.pad_to = 0 - # Switch model to evaluation mode - self.eval() - logging_level = logging.get_verbosity() - logging.set_verbosity(logging.WARNING) - # Work in tmp directory - will store manifest file there - with tempfile.TemporaryDirectory() as tmpdir: - with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp: - for audio_file in paths2audio_files: - label = 0.0 if self.is_regression_task else self.cfg.labels[0] - entry = {'audio_filepath': audio_file, 'duration': 100000.0, 'label': label} - fp.write(json.dumps(entry) + '\n') - - config = {'paths2audio_files': paths2audio_files, 'batch_size': batch_size, 'temp_dir': tmpdir} - - temporary_datalayer = self._setup_transcribe_dataloader(config) - for test_batch in temporary_datalayer: - logits = self.forward( - input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) - ) - if logprobs: - # dump log probs per file - for idx in range(logits.shape[0]): - lg = logits[idx] - labels.append(lg.cpu().numpy()) - else: - labels_k = [] - top_ks = self._accuracy.top_k - for top_k_i in top_ks: - # replace top k value with current top k - self._accuracy.top_k = top_k_i - labels_k_i = self._accuracy.top_k_predicted_labels(logits) - labels_k_i = labels_k_i.cpu() - labels_k.append(labels_k_i) - - # convenience: if only one top_k, pop out the nested list - if len(top_ks) == 1: - labels_k = labels_k[0] - - labels += labels_k - # reset top k to orignal value - self._accuracy.top_k = top_ks - del test_batch - finally: - # set mode back to its original value - self.train(mode=mode) - - if hasattr(self.preprocessor.featurizer, 'dither'): - self.preprocessor.featurizer.dither = dither_value - if hasattr(self.preprocessor.featurizer, 'pad_to'): - self.preprocessor.featurizer.pad_to = pad_to_value - logging.set_verbosity(logging_level) - return labels - - def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': - """ - Setup function for a temporary data loader which wraps the provided audio file. - - Args: - config: A python dictionary which contains the following keys: - - Returns: - A pytorch DataLoader for the given audio file(s). - """ - dl_config = { - 'manifest_filepath': os.path.join(config['temp_dir'], 'manifest.json'), - 'sample_rate': self.preprocessor._sample_rate, - 'labels': self.cfg.labels, - 'batch_size': min(config['batch_size'], len(config['paths2audio_files'])), - 'trim_silence': False, - 'shuffle': False, - } - - temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) - return temporary_datalayer - - @abstractmethod - def _update_decoder_config(self, labels, cfg): - pass - - -class EncDecClassificationModel(_EncDecBaseModel): - """Encoder decoder Classification models.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - - if cfg.get("is_regression_task", False): - raise ValueError(f"EndDecClassificationModel requires the flag is_regression_task to be set as false") - - super().__init__(cfg=cfg, trainer=trainer) - - def _setup_preprocessor(self): - return EncDecClassificationModel.from_config_dict(self._cfg.preprocessor) - - def _setup_encoder(self): - return EncDecClassificationModel.from_config_dict(self._cfg.encoder) - - def _setup_decoder(self): - return EncDecClassificationModel.from_config_dict(self._cfg.decoder) - - def _setup_loss(self): - return CrossEntropyLoss() - - def _setup_metrics(self): - self._accuracy = TopKClassificationAccuracy(dist_sync_on_step=True) - - @classmethod - def list_available_models(cls) -> Optional[List[PretrainedModelInfo]]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - - model = PretrainedModelInfo( - pretrained_model_name="vad_multilingual_marblenet", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/vad_multilingual_marblenet", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/vad_multilingual_marblenet/versions/1.10.0/files/vad_multilingual_marblenet.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="vad_telephony_marblenet", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:vad_telephony_marblenet", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/vad_telephony_marblenet/versions/1.0.0rc1/files/vad_telephony_marblenet.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="vad_marblenet", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:vad_marblenet", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/vad_marblenet/versions/1.0.0rc1/files/vad_marblenet.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="commandrecognition_en_matchboxnet3x1x64_v1", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x1x64_v1", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/commandrecognition_en_matchboxnet3x1x64_v1/versions/1.0.0rc1/files/commandrecognition_en_matchboxnet3x1x64_v1.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="commandrecognition_en_matchboxnet3x2x64_v1", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x2x64_v1", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/commandrecognition_en_matchboxnet3x2x64_v1/versions/1.0.0rc1/files/commandrecognition_en_matchboxnet3x2x64_v1.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="commandrecognition_en_matchboxnet3x1x64_v2", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x1x64_v2", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/commandrecognition_en_matchboxnet3x1x64_v2/versions/1.0.0rc1/files/commandrecognition_en_matchboxnet3x1x64_v2.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="commandrecognition_en_matchboxnet3x2x64_v2", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x2x64_v2", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/commandrecognition_en_matchboxnet3x2x64_v2/versions/1.0.0rc1/files/commandrecognition_en_matchboxnet3x2x64_v2.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="commandrecognition_en_matchboxnet3x1x64_v2_subset_task", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x1x64_v2_subset_task", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/commandrecognition_en_matchboxnet3x1x64_v2_subset_task/versions/1.0.0rc1/files/commandrecognition_en_matchboxnet3x1x64_v2_subset_task.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="commandrecognition_en_matchboxnet3x2x64_v2_subset_task", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:commandrecognition_en_matchboxnet3x2x64_v2_subset_task", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/commandrecognition_en_matchboxnet3x2x64_v2_subset_task/versions/1.0.0rc1/files/commandrecognition_en_matchboxnet3x2x64_v2_subset_task.nemo", - ) - results.append(model) - return results - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return {"outputs": NeuralType(('B', 'D'), LogitsType())} - - # PTL-specific methods - def training_step(self, batch, batch_nb): - audio_signal, audio_signal_len, labels, labels_len = batch - logits = self.forward(input_signal=audio_signal, input_signal_length=audio_signal_len) - loss_value = self.loss(logits=logits, labels=labels) - - self.log('train_loss', loss_value) - self.log('learning_rate', self._optimizer.param_groups[0]['lr']) - self.log('global_step', self.trainer.global_step) - - self._accuracy(logits=logits, labels=labels) - topk_scores = self._accuracy.compute() - self._accuracy.reset() - - for top_k, score in zip(self._accuracy.top_k, topk_scores): - self.log('training_batch_accuracy_top_{}'.format(top_k), score) - - return { - 'loss': loss_value, - } - - def validation_step(self, batch, batch_idx, dataloader_idx=0): - audio_signal, audio_signal_len, labels, labels_len = batch - logits = self.forward(input_signal=audio_signal, input_signal_length=audio_signal_len) - loss_value = self.loss(logits=logits, labels=labels) - acc = self._accuracy(logits=logits, labels=labels) - correct_counts, total_counts = self._accuracy.correct_counts_k, self._accuracy.total_counts_k - loss = { - 'val_loss': loss_value, - 'val_correct_counts': correct_counts, - 'val_total_counts': total_counts, - 'val_acc': acc, - } - if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: - self.validation_step_outputs[dataloader_idx].append(loss) - else: - self.validation_step_outputs.append(loss) - return loss - - def test_step(self, batch, batch_idx, dataloader_idx=0): - audio_signal, audio_signal_len, labels, labels_len = batch - logits = self.forward(input_signal=audio_signal, input_signal_length=audio_signal_len) - loss_value = self.loss(logits=logits, labels=labels) - acc = self._accuracy(logits=logits, labels=labels) - correct_counts, total_counts = self._accuracy.correct_counts_k, self._accuracy.total_counts_k - loss = { - 'test_loss': loss_value, - 'test_correct_counts': correct_counts, - 'test_total_counts': total_counts, - 'test_acc': acc, - } - if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: - self.test_step_outputs[dataloader_idx].append(loss) - else: - self.test_step_outputs.append(loss) - return loss - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): - val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean() - correct_counts = torch.stack([x['val_correct_counts'] for x in outputs]).sum(axis=0) - total_counts = torch.stack([x['val_total_counts'] for x in outputs]).sum(axis=0) - - self._accuracy.correct_counts_k = correct_counts - self._accuracy.total_counts_k = total_counts - topk_scores = self._accuracy.compute() - self._accuracy.reset() - - tensorboard_log = {'val_loss': val_loss_mean} - for top_k, score in zip(self._accuracy.top_k, topk_scores): - tensorboard_log['val_epoch_top@{}'.format(top_k)] = score - - return {'log': tensorboard_log} - - def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): - test_loss_mean = torch.stack([x['test_loss'] for x in outputs]).mean() - correct_counts = torch.stack([x['test_correct_counts'].unsqueeze(0) for x in outputs]).sum(axis=0) - total_counts = torch.stack([x['test_total_counts'].unsqueeze(0) for x in outputs]).sum(axis=0) - - self._accuracy.correct_counts_k = correct_counts - self._accuracy.total_counts_k = total_counts - topk_scores = self._accuracy.compute() - self._accuracy.reset() - - tensorboard_log = {'test_loss': test_loss_mean} - for top_k, score in zip(self._accuracy.top_k, topk_scores): - tensorboard_log['test_epoch_top@{}'.format(top_k)] = score - - return {'log': tensorboard_log} - - @typecheck() - def forward( - self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None - ): - logits = super().forward( - input_signal=input_signal, - input_signal_length=input_signal_length, - processed_signal=processed_signal, - processed_signal_length=processed_signal_length, - ) - return logits - - def change_labels(self, new_labels: List[str]): - """ - Changes labels used by the decoder model. Use this method when fine-tuning on from pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on a data in another dataset. - - If new_labels == self.decoder.vocabulary then nothing will be changed. - - Args: - - new_labels: list with new labels. Must contain at least 2 elements. Typically, \ - this is set of labels for the dataset. - - Returns: None - - """ - if new_labels is not None and not isinstance(new_labels, ListConfig): - new_labels = ListConfig(new_labels) - - if self._cfg.labels == new_labels: - logging.warning( - f"Old labels ({self._cfg.labels}) and new labels ({new_labels}) match. Not changing anything" - ) - else: - if new_labels is None or len(new_labels) == 0: - raise ValueError(f'New labels must be non-empty list of labels. But I got: {new_labels}') - - # Update config - self._cfg.labels = new_labels - - decoder_config = self.decoder.to_config_dict() - new_decoder_config = copy.deepcopy(decoder_config) - self._update_decoder_config(new_labels, new_decoder_config) - del self.decoder - self.decoder = EncDecClassificationModel.from_config_dict(new_decoder_config) - - OmegaConf.set_struct(self._cfg.decoder, False) - self._cfg.decoder = new_decoder_config - OmegaConf.set_struct(self._cfg.decoder, True) - - if 'train_ds' in self._cfg and self._cfg.train_ds is not None: - self._cfg.train_ds.labels = new_labels - - if 'validation_ds' in self._cfg and self._cfg.validation_ds is not None: - self._cfg.validation_ds.labels = new_labels - - if 'test_ds' in self._cfg and self._cfg.test_ds is not None: - self._cfg.test_ds.labels = new_labels - - logging.info(f"Changed decoder output to {self.decoder.num_classes} labels.") - - def _update_decoder_config(self, labels, cfg): - """ - Update the number of classes in the decoder based on labels provided. - - Args: - labels: The current labels of the model - cfg: The config of the decoder which will be updated. - """ - OmegaConf.set_struct(cfg, False) - - if 'params' in cfg: - cfg.params.num_classes = len(labels) - else: - cfg.num_classes = len(labels) - - OmegaConf.set_struct(cfg, True) - - -class EncDecRegressionModel(_EncDecBaseModel): - """Encoder decoder class for speech regression models. - Model class creates training, validation methods for setting up data - performing model forward pass. - """ - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - Returns: - List of available pre-trained models. - """ - result = [] - - return result - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - if not cfg.get('is_regression_task', False): - raise ValueError(f"EndDecRegressionModel requires the flag is_regression_task to be set as true") - super().__init__(cfg=cfg, trainer=trainer) - - def _setup_preprocessor(self): - return EncDecRegressionModel.from_config_dict(self._cfg.preprocessor) - - def _setup_encoder(self): - return EncDecRegressionModel.from_config_dict(self._cfg.encoder) - - def _setup_decoder(self): - return EncDecRegressionModel.from_config_dict(self._cfg.decoder) - - def _setup_loss(self): - return MSELoss() - - def _setup_metrics(self): - self._mse = MeanSquaredError() - self._mae = MeanAbsoluteError() - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return {"preds": NeuralType(tuple('B'), RegressionValuesType())} - - @typecheck() - def forward(self, input_signal, input_signal_length): - logits = super().forward(input_signal=input_signal, input_signal_length=input_signal_length) - return logits.view(-1) - - # PTL-specific methods - def training_step(self, batch, batch_idx): - audio_signal, audio_signal_len, targets, targets_len = batch - logits = self.forward(input_signal=audio_signal, input_signal_length=audio_signal_len) - loss = self.loss(preds=logits, labels=targets) - train_mse = self._mse(preds=logits, target=targets) - train_mae = self._mae(preds=logits, target=targets) - - self.log_dict( - { - 'train_loss': loss, - 'train_mse': train_mse, - 'train_mae': train_mae, - 'learning_rate': self._optimizer.param_groups[0]['lr'], - }, - ) - - return {'loss': loss} - - def validation_step(self, batch, batch_idx, dataloader_idx: int = 0): - audio_signal, audio_signal_len, targets, targets_len = batch - logits = self.forward(input_signal=audio_signal, input_signal_length=audio_signal_len) - loss_value = self.loss(preds=logits, labels=targets) - val_mse = self._mse(preds=logits, target=targets) - val_mae = self._mae(preds=logits, target=targets) - - return {'val_loss': loss_value, 'val_mse': val_mse, 'val_mae': val_mae} - - def test_step(self, batch, batch_idx, dataloader_idx: int = 0): - logs = self.validation_step(batch, batch_idx, dataloader_idx) - - return {'test_loss': logs['val_loss'], 'test_mse': logs['test_mse'], 'test_mae': logs['val_mae']} - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): - val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean() - val_mse = self._mse.compute() - self._mse.reset() - val_mae = self._mae.compute() - self._mae.reset() - - tensorboard_logs = {'val_loss': val_loss_mean, 'val_mse': val_mse, 'val_mae': val_mae} - - return {'val_loss': val_loss_mean, 'val_mse': val_mse, 'val_mae': val_mae, 'log': tensorboard_logs} - - def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): - test_loss_mean = torch.stack([x['test_loss'] for x in outputs]).mean() - test_mse = self._mse.compute() - self._mse.reset() - test_mae = self._mae.compute() - self._mae.reset() - - tensorboard_logs = {'test_loss': test_loss_mean, 'test_mse': test_mse, 'test_mae': test_mae} - - return {'test_loss': test_loss_mean, 'test_mse': test_mse, 'test_mae': test_mae, 'log': tensorboard_logs} - - @torch.no_grad() - def transcribe(self, paths2audio_files: List[str], batch_size: int = 4) -> List[float]: - """ - Generate class labels for provided audio files. Use this method for debugging and prototyping. - - Args: - paths2audio_files: (a list) of paths to audio files. \ - Recommended length per file is approximately 1 second. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - - Returns: - - A list of predictions in the same order as paths2audio_files - """ - predictions = super().transcribe(paths2audio_files, batch_size, logprobs=True) - return [float(pred) for pred in predictions] - - def _update_decoder_config(self, labels, cfg): - - OmegaConf.set_struct(cfg, False) - - if 'params' in cfg: - cfg.params.num_classes = 1 - else: - cfg.num_classes = 1 - - OmegaConf.set_struct(cfg, True) - - -class EncDecFrameClassificationModel(EncDecClassificationModel): - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return {"outputs": NeuralType(('B', 'T', 'C'), LogitsType())} - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - self.num_classes = len(cfg.labels) - self.eval_loop_cnt = 0 - self.ratio_threshold = cfg.get('ratio_threshold', 0.2) - super().__init__(cfg=cfg, trainer=trainer) - self.decoder.output_types = self.output_types - self.decoder.output_types_for_export = self.output_types - - @classmethod - def list_available_models(cls) -> Optional[List[PretrainedModelInfo]]: - results = [] - model = PretrainedModelInfo( - pretrained_model_name="vad_multilingual_frame_marblenet", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/vad_multilingual_frame_marblenet", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/vad_multilingual_frame_marblenet/versions/1.20.0/files/vad_multilingual_frame_marblenet.nemo", - ) - results.append(model) - return results - - def _setup_metrics(self): - self._accuracy = TopKClassificationAccuracy(dist_sync_on_step=True) - self._macro_accuracy = Accuracy(num_classes=self.num_classes, average='macro', task="multiclass") - - def _setup_loss(self): - if "loss" in self.cfg: - weight = self.cfg.loss.get("weight", None) - if weight in [None, "none", "None"]: - weight = [1.0] * self.num_classes - logging.info(f"Using cross-entropy with weights: {weight}") - else: - weight = [1.0] * self.num_classes - return CrossEntropyLoss(logits_ndim=3, weight=weight) - - def _setup_dataloader_from_config(self, config: DictConfig): - OmegaConf.set_struct(config, False) - config.is_regression_task = self.is_regression_task - OmegaConf.set_struct(config, True) - shuffle = config.get('shuffle', False) - - if config.get('is_tarred', False): - if ('tarred_audio_filepaths' in config and config['tarred_audio_filepaths'] is None) or ( - 'manifest_filepath' in config and config['manifest_filepath'] is None - ): - raise ValueError( - "Could not load dataset as `manifest_filepath` is None or " - f"`tarred_audio_filepaths` is None. Provided cfg : {config}" - ) - - shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0 - dataset = audio_to_label_dataset.get_tarred_audio_multi_label_dataset( - cfg=config, shuffle_n=shuffle_n, global_rank=self.global_rank, world_size=self.world_size, - ) - shuffle = False - if hasattr(dataset, 'collate_fn'): - collate_func = dataset.collate_fn - else: - collate_func = dataset.datasets[0].collate_fn - else: - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - raise ValueError(f"Could not load dataset as `manifest_filepath` is None. Provided cfg : {config}") - dataset = audio_to_label_dataset.get_audio_multi_label_dataset(config) - collate_func = dataset.collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config.get("batch_size", 1), - collate_fn=collate_func, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def _setup_feature_label_dataloader(self, config: DictConfig) -> torch.utils.data.DataLoader: - """ - setup dataloader for VAD inference with audio features as input - """ - - OmegaConf.set_struct(config, False) - config.is_regression_task = self.is_regression_task - OmegaConf.set_struct(config, True) - - if 'augmentor' in config: - augmentor = process_augmentations(config['augmentor']) - else: - augmentor = None - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - logging.warning(f"Could not load dataset as `manifest_filepath` is None. Provided config : {config}") - return None - - dataset = feature_to_label_dataset.get_feature_multi_label_dataset(config=config, augmentor=augmentor) - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config.get("batch_size", 1), - collate_fn=dataset.collate_fn, - drop_last=config.get('drop_last', False), - shuffle=config.get('shuffle', False), - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def get_label_masks(self, labels, labels_len): - mask = torch.arange(labels.size(1))[None, :].to(labels.device) < labels_len[:, None] - return mask.to(labels.device, dtype=bool) - - @typecheck() - def forward( - self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None - ): - has_input_signal = input_signal is not None and input_signal_length is not None - has_processed_signal = processed_signal is not None and processed_signal_length is not None - if (has_input_signal ^ has_processed_signal) == False: - raise ValueError( - f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive " - " with ``processed_signal`` and ``processed_signal_length`` arguments." - ) - - if not has_processed_signal: - processed_signal, processed_signal_length = self.preprocessor( - input_signal=input_signal, length=input_signal_length, - ) - - # Crop or pad is always applied - if self.crop_or_pad is not None: - processed_signal, processed_signal_length = self.crop_or_pad( - input_signal=processed_signal, length=processed_signal_length - ) - # Spec augment is not applied during evaluation/testing - if self.spec_augmentation is not None and self.training: - processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) - encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length) - logits = self.decoder(encoded.transpose(1, 2)) - return logits - - # PTL-specific methods - def training_step(self, batch, batch_idx): - audio_signal, audio_signal_len, labels, labels_len = batch - logits = self.forward(input_signal=audio_signal, input_signal_length=audio_signal_len) - labels, labels_len = self.reshape_labels(logits, labels, audio_signal_len, labels_len) - masks = self.get_label_masks(labels, labels_len) - - loss_value = self.loss(logits=logits, labels=labels, loss_mask=masks) - - tensorboard_logs = { - 'train_loss': loss_value, - 'learning_rate': self._optimizer.param_groups[0]['lr'], - 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), - } - - metric_logits, metric_labels = self.get_metric_logits_labels(logits, labels, masks) - self._accuracy(logits=metric_logits, labels=metric_labels) - topk_scores = self._accuracy.compute() - self._accuracy.reset() - - for top_k, score in zip(self._accuracy.top_k, topk_scores): - tensorboard_logs[f'training_batch_accuracy_top@{top_k}'] = score - - return {'loss': loss_value, 'log': tensorboard_logs} - - def validation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str = 'val'): - audio_signal, audio_signal_len, labels, labels_len = batch - logits = self.forward(input_signal=audio_signal, input_signal_length=audio_signal_len) - labels, labels_len = self.reshape_labels(logits, labels, audio_signal_len, labels_len) - masks = self.get_label_masks(labels, labels_len) - - loss_value = self.loss(logits=logits, labels=labels, loss_mask=masks) - - metric_logits, metric_labels = self.get_metric_logits_labels(logits, labels, masks) - - acc = self._accuracy(logits=metric_logits, labels=metric_labels) - correct_counts, total_counts = self._accuracy.correct_counts_k, self._accuracy.total_counts_k - - self._macro_accuracy.update(preds=metric_logits, target=metric_labels) - stats = self._macro_accuracy._final_state() - - return { - f'{tag}_loss': loss_value, - f'{tag}_correct_counts': correct_counts, - f'{tag}_total_counts': total_counts, - f'{tag}_acc_micro': acc, - f'{tag}_acc_stats': stats, - } - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0, tag: str = 'val'): - val_loss_mean = torch.stack([x[f'{tag}_loss'] for x in outputs]).mean() - correct_counts = torch.stack([x[f'{tag}_correct_counts'] for x in outputs]).sum(axis=0) - total_counts = torch.stack([x[f'{tag}_total_counts'] for x in outputs]).sum(axis=0) - - self._accuracy.correct_counts_k = correct_counts - self._accuracy.total_counts_k = total_counts - topk_scores = self._accuracy.compute() - - self._macro_accuracy.tp = torch.stack([x[f'{tag}_acc_stats'][0] for x in outputs]).sum(axis=0) - self._macro_accuracy.fp = torch.stack([x[f'{tag}_acc_stats'][1] for x in outputs]).sum(axis=0) - self._macro_accuracy.tn = torch.stack([x[f'{tag}_acc_stats'][2] for x in outputs]).sum(axis=0) - self._macro_accuracy.fn = torch.stack([x[f'{tag}_acc_stats'][3] for x in outputs]).sum(axis=0) - macro_accuracy_score = self._macro_accuracy.compute() - - self._accuracy.reset() - self._macro_accuracy.reset() - - tensorboard_log = { - f'{tag}_loss': val_loss_mean, - f'{tag}_acc_macro': macro_accuracy_score, - } - - for top_k, score in zip(self._accuracy.top_k, topk_scores): - tensorboard_log[f'{tag}_acc_micro_top@{top_k}'] = score - - self.log_dict(tensorboard_log, sync_dist=True) - return tensorboard_log - - def test_step(self, batch, batch_idx, dataloader_idx=0): - return self.validation_step(batch, batch_idx, dataloader_idx, tag='test') - - def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): - return self.multi_validation_epoch_end(outputs, dataloader_idx, tag='test') - - def reshape_labels(self, logits, labels, logits_len, labels_len): - """ - Reshape labels to match logits shape. For example, each label is expected to cover a 40ms frame, while each frme prediction from the - model covers 20ms. If labels are shorter than logits, labels are repeated, otherwise labels are folded and argmax is applied to obtain - the label of each frame. When lengths of labels and logits are not factors of each other, labels are truncated or padded with zeros. - The ratio_threshold=0.2 is used to determine whether to pad or truncate labels, where the value 0.2 is not important as in real cases the ratio - is very close to either ceil(ratio) or floor(ratio). We use 0.2 here for easier unit-testing. This implementation does not allow frame length - and label length that are not multiples of each other. - Args: - logits: logits tensor with shape [B, T1, C] - labels: labels tensor with shape [B, T2] - logits_len: logits length tensor with shape [B] - labels_len: labels length tensor with shape [B] - Returns: - labels: labels tensor with shape [B, T1] - labels_len: labels length tensor with shape [B] - """ - logits_max_len = logits.size(1) - labels_max_len = labels.size(1) - batch_size = logits.size(0) - if logits_max_len < labels_max_len: - ratio = labels_max_len // logits_max_len - res = labels_max_len % logits_max_len - if ceil(ratio) - ratio < self.ratio_threshold: # e.g., ratio is 1.99 - # pad labels with zeros until labels_max_len is a multiple of logits_max_len - labels = labels.cpu().tolist() - if len(labels) % ceil(ratio) != 0: - labels += [0] * (ceil(ratio) - len(labels) % ceil(ratio)) - labels = torch.tensor(labels).long().to(logits.device) - labels = labels.view(-1, ceil(ratio)).amax(1) - return self.reshape_labels(logits, labels, logits_len, labels_len) - else: - # truncate additional labels until labels_max_len is a multiple of logits_max_len - if res > 0: - labels = labels[:, :-res] - mask = labels_len > (labels_max_len - res) - labels_len = labels_len - mask * (labels_len - (labels_max_len - res)) - labels = labels.view(batch_size, ratio, -1).amax(1) - labels_len = torch.div(labels_len, ratio, rounding_mode="floor") - labels_len = torch.min(torch.cat([logits_len[:, None], labels_len[:, None]], dim=1), dim=1)[0] - return labels.contiguous(), labels_len.contiguous() - elif logits_max_len > labels_max_len: - ratio = logits_max_len / labels_max_len - res = logits_max_len % labels_max_len - if ceil(ratio) - ratio < self.ratio_threshold: # e.g., ratio is 1.99 - # repeat labels for ceil(ratio) times, and DROP additional labels based on logits_max_len - labels = labels.repeat_interleave(ceil(ratio), dim=1).long() - labels = labels[:, :logits_max_len] - labels_len = labels_len * ceil(ratio) - mask = labels_len > logits_max_len - labels_len = labels_len - mask * (labels_len - logits_max_len) - else: # e.g., ratio is 2.01 - # repeat labels for floor(ratio) times, and ADD padding labels based on logits_max_len - labels = labels.repeat_interleave(floor(ratio), dim=1).long() - labels_len = labels_len * floor(ratio) - if res > 0: - labels = torch.cat([labels, labels[:, -res:]], dim=1) - # no need to update `labels_len` since we ignore additional "res" padded labels - labels_len = torch.min(torch.cat([logits_len[:, None], labels_len[:, None]], dim=1), dim=1)[0] - return labels.contiguous(), labels_len.contiguous() - else: - labels_len = torch.min(torch.cat([logits_len[:, None], labels_len[:, None]], dim=1), dim=1)[0] - return labels, labels_len - - def get_metric_logits_labels(self, logits, labels, masks): - """ - Computes valid logits and labels for metric computation. - Args: - logits: tensor of shape [B, T, C] - labels: tensor of shape [B, T] - masks: tensor of shape [B, T] - Returns: - logits of shape [N, C] - labels of shape [N,] - """ - C = logits.size(2) - logits = logits.view(-1, C) # [BxT, C] - labels = labels.view(-1).contiguous() # [BxT,] - masks = masks.view(-1) # [BxT,] - idx = masks.nonzero() # [BxT, 1] - - logits = logits.gather(dim=0, index=idx.repeat(1, 2)) - labels = labels.gather(dim=0, index=idx.view(-1)) - - return logits, labels - - def forward_for_export( - self, input, length=None, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None - ): - """ - This forward is used when we need to export the model to ONNX format. - Inputs cache_last_channel and cache_last_time are needed to be passed for exporting streaming models. - Args: - input: Tensor that represents a batch of raw audio signals, - of shape [B, T]. T here represents timesteps. - length: Vector of length B, that contains the individual lengths of the audio sequences. - cache_last_channel: Tensor of shape [N, B, T, H] which contains the cache for last channel layers - cache_last_time: Tensor of shape [N, B, H, T] which contains the cache for last time layers - N is the number of such layers which need caching, B is batch size, H is the hidden size of activations, - and T is the length of the cache - - Returns: - the output of the model - """ - enc_fun = getattr(self.input_module, 'forward_for_export', self.input_module.forward) - if cache_last_channel is None: - encoder_output = enc_fun(audio_signal=input, length=length) - if isinstance(encoder_output, tuple): - encoder_output = encoder_output[0] - else: - encoder_output, length, cache_last_channel, cache_last_time, cache_last_channel_len = enc_fun( - audio_signal=input, - length=length, - cache_last_channel=cache_last_channel, - cache_last_time=cache_last_time, - cache_last_channel_len=cache_last_channel_len, - ) - - dec_fun = getattr(self.output_module, 'forward_for_export', self.output_module.forward) - ret = dec_fun(hidden_states=encoder_output.transpose(1, 2)) - if isinstance(ret, tuple): - ret = ret[0] - if cache_last_channel is not None: - ret = (ret, length, cache_last_channel, cache_last_time, cache_last_channel_len) - return cast_all(ret, from_dtype=torch.float16, to_dtype=torch.float32) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/clustering_diarizer.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/clustering_diarizer.py deleted file mode 100644 index 08f4255deac0d972b685bfd0dc7e2c90bac7ef62..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/clustering_diarizer.py +++ /dev/null @@ -1,559 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -import pickle as pkl -import shutil -import tarfile -import tempfile -from copy import deepcopy -from typing import Any, List, Optional, Union - -import torch -from omegaconf import DictConfig, OmegaConf -from pytorch_lightning.utilities import rank_zero_only -from tqdm import tqdm - -from nemo.collections.asr.metrics.der import score_labels -from nemo.collections.asr.models.classification_models import EncDecClassificationModel -from nemo.collections.asr.models.label_models import EncDecSpeakerLabelModel -from nemo.collections.asr.parts.mixins.mixins import DiarizationMixin -from nemo.collections.asr.parts.utils.speaker_utils import ( - audio_rttm_map, - get_embs_and_timestamps, - get_uniqname_from_filepath, - parse_scale_configs, - perform_clustering, - segments_manifest_to_subsegments_manifest, - validate_vad_manifest, - write_rttm2manifest, -) -from nemo.collections.asr.parts.utils.vad_utils import ( - generate_overlap_vad_seq, - generate_vad_segment_table, - get_vad_stream_status, - prepare_manifest, -) -from nemo.core.classes import Model -from nemo.utils import logging, model_utils - -try: - from torch.cuda.amp import autocast -except ImportError: - from contextlib import contextmanager - - @contextmanager - def autocast(enabled=None): - yield - - -__all__ = ['ClusteringDiarizer'] - -_MODEL_CONFIG_YAML = "model_config.yaml" -_VAD_MODEL = "vad_model.nemo" -_SPEAKER_MODEL = "speaker_model.nemo" - - -def get_available_model_names(class_name): - "lists available pretrained model names from NGC" - available_models = class_name.list_available_models() - return list(map(lambda x: x.pretrained_model_name, available_models)) - - -class ClusteringDiarizer(torch.nn.Module, Model, DiarizationMixin): - """ - Inference model Class for offline speaker diarization. - This class handles required functionality for diarization : Speech Activity Detection, Segmentation, - Extract Embeddings, Clustering, Resegmentation and Scoring. - All the parameters are passed through config file - """ - - def __init__(self, cfg: Union[DictConfig, Any], speaker_model=None): - super().__init__() - if isinstance(cfg, DictConfig): - cfg = model_utils.convert_model_config_to_dict_config(cfg) - # Convert config to support Hydra 1.0+ instantiation - cfg = model_utils.maybe_update_config_version(cfg) - self._cfg = cfg - - # Diarizer set up - self._diarizer_params = self._cfg.diarizer - - # init vad model - self.has_vad_model = False - if not self._diarizer_params.oracle_vad: - if self._cfg.diarizer.vad.model_path is not None: - self._vad_params = self._cfg.diarizer.vad.parameters - self._init_vad_model() - - # init speaker model - self.multiscale_embeddings_and_timestamps = {} - self._init_speaker_model(speaker_model) - self._speaker_params = self._cfg.diarizer.speaker_embeddings.parameters - - # Clustering params - self._cluster_params = self._diarizer_params.clustering.parameters - - @classmethod - def list_available_models(cls): - pass - - def _init_vad_model(self): - """ - Initialize VAD model with model name or path passed through config - """ - model_path = self._cfg.diarizer.vad.model_path - if model_path.endswith('.nemo'): - self._vad_model = EncDecClassificationModel.restore_from(model_path, map_location=self._cfg.device) - logging.info("VAD model loaded locally from {}".format(model_path)) - else: - if model_path not in get_available_model_names(EncDecClassificationModel): - logging.warning( - "requested {} model name not available in pretrained models, instead".format(model_path) - ) - model_path = "vad_telephony_marblenet" - logging.info("Loading pretrained {} model from NGC".format(model_path)) - self._vad_model = EncDecClassificationModel.from_pretrained( - model_name=model_path, map_location=self._cfg.device - ) - self._vad_window_length_in_sec = self._vad_params.window_length_in_sec - self._vad_shift_length_in_sec = self._vad_params.shift_length_in_sec - self.has_vad_model = True - - def _init_speaker_model(self, speaker_model=None): - """ - Initialize speaker embedding model with model name or path passed through config - """ - if speaker_model is not None: - self._speaker_model = speaker_model - else: - model_path = self._cfg.diarizer.speaker_embeddings.model_path - if model_path is not None and model_path.endswith('.nemo'): - self._speaker_model = EncDecSpeakerLabelModel.restore_from(model_path, map_location=self._cfg.device) - logging.info("Speaker Model restored locally from {}".format(model_path)) - elif model_path.endswith('.ckpt'): - self._speaker_model = EncDecSpeakerLabelModel.load_from_checkpoint( - model_path, map_location=self._cfg.device - ) - logging.info("Speaker Model restored locally from {}".format(model_path)) - else: - if model_path not in get_available_model_names(EncDecSpeakerLabelModel): - logging.warning( - "requested {} model name not available in pretrained models, instead".format(model_path) - ) - model_path = "ecapa_tdnn" - logging.info("Loading pretrained {} model from NGC".format(model_path)) - self._speaker_model = EncDecSpeakerLabelModel.from_pretrained( - model_name=model_path, map_location=self._cfg.device - ) - - self.multiscale_args_dict = parse_scale_configs( - self._diarizer_params.speaker_embeddings.parameters.window_length_in_sec, - self._diarizer_params.speaker_embeddings.parameters.shift_length_in_sec, - self._diarizer_params.speaker_embeddings.parameters.multiscale_weights, - ) - - def _setup_vad_test_data(self, manifest_vad_input): - vad_dl_config = { - 'manifest_filepath': manifest_vad_input, - 'sample_rate': self._cfg.sample_rate, - 'batch_size': self._cfg.get('batch_size'), - 'vad_stream': True, - 'labels': ['infer',], - 'window_length_in_sec': self._vad_window_length_in_sec, - 'shift_length_in_sec': self._vad_shift_length_in_sec, - 'trim_silence': False, - 'num_workers': self._cfg.num_workers, - } - self._vad_model.setup_test_data(test_data_config=vad_dl_config) - - def _setup_spkr_test_data(self, manifest_file): - spk_dl_config = { - 'manifest_filepath': manifest_file, - 'sample_rate': self._cfg.sample_rate, - 'batch_size': self._cfg.get('batch_size'), - 'trim_silence': False, - 'labels': None, - 'num_workers': self._cfg.num_workers, - } - self._speaker_model.setup_test_data(spk_dl_config) - - def _run_vad(self, manifest_file): - """ - Run voice activity detection. - Get log probability of voice activity detection and smoothes using the post processing parameters. - Using generated frame level predictions generated manifest file for later speaker embedding extraction. - input: - manifest_file (str) : Manifest file containing path to audio file and label as infer - - """ - - shutil.rmtree(self._vad_dir, ignore_errors=True) - os.makedirs(self._vad_dir) - - self._vad_model.eval() - - time_unit = int(self._vad_window_length_in_sec / self._vad_shift_length_in_sec) - trunc = int(time_unit / 2) - trunc_l = time_unit - trunc - all_len = 0 - data = [] - for line in open(manifest_file, 'r', encoding='utf-8'): - file = json.loads(line)['audio_filepath'] - data.append(get_uniqname_from_filepath(file)) - - status = get_vad_stream_status(data) - for i, test_batch in enumerate( - tqdm(self._vad_model.test_dataloader(), desc='vad', leave=True, disable=not self.verbose) - ): - test_batch = [x.to(self._vad_model.device) for x in test_batch] - with autocast(): - log_probs = self._vad_model(input_signal=test_batch[0], input_signal_length=test_batch[1]) - probs = torch.softmax(log_probs, dim=-1) - pred = probs[:, 1] - if status[i] == 'start': - to_save = pred[:-trunc] - elif status[i] == 'next': - to_save = pred[trunc:-trunc_l] - elif status[i] == 'end': - to_save = pred[trunc_l:] - else: - to_save = pred - all_len += len(to_save) - outpath = os.path.join(self._vad_dir, data[i] + ".frame") - with open(outpath, "a", encoding='utf-8') as fout: - for f in range(len(to_save)): - fout.write('{0:0.4f}\n'.format(to_save[f])) - del test_batch - if status[i] == 'end' or status[i] == 'single': - all_len = 0 - - if not self._vad_params.smoothing: - # Shift the window by 10ms to generate the frame and use the prediction of the window to represent the label for the frame; - self.vad_pred_dir = self._vad_dir - frame_length_in_sec = self._vad_shift_length_in_sec - else: - # Generate predictions with overlapping input segments. Then a smoothing filter is applied to decide the label for a frame spanned by multiple segments. - # smoothing_method would be either in majority vote (median) or average (mean) - logging.info("Generating predictions with overlapping input segments") - smoothing_pred_dir = generate_overlap_vad_seq( - frame_pred_dir=self._vad_dir, - smoothing_method=self._vad_params.smoothing, - overlap=self._vad_params.overlap, - window_length_in_sec=self._vad_window_length_in_sec, - shift_length_in_sec=self._vad_shift_length_in_sec, - num_workers=self._cfg.num_workers, - ) - self.vad_pred_dir = smoothing_pred_dir - frame_length_in_sec = 0.01 - - logging.info("Converting frame level prediction to speech/no-speech segment in start and end times format.") - - vad_params = self._vad_params if isinstance(self._vad_params, (DictConfig, dict)) else self._vad_params.dict() - table_out_dir = generate_vad_segment_table( - vad_pred_dir=self.vad_pred_dir, - postprocessing_params=vad_params, - frame_length_in_sec=frame_length_in_sec, - num_workers=self._cfg.num_workers, - out_dir=self._vad_dir, - ) - - AUDIO_VAD_RTTM_MAP = {} - for key in self.AUDIO_RTTM_MAP: - if os.path.exists(os.path.join(table_out_dir, key + ".txt")): - AUDIO_VAD_RTTM_MAP[key] = deepcopy(self.AUDIO_RTTM_MAP[key]) - AUDIO_VAD_RTTM_MAP[key]['rttm_filepath'] = os.path.join(table_out_dir, key + ".txt") - else: - logging.warning(f"no vad file found for {key} due to zero or negative duration") - - write_rttm2manifest(AUDIO_VAD_RTTM_MAP, self._vad_out_file) - self._speaker_manifest_path = self._vad_out_file - - def _run_segmentation(self, window: float, shift: float, scale_tag: str = ''): - - self.subsegments_manifest_path = os.path.join(self._speaker_dir, f'subsegments{scale_tag}.json') - logging.info( - f"Subsegmentation for embedding extraction:{scale_tag.replace('_',' ')}, {self.subsegments_manifest_path}" - ) - self.subsegments_manifest_path = segments_manifest_to_subsegments_manifest( - segments_manifest_file=self._speaker_manifest_path, - subsegments_manifest_file=self.subsegments_manifest_path, - window=window, - shift=shift, - ) - return None - - def _perform_speech_activity_detection(self): - """ - Checks for type of speech activity detection from config. Choices are NeMo VAD, - external vad manifest and oracle VAD (generates speech activity labels from provided RTTM files) - """ - if self.has_vad_model: - self._auto_split = True - self._split_duration = 50 - manifest_vad_input = self._diarizer_params.manifest_filepath - - if self._auto_split: - logging.info("Split long audio file to avoid CUDA memory issue") - logging.debug("Try smaller split_duration if you still have CUDA memory issue") - config = { - 'input': manifest_vad_input, - 'window_length_in_sec': self._vad_window_length_in_sec, - 'split_duration': self._split_duration, - 'num_workers': self._cfg.num_workers, - 'out_dir': self._diarizer_params.out_dir, - } - manifest_vad_input = prepare_manifest(config) - else: - logging.warning( - "If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it." - ) - - self._setup_vad_test_data(manifest_vad_input) - self._run_vad(manifest_vad_input) - - elif self._diarizer_params.vad.external_vad_manifest is not None: - self._speaker_manifest_path = self._diarizer_params.vad.external_vad_manifest - elif self._diarizer_params.oracle_vad: - self._speaker_manifest_path = os.path.join(self._speaker_dir, 'oracle_vad_manifest.json') - self._speaker_manifest_path = write_rttm2manifest(self.AUDIO_RTTM_MAP, self._speaker_manifest_path) - else: - raise ValueError( - "Only one of diarizer.oracle_vad, vad.model_path or vad.external_vad_manifest must be passed from config" - ) - validate_vad_manifest(self.AUDIO_RTTM_MAP, vad_manifest=self._speaker_manifest_path) - - def _extract_embeddings(self, manifest_file: str, scale_idx: int, num_scales: int): - """ - This method extracts speaker embeddings from segments passed through manifest_file - Optionally you may save the intermediate speaker embeddings for debugging or any use. - """ - logging.info("Extracting embeddings for Diarization") - self._setup_spkr_test_data(manifest_file) - self.embeddings = {} - self._speaker_model.eval() - self.time_stamps = {} - - all_embs = torch.empty([0]) - for test_batch in tqdm( - self._speaker_model.test_dataloader(), - desc=f'[{scale_idx+1}/{num_scales}] extract embeddings', - leave=True, - disable=not self.verbose, - ): - test_batch = [x.to(self._speaker_model.device) for x in test_batch] - audio_signal, audio_signal_len, labels, slices = test_batch - with autocast(): - _, embs = self._speaker_model.forward(input_signal=audio_signal, input_signal_length=audio_signal_len) - emb_shape = embs.shape[-1] - embs = embs.view(-1, emb_shape) - all_embs = torch.cat((all_embs, embs.cpu().detach()), dim=0) - del test_batch - - with open(manifest_file, 'r', encoding='utf-8') as manifest: - for i, line in enumerate(manifest.readlines()): - line = line.strip() - dic = json.loads(line) - uniq_name = get_uniqname_from_filepath(dic['audio_filepath']) - if uniq_name in self.embeddings: - self.embeddings[uniq_name] = torch.cat((self.embeddings[uniq_name], all_embs[i].view(1, -1))) - else: - self.embeddings[uniq_name] = all_embs[i].view(1, -1) - if uniq_name not in self.time_stamps: - self.time_stamps[uniq_name] = [] - start = dic['offset'] - end = start + dic['duration'] - self.time_stamps[uniq_name].append([start, end]) - - if self._speaker_params.save_embeddings: - embedding_dir = os.path.join(self._speaker_dir, 'embeddings') - if not os.path.exists(embedding_dir): - os.makedirs(embedding_dir, exist_ok=True) - - prefix = get_uniqname_from_filepath(manifest_file) - name = os.path.join(embedding_dir, prefix) - self._embeddings_file = name + f'_embeddings.pkl' - pkl.dump(self.embeddings, open(self._embeddings_file, 'wb')) - logging.info("Saved embedding files to {}".format(embedding_dir)) - - def path2audio_files_to_manifest(self, paths2audio_files, manifest_filepath): - with open(manifest_filepath, 'w', encoding='utf-8') as fp: - for audio_file in paths2audio_files: - audio_file = audio_file.strip() - entry = {'audio_filepath': audio_file, 'offset': 0.0, 'duration': None, 'text': '-', 'label': 'infer'} - fp.write(json.dumps(entry) + '\n') - - def diarize(self, paths2audio_files: List[str] = None, batch_size: int = 0): - """ - Diarize files provided thorugh paths2audio_files or manifest file - input: - paths2audio_files (List[str]): list of paths to file containing audio file - batch_size (int): batch_size considered for extraction of speaker embeddings and VAD computation - """ - - self._out_dir = self._diarizer_params.out_dir - - self._speaker_dir = os.path.join(self._diarizer_params.out_dir, 'speaker_outputs') - - if os.path.exists(self._speaker_dir): - logging.warning("Deleting previous clustering diarizer outputs.") - shutil.rmtree(self._speaker_dir, ignore_errors=True) - os.makedirs(self._speaker_dir) - - if not os.path.exists(self._out_dir): - os.mkdir(self._out_dir) - - self._vad_dir = os.path.join(self._out_dir, 'vad_outputs') - self._vad_out_file = os.path.join(self._vad_dir, "vad_out.json") - - if batch_size: - self._cfg.batch_size = batch_size - - if paths2audio_files: - if type(paths2audio_files) is list: - self._diarizer_params.manifest_filepath = os.path.join(self._out_dir, 'paths2audio_filepath.json') - self.path2audio_files_to_manifest(paths2audio_files, self._diarizer_params.manifest_filepath) - else: - raise ValueError("paths2audio_files must be of type list of paths to file containing audio file") - - self.AUDIO_RTTM_MAP = audio_rttm_map(self._diarizer_params.manifest_filepath) - - out_rttm_dir = os.path.join(self._out_dir, 'pred_rttms') - os.makedirs(out_rttm_dir, exist_ok=True) - - # Speech Activity Detection - self._perform_speech_activity_detection() - - # Segmentation - scales = self.multiscale_args_dict['scale_dict'].items() - for scale_idx, (window, shift) in scales: - - # Segmentation for the current scale (scale_idx) - self._run_segmentation(window, shift, scale_tag=f'_scale{scale_idx}') - - # Embedding Extraction for the current scale (scale_idx) - self._extract_embeddings(self.subsegments_manifest_path, scale_idx, len(scales)) - - self.multiscale_embeddings_and_timestamps[scale_idx] = [self.embeddings, self.time_stamps] - - embs_and_timestamps = get_embs_and_timestamps( - self.multiscale_embeddings_and_timestamps, self.multiscale_args_dict - ) - - # Clustering - all_reference, all_hypothesis = perform_clustering( - embs_and_timestamps=embs_and_timestamps, - AUDIO_RTTM_MAP=self.AUDIO_RTTM_MAP, - out_rttm_dir=out_rttm_dir, - clustering_params=self._cluster_params, - device=self._speaker_model.device, - verbose=self.verbose, - ) - logging.info("Outputs are saved in {} directory".format(os.path.abspath(self._diarizer_params.out_dir))) - - # Scoring - return score_labels( - self.AUDIO_RTTM_MAP, - all_reference, - all_hypothesis, - collar=self._diarizer_params.collar, - ignore_overlap=self._diarizer_params.ignore_overlap, - verbose=self.verbose, - ) - - @staticmethod - def __make_nemo_file_from_folder(filename, source_dir): - with tarfile.open(filename, "w:gz") as tar: - tar.add(source_dir, arcname="./") - - @rank_zero_only - def save_to(self, save_path: str): - """ - Saves model instance (weights and configuration) into EFF archive or . - You can use "restore_from" method to fully restore instance from .nemo file. - - .nemo file is an archive (tar.gz) with the following: - model_config.yaml - model configuration in .yaml format. You can deserialize this into cfg argument for model's constructor - model_wights.chpt - model checkpoint - - Args: - save_path: Path to .nemo file where model instance should be saved - """ - - # TODO: Why does this override the main save_to? - - with tempfile.TemporaryDirectory() as tmpdir: - config_yaml = os.path.join(tmpdir, _MODEL_CONFIG_YAML) - spkr_model = os.path.join(tmpdir, _SPEAKER_MODEL) - - self.to_config_file(path2yaml_file=config_yaml) - if self.has_vad_model: - vad_model = os.path.join(tmpdir, _VAD_MODEL) - self._vad_model.save_to(vad_model) - self._speaker_model.save_to(spkr_model) - self.__make_nemo_file_from_folder(filename=save_path, source_dir=tmpdir) - - @staticmethod - def __unpack_nemo_file(path2file: str, out_folder: str) -> str: - if not os.path.exists(path2file): - raise FileNotFoundError(f"{path2file} does not exist") - tar = tarfile.open(path2file, "r:gz") - tar.extractall(path=out_folder) - tar.close() - return out_folder - - @classmethod - def restore_from( - cls, - restore_path: str, - override_config_path: Optional[str] = None, - map_location: Optional[torch.device] = None, - strict: bool = False, - ): - # Get path where the command is executed - the artifacts will be "retrieved" there - # (original .nemo behavior) - cwd = os.getcwd() - - with tempfile.TemporaryDirectory() as tmpdir: - try: - cls.__unpack_nemo_file(path2file=restore_path, out_folder=tmpdir) - os.chdir(tmpdir) - if override_config_path is None: - config_yaml = os.path.join(tmpdir, _MODEL_CONFIG_YAML) - else: - config_yaml = override_config_path - conf = OmegaConf.load(config_yaml) - if os.path.exists(os.path.join(tmpdir, _VAD_MODEL)): - conf.diarizer.vad.model_path = os.path.join(tmpdir, _VAD_MODEL) - else: - logging.info( - f'Model {cls.__name__} does not contain a VAD model. A VAD model or manifest file with' - f'speech segments need for diarization with this model' - ) - - conf.diarizer.speaker_embeddings.model_path = os.path.join(tmpdir, _SPEAKER_MODEL) - conf.restore_map_location = map_location - OmegaConf.set_struct(conf, True) - instance = cls(cfg=conf) - - logging.info(f'Model {cls.__name__} was successfully restored from {restore_path}.') - finally: - os.chdir(cwd) - - return instance - - @property - def verbose(self) -> bool: - return self._cfg.verbose diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/confidence_ensemble.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/confidence_ensemble.py deleted file mode 100644 index dcbb0a05976cc427fe1a4a1ae81f02a8835842ce..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/confidence_ensemble.py +++ /dev/null @@ -1,323 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass -from typing import Dict, List, Optional, Union - -import joblib -import numpy as np -import torch -from omegaconf import DictConfig, open_dict -from pytorch_lightning import Trainer - -from nemo.collections.asr.models.asr_model import ASRModel -from nemo.collections.asr.models.hybrid_rnnt_ctc_models import EncDecHybridRNNTCTCModel -from nemo.collections.asr.parts.utils.asr_confidence_utils import ( - ConfidenceConfig, - ConfidenceMethodConfig, - get_confidence_aggregation_bank, - get_confidence_measure_bank, -) -from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType -from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis -from nemo.core.classes import ModelPT -from nemo.utils import model_utils - - -# frozen is required to allow hashing of this class and use it -# as a dictionary key when running confidence tuning -@dataclass(frozen=True) -class ConfidenceSpec: - exclude_blank: bool - aggregation: str - confidence_type: str - alpha: float - - def to_confidence_config(self) -> ConfidenceConfig: - """Converts confidence spec to the confidence config. - - Internally, the tuning procedure uses this "spec" objects as they - are more aligned with how things are implemented. But when it's time - to save the models or call transcribe, we need to use the proper - object of type ``ConfidenceConfig``. - """ - if self.confidence_type == 'max_prob': - name = 'max_prob' - entropy_type = 'tsallis' # can be any - entropy_norm = 'lin' # can be any - else: - name, entropy_type, entropy_norm = self.confidence_type.split("_") - return ConfidenceConfig( - exclude_blank=self.exclude_blank, - aggregation=self.aggregation, - method_cfg=ConfidenceMethodConfig( - name=name, entropy_type=entropy_type, alpha=self.alpha, entropy_norm=entropy_norm, - ), - ) - - -def get_filtered_logprobs(hypothesis: Hypothesis, exclude_blank: bool) -> torch.Tensor: - """Returns logprobs from the hypothesis object with optional blanks filter. - - This function supports both CTC and Transducer hypotheses. Will place the - logprobs on GPU if it's available. - - Args: - hypothesis: generated hypothesis as returned from the transcribe - method of the ASR model. - exclude_blank: whether to filter out all ```` tokens. - - Returns: - torch.Tensor: of shape [S, V], where S is (filtered) sequence length and - V is the vocabulary size. - """ - if isinstance(hypothesis.alignments, list): # Transducer - filtered_logprobs = [] - for alignment in hypothesis.alignments: - for align_elem in alignment: - if not exclude_blank: - filtered_logprobs.append(align_elem[0]) - elif align_elem[1].item() != align_elem[0].shape[-1] - 1: - filtered_logprobs.append(align_elem[0]) - if not filtered_logprobs: # for the edge-case of all blanks - filtered_logprobs.append(align_elem[0]) - filtered_logprobs = torch.stack(filtered_logprobs) - if torch.cuda.is_available(): # by default logprobs are placed on cpu in nemo - filtered_logprobs = filtered_logprobs.cuda() - else: # CTC - logprobs = hypothesis.y_sequence - if torch.cuda.is_available(): # by default logprobs are placed on cpu in nemo - logprobs = logprobs.cuda() - if exclude_blank: # filtering blanks - labels = logprobs.argmax(dim=-1) - filtered_logprobs = logprobs[labels != logprobs.shape[1] - 1] - if filtered_logprobs.shape[0] == 0: # for the edge-case of all blanks - filtered_logprobs = logprobs[:1] - else: - filtered_logprobs = logprobs - - # need to make sure logprobs are always normalized, so checking if they sum up to 1 - if not torch.allclose(filtered_logprobs[0].exp().sum(), torch.tensor(1.0)): - filtered_logprobs = torch.log_softmax(filtered_logprobs, dim=1) - - return filtered_logprobs - - -def compute_confidence(hypothesis: Hypothesis, confidence_cfg: ConfidenceConfig) -> float: - """Computes confidence score of the full utterance from a given hypothesis. - - This is essentially a re-implementation of the built-in confidence - computation in NeMo. The difference is that we aggregate full-utterance - scores, while core functionality only supports word and token level - aggregations. - - Args: - hypothesis: generated hypothesis as returned from the transcribe - method of the ASR model. - confidence_cfg: confidence config specifying what kind of - method/aggregation should be used. - - Returns: - float: confidence score. - - """ - filtered_logprobs = get_filtered_logprobs(hypothesis, confidence_cfg.exclude_blank) - vocab_size = filtered_logprobs.shape[1] - aggr_func = get_confidence_aggregation_bank()[confidence_cfg.aggregation] - if confidence_cfg.method_cfg.name == "max_prob": - conf_type = "max_prob" - alpha = 1.0 - else: - conf_type = f"entropy_{confidence_cfg.method_cfg.entropy_type}_{confidence_cfg.method_cfg.entropy_norm}" - alpha = confidence_cfg.method_cfg.alpha - conf_func = get_confidence_measure_bank()[conf_type] - - conf_value = aggr_func(conf_func(filtered_logprobs, v=vocab_size, t=alpha)).cpu().item() - - return conf_value - - -class ConfidenceEnsembleModel(ModelPT): - """Implementation of the confidence ensemble model. - - See https://arxiv.org/abs/2306.15824 for details. - - .. note:: - Currently this class only support `transcribe` method as it requires - full-utterance confidence scores to operate. - """ - - def __init__( - self, cfg: DictConfig, trainer: 'Trainer' = None, - ): - super().__init__(cfg=cfg, trainer=trainer) - - # either we load all models from ``load_models`` cfg parameter - # or all of them are specified in the config as modelX alongside the num_models key - # - # ideally, we'd like to directly store all models in a list, but that - # is not currently supported by the submodule logic - # so to access all the models, we do something like - # - # for model_idx in range(self.num_models): - # model = getattr(self, f"model{model_idx}") - - if 'num_models' in self.cfg: - self.num_models = self.cfg.num_models - for idx in range(self.num_models): - cfg_field = f"model{idx}" - model_cfg = self.cfg[cfg_field] - model_class = model_utils.import_class_by_path(model_cfg['target']) - self.register_nemo_submodule( - name=cfg_field, config_field=cfg_field, model=model_class(model_cfg, trainer=trainer), - ) - else: - self.num_models = len(cfg.load_models) - with open_dict(self.cfg): - self.cfg.num_models = self.num_models - for idx, model in enumerate(cfg.load_models): - cfg_field = f"model{idx}" - if model.endswith(".nemo"): - self.register_nemo_submodule( - name=cfg_field, - config_field=cfg_field, - model=ASRModel.restore_from(model, trainer=trainer, map_location="cpu"), - ) - else: - self.register_nemo_submodule( - cfg_field, config_field=cfg_field, model=ASRModel.from_pretrained(model, map_location="cpu"), - ) - - # registering model selection block - this is expected to be a joblib-saved - # pretrained sklearn pipeline containing standardization + logistic regression - # trained to predict "most-confident" model index from the confidence scores of all models - model_selection_block_path = self.register_artifact("model_selection_block", cfg.model_selection_block) - self.model_selection_block = joblib.load(model_selection_block_path) - self.confidence_cfg = ConfidenceConfig(**self.cfg.confidence) - - # making sure each model has correct temperature setting in the decoder strategy - for model_idx in range(self.num_models): - model = getattr(self, f"model{model_idx}") - # for now we assume users are direclty responsible for matching - # decoder type when building ensemble with inference type - # TODO: add automatic checks for errors - if isinstance(model, EncDecHybridRNNTCTCModel): - self.update_decoding_parameters(model.cfg.decoding) - model.change_decoding_strategy(model.cfg.decoding, decoder_type="rnnt") - self.update_decoding_parameters(model.cfg.aux_ctc.decoding) - model.change_decoding_strategy(model.cfg.aux_ctc.decoding, decoder_type="ctc") - else: - self.update_decoding_parameters(model.cfg.decoding) - model.change_decoding_strategy(model.cfg.decoding) - - def update_decoding_parameters(self, decoding_cfg: DictConfig): - """Updating temperature/preserve_alignment parameters of the config.""" - with open_dict(decoding_cfg): - decoding_cfg.temperature = self.cfg.temperature - decoding_cfg.preserve_alignments = True - - def setup_training_data(self, train_data_config: Union[DictConfig, Dict]): - """Pass-through to the ensemble models. - - Note that training is not actually supported for this class! - """ - for model_idx in range(self.num_models): - getattr(self, f"model{model_idx}").setup_training_data(train_data_config) - - def setup_validation_data(self, val_data_config: Union[DictConfig, Dict]): - """Pass-through to the ensemble models.""" - for model_idx in range(self.num_models): - getattr(self, f"model{model_idx}").setup_validation_data(val_data_config) - - def change_attention_model( - self, self_attention_model: str = None, att_context_size: List[int] = None, update_config: bool = True - ): - """Pass-through to the ensemble models.""" - for model_idx in range(self.num_models): - getattr(self, f"model{model_idx}").change_attention_model( - self_attention_model, att_context_size, update_config - ) - - def change_decoding_strategy(self, decoding_cfg: Optional[DictConfig] = None, decoder_type: str = None): - """Pass-through to the ensemble models. - - The only change here is that we always require expected temperature - to be set as well as ``decoding_cfg.preserve_alignments = True`` - """ - self.update_decoding_parameters(decoding_cfg) - for model_idx in range(self.num_models): - model = getattr(self, f"model{model_idx}") - if isinstance(model, EncDecHybridRNNTCTCModel): - model.change_decoding_strategy(decoding_cfg, decoder_type=decoder_type) - else: - model.change_decoding_strategy(decoding_cfg) - - @torch.no_grad() - def transcribe( - self, - paths2audio_files: List[str], - batch_size: int = 4, - return_hypotheses: bool = False, - num_workers: int = 0, - channel_selector: Optional[ChannelSelectorType] = None, - augmentor: DictConfig = None, - verbose: bool = True, - **kwargs, # any other model specific parameters are passed directly - ) -> List[str]: - """Confidence-ensemble transcribe method. - - Consists of the following steps: - - 1. Run all models (TODO: in parallel) - 2. Compute confidence for each model - 3. Use logistic regression to pick the "most confident" model - 4. Return the output of that model - """ - confidences = [] - all_transcriptions = [] - # always requiring to return hypothesis - # TODO: make sure to return text only if was False originally - return_hypotheses = True - for model_idx in range(self.num_models): - model = getattr(self, f"model{model_idx}") - transcriptions = model.transcribe( - paths2audio_files=paths2audio_files, - batch_size=batch_size, - return_hypotheses=return_hypotheses, - num_workers=num_workers, - channel_selector=channel_selector, - augmentor=augmentor, - verbose=verbose, - **kwargs, - ) - if isinstance(transcriptions, tuple): # transducers return a tuple - transcriptions = transcriptions[0] - - model_confidences = [] - for transcription in transcriptions: - model_confidences.append(compute_confidence(transcription, self.confidence_cfg)) - confidences.append(model_confidences) - all_transcriptions.append(transcriptions) - - # transposing with zip(*list) - features = np.array(list(zip(*confidences))) - model_indices = self.model_selection_block.predict(features) - final_transcriptions = [] - for transcrption_idx in range(len(all_transcriptions[0])): - final_transcriptions.append(all_transcriptions[model_indices[transcrption_idx]][transcrption_idx]) - - return final_transcriptions - - def list_available_models(self): - return [] diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__init__.py deleted file mode 100644 index b25f1119adcffddd2def23f32d0cbaf9e5046a49..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__init__.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.models.configs.asr_models_config import ( - ASRDatasetConfig, - CacheAwareStreamingConfig, - EncDecCTCConfig, - EncDecCTCModelConfig, -) -from nemo.collections.asr.models.configs.classification_models_config import ( - EncDecClassificationConfig, - EncDecClassificationDatasetConfig, - EncDecClassificationModelConfig, -) -from nemo.collections.asr.models.configs.diarizer_config import NeuralDiarizerInferenceConfig -from nemo.collections.asr.models.configs.matchboxnet_config import ( - EncDecClassificationModelConfigBuilder, - MatchboxNetModelConfig, - MatchboxNetVADModelConfig, -) -from nemo.collections.asr.models.configs.quartznet_config import ( - EncDecCTCModelConfigBuilder, - JasperModelConfig, - QuartzNetModelConfig, -) -from nemo.collections.asr.modules.audio_preprocessing import ( - AudioToMelSpectrogramPreprocessorConfig, - AudioToMFCCPreprocessorConfig, - CropOrPadSpectrogramAugmentationConfig, - SpectrogramAugmentationConfig, -) -from nemo.collections.asr.modules.conv_asr import ( - ConvASRDecoderClassificationConfig, - ConvASRDecoderConfig, - ConvASREncoderConfig, - JasperEncoderConfig, -) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 8bf5fd5292393b978d5176643ec24c89bc9e50e7..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/asr_models_config.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/asr_models_config.cpython-310.pyc deleted file mode 100644 index 161ef75c4b82bac3758d1b28541d7fe7d90aea3e..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/asr_models_config.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/classification_models_config.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/classification_models_config.cpython-310.pyc deleted file mode 100644 index 9dc7d0d91b1b23568fc611a03ff8142cd191ceef..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/classification_models_config.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/diarizer_config.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/diarizer_config.cpython-310.pyc deleted file mode 100644 index 45191a7df82f433f94e3007f965c93d4b6f3ccda..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/diarizer_config.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/matchboxnet_config.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/matchboxnet_config.cpython-310.pyc deleted file mode 100644 index 75b731be59f5ff3385163d1eb45cdc539f7895b5..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/matchboxnet_config.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/quartznet_config.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/quartznet_config.cpython-310.pyc deleted file mode 100644 index b2db910a5d4e7cc7a58c6f5d7e6c5c6b5c273b4b..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/__pycache__/quartznet_config.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/aligner_config.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/aligner_config.py deleted file mode 100644 index cf2cdd176719d3eb1b28c0f9104bd6959d3cd30a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/aligner_config.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field - -from nemo.collections.asr.parts.k2.classes import GraphModuleConfig - - -@dataclass -class AlignerCTCConfig: - prob_suppress_index: int = -1 - prob_suppress_value: float = 1.0 - - -@dataclass -class AlignerRNNTConfig: - predictor_window_size: int = 0 - predictor_step_size: int = 1 - - -@dataclass -class AlignerWrapperModelConfig: - alignment_type: str = "forced" - word_output: bool = True - cpu_decoding: bool = False - decode_batch_size: int = 0 - ctc_cfg: AlignerCTCConfig = field(default_factory=lambda: AlignerCTCConfig()) - rnnt_cfg: AlignerRNNTConfig = field(default_factory=lambda: AlignerRNNTConfig()) - - -@dataclass -class K2AlignerWrapperModelConfig(AlignerWrapperModelConfig): - decoder_module_cfg: GraphModuleConfig = field(default_factory=lambda: GraphModuleConfig()) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/asr_models_config.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/asr_models_config.py deleted file mode 100644 index ce480cac842870fe2eee4f4b163b0906eae51b12..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/asr_models_config.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional - -from omegaconf import MISSING - -import nemo.core.classes.dataset -from nemo.collections.asr.metrics.wer import CTCDecodingConfig -from nemo.collections.asr.modules.audio_preprocessing import ( - AudioToMelSpectrogramPreprocessorConfig, - SpectrogramAugmentationConfig, -) -from nemo.collections.asr.modules.conv_asr import ConvASRDecoderConfig, ConvASREncoderConfig -from nemo.core.config import modelPT as model_cfg - - -@dataclass -class ASRDatasetConfig(nemo.core.classes.dataset.DatasetConfig): - manifest_filepath: Optional[Any] = None - sample_rate: int = MISSING - labels: List[str] = MISSING - trim_silence: bool = False - - # Tarred dataset support - is_tarred: bool = False - tarred_audio_filepaths: Optional[Any] = None - tarred_shard_strategy: str = "scatter" - shard_manifests: bool = False - shuffle_n: int = 0 - - # Optional - int_values: Optional[int] = None - augmentor: Optional[Dict[str, Any]] = None - max_duration: Optional[float] = None - min_duration: Optional[float] = None - max_utts: int = 0 - blank_index: int = -1 - unk_index: int = -1 - normalize: bool = False - trim: bool = True - parser: Optional[str] = 'en' - eos_id: Optional[int] = None - bos_id: Optional[int] = None - pad_id: int = 0 - use_start_end_token: bool = False - return_sample_id: Optional[bool] = False - - # bucketing params - bucketing_strategy: str = "synced_randomized" - bucketing_batch_size: Optional[Any] = None - bucketing_weights: Optional[List[int]] = None - - -@dataclass -class EncDecCTCConfig(model_cfg.ModelConfig): - # Model global arguments - sample_rate: int = 16000 - repeat: int = 1 - dropout: float = 0.0 - separable: bool = False - labels: List[str] = MISSING - - # Dataset configs - train_ds: ASRDatasetConfig = field(default_factory=lambda: ASRDatasetConfig(manifest_filepath=None, shuffle=True)) - validation_ds: ASRDatasetConfig = field( - default_factory=lambda: ASRDatasetConfig(manifest_filepath=None, shuffle=False) - ) - test_ds: ASRDatasetConfig = field(default_factory=lambda: ASRDatasetConfig(manifest_filepath=None, shuffle=False)) - - # Optimizer / Scheduler config - optim: Optional[model_cfg.OptimConfig] = field( - default_factory=lambda: model_cfg.OptimConfig(sched=model_cfg.SchedConfig()) - ) - - # Model component configs - preprocessor: AudioToMelSpectrogramPreprocessorConfig = field( - default_factory=lambda: AudioToMelSpectrogramPreprocessorConfig() - ) - spec_augment: Optional[SpectrogramAugmentationConfig] = field( - default_factory=lambda: SpectrogramAugmentationConfig() - ) - encoder: ConvASREncoderConfig = field(default_factory=lambda: ConvASREncoderConfig()) - decoder: ConvASRDecoderConfig = field(default_factory=lambda: ConvASRDecoderConfig()) - decoding: CTCDecodingConfig = field(default_factory=lambda: CTCDecodingConfig()) - - -@dataclass -class EncDecCTCModelConfig(model_cfg.NemoConfig): - model: EncDecCTCConfig = field(default_factory=lambda: EncDecCTCConfig()) - - -@dataclass -class CacheAwareStreamingConfig: - chunk_size: int = 0 # the size of each chunk at each step, it can be a list of two integers to specify different chunk sizes for the first step and others - shift_size: int = 0 # the size of the shift in each step, it can be a list of two integers to specify different shift sizes for the first step and others - - cache_drop_size: int = 0 # the number of steps to drop from the cache - last_channel_cache_size: int = 0 # the size of the needed cache for last channel layers - - valid_out_len: int = 0 # the number of the steps in the final output which are valid (have the same value as in the offline mode) - - pre_encode_cache_size: int = 0 # the size of the needed cache for the pre-encoding part of the model to avoid caching inside the pre-encoding layers - drop_extra_pre_encoded: int = 0 # the number of steps to get dropped after the pre-encoding layer - - last_channel_num: int = 0 # number of the last channel layers (like MHA layers) which need caching in the model - last_time_num: int = 0 # number of the last time layers (like convolutions) which need caching in the model diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/classification_models_config.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/classification_models_config.py deleted file mode 100644 index 33408f591c8e7f186fe7427eaf9755a1f3895937..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/classification_models_config.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional - -from omegaconf import MISSING - -import nemo.core.classes.dataset -from nemo.collections.asr.modules.audio_preprocessing import ( - AudioToMFCCPreprocessorConfig, - CropOrPadSpectrogramAugmentationConfig, - SpectrogramAugmentationConfig, -) -from nemo.collections.asr.modules.conv_asr import ConvASRDecoderClassificationConfig, ConvASREncoderConfig -from nemo.core.config import modelPT as model_cfg - - -@dataclass -class EncDecClassificationDatasetConfig(nemo.core.classes.dataset.DatasetConfig): - manifest_filepath: Optional[str] = None - sample_rate: int = MISSING - labels: List[str] = MISSING - trim_silence: bool = False - - # Tarred dataset support - is_tarred: bool = False - tarred_audio_filepaths: Optional[str] = None - tarred_shard_strategy: str = "scatter" - shuffle_n: int = 0 - - # Optional - int_values: Optional[int] = None - augmentor: Optional[Dict[str, Any]] = None - max_duration: Optional[float] = None - min_duration: Optional[float] = None - cal_labels_occurrence: Optional[bool] = False - - # VAD Optional - vad_stream: Optional[bool] = None - window_length_in_sec: float = 0.31 - shift_length_in_sec: float = 0.01 - normalize_audio: bool = False - is_regression_task: bool = False - - # bucketing params - bucketing_strategy: str = "synced_randomized" - bucketing_batch_size: Optional[Any] = None - bucketing_weights: Optional[List[int]] = None - - -@dataclass -class EncDecClassificationConfig(model_cfg.ModelConfig): - # Model global arguments - sample_rate: int = 16000 - repeat: int = 1 - dropout: float = 0.0 - separable: bool = True - kernel_size_factor: float = 1.0 - labels: List[str] = MISSING - timesteps: int = MISSING - - # Dataset configs - train_ds: EncDecClassificationDatasetConfig = field( - default_factory=lambda: EncDecClassificationDatasetConfig( - manifest_filepath=None, shuffle=True, trim_silence=False - ) - ) - validation_ds: EncDecClassificationDatasetConfig = field( - default_factory=lambda: EncDecClassificationDatasetConfig(manifest_filepath=None, shuffle=False) - ) - test_ds: EncDecClassificationDatasetConfig = field( - default_factory=lambda: EncDecClassificationDatasetConfig(manifest_filepath=None, shuffle=False) - ) - - # Optimizer / Scheduler config - optim: Optional[model_cfg.OptimConfig] = field( - default_factory=lambda: model_cfg.OptimConfig(sched=model_cfg.SchedConfig()) - ) - - # Model component configs - preprocessor: AudioToMFCCPreprocessorConfig = field(default_factory=lambda: AudioToMFCCPreprocessorConfig()) - spec_augment: Optional[SpectrogramAugmentationConfig] = field( - default_factory=lambda: SpectrogramAugmentationConfig() - ) - crop_or_pad_augment: Optional[CropOrPadSpectrogramAugmentationConfig] = field( - default_factory=lambda: CropOrPadSpectrogramAugmentationConfig(audio_length=-1) - ) - - encoder: ConvASREncoderConfig = field(default_factory=lambda: ConvASREncoderConfig()) - decoder: ConvASRDecoderClassificationConfig = field(default_factory=lambda: ConvASRDecoderClassificationConfig()) - - def __post_init__(self): - if self.crop_or_pad_augment is not None: - self.crop_or_pad_augment.audio_length = self.timesteps - - -@dataclass -class EncDecClassificationModelConfig(model_cfg.NemoConfig): - model: EncDecClassificationConfig = field(default_factory=lambda: EncDecClassificationConfig()) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/diarizer_config.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/diarizer_config.py deleted file mode 100644 index 0745a6f2a4511d5545c1d23a421b05139f5743f8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/diarizer_config.py +++ /dev/null @@ -1,204 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import asdict, dataclass, field -from typing import Any, Dict, Optional, Tuple, Union - - -@dataclass -class DiarizerComponentConfig: - """Dataclass to imitate HydraConfig dict when accessing parameters.""" - - def get(self, name: str, default: Optional[Any] = None): - return getattr(self, name, default) - - def __iter__(self): - for key in asdict(self): - yield key - - def dict(self) -> Dict: - return asdict(self) - - -@dataclass -class ASRDiarizerCTCDecoderParams: - pretrained_language_model: Optional[str] = None # KenLM model file: .arpa model file or .bin binary file. - beam_width: int = 32 - alpha: float = 0.5 - beta: float = 2.5 - - -@dataclass -class ASRRealigningLMParams: - # Provide a KenLM language model in .arpa format. - arpa_language_model: Optional[str] = None - # Min number of words for the left context. - min_number_of_words: int = 3 - # Max number of words for the right context. - max_number_of_words: int = 10 - # The threshold for the difference between two log probability values from two hypotheses. - logprob_diff_threshold: float = 1.2 - - -@dataclass -class ASRDiarizerParams(DiarizerComponentConfig): - # if True, speech segmentation for diarization is based on word-timestamps from ASR inference. - asr_based_vad: bool = False - # Threshold (in sec) that caps the gap between two words when generating VAD timestamps using ASR based VAD. - asr_based_vad_threshold: float = 1.0 - # Batch size can be dependent on each ASR model. Default batch sizes are applied if set to null. - asr_batch_size: Optional[int] = None - # Native decoder delay. null is recommended to use the default values for each ASR model. - decoder_delay_in_sec: Optional[float] = None - # Offset to set a reference point from the start of the word. Recommended range of values is [-0.05 0.2]. - word_ts_anchor_offset: Optional[float] = None - # Select which part of the word timestamp we want to use. The options are: 'start', 'end', 'mid'. - word_ts_anchor_pos: str = "start" - # Fix the word timestamp using VAD output. You must provide a VAD model to use this feature. - fix_word_ts_with_VAD: bool = False - # If True, use colored text to distinguish speakers in the output transcript. - colored_text: bool = False - # If True, the start and end time of each speaker turn is printed in the output transcript. - print_time: bool = True - # If True, the output transcript breaks the line to fix the line width (default is 90 chars) - break_lines: bool = False - - -@dataclass -class ASRDiarizerConfig(DiarizerComponentConfig): - model_path: Optional[str] = "stt_en_conformer_ctc_large" - parameters: ASRDiarizerParams = field(default_factory=lambda: ASRDiarizerParams()) - ctc_decoder_parameters: ASRDiarizerCTCDecoderParams = field(default_factory=lambda: ASRDiarizerCTCDecoderParams()) - realigning_lm_parameters: ASRRealigningLMParams = field(default_factory=lambda: ASRRealigningLMParams()) - - -@dataclass -class VADParams(DiarizerComponentConfig): - window_length_in_sec: float = 0.15 # Window length in sec for VAD context input - shift_length_in_sec: float = 0.01 # Shift length in sec for generate frame level VAD prediction - smoothing: Union[str, bool] = "median" # False or type of smoothing method (eg: median) - overlap: float = 0.5 # Overlap ratio for overlapped mean/median smoothing filter - onset: float = 0.1 # Onset threshold for detecting the beginning and end of a speech - offset: float = 0.1 # Offset threshold for detecting the end of a speech - pad_onset: float = 0.1 # Adding durations before each speech segment - pad_offset: float = 0 # Adding durations after each speech segment - min_duration_on: float = 0 # Threshold for small non_speech deletion - min_duration_off: float = 0.2 # Threshold for short speech segment deletion - filter_speech_first: bool = True - - -@dataclass -class VADConfig(DiarizerComponentConfig): - model_path: str = "vad_multilingual_marblenet" # .nemo local model path or pretrained VAD model name - external_vad_manifest: Optional[str] = None - parameters: VADParams = field(default_factory=lambda: VADParams()) - - -@dataclass -class SpeakerEmbeddingsParams(DiarizerComponentConfig): - # Window length(s) in sec (floating-point number). either a number or a list. ex) 1.5 or [1.5,1.0,0.5] - window_length_in_sec: Tuple[float] = (1.5, 1.25, 1.0, 0.75, 0.5) - # Shift length(s) in sec (floating-point number). either a number or a list. ex) 0.75 or [0.75,0.5,0.25] - shift_length_in_sec: Tuple[float] = (0.75, 0.625, 0.5, 0.375, 0.25) - # Weight for each scale. None (for single scale) or list with window/shift scale count. ex) [0.33,0.33,0.33] - multiscale_weights: Tuple[float] = (1, 1, 1, 1, 1) - # save speaker embeddings in pickle format. True if clustering result is used for other models, such as MSDD. - save_embeddings: bool = True - - -@dataclass -class SpeakerEmbeddingsConfig(DiarizerComponentConfig): - # .nemo local model path or pretrained model name (titanet_large, ecapa_tdnn or speakerverification_speakernet) - model_path: Optional[str] = None - parameters: SpeakerEmbeddingsParams = field(default_factory=lambda: SpeakerEmbeddingsParams()) - - -@dataclass -class ClusteringParams(DiarizerComponentConfig): - # If True, use num of speakers value provided in manifest file. - oracle_num_speakers: bool = False - # Max number of speakers for each recording. If an oracle number of speakers is passed, this value is ignored. - max_num_speakers: int = 8 - # If the number of segments is lower than this number, enhanced speaker counting is activated. - enhanced_count_thres: int = 80 - # Determines the range of p-value search: 0 < p <= max_rp_threshold. - max_rp_threshold: float = 0.25 - # The higher the number, the more values will be examined with more time. - sparse_search_volume: int = 30 - # If True, take a majority vote on multiple p-values to estimate the number of speakers. - maj_vote_spk_count: bool = False - - -@dataclass -class ClusteringConfig(DiarizerComponentConfig): - parameters: ClusteringParams = field(default_factory=lambda: ClusteringParams()) - - -@dataclass -class MSDDParams(DiarizerComponentConfig): - # If True, use speaker embedding model in checkpoint, else provided speaker embedding model in config will be used. - use_speaker_model_from_ckpt: bool = True - # Batch size for MSDD inference. - infer_batch_size: int = 25 - # Sigmoid threshold for generating binarized speaker labels. The smaller the more generous on detecting overlaps. - sigmoid_threshold: Tuple[float] = (0.7,) - # If True, use oracle number of speaker and evaluate F1 score for the given speaker sequences. Default is False. - seq_eval_mode: bool = False - # If True, break the input audio clip to short sequences and calculate cluster average embeddings for inference. - split_infer: bool = True - # The length of split short sequence when split_infer is True. - diar_window_length: int = 50 - # If the estimated number of speakers are larger than this number, overlap speech is not estimated. - overlap_infer_spk_limit: int = 5 - - -@dataclass -class MSDDConfig(DiarizerComponentConfig): - model_path: Optional[str] = "diar_msdd_telephonic" - parameters: MSDDParams = field(default_factory=lambda: MSDDParams()) - - -@dataclass -class DiarizerConfig(DiarizerComponentConfig): - manifest_filepath: Optional[str] = None - out_dir: Optional[str] = None - oracle_vad: bool = False # If True, uses RTTM files provided in the manifest file to get VAD timestamps - collar: float = 0.25 # Collar value for scoring - ignore_overlap: bool = True # Consider or ignore overlap segments while scoring - vad: VADConfig = field(default_factory=lambda: VADConfig()) - speaker_embeddings: SpeakerEmbeddingsConfig = field(default_factory=lambda: SpeakerEmbeddingsConfig()) - clustering: ClusteringConfig = field(default_factory=lambda: ClusteringConfig()) - msdd_model: MSDDConfig = field(default_factory=lambda: MSDDConfig()) - asr: ASRDiarizerConfig = field(default_factory=lambda: ASRDiarizerConfig()) - - -@dataclass -class NeuralDiarizerInferenceConfig(DiarizerComponentConfig): - diarizer: DiarizerConfig = field(default_factory=lambda: DiarizerConfig()) - device: str = "cpu" - verbose: bool = False - batch_size: int = 64 - num_workers: int = 1 - sample_rate: int = 16000 - name: str = "" - - @classmethod - def init_config(cls, diar_model_path: str, vad_model_path: str, map_location: str, verbose: bool): - return NeuralDiarizerInferenceConfig( - DiarizerConfig( - vad=VADConfig(model_path=vad_model_path), msdd_model=MSDDConfig(model_path=diar_model_path), - ), - device=map_location, - verbose=verbose, - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/k2_sequence_models_config.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/k2_sequence_models_config.py deleted file mode 100644 index 53ed3e1377fe9e27253f1d07cc9d8a0686102cf7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/k2_sequence_models_config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field - -from nemo.collections.asr.models.configs.asr_models_config import EncDecCTCConfig -from nemo.collections.asr.parts.k2.classes import GraphModuleConfig as BackendConfig -from nemo.core.config.modelPT import NemoConfig - - -@dataclass -class GraphModuleConfig: - criterion_type: str = "ml" - loss_type: str = "ctc" - split_batch_size: int = 0 - dec_type: str = "topo" - transcribe_training: bool = True - backend_cfg: BackendConfig = field(default_factory=lambda: BackendConfig()) - - -@dataclass -class EncDecK2SeqConfig(EncDecCTCConfig): - graph_module_cfg: GraphModuleConfig = field(default_factory=lambda: GraphModuleConfig()) - - -@dataclass -class EncDecK2SeqModelConfig(NemoConfig): - model: EncDecK2SeqConfig = field(default_factory=lambda: EncDecK2SeqConfig()) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/matchboxnet_config.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/matchboxnet_config.py deleted file mode 100644 index 52ec4c35d9e853fac6143d89158400c45fd1aceb..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/matchboxnet_config.py +++ /dev/null @@ -1,261 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field -from typing import Any, Callable, List, Optional - -from omegaconf import MISSING - -from nemo.collections.asr.models.configs import classification_models_config as clf_cfg -from nemo.collections.asr.modules.audio_preprocessing import ( - AudioToMFCCPreprocessorConfig, - CropOrPadSpectrogramAugmentationConfig, - SpectrogramAugmentationConfig, -) -from nemo.collections.asr.modules.conv_asr import ( - ConvASRDecoderClassificationConfig, - ConvASREncoderConfig, - JasperEncoderConfig, -) -from nemo.core.config import modelPT as model_cfg - - -# fmt: off -def matchboxnet_3x1x64(): - config = [ - JasperEncoderConfig(filters=128, repeat=1, kernel=[11], stride=[1], dilation=[1], dropout=0.0, - residual=False, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=64, repeat=1, kernel=[13], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=64, repeat=1, kernel=[15], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=64, repeat=1, kernel=[17], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=128, repeat=1, kernel=[29], stride=[1], dilation=[2], dropout=0.0, - residual=False, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=128, repeat=1, kernel=[1], stride=[1], dilation=[1], dropout=0.0, - residual=False, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False) - ] - return config - - -def matchboxnet_3x1x64_vad(): - config = [ - JasperEncoderConfig(filters=128, repeat=1, kernel=[11], stride=[1], dilation=[1], dropout=0.0, - residual=False, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=64, repeat=1, kernel=[13], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=64, repeat=1, kernel=[15], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=64, repeat=1, kernel=[17], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=128, repeat=1, kernel=[29], stride=[1], dilation=[2], dropout=0.0, - residual=False, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=128, repeat=1, kernel=[1], stride=[1], dilation=[1], dropout=0.0, - residual=False, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False) - ] - return config - - -# fmt: on - - -@dataclass -class MatchboxNetModelConfig(clf_cfg.EncDecClassificationConfig): - # Model global arguments - sample_rate: int = 16000 - repeat: int = 1 - dropout: float = 0.0 - separable: bool = True - kernel_size_factor: float = 1.0 - timesteps: int = 128 - labels: List[str] = MISSING - - # Dataset configs - train_ds: clf_cfg.EncDecClassificationDatasetConfig = field( - default_factory=lambda: clf_cfg.EncDecClassificationDatasetConfig( - manifest_filepath=None, shuffle=True, trim_silence=False - ) - ) - validation_ds: clf_cfg.EncDecClassificationDatasetConfig = field( - default_factory=lambda: clf_cfg.EncDecClassificationDatasetConfig(manifest_filepath=None, shuffle=False) - ) - test_ds: clf_cfg.EncDecClassificationDatasetConfig = field( - default_factory=lambda: clf_cfg.EncDecClassificationDatasetConfig(manifest_filepath=None, shuffle=False) - ) - - # Optimizer / Scheduler config - optim: Optional[model_cfg.OptimConfig] = field( - default_factory=lambda: model_cfg.OptimConfig(sched=model_cfg.SchedConfig()) - ) - - # Model general component configs - preprocessor: AudioToMFCCPreprocessorConfig = field( - default_factory=lambda: AudioToMFCCPreprocessorConfig(window_size=0.025) - ) - spec_augment: Optional[SpectrogramAugmentationConfig] = field( - default_factory=lambda: SpectrogramAugmentationConfig( - freq_masks=2, time_masks=2, freq_width=15, time_width=25, rect_masks=5, rect_time=25, rect_freq=15 - ) - ) - crop_or_pad_augment: Optional[CropOrPadSpectrogramAugmentationConfig] = field( - default_factory=lambda: CropOrPadSpectrogramAugmentationConfig(audio_length=128) - ) - - encoder: ConvASREncoderConfig = field(default_factory=lambda: ConvASREncoderConfig(activation="relu")) - decoder: ConvASRDecoderClassificationConfig = field(default_factory=lambda: ConvASRDecoderClassificationConfig()) - - -@dataclass -class MatchboxNetVADModelConfig(MatchboxNetModelConfig): - timesteps: int = 64 - labels: List[str] = field(default_factory=lambda: ['background', 'speech']) - - crop_or_pad_augment: Optional[CropOrPadSpectrogramAugmentationConfig] = None - - -class EncDecClassificationModelConfigBuilder(model_cfg.ModelConfigBuilder): - VALID_CONFIGS = ['matchboxnet_3x1x64', 'matchboxnet_3x1x64_vad'] - - def __init__(self, name: str = 'matchboxnet_3x1x64', encoder_cfg_func: Optional[Callable[[], List[Any]]] = None): - if name not in EncDecClassificationModelConfigBuilder.VALID_CONFIGS: - raise ValueError("`name` must be one of : \n" f"{EncDecClassificationModelConfigBuilder.VALID_CONFIGS}") - - self.name = name - - if 'matchboxnet_3x1x64_vad' in name: - if encoder_cfg_func is None: - encoder_cfg_func = matchboxnet_3x1x64_vad - - model_cfg = MatchboxNetVADModelConfig( - repeat=1, - separable=True, - encoder=ConvASREncoderConfig(jasper=encoder_cfg_func(), activation="relu"), - decoder=ConvASRDecoderClassificationConfig(), - ) - - elif 'matchboxnet_3x1x64' in name: - if encoder_cfg_func is None: - encoder_cfg_func = matchboxnet_3x1x64 - - model_cfg = MatchboxNetModelConfig( - repeat=1, - separable=False, - spec_augment=SpectrogramAugmentationConfig(rect_masks=5, rect_freq=50, rect_time=120), - encoder=ConvASREncoderConfig(jasper=encoder_cfg_func(), activation="relu"), - decoder=ConvASRDecoderClassificationConfig(), - ) - - else: - raise ValueError(f"Invalid config name submitted to {self.__class__.__name__}") - - super(EncDecClassificationModelConfigBuilder, self).__init__(model_cfg) - self.model_cfg: clf_cfg.EncDecClassificationConfig = model_cfg # enable type hinting - - def set_labels(self, labels: List[str]): - self.model_cfg.labels = labels - - def set_separable(self, separable: bool): - self.model_cfg.separable = separable - - def set_repeat(self, repeat: int): - self.model_cfg.repeat = repeat - - def set_sample_rate(self, sample_rate: int): - self.model_cfg.sample_rate = sample_rate - - def set_dropout(self, dropout: float = 0.0): - self.model_cfg.dropout = dropout - - def set_timesteps(self, timesteps: int): - self.model_cfg.timesteps = timesteps - - def set_is_regression_task(self, is_regression_task: bool): - self.model_cfg.is_regression_task = is_regression_task - - # Note: Autocomplete for users wont work without these overrides - # But practically it is not needed since python will infer at runtime - - # def set_train_ds(self, cfg: Optional[clf_cfg.EncDecClassificationDatasetConfig] = None): - # super().set_train_ds(cfg) - # - # def set_validation_ds(self, cfg: Optional[clf_cfg.EncDecClassificationDatasetConfig] = None): - # super().set_validation_ds(cfg) - # - # def set_test_ds(self, cfg: Optional[clf_cfg.EncDecClassificationDatasetConfig] = None): - # super().set_test_ds(cfg) - - def _finalize_cfg(self): - # propagate labels - self.model_cfg.train_ds.labels = self.model_cfg.labels - self.model_cfg.validation_ds.labels = self.model_cfg.labels - self.model_cfg.test_ds.labels = self.model_cfg.labels - self.model_cfg.decoder.vocabulary = self.model_cfg.labels - - # propagate num classes - self.model_cfg.decoder.num_classes = len(self.model_cfg.labels) - - # propagate sample rate - self.model_cfg.sample_rate = self.model_cfg.sample_rate - self.model_cfg.preprocessor.sample_rate = self.model_cfg.sample_rate - self.model_cfg.train_ds.sample_rate = self.model_cfg.sample_rate - self.model_cfg.validation_ds.sample_rate = self.model_cfg.sample_rate - self.model_cfg.test_ds.sample_rate = self.model_cfg.sample_rate - - # propagate filters - self.model_cfg.encoder.feat_in = self.model_cfg.preprocessor.features - self.model_cfg.decoder.feat_in = self.model_cfg.encoder.jasper[-1].filters - - # propagate timeteps - if self.model_cfg.crop_or_pad_augment is not None: - self.model_cfg.crop_or_pad_augment.audio_length = self.model_cfg.timesteps - - # propagate separable - for layer in self.model_cfg.encoder.jasper[:-1]: # type: JasperEncoderConfig - layer.separable = self.model_cfg.separable - - # propagate repeat - for layer in self.model_cfg.encoder.jasper[1:-2]: # type: JasperEncoderConfig - layer.repeat = self.model_cfg.repeat - - # propagate dropout - for layer in self.model_cfg.encoder.jasper: # type: JasperEncoderConfig - layer.dropout = self.model_cfg.dropout - - def build(self) -> clf_cfg.EncDecClassificationConfig: - return super().build() diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/quartznet_config.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/quartznet_config.py deleted file mode 100644 index 93412b0053bf17296615f3d4f738c29f9f689d2b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/configs/quartznet_config.py +++ /dev/null @@ -1,316 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field -from typing import Any, Callable, List, Optional - -from omegaconf import MISSING - -from nemo.collections.asr.models.configs import asr_models_config as ctc_cfg -from nemo.collections.asr.modules.audio_preprocessing import ( - AudioToMelSpectrogramPreprocessorConfig, - SpectrogramAugmentationConfig, -) -from nemo.collections.asr.modules.conv_asr import ConvASRDecoderConfig, ConvASREncoderConfig, JasperEncoderConfig -from nemo.core.config import modelPT as model_cfg - - -# fmt: off -def qn_15x5(): - config = [ - JasperEncoderConfig(filters=256, repeat=1, kernel=[33], stride=[2], dilation=[1], dropout=0.0, - residual=False, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=256, repeat=5, kernel=[33], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=256, repeat=5, kernel=[33], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=256, repeat=5, kernel=[33], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=256, repeat=5, kernel=[39], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=256, repeat=5, kernel=[39], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=256, repeat=5, kernel=[39], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=512, repeat=5, kernel=[51], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=512, repeat=5, kernel=[51], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=512, repeat=5, kernel=[51], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=512, repeat=5, kernel=[63], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=512, repeat=5, kernel=[63], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=512, repeat=5, kernel=[63], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=512, repeat=5, kernel=[75], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=512, repeat=5, kernel=[75], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=512, repeat=5, kernel=[75], stride=[1], dilation=[1], dropout=0.0, - residual=True, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=512, repeat=1, kernel=[87], stride=[1], dilation=[2], dropout=0.0, - residual=False, groups=1, separable=True, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=1024, repeat=1, kernel=[1], stride=[1], dilation=[1], dropout=0.0, - residual=False, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False) - ] - return config - - -def jasper_10x5_dr(): - config = [ - JasperEncoderConfig(filters=256, repeat=1, kernel=[11], stride=[2], dilation=[1], dropout=0.2, - residual=False, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=256, repeat=5, kernel=[11], stride=[1], dilation=[1], dropout=0.2, - residual=True, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=True, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=256, repeat=5, kernel=[11], stride=[1], dilation=[1], dropout=0.2, - residual=True, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=True, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=384, repeat=5, kernel=[13], stride=[1], dilation=[1], dropout=0.2, - residual=True, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=True, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=384, repeat=5, kernel=[13], stride=[1], dilation=[1], dropout=0.2, - residual=True, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=True, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=512, repeat=5, kernel=[17], stride=[1], dilation=[1], dropout=0.2, - residual=True, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=True, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=512, repeat=5, kernel=[17], stride=[1], dilation=[1], dropout=0.2, - residual=True, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=True, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=640, repeat=5, kernel=[21], stride=[1], dilation=[1], dropout=0.3, - residual=True, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=True, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=640, repeat=5, kernel=[21], stride=[1], dilation=[1], dropout=0.3, - residual=True, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=True, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=768, repeat=5, kernel=[25], stride=[1], dilation=[1], dropout=0.3, - residual=True, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=True, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=768, repeat=5, kernel=[25], stride=[1], dilation=[1], dropout=0.3, - residual=True, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=True, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=896, repeat=1, kernel=[29], stride=[1], dilation=[2], dropout=0.4, - residual=False, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False), - JasperEncoderConfig(filters=1024, repeat=1, kernel=[1], stride=[1], dilation=[1], dropout=0.4, - residual=False, groups=1, separable=False, heads=-1, residual_mode='add', - residual_dense=False, se=False, se_reduction_ratio=8, se_context_size=-1, - se_interpolation_mode='nearest', kernel_size_factor=1.0, stride_last=False) - ] - return config -# fmt: on - - -@dataclass -class JasperModelConfig(ctc_cfg.EncDecCTCConfig): - # Model global arguments - sample_rate: int = 16000 - repeat: int = 1 - dropout: float = 0.0 - separable: bool = False - labels: List[str] = MISSING - - # Dataset configs - train_ds: ctc_cfg.ASRDatasetConfig = field( - default_factory=lambda: ctc_cfg.ASRDatasetConfig(manifest_filepath=None, shuffle=True, trim_silence=True) - ) - validation_ds: ctc_cfg.ASRDatasetConfig = field( - default_factory=lambda: ctc_cfg.ASRDatasetConfig(manifest_filepath=None, shuffle=False) - ) - test_ds: ctc_cfg.ASRDatasetConfig = field( - default_factory=lambda: ctc_cfg.ASRDatasetConfig(manifest_filepath=None, shuffle=False) - ) - - # Optimizer / Scheduler config - optim: Optional[model_cfg.OptimConfig] = field( - default_factory=lambda: model_cfg.OptimConfig(sched=model_cfg.SchedConfig()) - ) - - # Model general component configs - preprocessor: AudioToMelSpectrogramPreprocessorConfig = field( - default_factory=lambda: AudioToMelSpectrogramPreprocessorConfig() - ) - spec_augment: Optional[SpectrogramAugmentationConfig] = field( - default_factory=lambda: SpectrogramAugmentationConfig() - ) - encoder: ConvASREncoderConfig = field(default_factory=lambda: ConvASREncoderConfig(activation="relu")) - decoder: ConvASRDecoderConfig = field(default_factory=lambda: ConvASRDecoderConfig()) - - -@dataclass -class QuartzNetModelConfig(JasperModelConfig): - separable: bool = True - - -class EncDecCTCModelConfigBuilder(model_cfg.ModelConfigBuilder): - VALID_CONFIGS = ['quartznet_15x5', 'quartznet_15x5_zh', 'jasper_10x5dr'] - - def __init__(self, name: str = 'quartznet_15x5', encoder_cfg_func: Optional[Callable[[], List[Any]]] = None): - if name not in EncDecCTCModelConfigBuilder.VALID_CONFIGS: - raise ValueError("`name` must be one of : \n" f"{EncDecCTCModelConfigBuilder.VALID_CONFIGS}") - - self.name = name - - if 'quartznet_15x5' in name: - if encoder_cfg_func is None: - encoder_cfg_func = qn_15x5 - - model_cfg = QuartzNetModelConfig( - repeat=5, - separable=True, - spec_augment=SpectrogramAugmentationConfig(rect_masks=5, rect_freq=50, rect_time=120), - encoder=ConvASREncoderConfig(jasper=encoder_cfg_func(), activation="relu"), - decoder=ConvASRDecoderConfig(), - ) - - elif 'jasper_10x5' in name: - if encoder_cfg_func is None: - encoder_cfg_func = jasper_10x5_dr - - model_cfg = JasperModelConfig( - repeat=5, - separable=False, - spec_augment=SpectrogramAugmentationConfig(rect_masks=5, rect_freq=50, rect_time=120), - encoder=ConvASREncoderConfig(jasper=encoder_cfg_func(), activation="relu"), - decoder=ConvASRDecoderConfig(), - ) - - else: - raise ValueError(f"Invalid config name submitted to {self.__class__.__name__}") - - super(EncDecCTCModelConfigBuilder, self).__init__(model_cfg) - self.model_cfg: ctc_cfg.EncDecCTCConfig = model_cfg # enable type hinting - - if 'zh' in name: - self.set_dataset_normalize(normalize=False) - - def set_labels(self, labels: List[str]): - self.model_cfg.labels = labels - - def set_separable(self, separable: bool): - self.model_cfg.separable = separable - - def set_repeat(self, repeat: int): - self.model_cfg.repeat = repeat - - def set_sample_rate(self, sample_rate: int): - self.model_cfg.sample_rate = sample_rate - - def set_dropout(self, dropout: float = 0.0): - self.model_cfg.dropout = dropout - - def set_dataset_normalize(self, normalize: bool): - self.model_cfg.train_ds.normalize = normalize - self.model_cfg.validation_ds.normalize = normalize - self.model_cfg.test_ds.normalize = normalize - - # Note: Autocomplete for users wont work without these overrides - # But practically it is not needed since python will infer at runtime - - # def set_train_ds(self, cfg: Optional[ctc_cfg.ASRDatasetConfig] = None): - # super().set_train_ds(cfg) - # - # def set_validation_ds(self, cfg: Optional[ctc_cfg.ASRDatasetConfig] = None): - # super().set_validation_ds(cfg) - # - # def set_test_ds(self, cfg: Optional[ctc_cfg.ASRDatasetConfig] = None): - # super().set_test_ds(cfg) - - def _finalize_cfg(self): - # propagate labels - self.model_cfg.train_ds.labels = self.model_cfg.labels - self.model_cfg.validation_ds.labels = self.model_cfg.labels - self.model_cfg.test_ds.labels = self.model_cfg.labels - self.model_cfg.decoder.vocabulary = self.model_cfg.labels - - # propagate num classes - self.model_cfg.decoder.num_classes = len(self.model_cfg.labels) - - # propagate sample rate - self.model_cfg.sample_rate = self.model_cfg.sample_rate - self.model_cfg.preprocessor.sample_rate = self.model_cfg.sample_rate - self.model_cfg.train_ds.sample_rate = self.model_cfg.sample_rate - self.model_cfg.validation_ds.sample_rate = self.model_cfg.sample_rate - self.model_cfg.test_ds.sample_rate = self.model_cfg.sample_rate - - # propagate filters - self.model_cfg.encoder.feat_in = self.model_cfg.preprocessor.features - self.model_cfg.decoder.feat_in = self.model_cfg.encoder.jasper[-1].filters - - # propagate separable - for layer in self.model_cfg.encoder.jasper[:-1]: # type: JasperEncoderConfig - layer.separable = self.model_cfg.separable - - # propagate repeat - for layer in self.model_cfg.encoder.jasper[1:-2]: # type: JasperEncoderConfig - layer.repeat = self.model_cfg.repeat - - # propagate dropout - for layer in self.model_cfg.encoder.jasper: # type: JasperEncoderConfig - layer.dropout = self.model_cfg.dropout - - def build(self) -> ctc_cfg.EncDecCTCConfig: - return super().build() diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/ctc_bpe_models.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/ctc_bpe_models.py deleted file mode 100644 index 7b17f7918e2075aed257761749294a7e677467f6..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/ctc_bpe_models.py +++ /dev/null @@ -1,630 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os -from typing import Dict, List, Optional, Union - -import torch -from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict - -from nemo.collections.asr.data import audio_to_text_dataset -from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset -from nemo.collections.asr.losses.ctc import CTCLoss -from nemo.collections.asr.metrics.wer_bpe import WERBPE, CTCBPEDecoding, CTCBPEDecodingConfig -from nemo.collections.asr.models.ctc_models import EncDecCTCModel -from nemo.collections.asr.parts.mixins import ASRBPEMixin -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging, model_utils - -__all__ = ['EncDecCTCModelBPE'] - - -class EncDecCTCModelBPE(EncDecCTCModel, ASRBPEMixin): - """Encoder decoder CTC-based models with Byte Pair Encoding.""" - - def __init__(self, cfg: DictConfig, trainer=None): - # Convert to Hydra 1.0 compatible DictConfig - cfg = model_utils.convert_model_config_to_dict_config(cfg) - cfg = model_utils.maybe_update_config_version(cfg) - - if 'tokenizer' not in cfg: - raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !") - - # Setup the tokenizer - self._setup_tokenizer(cfg.tokenizer) - - # Initialize a dummy vocabulary - vocabulary = self.tokenizer.tokenizer.get_vocab() - - # Set the new vocabulary - with open_dict(cfg): - # sidestepping the potential overlapping tokens issue in aggregate tokenizers - if self.tokenizer_type == "agg": - cfg.decoder.vocabulary = ListConfig(vocabulary) - else: - cfg.decoder.vocabulary = ListConfig(list(vocabulary.keys())) - - # Override number of classes if placeholder provided - num_classes = cfg.decoder["num_classes"] - - if num_classes < 1: - logging.info( - "\nReplacing placeholder number of classes ({}) with actual number of classes - {}".format( - num_classes, len(vocabulary) - ) - ) - cfg.decoder["num_classes"] = len(vocabulary) - - super().__init__(cfg=cfg, trainer=trainer) - - # Setup decoding objects - decoding_cfg = self.cfg.get('decoding', None) - - # In case decoding config not found, use default config - if decoding_cfg is None: - decoding_cfg = OmegaConf.structured(CTCBPEDecodingConfig) - with open_dict(self.cfg): - self.cfg.decoding = decoding_cfg - - self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer) - - # Setup metric with decoding strategy - self._wer = WERBPE( - decoding=self.decoding, - use_cer=self._cfg.get('use_cer', False), - dist_sync_on_step=True, - log_prediction=self._cfg.get("log_prediction", False), - ) - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config( - config=config, - local_rank=self.local_rank, - global_rank=self.global_rank, - world_size=self.world_size, - tokenizer=self.tokenizer, - preprocessor_cfg=self.cfg.get("preprocessor", None), - ) - - if dataset is None: - return None - - if isinstance(dataset, AudioToBPEDALIDataset): - # DALI Dataset implements dataloader interface - return dataset - - shuffle = config['shuffle'] - if isinstance(dataset, torch.utils.data.IterableDataset): - shuffle = False - - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - elif hasattr(dataset.datasets[0], 'collate_fn'): - # support datasets that are lists of entries - collate_fn = dataset.datasets[0].collate_fn - else: - # support datasets that are lists of lists - collate_fn = dataset.datasets[0].datasets[0].collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': - """ - Setup function for a temporary data loader which wraps the provided audio file. - - Args: - config: A python dictionary which contains the following keys: - paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \ - Recommended length per file is between 5 and 25 seconds. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - temp_dir: (str) A temporary directory where the audio manifest is temporarily - stored. - num_workers: (int) number of workers. Depends of the batch_size and machine. \ - 0 - only the main process will load batches, 1 - one worker (not main process) - - Returns: - A pytorch DataLoader for the given audio file(s). - """ - - if 'manifest_filepath' in config: - manifest_filepath = config['manifest_filepath'] - batch_size = config['batch_size'] - else: - manifest_filepath = os.path.join(config['temp_dir'], 'manifest.json') - batch_size = min(config['batch_size'], len(config['paths2audio_files'])) - - dl_config = { - 'manifest_filepath': manifest_filepath, - 'sample_rate': self.preprocessor._sample_rate, - 'batch_size': batch_size, - 'shuffle': False, - 'num_workers': config.get('num_workers', min(batch_size, os.cpu_count() - 1)), - 'pin_memory': True, - 'channel_selector': config.get('channel_selector', None), - 'use_start_end_token': self.cfg.validation_ds.get('use_start_end_token', False), - } - - if config.get("augmentor"): - dl_config['augmentor'] = config.get("augmentor") - - temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) - return temporary_datalayer - - def change_vocabulary( - self, - new_tokenizer_dir: Union[str, DictConfig], - new_tokenizer_type: str, - decoding_cfg: Optional[DictConfig] = None, - ): - """ - Changes vocabulary of the tokenizer used during CTC decoding process. - Use this method when fine-tuning on from pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - Args: - new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer (if the tokenizer type is `agg`) - new_tokenizer_type: Either `agg`, `bpe` or `wpe`. `bpe` is used for SentencePiece tokenizers, - whereas `wpe` is used for `BertTokenizer`. - new_tokenizer_cfg: A config for the new tokenizer. if provided, pre-empts the dir and type - - Returns: None - - """ - if isinstance(new_tokenizer_dir, DictConfig): - if new_tokenizer_type == 'agg': - new_tokenizer_cfg = new_tokenizer_dir - else: - raise ValueError( - f'New tokenizer dir should be a string unless the tokenizer is `agg`, but this tokenizer type is: {new_tokenizer_type}' - ) - else: - new_tokenizer_cfg = None - - if new_tokenizer_cfg is not None: - tokenizer_cfg = new_tokenizer_cfg - else: - if not os.path.isdir(new_tokenizer_dir): - raise NotADirectoryError( - f'New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}' - f"New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}" - ) - - if new_tokenizer_type.lower() not in ('bpe', 'wpe'): - raise ValueError(f'New tokenizer type must be either `bpe` or `wpe`') - - tokenizer_cfg = OmegaConf.create({'dir': new_tokenizer_dir, 'type': new_tokenizer_type}) - - # Setup the tokenizer - self._setup_tokenizer(tokenizer_cfg) - - # Initialize a dummy vocabulary - vocabulary = self.tokenizer.tokenizer.get_vocab() - - # Set the new vocabulary - decoder_config = copy.deepcopy(self.decoder.to_config_dict()) - # sidestepping the potential overlapping tokens issue in aggregate tokenizers - if self.tokenizer_type == "agg": - decoder_config.vocabulary = ListConfig(vocabulary) - else: - decoder_config.vocabulary = ListConfig(list(vocabulary.keys())) - - decoder_num_classes = decoder_config['num_classes'] - - # Override number of classes if placeholder provided - logging.info( - "\nReplacing old number of classes ({}) with new number of classes - {}".format( - decoder_num_classes, len(vocabulary) - ) - ) - - decoder_config['num_classes'] = len(vocabulary) - - del self.decoder - self.decoder = EncDecCTCModelBPE.from_config_dict(decoder_config) - del self.loss - self.loss = CTCLoss( - num_classes=self.decoder.num_classes_with_blank - 1, - zero_infinity=True, - reduction=self._cfg.get("ctc_reduction", "mean_batch"), - ) - - if decoding_cfg is None: - # Assume same decoding config as before - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(CTCBPEDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = CTCBPEDecoding(decoding_cfg=decoding_cfg, tokenizer=self.tokenizer) - - self._wer = WERBPE( - decoding=self.decoding, - use_cer=self._cfg.get('use_cer', False), - log_prediction=self._cfg.get("log_prediction", False), - dist_sync_on_step=True, - ) - - # Update config - with open_dict(self.cfg.decoder): - self._cfg.decoder = decoder_config - - with open_dict(self.cfg.decoding): - self._cfg.decoding = decoding_cfg - - logging.info(f"Changed tokenizer to {self.decoder.vocabulary} vocabulary.") - - def change_decoding_strategy(self, decoding_cfg: DictConfig): - """ - Changes decoding strategy used during CTC decoding process. - - Args: - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - """ - if decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config") - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(CTCBPEDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = CTCBPEDecoding(decoding_cfg=decoding_cfg, tokenizer=self.tokenizer,) - - self._wer = WERBPE( - decoding=self.decoding, - use_cer=self._wer.use_cer, - log_prediction=self._wer.log_prediction, - dist_sync_on_step=True, - ) - - self.decoder.temperature = decoding_cfg.get('temperature', 1.0) - - # Update config - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - logging.info(f"Changed decoding strategy to \n{OmegaConf.to_yaml(self.cfg.decoding)}") - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_citrinet_256", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_256/versions/1.0.0rc1/files/stt_en_citrinet_256.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_citrinet_512", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_512/versions/1.0.0rc1/files/stt_en_citrinet_512.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_citrinet_1024", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_1024/versions/1.0.0rc1/files/stt_en_citrinet_1024.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_citrinet_256_gamma_0_25", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256_gamma_0_25", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_256_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_256_gamma_0_25.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_citrinet_512_gamma_0_25", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512_gamma_0_25", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_512_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_512_gamma_0_25.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_citrinet_1024_gamma_0_25", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024_gamma_0_25", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_1024_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_1024_gamma_0_25.nemo", - ) - - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_es_citrinet_512", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_512", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_citrinet_512/versions/1.0.0/files/stt_es_citrinet_512.nemo", - ) - - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_de_citrinet_1024", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_citrinet_1024", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_citrinet_1024/versions/1.5.0/files/stt_de_citrinet_1024.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_fr_citrinet_1024_gamma_0_25", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_citrinet_1024_gamma_0_25", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_citrinet_1024_gamma_0_25/versions/1.5/files/stt_fr_citrinet_1024_gamma_0_25.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_fr_no_hyphen_citrinet_1024_gamma_0_25", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_citrinet_1024_gamma_0_25", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_citrinet_1024_gamma_0_25/versions/1.5/files/stt_fr_no_hyphen_citrinet_1024_gamma_0_25.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_es_citrinet_1024_gamma_0_25", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_1024_gamma_0_25", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_citrinet_1024_gamma_0_25/versions/1.8.0/files/stt_es_citrinet_1024_gamma_0_25.nemo", - ) - - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_conformer_ctc_small", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_small/versions/1.6.0/files/stt_en_conformer_ctc_small.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_conformer_ctc_medium", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_medium/versions/1.6.0/files/stt_en_conformer_ctc_medium.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_conformer_ctc_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_large/versions/1.10.0/files/stt_en_conformer_ctc_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_conformer_ctc_xlarge", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_xlarge", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_xlarge/versions/1.10.0/files/stt_en_conformer_ctc_xlarge.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_squeezeformer_ctc_xsmall_ls", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_xsmall_ls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_xsmall_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_xsmall_ls.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_squeezeformer_ctc_small_ls", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_small_ls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_small_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_small_ls.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_squeezeformer_ctc_small_medium_ls", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_small_medium_ls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_small_medium_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_small_medium_ls.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_squeezeformer_ctc_medium_ls", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_medium_ls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_medium_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_medium_ls.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_squeezeformer_ctc_medium_large_ls", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_medium_large_ls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_medium_large_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_medium_large_ls.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_squeezeformer_ctc_large_ls", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_large_ls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_large_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_large_ls.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_conformer_ctc_small_ls", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small_ls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_small_ls/versions/1.0.0/files/stt_en_conformer_ctc_small_ls.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_conformer_ctc_medium_ls", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium_ls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_medium_ls/versions/1.0.0/files/stt_en_conformer_ctc_medium_ls.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_conformer_ctc_large_ls", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large_ls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_large_ls/versions/1.0.0/files/stt_en_conformer_ctc_large_ls.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_fr_conformer_ctc_large", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_conformer_ctc_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_conformer_ctc_large/versions/1.5.1/files/stt_fr_conformer_ctc_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_fr_no_hyphen_conformer_ctc_large", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_conformer_ctc_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_conformer_ctc_large/versions/1.5.1/files/stt_fr_no_hyphen_conformer_ctc_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_de_conformer_ctc_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_conformer_ctc_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_conformer_ctc_large/versions/1.5.0/files/stt_de_conformer_ctc_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_es_conformer_ctc_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_conformer_ctc_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_conformer_ctc_large/versions/1.8.0/files/stt_es_conformer_ctc_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_hi_conformer_ctc_medium", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hi_conformer_ctc_medium", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_hi_conformer_ctc_medium/versions/1.6.0/files/stt_hi_conformer_ctc_medium.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_mr_conformer_ctc_medium", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_mr_conformer_ctc_medium", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_mr_conformer_ctc_medium/versions/1.6.0/files/stt_mr_conformer_ctc_medium.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_enes_conformer_ctc_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_enes_conformer_ctc_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_enes_conformer_ctc_large/versions/1.0.0/files/stt_enes_conformer_ctc_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_ca_conformer_ctc_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_conformer_ctc_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ca_conformer_ctc_large/versions/1.11.0/files/stt_ca_conformer_ctc_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_rw_conformer_ctc_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_rw_conformer_ctc_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_rw_conformer_ctc_large/versions/1.11.0/files/stt_rw_conformer_ctc_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_enes_conformer_ctc_large_codesw", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_enes_conformer_ctc_large_codesw", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_enes_conformer_ctc_large_codesw/versions/1.0.0/files/stt_enes_conformer_ctc_large_codesw.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_be_conformer_ctc_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_be_conformer_ctc_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_be_conformer_ctc_large/versions/1.12.0/files/stt_be_conformer_ctc_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_hr_conformer_ctc_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_conformer_ctc_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_hr_conformer_ctc_large/versions/1.11.0/files/stt_hr_conformer_ctc_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_it_conformer_ctc_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_conformer_ctc_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_it_conformer_ctc_large/versions/1.13.0/files/stt_it_conformer_ctc_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_ru_conformer_ctc_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_conformer_ctc_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ru_conformer_ctc_large/versions/1.13.0/files/stt_ru_conformer_ctc_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_eo_conformer_ctc_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_eo_conformer_ctc_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_eo_conformer_ctc_large/versions/1.14.0/files/stt_eo_conformer_ctc_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_fastconformer_ctc_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_large/versions/1.0.0/files/stt_en_fastconformer_ctc_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_fastconformer_ctc_large_ls", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_large_ls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_large_ls/versions/1.0.0/files/stt_en_fastconformer_ctc_large_ls.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_fastconformer_ctc_xlarge", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_xlarge", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_xlarge/versions/1.20.0/files/stt_en_fastconformer_ctc_xlarge.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_fastconformer_ctc_xxlarge", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_xxlarge", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_xxlarge/versions/1.20.1/files/stt_en_fastconformer_ctc_xxlarge.nemo", - ) - results.append(model) - - return results diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/ctc_models.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/ctc_models.py deleted file mode 100644 index cb6eb9a7e7e7f60b5de5aca538f7adafc13149af..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/ctc_models.py +++ /dev/null @@ -1,845 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import copy -import json -import os -import tempfile -from math import ceil -from typing import Dict, List, Optional, Union - -import torch -from omegaconf import DictConfig, OmegaConf, open_dict -from pytorch_lightning import Trainer -from tqdm.auto import tqdm - -from nemo.collections.asr.data import audio_to_text_dataset -from nemo.collections.asr.data.audio_to_text_dali import AudioToCharDALIDataset, DALIOutputs -from nemo.collections.asr.losses.ctc import CTCLoss -from nemo.collections.asr.metrics.wer import WER, CTCDecoding, CTCDecodingConfig -from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel -from nemo.collections.asr.parts.mixins import ASRModuleMixin, InterCTCMixin -from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType -from nemo.core.classes.common import PretrainedModelInfo, typecheck -from nemo.core.classes.mixins import AccessMixin -from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, LogprobsType, NeuralType, SpectrogramType -from nemo.utils import logging - -__all__ = ['EncDecCTCModel'] - - -class EncDecCTCModel(ASRModel, ExportableEncDecModel, ASRModuleMixin, InterCTCMixin): - """Base class for encoder decoder CTC-based models.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable - # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 - self.world_size = 1 - if trainer is not None: - self.world_size = trainer.world_size - - super().__init__(cfg=cfg, trainer=trainer) - self.preprocessor = EncDecCTCModel.from_config_dict(self._cfg.preprocessor) - self.encoder = EncDecCTCModel.from_config_dict(self._cfg.encoder) - - with open_dict(self._cfg): - if "feat_in" not in self._cfg.decoder or ( - not self._cfg.decoder.feat_in and hasattr(self.encoder, '_feat_out') - ): - self._cfg.decoder.feat_in = self.encoder._feat_out - if "feat_in" not in self._cfg.decoder or not self._cfg.decoder.feat_in: - raise ValueError("param feat_in of the decoder's config is not set!") - - if self.cfg.decoder.num_classes < 1 and self.cfg.decoder.vocabulary is not None: - logging.info( - "\nReplacing placeholder number of classes ({}) with actual number of classes - {}".format( - self.cfg.decoder.num_classes, len(self.cfg.decoder.vocabulary) - ) - ) - cfg.decoder["num_classes"] = len(self.cfg.decoder.vocabulary) - - self.decoder = EncDecCTCModel.from_config_dict(self._cfg.decoder) - - self.loss = CTCLoss( - num_classes=self.decoder.num_classes_with_blank - 1, - zero_infinity=True, - reduction=self._cfg.get("ctc_reduction", "mean_batch"), - ) - - if hasattr(self._cfg, 'spec_augment') and self._cfg.spec_augment is not None: - self.spec_augmentation = EncDecCTCModel.from_config_dict(self._cfg.spec_augment) - else: - self.spec_augmentation = None - - # Setup decoding objects - decoding_cfg = self.cfg.get('decoding', None) - - # In case decoding config not found, use default config - if decoding_cfg is None: - decoding_cfg = OmegaConf.structured(CTCDecodingConfig) - with open_dict(self.cfg): - self.cfg.decoding = decoding_cfg - - self.decoding = CTCDecoding(self.cfg.decoding, vocabulary=OmegaConf.to_container(self.decoder.vocabulary)) - - # Setup metric with decoding strategy - self._wer = WER( - decoding=self.decoding, - use_cer=self._cfg.get('use_cer', False), - dist_sync_on_step=True, - log_prediction=self._cfg.get("log_prediction", False), - ) - - # Setup optional Optimization flags - self.setup_optimization_flags() - - # setting up interCTC loss (from InterCTCMixin) - self.setup_interctc(decoder_name='decoder', loss_name='loss', wer_name='_wer') - - # Adapter modules setup (from ASRAdapterModelMixin) - self.setup_adapters() - - @torch.no_grad() - def transcribe( - self, - paths2audio_files: List[str], - batch_size: int = 4, - logprobs: bool = False, - return_hypotheses: bool = False, - num_workers: int = 0, - channel_selector: Optional[ChannelSelectorType] = None, - augmentor: DictConfig = None, - verbose: bool = True, - ) -> List[str]: - """ - If modify this function, please remember update transcribe_partial_audio() in - nemo/collections/asr/parts/utils/trancribe_utils.py - - Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. - - Args: - paths2audio_files: (a list) of paths to audio files. \ - Recommended length per file is between 5 and 25 seconds. \ - But it is possible to pass a few hours long file if enough GPU memory is available. - batch_size: (int) batch size to use during inference. - Bigger will result in better throughput performance but would use more memory. - logprobs: (bool) pass True to get log probabilities instead of transcripts. - return_hypotheses: (bool) Either return hypotheses or text - With hypotheses can do some postprocessing like getting timestamp or rescoring - num_workers: (int) number of workers for DataLoader - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. - augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied. - verbose: (bool) whether to display tqdm progress bar - Returns: - A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files - """ - if paths2audio_files is None or len(paths2audio_files) == 0: - return {} - - if return_hypotheses and logprobs: - raise ValueError( - "Either `return_hypotheses` or `logprobs` can be True at any given time." - "Returned hypotheses will contain the logprobs." - ) - - if num_workers is None: - num_workers = min(batch_size, os.cpu_count() - 1) - - # We will store transcriptions here - hypotheses = [] - all_hypotheses = [] - - # Model's mode and device - mode = self.training - device = next(self.parameters()).device - dither_value = self.preprocessor.featurizer.dither - pad_to_value = self.preprocessor.featurizer.pad_to - - try: - self.preprocessor.featurizer.dither = 0.0 - self.preprocessor.featurizer.pad_to = 0 - # Switch model to evaluation mode - self.eval() - # Freeze the encoder and decoder modules - self.encoder.freeze() - self.decoder.freeze() - logging_level = logging.get_verbosity() - logging.set_verbosity(logging.WARNING) - # Work in tmp directory - will store manifest file there - with tempfile.TemporaryDirectory() as tmpdir: - with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp: - for audio_file in paths2audio_files: - entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': ''} - fp.write(json.dumps(entry) + '\n') - - config = { - 'paths2audio_files': paths2audio_files, - 'batch_size': batch_size, - 'temp_dir': tmpdir, - 'num_workers': num_workers, - 'channel_selector': channel_selector, - } - - if augmentor: - config['augmentor'] = augmentor - - temporary_datalayer = self._setup_transcribe_dataloader(config) - for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=not verbose): - logits, logits_len, greedy_predictions = self.forward( - input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) - ) - - if logprobs: - # dump log probs per file - for idx in range(logits.shape[0]): - lg = logits[idx][: logits_len[idx]] - hypotheses.append(lg.cpu().numpy()) - else: - current_hypotheses, all_hyp = self.decoding.ctc_decoder_predictions_tensor( - logits, decoder_lengths=logits_len, return_hypotheses=return_hypotheses, - ) - logits = logits.cpu() - - if return_hypotheses: - # dump log probs per file - for idx in range(logits.shape[0]): - current_hypotheses[idx].y_sequence = logits[idx][: logits_len[idx]] - if current_hypotheses[idx].alignments is None: - current_hypotheses[idx].alignments = current_hypotheses[idx].y_sequence - - if all_hyp is None: - hypotheses += current_hypotheses - else: - hypotheses += all_hyp - - del greedy_predictions - del logits - del test_batch - finally: - # set mode back to its original value - self.train(mode=mode) - self.preprocessor.featurizer.dither = dither_value - self.preprocessor.featurizer.pad_to = pad_to_value - if mode is True: - self.encoder.unfreeze() - self.decoder.unfreeze() - logging.set_verbosity(logging_level) - - return hypotheses - - def change_vocabulary(self, new_vocabulary: List[str], decoding_cfg: Optional[DictConfig] = None): - """ - Changes vocabulary used during CTC decoding process. Use this method when fine-tuning on from pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - If new_vocabulary == self.decoder.vocabulary then nothing will be changed. - - Args: - - new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically, \ - this is target alphabet. - - Returns: None - - """ - if self.decoder.vocabulary == new_vocabulary: - logging.warning(f"Old {self.decoder.vocabulary} and new {new_vocabulary} match. Not changing anything.") - else: - if new_vocabulary is None or len(new_vocabulary) == 0: - raise ValueError(f'New vocabulary must be non-empty list of chars. But I got: {new_vocabulary}') - decoder_config = self.decoder.to_config_dict() - new_decoder_config = copy.deepcopy(decoder_config) - new_decoder_config['vocabulary'] = new_vocabulary - new_decoder_config['num_classes'] = len(new_vocabulary) - - del self.decoder - self.decoder = EncDecCTCModel.from_config_dict(new_decoder_config) - del self.loss - self.loss = CTCLoss( - num_classes=self.decoder.num_classes_with_blank - 1, - zero_infinity=True, - reduction=self._cfg.get("ctc_reduction", "mean_batch"), - ) - - if decoding_cfg is None: - # Assume same decoding config as before - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(CTCDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = CTCDecoding( - decoding_cfg=decoding_cfg, vocabulary=OmegaConf.to_container(self.decoder.vocabulary) - ) - - self._wer = WER( - decoding=self.decoding, - use_cer=self._cfg.get('use_cer', False), - dist_sync_on_step=True, - log_prediction=self._cfg.get("log_prediction", False), - ) - - # Update config - with open_dict(self.cfg.decoder): - self._cfg.decoder = new_decoder_config - - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - ds_keys = ['train_ds', 'validation_ds', 'test_ds'] - for key in ds_keys: - if key in self.cfg: - with open_dict(self.cfg[key]): - self.cfg[key]['labels'] = OmegaConf.create(new_vocabulary) - - logging.info(f"Changed decoder to output to {self.decoder.vocabulary} vocabulary.") - - def change_decoding_strategy(self, decoding_cfg: DictConfig): - """ - Changes decoding strategy used during CTC decoding process. - - Args: - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - """ - if decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config") - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(CTCDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = CTCDecoding( - decoding_cfg=decoding_cfg, vocabulary=OmegaConf.to_container(self.decoder.vocabulary) - ) - - self._wer = WER( - decoding=self.decoding, - use_cer=self._wer.use_cer, - log_prediction=self._wer.log_prediction, - dist_sync_on_step=True, - ) - - self.decoder.temperature = decoding_cfg.get('temperature', 1.0) - - # Update config - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - logging.info(f"Changed decoding strategy to \n{OmegaConf.to_yaml(self.cfg.decoding)}") - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - # Automatically inject args from model config to dataloader config - audio_to_text_dataset.inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate') - audio_to_text_dataset.inject_dataloader_value_from_model_config(self.cfg, config, key='labels') - dataset = audio_to_text_dataset.get_audio_to_text_char_dataset_from_config( - config=config, - local_rank=self.local_rank, - global_rank=self.global_rank, - world_size=self.world_size, - preprocessor_cfg=self._cfg.get("preprocessor", None), - ) - - if dataset is None: - return None - - if isinstance(dataset, AudioToCharDALIDataset): - # DALI Dataset implements dataloader interface - return dataset - - shuffle = config['shuffle'] - if isinstance(dataset, torch.utils.data.IterableDataset): - shuffle = False - - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - elif hasattr(dataset.datasets[0], 'collate_fn'): - # support datasets that are lists of entries - collate_fn = dataset.datasets[0].collate_fn - else: - # support datasets that are lists of lists - collate_fn = dataset.datasets[0].datasets[0].collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the training data loader via a Dict-like object. - - Args: - train_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` - """ - if 'shuffle' not in train_data_config: - train_data_config['shuffle'] = True - - # preserve config - self._update_dataset_config(dataset_name='train', config=train_data_config) - - self._train_dl = self._setup_dataloader_from_config(config=train_data_config) - - # Need to set this because if using an IterableDataset, the length of the dataloader is the total number - # of samples rather than the number of batches, and this messes up the tqdm progress bar. - # So we set the number of steps manually (to the correct number) to fix this. - if ( - self._train_dl is not None - and hasattr(self._train_dl, 'dataset') - and isinstance(self._train_dl.dataset, torch.utils.data.IterableDataset) - ): - # We also need to check if limit_train_batches is already set. - # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, - # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). - if self._trainer is not None and isinstance(self._trainer.limit_train_batches, float): - self._trainer.limit_train_batches = int( - self._trainer.limit_train_batches - * ceil((len(self._train_dl.dataset) / self.world_size) / train_data_config['batch_size']) - ) - elif self._trainer is None: - logging.warning( - "Model Trainer was not set before constructing the dataset, incorrect number of " - "training batches will be used. Please set the trainer and rebuild the dataset." - ) - - def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the validation data loader via a Dict-like object. - - Args: - val_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` - """ - if 'shuffle' not in val_data_config: - val_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='validation', config=val_data_config) - - self._validation_dl = self._setup_dataloader_from_config(config=val_data_config) - - def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the test data loader via a Dict-like object. - - Args: - test_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` - """ - if 'shuffle' not in test_data_config: - test_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='test', config=test_data_config) - - self._test_dl = self._setup_dataloader_from_config(config=test_data_config) - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - if hasattr(self.preprocessor, '_sample_rate'): - input_signal_eltype = AudioSignal(freq=self.preprocessor._sample_rate) - else: - input_signal_eltype = AudioSignal() - return { - "input_signal": NeuralType(('B', 'T'), input_signal_eltype, optional=True), - "input_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), - "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType(), optional=True), - "processed_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), - "sample_id": NeuralType(tuple('B'), LengthsType(), optional=True), - } - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return { - "outputs": NeuralType(('B', 'T', 'D'), LogprobsType()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - "greedy_predictions": NeuralType(('B', 'T'), LabelsType()), - } - - @typecheck() - def forward( - self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None - ): - """ - Forward pass of the model. - - Args: - input_signal: Tensor that represents a batch of raw audio signals, - of shape [B, T]. T here represents timesteps, with 1 second of audio represented as - `self.sample_rate` number of floating point values. - input_signal_length: Vector of length B, that contains the individual lengths of the audio - sequences. - processed_signal: Tensor that represents a batch of processed audio signals, - of shape (B, D, T) that has undergone processing via some DALI preprocessor. - processed_signal_length: Vector of length B, that contains the individual lengths of the - processed audio sequences. - - Returns: - A tuple of 3 elements - - 1) The log probabilities tensor of shape [B, T, D]. - 2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B]. - 3) The greedy token predictions of the model of shape [B, T] (via argmax) - """ - has_input_signal = input_signal is not None and input_signal_length is not None - has_processed_signal = processed_signal is not None and processed_signal_length is not None - if (has_input_signal ^ has_processed_signal) == False: - raise ValueError( - f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive " - " with ``processed_signal`` and ``processed_signal_len`` arguments." - ) - - if not has_processed_signal: - processed_signal, processed_signal_length = self.preprocessor( - input_signal=input_signal, length=input_signal_length, - ) - - if self.spec_augmentation is not None and self.training: - processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) - - encoder_output = self.encoder(audio_signal=processed_signal, length=processed_signal_length) - encoded = encoder_output[0] - encoded_len = encoder_output[1] - log_probs = self.decoder(encoder_output=encoded) - greedy_predictions = log_probs.argmax(dim=-1, keepdim=False) - - return ( - log_probs, - encoded_len, - greedy_predictions, - ) - - # PTL-specific methods - def training_step(self, batch, batch_nb): - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - if self.is_interctc_enabled(): - AccessMixin.set_access_enabled(access_enabled=True) - - signal, signal_len, transcript, transcript_len = batch - if isinstance(batch, DALIOutputs) and batch.has_processed_signal: - log_probs, encoded_len, predictions = self.forward( - processed_signal=signal, processed_signal_length=signal_len - ) - else: - log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len) - - if hasattr(self, '_trainer') and self._trainer is not None: - log_every_n_steps = self._trainer.log_every_n_steps - else: - log_every_n_steps = 1 - - loss_value = self.loss( - log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len - ) - - # Add auxiliary losses, if registered - loss_value = self.add_auxiliary_losses(loss_value) - # only computing WER when requested in the logs (same as done for final-layer WER below) - loss_value, tensorboard_logs = self.add_interctc_losses( - loss_value, transcript, transcript_len, compute_wer=((batch_nb + 1) % log_every_n_steps == 0) - ) - - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - tensorboard_logs.update( - { - 'train_loss': loss_value, - 'learning_rate': self._optimizer.param_groups[0]['lr'], - 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), - } - ) - - if (batch_nb + 1) % log_every_n_steps == 0: - self._wer.update( - predictions=log_probs, - targets=transcript, - target_lengths=transcript_len, - predictions_lengths=encoded_len, - ) - wer, _, _ = self._wer.compute() - self._wer.reset() - tensorboard_logs.update({'training_batch_wer': wer}) - - return {'loss': loss_value, 'log': tensorboard_logs} - - def predict_step(self, batch, batch_idx, dataloader_idx=0): - signal, signal_len, transcript, transcript_len, sample_id = batch - if isinstance(batch, DALIOutputs) and batch.has_processed_signal: - log_probs, encoded_len, predictions = self.forward( - processed_signal=signal, processed_signal_length=signal_len - ) - else: - log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len) - - transcribed_texts, _ = self._wer.decoding.ctc_decoder_predictions_tensor( - decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False, - ) - - sample_id = sample_id.cpu().detach().numpy() - return list(zip(sample_id, transcribed_texts)) - - def validation_pass(self, batch, batch_idx, dataloader_idx=0): - if self.is_interctc_enabled(): - AccessMixin.set_access_enabled(access_enabled=True) - - signal, signal_len, transcript, transcript_len = batch - if isinstance(batch, DALIOutputs) and batch.has_processed_signal: - log_probs, encoded_len, predictions = self.forward( - processed_signal=signal, processed_signal_length=signal_len - ) - else: - log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len) - - loss_value = self.loss( - log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len - ) - loss_value, metrics = self.add_interctc_losses( - loss_value, transcript, transcript_len, compute_wer=True, log_wer_num_denom=True, log_prefix="val_", - ) - - self._wer.update( - predictions=log_probs, targets=transcript, target_lengths=transcript_len, predictions_lengths=encoded_len - ) - wer, wer_num, wer_denom = self._wer.compute() - self._wer.reset() - metrics.update({'val_loss': loss_value, 'val_wer_num': wer_num, 'val_wer_denom': wer_denom, 'val_wer': wer}) - - self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) - - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - return metrics - - def validation_step(self, batch, batch_idx, dataloader_idx=0): - metrics = self.validation_pass(batch, batch_idx, dataloader_idx) - if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: - self.validation_step_outputs[dataloader_idx].append(metrics) - else: - self.validation_step_outputs.append(metrics) - return metrics - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): - metrics = super().multi_validation_epoch_end(outputs, dataloader_idx) - self.finalize_interctc_metrics(metrics, outputs, prefix="val_") - return metrics - - def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): - metrics = super().multi_test_epoch_end(outputs, dataloader_idx) - self.finalize_interctc_metrics(metrics, outputs, prefix="test_") - return metrics - - def test_step(self, batch, batch_idx, dataloader_idx=0): - logs = self.validation_pass(batch, batch_idx, dataloader_idx=dataloader_idx) - test_logs = {name.replace("val_", "test_"): value for name, value in logs.items()} - if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: - self.test_step_outputs[dataloader_idx].append(test_logs) - else: - self.test_step_outputs.append(test_logs) - return test_logs - - def test_dataloader(self): - if self._test_dl is not None: - return self._test_dl - - def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': - """ - Setup function for a temporary data loader which wraps the provided audio file. - - Args: - config: A python dictionary which contains the following keys: - paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \ - Recommended length per file is between 5 and 25 seconds. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - temp_dir: (str) A temporary directory where the audio manifest is temporarily - stored. - num_workers: (int) number of workers. Depends of the batch_size and machine. \ - 0 - only the main process will load batches, 1 - one worker (not main process) - - Returns: - A pytorch DataLoader for the given audio file(s). - """ - if 'manifest_filepath' in config: - manifest_filepath = config['manifest_filepath'] - batch_size = config['batch_size'] - else: - manifest_filepath = os.path.join(config['temp_dir'], 'manifest.json') - batch_size = min(config['batch_size'], len(config['paths2audio_files'])) - - dl_config = { - 'manifest_filepath': manifest_filepath, - 'sample_rate': self.preprocessor._sample_rate, - 'labels': OmegaConf.to_container(self.decoder.vocabulary), - 'batch_size': batch_size, - 'trim_silence': False, - 'shuffle': False, - 'num_workers': config.get('num_workers', min(batch_size, os.cpu_count() - 1)), - 'pin_memory': True, - 'channel_selector': config.get('channel_selector', None), - } - if config.get("augmentor"): - dl_config['augmentor'] = config.get("augmentor") - - temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) - return temporary_datalayer - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - - model = PretrainedModelInfo( - pretrained_model_name="QuartzNet15x5Base-En", - description="QuartzNet15x5 model trained on six datasets: LibriSpeech, Mozilla Common Voice (validated clips from en_1488h_2019-12-10), WSJ, Fisher, Switchboard, and NSC Singapore English. It was trained with Apex/Amp optimization level O1 for 600 epochs. The model achieves a WER of 3.79% on LibriSpeech dev-clean, and a WER of 10.05% on dev-other. Please visit https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels for further details.", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemospeechmodels/versions/1.0.0a5/files/QuartzNet15x5Base-En.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_quartznet15x5", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_quartznet15x5", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_quartznet15x5/versions/1.0.0rc1/files/stt_en_quartznet15x5.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_jasper10x5dr", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_jasper10x5dr/versions/1.0.0rc1/files/stt_en_jasper10x5dr.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_ca_quartznet15x5", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_quartznet15x5", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ca_quartznet15x5/versions/1.0.0rc1/files/stt_ca_quartznet15x5.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_it_quartznet15x5", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_quartznet15x5", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_it_quartznet15x5/versions/1.0.0rc1/files/stt_it_quartznet15x5.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_fr_quartznet15x5", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_quartznet15x5", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_quartznet15x5/versions/1.0.0rc1/files/stt_fr_quartznet15x5.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_es_quartznet15x5", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_quartznet15x5", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_quartznet15x5/versions/1.0.0rc1/files/stt_es_quartznet15x5.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_de_quartznet15x5", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_quartznet15x5", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_quartznet15x5/versions/1.0.0rc1/files/stt_de_quartznet15x5.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_pl_quartznet15x5", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_pl_quartznet15x5", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_pl_quartznet15x5/versions/1.0.0rc1/files/stt_pl_quartznet15x5.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_ru_quartznet15x5", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_quartznet15x5", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ru_quartznet15x5/versions/1.0.0rc1/files/stt_ru_quartznet15x5.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_zh_citrinet_512", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_citrinet_512/versions/1.0.0rc1/files/stt_zh_citrinet_512.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_zh_citrinet_1024_gamma_0_25", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_1024_gamma_0_25", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_citrinet_1024_gamma_0_25/versions/1.0.0/files/stt_zh_citrinet_1024_gamma_0_25.nemo", - ) - - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_zh_citrinet_1024_gamma_0_25", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_1024_gamma_0_25", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_citrinet_1024_gamma_0_25/versions/1.0.0/files/stt_zh_citrinet_1024_gamma_0_25.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="asr_talknet_aligner", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:asr_talknet_aligner", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/asr_talknet_aligner/versions/1.0.0rc1/files/qn5x5_libri_tts_phonemes.nemo", - ) - results.append(model) - - return results diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/enhancement_models.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/enhancement_models.py deleted file mode 100644 index 7cc5c3d8459fad77fb34999962494ced04ad3e09..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/enhancement_models.py +++ /dev/null @@ -1,466 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import json -import os -import tempfile -from typing import Dict, List, Optional, Union - -import librosa -import soundfile as sf -import torch -from omegaconf import DictConfig -from pytorch_lightning import Trainer -from tqdm import tqdm - -from nemo.collections.asr.data import audio_to_audio_dataset -from nemo.collections.asr.data.audio_to_text_dataset import inject_dataloader_value_from_model_config -from nemo.collections.asr.models.audio_to_audio_model import AudioToAudioModel -from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType -from nemo.core.classes.common import PretrainedModelInfo, typecheck -from nemo.core.neural_types import AudioSignal, LengthsType, NeuralType -from nemo.utils import logging - -__all__ = ['EncMaskDecAudioToAudioModel'] - - -class EncMaskDecAudioToAudioModel(AudioToAudioModel): - """Class for encoder-mask-decoder audio processing models. - - The model consists of the following blocks: - - encoder: transforms input multi-channel audio signal into an encoded representation (analysis transform) - - mask_estimator: estimates a mask used by signal processor - - mask_processor: mask-based signal processor, combines the encoded input and the estimated mask - - decoder: transforms processor output into the time domain (synthesis transform) - """ - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable - # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 - self.world_size = 1 - if trainer is not None: - self.world_size = trainer.world_size - - super().__init__(cfg=cfg, trainer=trainer) - self.sample_rate = self._cfg.sample_rate - - # Setup processing modules - self.encoder = EncMaskDecAudioToAudioModel.from_config_dict(self._cfg.encoder) - self.mask_estimator = EncMaskDecAudioToAudioModel.from_config_dict(self._cfg.mask_estimator) - self.mask_processor = EncMaskDecAudioToAudioModel.from_config_dict(self._cfg.mask_processor) - self.decoder = EncMaskDecAudioToAudioModel.from_config_dict(self._cfg.decoder) - - if 'mixture_consistency' in self._cfg: - logging.debug('Using mixture consistency') - self.mixture_consistency = EncMaskDecAudioToAudioModel.from_config_dict(self._cfg.mixture_consistency) - else: - logging.debug('Mixture consistency not used') - self.mixture_consistency = None - - # Future enhancement: - # If subclasses need to modify the config before calling super() - # Check ASRBPE* classes do with their mixin - - # Setup augmentation - if hasattr(self.cfg, 'channel_augment') and self.cfg.channel_augment is not None: - logging.debug('Using channel augmentation') - self.channel_augmentation = EncMaskDecAudioToAudioModel.from_config_dict(self.cfg.channel_augment) - else: - logging.debug('Channel augmentation not used') - self.channel_augmentation = None - - # Setup optional Optimization flags - self.setup_optimization_flags() - - @torch.no_grad() - def process( - self, - paths2audio_files: List[str], - output_dir: str, - batch_size: int = 1, - num_workers: Optional[int] = None, - input_channel_selector: Optional[ChannelSelectorType] = None, - ) -> List[str]: - """ - Process audio files provided in paths2audio_files. - Processed signals will be saved in output_dir. - - Args: - paths2audio_files: (a list) of paths to audio files. \ - Recommended length per file is between 5 and 25 seconds. \ - But it is possible to pass a few hours long file if enough GPU memory is available. - output_dir: - batch_size: (int) batch size to use during inference. - Bigger will result in better throughput performance but would use more memory. - num_workers: Number of workers for the dataloader - input_channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. - - Returns: - """ - if paths2audio_files is None or len(paths2audio_files) == 0: - return {} - - if num_workers is None: - num_workers = min(batch_size, os.cpu_count() - 1) - - # Output - paths2processed_files = [] - - # Model's mode and device - mode = self.training - device = next(self.parameters()).device - - try: - # Switch model to evaluation mode - self.eval() - # Freeze weights - self.freeze() - - logging_level = logging.get_verbosity() - logging.set_verbosity(logging.WARNING) - - # Processing - with tempfile.TemporaryDirectory() as tmpdir: - # Save temporary manifest - temporary_manifest_filepath = os.path.join(tmpdir, 'manifest.json') - with open(temporary_manifest_filepath, 'w', encoding='utf-8') as fp: - for audio_file in paths2audio_files: - entry = {'input_filepath': audio_file, 'duration': librosa.get_duration(path=audio_file)} - fp.write(json.dumps(entry) + '\n') - - config = { - 'manifest_filepath': temporary_manifest_filepath, - 'input_key': 'input_filepath', - 'input_channel_selector': input_channel_selector, - 'batch_size': min(batch_size, len(paths2audio_files)), - 'num_workers': num_workers, - } - - # Create output dir if necessary - if not os.path.isdir(output_dir): - os.makedirs(output_dir) - - # DataLoader for the input files - temporary_dataloader = self._setup_process_dataloader(config) - - # Indexing of the original files, used to form the output file name - file_idx = 0 - - # Process batches - for test_batch in tqdm(temporary_dataloader, desc="Processing"): - input_signal = test_batch[0] - input_length = test_batch[1] - - # Expand channel dimension, if necessary - # For consistency, the model uses multi-channel format, even if the channel dimension is 1 - if input_signal.ndim == 2: - input_signal = input_signal.unsqueeze(1) - - processed_batch, _ = self.forward( - input_signal=input_signal.to(device), input_length=input_length.to(device) - ) - - for example_idx in range(processed_batch.size(0)): - # This assumes the data loader is not shuffling files - file_name = os.path.basename(paths2audio_files[file_idx]) - # Prepare output file - output_file = os.path.join(output_dir, f'processed_{file_name}') - # Crop the output signal to the actual length - output_signal = processed_batch[example_idx, :, : input_length[example_idx]].cpu().numpy() - # Write audio - sf.write(output_file, output_signal.T, self.sample_rate, 'float') - # Update the file counter - file_idx += 1 - # Save processed file - paths2processed_files.append(output_file) - - del test_batch - del processed_batch - - finally: - # set mode back to its original value - self.train(mode=mode) - if mode is True: - self.unfreeze() - logging.set_verbosity(logging_level) - - return paths2processed_files - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - - is_concat = config.get('is_concat', False) - if is_concat: - raise NotImplementedError('Concat not implemented') - - # TODO: Consider moving `inject` from `audio_to_text_dataset` to a utility module? - # Automatically inject args from model config to dataloader config - inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate') - - # Instantiate tarred dataset loader or normal dataset loader - if config.get('is_tarred', False): - raise NotImplementedError('Tarred datasets not supported') - - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}") - return None - - dataset = audio_to_audio_dataset.get_audio_to_target_dataset(config=config) - - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - elif hasattr(dataset.datasets[0], 'collate_fn'): - # support datasets that are lists of entries - collate_fn = dataset.datasets[0].collate_fn - else: - # support datasets that are lists of lists - collate_fn = dataset.datasets[0].datasets[0].collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=config['shuffle'], - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the training data loader via a Dict-like object. - - Args: - train_data_config: A config that contains the information regarding construction - of a training dataset. - - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset` - """ - if 'shuffle' not in train_data_config: - train_data_config['shuffle'] = True - - # preserve config - self._update_dataset_config(dataset_name='train', config=train_data_config) - - self._train_dl = self._setup_dataloader_from_config(config=train_data_config) - - if 'is_tarred' in train_data_config and train_data_config['is_tarred']: - raise NotImplementedError('Tarred datasets not supported') - - def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the validation data loader via a Dict-like object. - - Args: - val_data_config: A config that contains the information regarding construction - of a validation dataset. - - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset` - """ - if 'shuffle' not in val_data_config: - val_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='validation', config=val_data_config) - - self._validation_dl = self._setup_dataloader_from_config(config=val_data_config) - - def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the test data loader via a Dict-like object. - - Args: - test_data_config: A config that contains the information regarding construction - of a test dataset. - - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset` - """ - if 'shuffle' not in test_data_config: - test_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='test', config=test_data_config) - - self._test_dl = self._setup_dataloader_from_config(config=test_data_config) - - def _setup_process_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': - """Prepare a dataloader for processing files. - - Args: - config: A python dictionary which contains the following keys: - manifest_filepath: path to a manifest file - input_key: key with audio filepaths in the manifest - input_channel_selector: Optional, used to select a subset of channels from input audio files - batch_size: batch size for the dataloader - num_workers: number of workers for the dataloader - - Returns: - A pytorch DataLoader for the given manifest filepath. - """ - dl_config = { - 'manifest_filepath': config['manifest_filepath'], - 'sample_rate': self.sample_rate, - 'input_key': config['input_key'], - 'input_channel_selector': config.get('input_channel_selector', None), - 'target_key': None, - 'target_channel_selector': None, - 'batch_size': config['batch_size'], - 'shuffle': False, - 'num_workers': config.get('num_workers', min(config['batch_size'], os.cpu_count() - 1)), - 'pin_memory': True, - } - - temporary_dataloader = self._setup_dataloader_from_config(config=DictConfig(dl_config)) - return temporary_dataloader - - @property - def input_types(self) -> Dict[str, NeuralType]: - return { - "input_signal": NeuralType( - ('B', 'C', 'T'), AudioSignal(freq=self.sample_rate) - ), # multi-channel format, channel dimension can be 1 for single-channel audio - "input_length": NeuralType(tuple('B'), LengthsType(), optional=True), - } - - @property - def output_types(self) -> Dict[str, NeuralType]: - return { - "output_signal": NeuralType( - ('B', 'C', 'T'), AudioSignal(freq=self.sample_rate) - ), # multi-channel format, channel dimension can be 1 for single-channel audio - "output_length": NeuralType(tuple('B'), LengthsType(), optional=True), - } - - def match_batch_length(self, input: torch.Tensor, batch_length: int): - """Trim or pad the output to match the batch length. - - Args: - input: tensor with shape (B, C, T) - batch_length: int - - Returns: - Tensor with shape (B, C, T), where T matches the - batch length. - """ - input_length = input.size(-1) - pad_length = batch_length - input_length - pad = (0, pad_length) - # pad with zeros or crop - return torch.nn.functional.pad(input, pad, 'constant', 0) - - @typecheck() - def forward(self, input_signal, input_length=None): - """ - Forward pass of the model. - - Args: - input_signal: Tensor that represents a batch of raw audio signals, - of shape [B, T] or [B, T, C]. T here represents timesteps, with 1 second of audio represented as - `self.sample_rate` number of floating point values. - input_signal_length: Vector of length B, that contains the individual lengths of the audio - sequences. - - Returns: - """ - batch_length = input_signal.size(-1) - - # Encoder - encoded, encoded_length = self.encoder(input=input_signal, input_length=input_length) - - # Mask estimator - mask, _ = self.mask_estimator(input=encoded, input_length=encoded_length) - - # Mask-based processor in the encoded domain - processed, processed_length = self.mask_processor(input=encoded, input_length=encoded_length, mask=mask) - - # Mixture consistency - if self.mixture_consistency is not None: - processed = self.mixture_consistency(mixture=encoded, estimate=processed) - - # Decoder - processed, processed_length = self.decoder(input=processed, input_length=processed_length) - - # Trim or pad the estimated signal to match input length - processed = self.match_batch_length(input=processed, batch_length=batch_length) - return processed, processed_length - - # PTL-specific methods - def training_step(self, batch, batch_idx): - input_signal, input_length, target_signal, target_length = batch - - # Expand channel dimension, if necessary - # For consistency, the model uses multi-channel format, even if the channel dimension is 1 - if input_signal.ndim == 2: - input_signal = input_signal.unsqueeze(1) - if target_signal.ndim == 2: - target_signal = target_signal.unsqueeze(1) - - # Apply channel augmentation - if self.training and self.channel_augmentation is not None: - input_signal = self.channel_augmentation(input=input_signal) - - # Process input - processed_signal, _ = self.forward(input_signal=input_signal, input_length=input_length) - - # Calculate the loss - loss = self.loss(estimate=processed_signal, target=target_signal, input_length=input_length) - - # Logs - self.log('train_loss', loss) - self.log('learning_rate', self._optimizer.param_groups[0]['lr']) - self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) - - # Return loss - return loss - - def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str = 'val'): - input_signal, input_length, target_signal, target_length = batch - - # Expand channel dimension, if necessary - # For consistency, the model uses multi-channel format, even if the channel dimension is 1 - if input_signal.ndim == 2: - input_signal = input_signal.unsqueeze(1) - if target_signal.ndim == 2: - target_signal = target_signal.unsqueeze(1) - - # Process input - processed_signal, _ = self.forward(input_signal=input_signal, input_length=input_length) - - # Calculate the loss - loss = self.loss(estimate=processed_signal, target=target_signal, input_length=input_length) - - # Update metrics - if hasattr(self, 'metrics') and tag in self.metrics: - # Update metrics for this (tag, dataloader_idx) - for name, metric in self.metrics[tag][dataloader_idx].items(): - metric.update(preds=processed_signal, target=target_signal, input_length=input_length) - - # Log global step - self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) - - # Return loss - return {f'{tag}_loss': loss} - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - - return results diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/hybrid_asr_tts_models.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/hybrid_asr_tts_models.py deleted file mode 100644 index 9fea3c293e6435119d4358213df266575bd931c2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/hybrid_asr_tts_models.py +++ /dev/null @@ -1,601 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import itertools -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union, cast - -import torch -from omegaconf import MISSING, DictConfig, OmegaConf, open_dict -from pytorch_lightning import Trainer -from torch.nn.utils.rnn import pad_sequence - -from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs -from nemo.collections.asr.data.audio_to_text_dataset import get_audio_to_text_bpe_dataset_from_config -from nemo.collections.asr.data.text_to_text import ( - TextOrAudioToTextBatch, - TextToTextBatch, - TextToTextDataset, - TextToTextIterableDataset, -) -from nemo.collections.asr.models.asr_model import ASRModel -from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE -from nemo.collections.asr.models.hybrid_rnnt_ctc_bpe_models import EncDecHybridRNNTCTCBPEModel -from nemo.collections.asr.models.rnnt_bpe_models import EncDecRNNTBPEModel -from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder -from nemo.collections.asr.parts.preprocessing.features import clean_spectrogram_batch, normalize_batch -from nemo.collections.asr.parts.submodules.batchnorm import replace_bn_with_fused_bn_all -from nemo.collections.common.data import ConcatDataset, ConcatMapDataset -from nemo.collections.tts.models import FastPitchModel, SpectrogramEnhancerModel -from nemo.core.classes import Dataset, typecheck -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging -from nemo.utils.enum import PrettyStrEnum -from nemo.utils.exceptions import NeMoBaseException - - -def _fuse_bn_in_conformer(asr_model: ASRModel): - """ - Replace BatchNorm with Fused BatchNorm in Conformer and fixes model config inplace - Expected `encoder` model to exist and be of type ConformerEncoder - """ - logging.info("Replacing BatchNorm with Fused BatchNorm") - if not hasattr(asr_model, "encoder"): - raise NotImplementedError("No encoder found in ASR Model, replacement not supported") - if not isinstance(asr_model.encoder, ConformerEncoder): - raise NotImplementedError(f"Unsupported encoder type: {type(asr_model.encoder)}") - replace_bn_with_fused_bn_all(asr_model.encoder) - if "conv_norm_type" not in asr_model.cfg.encoder: - # old CTC models from NGC don't have such param - logging.warning("conv_norm_type not in encoder config, adding parameter") - with open_dict(asr_model.cfg): - asr_model.cfg.encoder.conv_norm_type = "fused_batch_norm" - else: - asr_model.cfg.encoder.conv_norm_type = "fused_batch_norm" - - -@dataclass -class TextDataConfig: - """ - Text dataset subconfig for text-only dataset - """ - - manifest_filepath: Any = MISSING # actual Union[str, List[str]], but this type is not supported by OmegaConf - speakers_filepath: Any = MISSING - min_words: int = 1 - max_words: int = 45 # 45 - recommended value, ~16.7 sec for LibriSpeech - tokenizer_workers: int = 1 - asr_tts_sampling_technique: Optional[str] = None - asr_tts_sampling_temperature: Optional[int] = None - asr_tts_sampling_probabilities: Optional[List[float]] = None - - -class ASRWithTTSModel(ASRModel): - """ - Hybrid ASR-TTS model: a transparent wrapper for ASR model - with frozen text-to-spectrogram pretrained model, which allows to use text-only data for training/finetuning - Text-only data can be mixed with audio-text pairs - """ - - asr_model: Union[EncDecRNNTBPEModel, EncDecCTCModelBPE, EncDecHybridRNNTCTCBPEModel] - tts_model: FastPitchModel - enhancer_model: Optional[SpectrogramEnhancerModel] - - class ASRModelTypes(PrettyStrEnum): - """ - Supported ASR types, needed for training from scratch - """ - - RNNT_BPE = "rnnt_bpe" - CTC_BPE = "ctc_bpe" - HYBRID_RNNT_CTC_BPE = "hybrid_rnnt_ctc_bpe" - - @classmethod - def from_asr_model(cls, model: Any): - if isinstance(model, EncDecRNNTBPEModel): - return cls.RNNT_BPE - if isinstance(model, EncDecCTCModelBPE): - return cls.CTC_BPE - if isinstance(model, EncDecHybridRNNTCTCBPEModel): - return cls.HYBRID_RNNT_CTC_BPE - raise ValueError(f"Unsupported model type: {type(model)}") - - def get_asr_cls(self): - if self == self.RNNT_BPE: - return EncDecRNNTBPEModel - if self == self.CTC_BPE: - return EncDecCTCModelBPE - if self == self.HYBRID_RNNT_CTC_BPE: - return EncDecHybridRNNTCTCBPEModel - raise NotImplementedError(f"Not implemented for value {self.value}") - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - return [] - - @classmethod - def _check_config(cls, cfg: DictConfig): - """ - Check that all required fields are present in config - Structured configs are not compatible with model serialization, so we check fields manually - """ - expected_fields = [ - # asr - "asr_model", - "asr_model_path", - "asr_model_fuse_bn", - "asr_model_type", - # tts - "tts_model", - "tts_model_path", - # enhancer - "enhancer_model_path", - "enhancer_model", - ] - for field in expected_fields: - if field not in cfg: - raise NeMoBaseException(f"Field {field} is required in config (possibly should be None/null)") - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - self._full_init_guard = False - - self._check_config(cfg) # check all required keys are in config - - # setup datasets and optimizer after model is fully initialized - # since it's done automatically, remove options from config - cfg = copy.deepcopy(cfg) # copy to avoid modifying original config - with open_dict(cfg): - train_ds_cfg = cfg.pop("train_ds", None) - validation_ds_cfg = cfg.pop("validation_ds", None) - test_ds_cfg = cfg.pop("test_ds", None) - optim_cfg = cfg.pop("optim", None) - - super().__init__(cfg, trainer=trainer) - - # tts model - if cfg.tts_model is not None: - self.register_nemo_submodule("tts_model", config_field="tts_model", model=FastPitchModel(cfg.tts_model)) - else: - if cfg.tts_model_path is None: - raise NeMoBaseException("Either tts_model or tts_model_path should be provided") - self.register_nemo_submodule( - "tts_model", - config_field="tts_model", - model=FastPitchModel.restore_from(f"{cfg.tts_model_path}", map_location=torch.device("cpu")), - ) - self.tts_model.freeze() # tts model should be always frozen - - if cfg.asr_model is not None: - self.asr_model_type = self.ASRModelTypes(cfg.asr_model_type) # convert to enum - self.register_nemo_submodule( - "asr_model", config_field="asr_model", model=self.asr_model_type.get_asr_cls()(cfg.asr_model) - ) - else: - if cfg.asr_model_path is None: - raise NeMoBaseException("Either asr_model or asr_model_path should be provided") - self.register_nemo_submodule( - "asr_model", - config_field="asr_model", - model=ASRModel.restore_from(f"{cfg.asr_model_path}", map_location=torch.device("cpu")), - ) - self.asr_model_type = self.ASRModelTypes.from_asr_model(self.asr_model) - self.cfg.asr_model_type = f"{self.asr_model_type}" # save to config - - # replace BatchNorm with FusedBatchNorm - if cfg.asr_model_fuse_bn: - _fuse_bn_in_conformer(self.asr_model) - self.cfg.asr_model_fuse_bn = False # no need to fuse anymore - - if cfg.enhancer_model is not None: - self.register_nemo_submodule( - "enhancer_model", config_field="enhancer_model", model=SpectrogramEnhancerModel(cfg.enhancer_model) - ) - elif cfg.enhancer_model_path is not None: - self.register_nemo_submodule( - "enhancer_model", - config_field="enhancer_model", - model=SpectrogramEnhancerModel.restore_from(cfg.enhancer_model_path, map_location=torch.device("cpu")), - ) - else: - self.enhancer_model = None - - self._full_init_guard = True - - # initialize optimizer and datasets, asr/tts models are initialized here - if optim_cfg: - with open_dict(self.cfg): - self.cfg.optim = optim_cfg - self.setup_optimization(optim_config=optim_cfg) - if train_ds_cfg: - with open_dict(self.cfg): - self.cfg.train_ds = train_ds_cfg - self.setup_training_data(train_data_config=train_ds_cfg) - if validation_ds_cfg: - with open_dict(self.cfg): - self.cfg.validation_ds = validation_ds_cfg - self.setup_multiple_validation_data(val_data_config=validation_ds_cfg) - if test_ds_cfg: - with open_dict(self.cfg): - self.cfg.test_ds = test_ds_cfg - self.setup_test_data(test_data_config=test_ds_cfg) - - @classmethod - def from_asr_config( - cls, - asr_cfg: DictConfig, - asr_model_type: Union[str, ASRModelTypes], - tts_model_path: Union[str, Path], - enhancer_model_path: Optional[Union[str, Path]] = None, - trainer: Trainer = None, - ): - """ - Method to construct model from ASR config for training from scratch - """ - model_type = cls.ASRModelTypes(asr_model_type) - cfg = DictConfig( - dict( - asr_model_path=None, - asr_model=None, - asr_model_type=f"{model_type}", - asr_model_fuse_bn=False, # for training from scratch always should be False - tts_model_path=f"{tts_model_path}", - tts_model=None, - enhancer_model_path=f"{enhancer_model_path}" if enhancer_model_path is not None else None, - enhancer_model=None, - train_ds=None, - validation_ds=None, - test_ds=None, - optim=None, - ) - ) - - asr_cfg = copy.deepcopy(asr_cfg) # copy not to affect original config - with open_dict(asr_cfg): - for subconfig_path in ["train_ds", "validation_ds", "test_ds", "optim"]: - if subconfig_path in asr_cfg: - cfg[subconfig_path] = asr_cfg.pop(subconfig_path) - cfg.asr_model = asr_cfg - return cls(cfg=cfg, trainer=trainer) - - @classmethod - def from_pretrained_models( - cls, - asr_model_path: Union[str, Path], - tts_model_path: Union[str, Path], - enhancer_model_path: Optional[Union[str, Path]] = None, - asr_model_fuse_bn: bool = False, - cfg: Optional[DictConfig] = None, - trainer: Optional[Trainer] = None, - ): - """ - Load model from pretrained ASR and TTS models - Args: - asr_model_path: path to .nemo ASR model checkpoint - tts_model_path: path to .nemo TTS model checkpoint - enhancer_model_path: path to .nemo enhancer model checkpoint - asr_model_fuse_bn: automatically fuse batchnorm layers in ASR model - cfg: optional config for hybrid model - trainer: Pytorch-Lightning trainer - - Returns: - ASRWithTTSModel instance - """ - if cfg is None: - cfg = DictConfig( - dict( - asr_model_path=f"{asr_model_path}", - asr_model=None, - tts_model_path=f"{tts_model_path}", - tts_model=None, - enhancer_model_path=f"{enhancer_model_path}" if enhancer_model_path is not None else None, - enhancer_model=None, - asr_model_type=None, - asr_model_fuse_bn=asr_model_fuse_bn, - train_ds=None, - validation_ds=None, - test_ds=None, - optim=None, - ) - ) - else: - cfg = copy.deepcopy(cfg) # copy to avoid modifying original config - cfg.tts_model_path = f"{tts_model_path}" - cfg.asr_model_path = f"{asr_model_path}" - cfg.enhancer_model_path = f"{enhancer_model_path}" if enhancer_model_path is not None else None - return ASRWithTTSModel(cfg, trainer=trainer) - - def __setattr__(self, name, value): - # pytorch-lightning magic, allows to call *_step on asr_model - if name == "_current_fx_name" and self._full_init_guard: - self.asr_model._current_fx_name = value # need to make logging inside asr_model work - return super().__setattr__(name, value) - - def setup_optimization( - self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None, - ): - """ - Setup optimizer and scheduler. Ensure tts model is frozen. - Add optimizer and scheduler to asr model, to allow `train_step` on ASR model - """ - self.tts_model.freeze() - optimizer, scheduler = super().setup_optimization(optim_config=optim_config, optim_kwargs=optim_kwargs) - # set ASR model optimizer/scheduler to allow training_step on asr_model - self.asr_model._optimizer = optimizer - self.asr_model._scheduler = scheduler - return optimizer, scheduler - - def setup_validation_data(self, val_data_config: Union[DictConfig, Dict]): - """Setup validation data for ASR model""" - return self.asr_model.setup_validation_data(val_data_config) - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): - """Validation epoch end hook for ASR model""" - return self.asr_model.multi_validation_epoch_end(outputs=outputs, dataloader_idx=dataloader_idx) - - def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): - """Test epoch end hook for ASR model""" - return self.asr_model.multi_test_epoch_end(outputs=outputs, dataloader_idx=dataloader_idx) - - def transcribe(self, paths2audio_files: List[str], batch_size: int = 4, verbose: bool = True) -> List[str]: - """Transcribe audio data using ASR model""" - return self.asr_model.transcribe(paths2audio_files=paths2audio_files, batch_size=batch_size, verbose=verbose) - - def setup_multiple_validation_data(self, val_data_config: Union[DictConfig, Dict]): - """Setup multiple validation data for ASR model""" - self.asr_model.setup_multiple_validation_data(val_data_config) - - def setup_test_data(self, test_data_config: Union[DictConfig, Dict]): - """Setup test data for ASR model""" - self.asr_model.setup_test_data(test_data_config) - - def setup_multiple_test_data(self, test_data_config: Union[DictConfig, Dict]): - """Setup multiple test data for ASR Model""" - return self.asr_model.setup_multiple_test_data(test_data_config) - - def save_asr_model_to(self, save_path: str): - """Save ASR model separately""" - return self.asr_model.save_to(save_path=save_path) - - def validation_step(self, batch, batch_idx, dataloader_idx=0): - """Validation step, forward to ASR model""" - loss = self.asr_model.validation_step(batch=batch, batch_idx=batch_idx, dataloader_idx=dataloader_idx) - if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: - self.validation_step_outputs[dataloader_idx].append(loss) - else: - self.validation_step_outputs.append(loss) - return loss - - def on_validation_epoch_end(self) -> Optional[Dict[str, Dict[str, torch.Tensor]]]: - """Validation epoch end hook, forward to ASR model""" - return self.asr_model.on_validation_epoch_end() - - def on_test_epoch_end(self) -> Optional[Dict[str, Dict[str, torch.Tensor]]]: - """Test epoch end hook, forward to ASR model""" - return self.asr_model.on_test_epoch_end() - - def val_dataloader(self): - """Get valudation dataloader from ASR model""" - return self.asr_model.val_dataloader() - - def unfreeze(self) -> None: - """Unfreeze the ASR model, keep TTS model frozen.""" - super().unfreeze() - self.tts_model.freeze() # tts model should be always frozen - - def on_fit_start(self): - """Call asr_model on_fit_start hook, ensure TTS model is frozen""" - self.asr_model.on_fit_start() - self.tts_model.freeze() - - def train(self, mode: bool = True): - """Train mode, ensure TTS model is frozen""" - super().train(mode) - self.tts_model.eval() - return self - - def _get_tts_spectrogram( - self, tts_texts: torch.Tensor, speakers: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Get TTS spectrogram from text and speaker ids""" - with torch.no_grad(): - spectrogram, spectrogram_len, *_ = self.tts_model(text=tts_texts, durs=None, pitch=None, speaker=speakers) - if self.enhancer_model is not None: - # apply enhancer - with typecheck.disable_checks(): - # spectrogram_len are of TokenDurationType, enhancer requires LengthsType - # TODO: fix FastPitch model to return LengthsType - spectrogram = self.enhancer_model.forward(input_spectrograms=spectrogram, lengths=spectrogram_len) - spectrogram, *_ = normalize_batch(spectrogram, spectrogram_len, self.asr_model.cfg.preprocessor.normalize) - return spectrogram, spectrogram_len - - def _get_batch_spect(self, batch: Union[TextToTextBatch, TextOrAudioToTextBatch, tuple]): - """Get batch with spectrograms from text-only, audio-text or mixed batch data""" - if isinstance(batch, TextToTextBatch): - spectrogram, spectrogram_len = self._get_tts_spectrogram(batch.tts_texts, batch.speakers) - transcript = batch.transcripts - transcript_len = batch.transcript_lengths - elif isinstance(batch, TextOrAudioToTextBatch): - tts_spectrogram, tts_spectrogram_len = self._get_tts_spectrogram(batch.tts_texts, batch.speakers) - asr_spectrogram, asr_spectrogram_len = self.asr_model.preprocessor( - input_signal=batch.audio_signals, length=batch.audio_signal_lengths, - ) - - spectrogram = pad_sequence( - [ - x.squeeze(0) - for x in itertools.chain( - torch.tensor_split(tts_spectrogram.transpose(1, 2), tts_spectrogram.size(0)), - torch.tensor_split(asr_spectrogram.transpose(1, 2), asr_spectrogram.size(0)), - ) - ], - batch_first=True, - padding_value=0.0, - ).transpose(1, 2) - spectrogram_len = torch.cat([tts_spectrogram_len, asr_spectrogram_len], dim=0) - - transcript = batch.transcripts - transcript_len = batch.transcript_lengths - else: - audio_signal, audio_signal_len, transcript, transcript_len, *_ = batch # audio batch: 4 or 5 elements - spectrogram, spectrogram_len = self.asr_model.preprocessor( - input_signal=audio_signal, length=audio_signal_len - ) - spectrogram = clean_spectrogram_batch(spectrogram, spectrogram_len) - return spectrogram.detach(), spectrogram_len.detach(), transcript, transcript_len - - def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]): - """ - Setup training data from config: text-only, audio-text or mixed data. - """ - if train_data_config is None: - logging.warning("No training data") - return - - self._update_dataset_config(dataset_name='train', config=train_data_config) - asr_dataset = get_audio_to_text_bpe_dataset_from_config( - train_data_config, - local_rank=self.local_rank, - global_rank=self.global_rank, - world_size=self.world_size, - tokenizer=self.asr_model.tokenizer, - preprocessor_cfg=self.asr_model.cfg.get("preprocessor", None), - ) - - dataset_iterable = True - if asr_dataset is not None and isinstance(asr_dataset, Dataset): - # asr_dataset is map-style, for mixing datasets use map-style text-to-text dataset - dataset_iterable = False - if train_data_config.get("text_data") is not None: - tts_dataset = self._setup_text_dataset_from_config(train_data_config, iterable=dataset_iterable) - else: - tts_dataset = None - - if tts_dataset and asr_dataset: - text_data_config: TextDataConfig = cast( - TextDataConfig, OmegaConf.merge(OmegaConf.structured(TextDataConfig), train_data_config.text_data) - ) - concat_kwargs = dict() - if text_data_config.asr_tts_sampling_technique is not None: - concat_kwargs["sampling_technique"] = text_data_config.asr_tts_sampling_technique - if text_data_config.asr_tts_sampling_temperature is not None: - concat_kwargs["sampling_temperature"] = text_data_config.asr_tts_sampling_temperature - if text_data_config.asr_tts_sampling_probabilities: - concat_kwargs["sampling_probabilities"] = text_data_config.asr_tts_sampling_probabilities - - if dataset_iterable: - dataset = ConcatDataset(datasets=[asr_dataset, tts_dataset], **concat_kwargs) - else: - dataset = ConcatMapDataset(datasets=[asr_dataset, tts_dataset], **concat_kwargs) - else: - dataset = tts_dataset or asr_dataset - - if dataset is None: - return - - if tts_dataset: - collate_fn = tts_dataset.collate_fn - else: - if hasattr(asr_dataset, 'collate_fn'): - collate_fn = asr_dataset.collate_fn - elif hasattr(asr_dataset.datasets[0], 'collate_fn'): - # support datasets that are lists of entries - collate_fn = asr_dataset.datasets[0].collate_fn - else: - # support datasets that are lists of lists - collate_fn = asr_dataset.datasets[0].datasets[0].collate_fn - - shuffle = train_data_config.get("shuffle", True) and not dataset_iterable - self._train_dl = torch.utils.data.DataLoader( - dataset=dataset, - batch_size=train_data_config['batch_size'], - collate_fn=collate_fn, - drop_last=train_data_config.get('drop_last', False), - shuffle=shuffle, - num_workers=train_data_config.get('num_workers', 0), - pin_memory=train_data_config.get('pin_memory', False), - ) - - def _setup_text_dataset_from_config( - self, train_data_config: DictConfig, iterable=True - ) -> Union[TextToTextDataset, TextToTextIterableDataset]: - """ - Construct text-to-text (text-only) dataset from config. - - Args: - train_data_config: config - iterable: construct iterable-style datasset if True, otherwise map-style - - Returns: - text-to-text dataset of TextToTextDataset or TextToTextIterableDataset type - """ - text_data_config: TextDataConfig = cast( - TextDataConfig, OmegaConf.merge(OmegaConf.structured(TextDataConfig), train_data_config.text_data) - ) - if iterable: - textonly_ds = TextToTextIterableDataset( - manifest_filepath=text_data_config.manifest_filepath, - speakers_filepath=text_data_config.speakers_filepath, - asr_tokenizer=self.asr_model.tokenizer, - asr_use_start_end_token=train_data_config.get("use_start_end_token", False), - tts_parser=self.tts_model.parser, - tts_text_pad_id=self.tts_model.vocab.pad, - tts_text_normalizer=self.tts_model.normalizer, - tts_text_normalizer_call_kwargs=self.tts_model.text_normalizer_call_kwargs, - min_words=text_data_config.min_words, - max_words=text_data_config.max_words, - tokenizer_workers=text_data_config.tokenizer_workers, - num_parts=self.world_size, - current_part_index=self.global_rank, - ) - else: - textonly_ds = TextToTextDataset( - manifest_filepath=text_data_config.manifest_filepath, - speakers_filepath=text_data_config.speakers_filepath, - asr_tokenizer=self.asr_model.tokenizer, - asr_use_start_end_token=train_data_config.get("use_start_end_token", False), - tts_parser=self.tts_model.parser, - tts_text_pad_id=self.tts_model.vocab.pad, - tts_text_normalizer=self.tts_model.normalizer, - tts_text_normalizer_call_kwargs=self.tts_model.text_normalizer_call_kwargs, - min_words=text_data_config.min_words, - max_words=text_data_config.max_words, - tokenizer_workers=text_data_config.tokenizer_workers, - ) - return textonly_ds - - def training_step(self, batch: Union[TextOrAudioToTextBatch, TextToTextBatch, DALIOutputs, tuple], batch_nb: int): - """ - Training step for ASR-TTS model. - - construct spectrogram for the batch (from text - using TTS model, from audio - using ASR preprocessor) - - call training_step on ASR model - """ - assert not self.tts_model.training - if isinstance(batch, DALIOutputs): - return self.asr_model.training_step(batch=batch, batch_nb=batch_nb) - with torch.no_grad(): - spectrogram, spectrogram_len, transcript, transcript_len = self._get_batch_spect(batch) - # TODO: maybe support precomputed without DALIOutputs - return self.asr_model.training_step( - batch=DALIOutputs( - dict( - processed_signal=spectrogram, - processed_signal_len=spectrogram_len, - transcript=transcript, - transcript_len=transcript_len, - ) - ), - batch_nb=batch_nb, - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py deleted file mode 100644 index 789173bcca2aa8af68911b7f5f8a20258f3a3b84..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ /dev/null @@ -1,582 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os -from typing import Dict, List, Optional, Union - -import torch -from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict -from pytorch_lightning import Trainer - -from nemo.collections.asr.data import audio_to_text_dataset -from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset -from nemo.collections.asr.losses.ctc import CTCLoss -from nemo.collections.asr.losses.rnnt import RNNTLoss -from nemo.collections.asr.metrics.rnnt_wer_bpe import RNNTBPEWER, RNNTBPEDecoding, RNNTBPEDecodingConfig -from nemo.collections.asr.metrics.wer_bpe import WERBPE, CTCBPEDecoding, CTCBPEDecodingConfig -from nemo.collections.asr.models.hybrid_rnnt_ctc_models import EncDecHybridRNNTCTCModel -from nemo.collections.asr.parts.mixins import ASRBPEMixin -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging, model_utils - - -class EncDecHybridRNNTCTCBPEModel(EncDecHybridRNNTCTCModel, ASRBPEMixin): - """Base class for encoder decoder RNNT-based models with auxiliary CTC decoder/loss and subword tokenization.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - # Convert to Hydra 1.0 compatible DictConfig - cfg = model_utils.convert_model_config_to_dict_config(cfg) - cfg = model_utils.maybe_update_config_version(cfg) - - # Tokenizer is necessary for this model - if 'tokenizer' not in cfg: - raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !") - - if not isinstance(cfg, DictConfig): - cfg = OmegaConf.create(cfg) - - # Setup the tokenizer - self._setup_tokenizer(cfg.tokenizer) - - # Initialize a dummy vocabulary - vocabulary = self.tokenizer.tokenizer.get_vocab() - - # Set the new vocabulary - with open_dict(cfg): - cfg.labels = ListConfig(list(vocabulary)) - - with open_dict(cfg.decoder): - cfg.decoder.vocab_size = len(vocabulary) - - with open_dict(cfg.joint): - cfg.joint.num_classes = len(vocabulary) - cfg.joint.vocabulary = ListConfig(list(vocabulary)) - cfg.joint.jointnet.encoder_hidden = cfg.model_defaults.enc_hidden - cfg.joint.jointnet.pred_hidden = cfg.model_defaults.pred_hidden - - # setup auxiliary CTC decoder - if 'aux_ctc' not in cfg: - raise ValueError( - "The config need to have a section for the CTC decoder named as aux_ctc for Hybrid models." - ) - - with open_dict(cfg): - if self.tokenizer_type == "agg": - cfg.aux_ctc.decoder.vocabulary = ListConfig(vocabulary) - else: - cfg.aux_ctc.decoder.vocabulary = ListConfig(list(vocabulary.keys())) - - if cfg.aux_ctc.decoder["num_classes"] < 1: - logging.info( - "\nReplacing placholder number of classes ({}) with actual number of classes - {}".format( - cfg.aux_ctc.decoder["num_classes"], len(vocabulary) - ) - ) - cfg.aux_ctc.decoder["num_classes"] = len(vocabulary) - - super().__init__(cfg=cfg, trainer=trainer) - - # Setup decoding object - self.decoding = RNNTBPEDecoding( - decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) - - # Setup wer object - self.wer = RNNTBPEWER( - decoding=self.decoding, - batch_dim_index=0, - use_cer=self.cfg.get('use_cer', False), - log_prediction=self.cfg.get('log_prediction', True), - dist_sync_on_step=True, - ) - - # Setup fused Joint step if flag is set - if self.joint.fuse_loss_wer: - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - # Setup CTC decoding - ctc_decoding_cfg = self.cfg.aux_ctc.get('decoding', None) - if ctc_decoding_cfg is None: - ctc_decoding_cfg = OmegaConf.structured(CTCBPEDecodingConfig) - with open_dict(self.cfg.aux_ctc): - self.cfg.aux_ctc.decoding = ctc_decoding_cfg - self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer) - - # Setup CTC WER - self.ctc_wer = WERBPE( - decoding=self.ctc_decoding, - use_cer=self.cfg.aux_ctc.get('use_cer', False), - dist_sync_on_step=True, - log_prediction=self.cfg.get("log_prediction", False), - ) - - # setting the RNNT decoder as the default one - self.cur_decoder = "rnnt" - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config( - config=config, - local_rank=self.local_rank, - global_rank=self.global_rank, - world_size=self.world_size, - tokenizer=self.tokenizer, - preprocessor_cfg=self.cfg.get("preprocessor", None), - ) - - if dataset is None: - return None - - if isinstance(dataset, AudioToBPEDALIDataset): - # DALI Dataset implements dataloader interface - return dataset - - shuffle = config['shuffle'] - if isinstance(dataset, torch.utils.data.IterableDataset): - shuffle = False - - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - elif hasattr(dataset.datasets[0], 'collate_fn'): - # support datasets that are lists of entries - collate_fn = dataset.datasets[0].collate_fn - else: - # support datasets that are lists of lists - collate_fn = dataset.datasets[0].datasets[0].collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': - """ - Setup function for a temporary data loader which wraps the provided audio file. - - Args: - config: A python dictionary which contains the following keys: - paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \ - Recommended length per file is between 5 and 25 seconds. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - temp_dir: (str) A temporary directory where the audio manifest is temporarily - stored. - num_workers: (int) number of workers. Depends of the batch_size and machine. \ - 0 - only the main process will load batches, 1 - one worker (not main process) - - Returns: - A pytorch DataLoader for the given audio file(s). - """ - - if 'manifest_filepath' in config: - manifest_filepath = config['manifest_filepath'] - batch_size = config['batch_size'] - else: - manifest_filepath = os.path.join(config['temp_dir'], 'manifest.json') - batch_size = min(config['batch_size'], len(config['paths2audio_files'])) - - dl_config = { - 'manifest_filepath': manifest_filepath, - 'sample_rate': self.preprocessor._sample_rate, - 'batch_size': batch_size, - 'shuffle': False, - 'num_workers': config.get('num_workers', min(batch_size, os.cpu_count() - 1)), - 'pin_memory': True, - 'channel_selector': config.get('channel_selector', None), - 'use_start_end_token': self.cfg.validation_ds.get('use_start_end_token', False), - } - - if config.get("augmentor"): - dl_config['augmentor'] = config.get("augmentor") - - temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) - return temporary_datalayer - - def change_vocabulary( - self, - new_tokenizer_dir: Union[str, DictConfig], - new_tokenizer_type: str, - decoding_cfg: Optional[DictConfig] = None, - ctc_decoding_cfg: Optional[DictConfig] = None, - ): - """ - Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning on from pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - Args: - new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer (if the tokenizer type is `agg`) - new_tokenizer_type: Type of tokenizer. Can be either `agg`, `bpe` or `wpe`. - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - ctc_decoding_cfg: A config for auxiliary CTC decoding, which is optional and can be used to change the decoding type. - - Returns: None - - """ - if isinstance(new_tokenizer_dir, DictConfig): - if new_tokenizer_type == 'agg': - new_tokenizer_cfg = new_tokenizer_dir - else: - raise ValueError( - f'New tokenizer dir should be a string unless the tokenizer is `agg`, but this tokenizer type is: {new_tokenizer_type}' - ) - else: - new_tokenizer_cfg = None - - if new_tokenizer_cfg is not None: - tokenizer_cfg = new_tokenizer_cfg - else: - if not os.path.isdir(new_tokenizer_dir): - raise NotADirectoryError( - f'New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}' - ) - - if new_tokenizer_type.lower() not in ('bpe', 'wpe'): - raise ValueError(f'New tokenizer type must be either `bpe` or `wpe`') - - tokenizer_cfg = OmegaConf.create({'dir': new_tokenizer_dir, 'type': new_tokenizer_type}) - - # Setup the tokenizer - self._setup_tokenizer(tokenizer_cfg) - - # Initialize a dummy vocabulary - vocabulary = self.tokenizer.tokenizer.get_vocab() - - joint_config = self.joint.to_config_dict() - new_joint_config = copy.deepcopy(joint_config) - if self.tokenizer_type == "agg": - new_joint_config["vocabulary"] = ListConfig(vocabulary) - else: - new_joint_config["vocabulary"] = ListConfig(list(vocabulary.keys())) - - new_joint_config['num_classes'] = len(vocabulary) - del self.joint - self.joint = EncDecHybridRNNTCTCBPEModel.from_config_dict(new_joint_config) - - decoder_config = self.decoder.to_config_dict() - new_decoder_config = copy.deepcopy(decoder_config) - new_decoder_config.vocab_size = len(vocabulary) - del self.decoder - self.decoder = EncDecHybridRNNTCTCBPEModel.from_config_dict(new_decoder_config) - - del self.loss - self.loss = RNNTLoss(num_classes=self.joint.num_classes_with_blank - 1) - - if decoding_cfg is None: - # Assume same decoding config as before - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(RNNTBPEDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = RNNTBPEDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) - - self.wer = RNNTBPEWER( - decoding=self.decoding, - batch_dim_index=self.wer.batch_dim_index, - use_cer=self.wer.use_cer, - log_prediction=self.wer.log_prediction, - dist_sync_on_step=True, - ) - - # Setup fused Joint step - if self.joint.fuse_loss_wer or ( - self.decoding.joint_fused_batch_size is not None and self.decoding.joint_fused_batch_size > 0 - ): - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - # Update config - with open_dict(self.cfg.joint): - self.cfg.joint = new_joint_config - - with open_dict(self.cfg.decoder): - self.cfg.decoder = new_decoder_config - - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - logging.info(f"Changed tokenizer of the RNNT decoder to {self.joint.vocabulary} vocabulary.") - - # set up the new tokenizer for the CTC decoder - if hasattr(self, 'ctc_decoder'): - ctc_decoder_config = copy.deepcopy(self.ctc_decoder.to_config_dict()) - # sidestepping the potential overlapping tokens issue in aggregate tokenizers - if self.tokenizer_type == "agg": - ctc_decoder_config.vocabulary = ListConfig(vocabulary) - else: - ctc_decoder_config.vocabulary = ListConfig(list(vocabulary.keys())) - - decoder_num_classes = ctc_decoder_config['num_classes'] - # Override number of classes if placeholder provided - logging.info( - "\nReplacing old number of classes ({}) with new number of classes - {}".format( - decoder_num_classes, len(vocabulary) - ) - ) - ctc_decoder_config['num_classes'] = len(vocabulary) - - del self.ctc_decoder - self.ctc_decoder = EncDecHybridRNNTCTCBPEModel.from_config_dict(ctc_decoder_config) - del self.ctc_loss - self.ctc_loss = CTCLoss( - num_classes=self.ctc_decoder.num_classes_with_blank - 1, - zero_infinity=True, - reduction=self.cfg.aux_ctc.get("ctc_reduction", "mean_batch"), - ) - - if ctc_decoding_cfg is None: - # Assume same decoding config as before - ctc_decoding_cfg = self.cfg.aux_ctc.decoding - - # Assert the decoding config with all hyper parameters - ctc_decoding_cls = OmegaConf.structured(CTCBPEDecodingConfig) - ctc_decoding_cls = OmegaConf.create(OmegaConf.to_container(ctc_decoding_cls)) - ctc_decoding_cfg = OmegaConf.merge(ctc_decoding_cls, ctc_decoding_cfg) - - self.ctc_decoding = CTCBPEDecoding(decoding_cfg=ctc_decoding_cfg, tokenizer=self.tokenizer) - - self.ctc_wer = WERBPE( - decoding=self.ctc_decoding, - use_cer=self.cfg.aux_ctc.get('use_cer', False), - log_prediction=self.cfg.get("log_prediction", False), - dist_sync_on_step=True, - ) - - # Update config - with open_dict(self.cfg.aux_ctc): - self.cfg.aux_ctc.decoder = ctc_decoder_config - - with open_dict(self.cfg.aux_ctc): - self.cfg.aux_ctc.decoding = ctc_decoding_cfg - - logging.info(f"Changed tokenizer of the CTC decoder to {self.ctc_decoder.vocabulary} vocabulary.") - - def change_decoding_strategy(self, decoding_cfg: DictConfig = None, decoder_type: str = None): - """ - Changes decoding strategy used during RNNT decoding process. - Args: - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - decoder_type: (str) Can be set to 'rnnt' or 'ctc' to switch between appropriate decoder in a - model having both RNN-T and CTC decoders. Defaults to None, in which case RNN-T decoder is - used. If set to 'ctc', it raises error if 'ctc_decoder' is not an attribute of the model. - """ - if decoder_type is None or decoder_type == 'rnnt': - if decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config") - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(RNNTBPEDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = RNNTBPEDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) - - self.wer = RNNTBPEWER( - decoding=self.decoding, - batch_dim_index=self.wer.batch_dim_index, - use_cer=self.wer.use_cer, - log_prediction=self.wer.log_prediction, - dist_sync_on_step=True, - ) - - # Setup fused Joint step - if self.joint.fuse_loss_wer or ( - self.decoding.joint_fused_batch_size is not None and self.decoding.joint_fused_batch_size > 0 - ): - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - self.joint.temperature = decoding_cfg.get('temperature', 1.0) - - # Update config - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - logging.info(f"Changed decoding strategy of the RNNT decoder to \n{OmegaConf.to_yaml(self.cfg.decoding)}") - elif decoder_type == 'ctc': - if not hasattr(self, 'ctc_decoding'): - raise ValueError("The model does not have the ctc_decoding module and does not support ctc decoding.") - if decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config") - decoding_cfg = self.cfg.aux_ctc.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(CTCBPEDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.ctc_decoding = CTCBPEDecoding(decoding_cfg=decoding_cfg, tokenizer=self.tokenizer) - - self.ctc_wer = WERBPE( - decoding=self.ctc_decoding, - use_cer=self.ctc_wer.use_cer, - log_prediction=self.ctc_wer.log_prediction, - dist_sync_on_step=True, - ) - - self.ctc_decoder.temperature = decoding_cfg.get('temperature', 1.0) - - # Update config - with open_dict(self.cfg.aux_ctc.decoding): - self.cfg.aux_ctc.decoding = decoding_cfg - - self.cur_decoder = "ctc" - logging.info( - f"Changed decoding strategy of the CTC decoder to \n{OmegaConf.to_yaml(self.cfg.aux_ctc.decoding)}" - ) - else: - raise ValueError(f"decoder_type={decoder_type} is not supported. Supported values: [ctc,rnnt]") - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_fastconformer_hybrid_large_pc", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_pc", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_hybrid_large_pc/versions/1.21.0/files/stt_en_fastconformer_hybrid_large_pc.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_de_fastconformer_hybrid_large_pc", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_fastconformer_hybrid_large_pc", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_fastconformer_hybrid_large_pc/versions/1.21.0/files/stt_de_fastconformer_hybrid_large_pc.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_it_fastconformer_hybrid_large_pc", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_fastconformer_hybrid_large_pc", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_it_fastconformer_hybrid_large_pc/versions/1.20.0/files/stt_it_fastconformer_hybrid_large_pc.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_es_fastconformer_hybrid_large_pc", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_fastconformer_hybrid_large_pc", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_fastconformer_hybrid_large_pc/versions/1.21.0/files/stt_es_fastconformer_hybrid_large_pc.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_hr_fastconformer_hybrid_large_pc", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_fastconformer_hybrid_large_pc", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_hr_fastconformer_hybrid_large_pc/versions/1.21.0/files/FastConformer-Hybrid-Transducer-CTC-BPE-v256-averaged.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_ua_fastconformer_hybrid_large_pc", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ua_fastconformer_hybrid_large_pc", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ua_fastconformer_hybrid_large_pc/versions/1.21.0/files/stt_ua_fastconformer_hybrid_large_pc.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_pl_fastconformer_hybrid_large_pc", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_pl_fastconformer_hybrid_large_pc", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_pl_fastconformer_hybrid_large_pc/versions/1.21.0/files/stt_pl_fastconformer_hybrid_large_pc.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_by_fastconformer_hybrid_large_pc", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_by_fastconformer_hybrid_large_pc", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_by_fastconformer_hybrid_large_pc/versions/1.21.0/files/stt_by_fastconformer_hybrid_large_pc.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_ru_fastconformer_hybrid_large_pc", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_fastconformer_hybrid_large_pc", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ru_fastconformer_hybrid_large_pc/versions/1.21.0/files/stt_ru_fastconformer_hybrid_large_pc.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_fr_fastconformer_hybrid_large_pc", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_fastconformer_hybrid_large_pc", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_fastconformer_hybrid_large_pc/versions/1.21.0/files/stt_fr_fastconformer_hybrid_large_pc.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_multilingual_fastconformer_hybrid_large_pc_blend_eu", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_multilingual_fastconformer_hybrid_large_pc_blend_eu", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_multilingual_fastconformer_hybrid_large_pc_blend_eu/versions/1.21.0/files/stt_multilingual_fastconformer_hybrid_large_pc_blend_eu.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_multilingual_fastconformer_hybrid_large_pc", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_multilingual_fastconformer_hybrid_large_pc", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_multilingual_fastconformer_hybrid_large_pc/versions/1.21.0/files/stt_multilingual_fastconformer_hybrid_large_pc.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_fastconformer_hybrid_large_streaming_80ms", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_streaming_80ms", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_hybrid_large_streaming_80ms/versions/1.20.0/files/stt_en_fastconformer_hybrid_large_streaming_80ms.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_fastconformer_hybrid_large_streaming_480ms", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_streaming_480ms", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_hybrid_large_streaming_480ms/versions/1.20.0/files/stt_en_fastconformer_hybrid_large_streaming_480ms.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_fastconformer_hybrid_large_streaming_1040ms", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_streaming_1040ms", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_hybrid_large_streaming_1040ms/versions/1.20.0/files/stt_en_fastconformer_hybrid_large_streaming_1040ms.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_fastconformer_hybrid_large_streaming_multi", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_streaming_multi", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_hybrid_large_streaming_multi/versions/1.20.0/files/stt_en_fastconformer_hybrid_large_streaming_multi.nemo", - ) - results.append(model) - - return results diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py deleted file mode 100644 index c3f5e5803d1142fa9b25c972a8805fe583eb1664..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py +++ /dev/null @@ -1,683 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import json -import os -import tempfile -from typing import List, Optional - -import torch -from omegaconf import DictConfig, OmegaConf, open_dict -from pytorch_lightning import Trainer -from tqdm.auto import tqdm - -from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs -from nemo.collections.asr.losses.ctc import CTCLoss -from nemo.collections.asr.metrics.wer import WER, CTCDecoding, CTCDecodingConfig -from nemo.collections.asr.models.rnnt_models import EncDecRNNTModel -from nemo.collections.asr.parts.mixins import ASRBPEMixin, InterCTCMixin -from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType -from nemo.core.classes.common import PretrainedModelInfo -from nemo.core.classes.mixins import AccessMixin -from nemo.utils import logging, model_utils - - -class EncDecHybridRNNTCTCModel(EncDecRNNTModel, ASRBPEMixin, InterCTCMixin): - """Base class for hybrid RNNT/CTC models.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - cfg = model_utils.convert_model_config_to_dict_config(cfg) - cfg = model_utils.maybe_update_config_version(cfg) - super().__init__(cfg=cfg, trainer=trainer) - - if 'aux_ctc' not in self.cfg: - raise ValueError( - "The config need to have a section for the CTC decoder named as aux_ctc for Hybrid models." - ) - with open_dict(self.cfg.aux_ctc): - if "feat_in" not in self.cfg.aux_ctc.decoder or ( - not self.cfg.aux_ctc.decoder.feat_in and hasattr(self.encoder, '_feat_out') - ): - self.cfg.aux_ctc.decoder.feat_in = self.encoder._feat_out - if "feat_in" not in self.cfg.aux_ctc.decoder or not self.cfg.aux_ctc.decoder.feat_in: - raise ValueError("param feat_in of the decoder's config is not set!") - - if self.cfg.aux_ctc.decoder.num_classes < 1 and self.cfg.aux_ctc.decoder.vocabulary is not None: - logging.info( - "\nReplacing placeholder number of classes ({}) with actual number of classes - {}".format( - self.cfg.aux_ctc.decoder.num_classes, len(self.cfg.aux_ctc.decoder.vocabulary) - ) - ) - self.cfg.aux_ctc.decoder["num_classes"] = len(self.cfg.aux_ctc.decoder.vocabulary) - - self.ctc_decoder = EncDecRNNTModel.from_config_dict(self.cfg.aux_ctc.decoder) - self.ctc_loss_weight = self.cfg.aux_ctc.get("ctc_loss_weight", 0.5) - - self.ctc_loss = CTCLoss( - num_classes=self.ctc_decoder.num_classes_with_blank - 1, - zero_infinity=True, - reduction=self.cfg.aux_ctc.get("ctc_reduction", "mean_batch"), - ) - - ctc_decoding_cfg = self.cfg.aux_ctc.get('decoding', None) - if ctc_decoding_cfg is None: - ctc_decoding_cfg = OmegaConf.structured(CTCDecodingConfig) - with open_dict(self.cfg.aux_ctc): - self.cfg.aux_ctc.decoding = ctc_decoding_cfg - - self.ctc_decoding = CTCDecoding(self.cfg.aux_ctc.decoding, vocabulary=self.ctc_decoder.vocabulary) - self.ctc_wer = WER( - decoding=self.ctc_decoding, - use_cer=self.cfg.aux_ctc.get('use_cer', False), - dist_sync_on_step=True, - log_prediction=self.cfg.get("log_prediction", False), - ) - - # setting the RNNT decoder as the default one - self.cur_decoder = "rnnt" - - # setting up interCTC loss (from InterCTCMixin) - self.setup_interctc(decoder_name='ctc_decoder', loss_name='ctc_loss', wer_name='ctc_wer') - - @torch.no_grad() - def transcribe( - self, - paths2audio_files: List[str], - batch_size: int = 4, - return_hypotheses: bool = False, - partial_hypothesis: Optional[List['Hypothesis']] = None, - num_workers: int = 0, - channel_selector: Optional[ChannelSelectorType] = None, - augmentor: DictConfig = None, - verbose: bool = True, - logprobs: bool = False, - ) -> (List[str], Optional[List['Hypothesis']]): - """ - Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. - - Args: - - paths2audio_files: (a list) of paths to audio files. \ - Recommended length per file is between 5 and 25 seconds. \ - But it is possible to pass a few hours long file if enough GPU memory is available. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - return_hypotheses: (bool) Either return hypotheses or text - With hypotheses can do some postprocessing like getting timestamp or rescoring - num_workers: (int) number of workers for DataLoader - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. - augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied. - verbose: (bool) whether to display tqdm progress bar - logprobs: (bool) whether to return ctc logits insted of hypotheses - - Returns: - Returns a tuple of 2 items - - * A list of greedy transcript texts / Hypothesis - * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis. - """ - if self.cur_decoder not in ["ctc", "rnnt"]: - raise ValueError( - f"{self.cur_decoder} is not supported for cur_decoder. Supported values are ['ctc', 'rnnt']" - ) - if self.cur_decoder == "rnnt": - return super().transcribe( - paths2audio_files=paths2audio_files, - batch_size=batch_size, - return_hypotheses=return_hypotheses, - partial_hypothesis=partial_hypothesis, - num_workers=num_workers, - channel_selector=channel_selector, - augmentor=augmentor, - verbose=verbose, - ) - - if paths2audio_files is None or len(paths2audio_files) == 0: - return {} - # We will store transcriptions here - hypotheses = [] - all_hypotheses = [] - # Model's mode and device - mode = self.training - device = next(self.parameters()).device - dither_value = self.preprocessor.featurizer.dither - pad_to_value = self.preprocessor.featurizer.pad_to - - if num_workers is None: - num_workers = min(batch_size, os.cpu_count() - 1) - - try: - self.preprocessor.featurizer.dither = 0.0 - self.preprocessor.featurizer.pad_to = 0 - - # Switch model to evaluation mode - self.eval() - # Freeze the encoder and decoder modules - self.encoder.freeze() - self.decoder.freeze() - self.joint.freeze() - if hasattr(self, 'ctc_decoder'): - self.ctc_decoder.freeze() - - logging_level = logging.get_verbosity() - logging.set_verbosity(logging.WARNING) - # Work in tmp directory - will store manifest file there - with tempfile.TemporaryDirectory() as tmpdir: - with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp: - for audio_file in paths2audio_files: - entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': ''} - fp.write(json.dumps(entry) + '\n') - - config = { - 'paths2audio_files': paths2audio_files, - 'batch_size': batch_size, - 'temp_dir': tmpdir, - 'num_workers': num_workers, - 'channel_selector': channel_selector, - } - - if augmentor: - config['augmentor'] = augmentor - - temporary_datalayer = self._setup_transcribe_dataloader(config) - logits_list = [] - for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=not verbose): - encoded, encoded_len = self.forward( - input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) - ) - - logits = self.ctc_decoder(encoder_output=encoded) - best_hyp, all_hyp = self.ctc_decoding.ctc_decoder_predictions_tensor( - logits, encoded_len, return_hypotheses=return_hypotheses, - ) - logits = logits.cpu() - - if return_hypotheses: - # dump log probs per file - for idx in range(logits.shape[0]): - best_hyp[idx].y_sequence = logits[idx][: encoded_len[idx]] - if best_hyp[idx].alignments is None: - best_hyp[idx].alignments = best_hyp[idx].y_sequence - if logprobs: - for logit, elen in zip(logits, encoded_len): - logits_list.append(logit[:elen]) - del logits - - hypotheses += best_hyp - if all_hyp is not None: - all_hypotheses += all_hyp - else: - all_hypotheses += best_hyp - - del encoded - del test_batch - finally: - # set mode back to its original value - self.train(mode=mode) - self.preprocessor.featurizer.dither = dither_value - self.preprocessor.featurizer.pad_to = pad_to_value - - logging.set_verbosity(logging_level) - if mode is True: - self.encoder.unfreeze() - self.decoder.unfreeze() - self.joint.unfreeze() - if hasattr(self, 'ctc_decoder'): - self.ctc_decoder.unfreeze() - if logprobs: - return logits_list - else: - return hypotheses, all_hypotheses - - def change_vocabulary( - self, - new_vocabulary: List[str], - decoding_cfg: Optional[DictConfig] = None, - ctc_decoding_cfg: Optional[DictConfig] = None, - ): - """ - Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning a pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - Args: - new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically, \ - this is target alphabet. - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - ctc_decoding_cfg: A config for CTC decoding, which is optional and can be used to change decoding type. - - Returns: None - - """ - super().change_vocabulary(new_vocabulary=new_vocabulary, decoding_cfg=decoding_cfg) - - # set up the new tokenizer for the CTC decoder - if hasattr(self, 'ctc_decoder'): - if self.ctc_decoder.vocabulary == new_vocabulary: - logging.warning( - f"Old {self.ctc_decoder.vocabulary} and new {new_vocabulary} match. Not changing anything." - ) - else: - if new_vocabulary is None or len(new_vocabulary) == 0: - raise ValueError(f'New vocabulary must be non-empty list of chars. But I got: {new_vocabulary}') - decoder_config = self.ctc_decoder.to_config_dict() - new_decoder_config = copy.deepcopy(decoder_config) - new_decoder_config['vocabulary'] = new_vocabulary - new_decoder_config['num_classes'] = len(new_vocabulary) - - del self.ctc_decoder - self.ctc_decoder = EncDecHybridRNNTCTCModel.from_config_dict(new_decoder_config) - del self.ctc_loss - self.ctc_loss = CTCLoss( - num_classes=self.ctc_decoder.num_classes_with_blank - 1, - zero_infinity=True, - reduction=self.cfg.aux_ctc.get("ctc_reduction", "mean_batch"), - ) - - if ctc_decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `ctc_decoding_cfg` passed when changing decoding strategy, using internal config") - ctc_decoding_cfg = self.cfg.aux_ctc.decoding - - # Assert the decoding config with all hyper parameters - ctc_decoding_cls = OmegaConf.structured(CTCDecodingConfig) - ctc_decoding_cls = OmegaConf.create(OmegaConf.to_container(ctc_decoding_cls)) - ctc_decoding_cfg = OmegaConf.merge(ctc_decoding_cls, ctc_decoding_cfg) - - self.ctc_decoding = CTCDecoding(decoding_cfg=ctc_decoding_cfg, vocabulary=self.ctc_decoder.vocabulary) - - self.ctc_wer = WER( - decoding=self.ctc_decoding, - use_cer=self.ctc_wer.use_cer, - log_prediction=self.ctc_wer.log_prediction, - dist_sync_on_step=True, - ) - - # Update config - with open_dict(self.cfg.aux_ctc): - self.cfg.aux_ctc.decoding = ctc_decoding_cfg - - with open_dict(self.cfg.aux_ctc): - self.cfg.aux_ctc.decoder = new_decoder_config - - ds_keys = ['train_ds', 'validation_ds', 'test_ds'] - for key in ds_keys: - if key in self.cfg: - with open_dict(self.cfg[key]): - self.cfg[key]['labels'] = OmegaConf.create(new_vocabulary) - - logging.info(f"Changed the tokenizer of the CTC decoder to {self.ctc_decoder.vocabulary} vocabulary.") - - def change_decoding_strategy(self, decoding_cfg: DictConfig = None, decoder_type: str = None): - """ - Changes decoding strategy used during RNNT decoding process. - - Args: - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - decoder_type: (str) Can be set to 'rnnt' or 'ctc' to switch between appropriate decoder in a - model having RNN-T and CTC decoders. Defaults to None, in which case RNN-T decoder is - used. If set to 'ctc', it raises error if 'ctc_decoder' is not an attribute of the model. - """ - if decoder_type is None or decoder_type == 'rnnt': - self.cur_decoder = "rnnt" - return super().change_decoding_strategy(decoding_cfg=decoding_cfg) - - assert decoder_type == 'ctc' and hasattr(self, 'ctc_decoder') - if decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config") - decoding_cfg = self.cfg.aux_ctc.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(CTCDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.ctc_decoding = CTCDecoding(decoding_cfg=decoding_cfg, vocabulary=self.ctc_decoder.vocabulary) - - self.ctc_wer = WER( - decoding=self.ctc_decoding, - use_cer=self.ctc_wer.use_cer, - log_prediction=self.ctc_wer.log_prediction, - dist_sync_on_step=True, - ) - - self.ctc_decoder.temperature = decoding_cfg.get('temperature', 1.0) - - # Update config - with open_dict(self.cfg.aux_ctc): - self.cfg.aux_ctc.decoding = decoding_cfg - - self.cur_decoder = "ctc" - logging.info(f"Changed decoding strategy to \n{OmegaConf.to_yaml(self.cfg.aux_ctc.decoding)}") - - # PTL-specific methods - def training_step(self, batch, batch_nb): - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - if self.is_interctc_enabled(): - AccessMixin.set_access_enabled(access_enabled=True) - - signal, signal_len, transcript, transcript_len = batch - - # forward() only performs encoder forward - if isinstance(batch, DALIOutputs) and batch.has_processed_signal: - encoded, encoded_len = self.forward(processed_signal=signal, processed_signal_length=signal_len) - else: - encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) - del signal - - # During training, loss must be computed, so decoder forward is necessary - decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) - - if hasattr(self, '_trainer') and self._trainer is not None: - log_every_n_steps = self._trainer.log_every_n_steps - sample_id = self._trainer.global_step - else: - log_every_n_steps = 1 - sample_id = batch_nb - - if (sample_id + 1) % log_every_n_steps == 0: - compute_wer = True - else: - compute_wer = False - - # If fused Joint-Loss-WER is not used - if not self.joint.fuse_loss_wer: - # Compute full joint and loss - joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) - loss_value = self.loss( - log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length - ) - - # Add auxiliary losses, if registered - loss_value = self.add_auxiliary_losses(loss_value) - - tensorboard_logs = { - 'learning_rate': self._optimizer.param_groups[0]['lr'], - 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), - } - - if compute_wer: - self.wer.update(encoded, encoded_len, transcript, transcript_len) - _, scores, words = self.wer.compute() - self.wer.reset() - tensorboard_logs.update({'training_batch_wer': scores.float() / words}) - - else: # If fused Joint-Loss-WER is used - # Fused joint step - loss_value, wer, _, _ = self.joint( - encoder_outputs=encoded, - decoder_outputs=decoder, - encoder_lengths=encoded_len, - transcripts=transcript, - transcript_lengths=transcript_len, - compute_wer=compute_wer, - ) - - # Add auxiliary losses, if registered - loss_value = self.add_auxiliary_losses(loss_value) - - tensorboard_logs = { - 'learning_rate': self._optimizer.param_groups[0]['lr'], - 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), - } - - if compute_wer: - tensorboard_logs.update({'training_batch_wer': wer}) - - if self.ctc_loss_weight > 0: - log_probs = self.ctc_decoder(encoder_output=encoded) - ctc_loss = self.ctc_loss( - log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len - ) - tensorboard_logs['train_rnnt_loss'] = loss_value - tensorboard_logs['train_ctc_loss'] = ctc_loss - loss_value = (1 - self.ctc_loss_weight) * loss_value + self.ctc_loss_weight * ctc_loss - if compute_wer: - self.ctc_wer.update( - predictions=log_probs, - targets=transcript, - target_lengths=transcript_len, - predictions_lengths=encoded_len, - ) - ctc_wer, _, _ = self.ctc_wer.compute() - self.ctc_wer.reset() - tensorboard_logs.update({'training_batch_wer_ctc': ctc_wer}) - - # note that we want to apply interctc independent of whether main ctc - # loss is used or not (to allow rnnt + interctc training). - # assuming ``ctc_loss_weight=0.3`` and interctc is applied to a single - # layer with weight of ``0.1``, the total loss will be - # ``loss = 0.9 * (0.3 * ctc_loss + 0.7 * rnnt_loss) + 0.1 * interctc_loss`` - loss_value, additional_logs = self.add_interctc_losses( - loss_value, transcript, transcript_len, compute_wer=compute_wer - ) - tensorboard_logs.update(additional_logs) - tensorboard_logs['train_loss'] = loss_value - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - # Log items - self.log_dict(tensorboard_logs) - - # Preserve batch acoustic model T and language model U parameters if normalizing - if self._optim_normalize_joint_txu: - self._optim_normalize_txu = [encoded_len.max(), transcript_len.max()] - - return {'loss': loss_value} - - def predict_step(self, batch, batch_idx, dataloader_idx=0): - # TODO: add support for CTC decoding - signal, signal_len, transcript, transcript_len, sample_id = batch - - # forward() only performs encoder forward - if isinstance(batch, DALIOutputs) and batch.has_processed_signal: - encoded, encoded_len = self.forward(processed_signal=signal, processed_signal_length=signal_len) - else: - encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) - del signal - - best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( - encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False - ) - - sample_id = sample_id.cpu().detach().numpy() - return list(zip(sample_id, best_hyp_text)) - - def validation_pass(self, batch, batch_idx, dataloader_idx): - if self.is_interctc_enabled(): - AccessMixin.set_access_enabled(access_enabled=True) - - signal, signal_len, transcript, transcript_len = batch - - # forward() only performs encoder forward - if isinstance(batch, DALIOutputs) and batch.has_processed_signal: - encoded, encoded_len = self.forward(processed_signal=signal, processed_signal_length=signal_len) - else: - encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) - del signal - - tensorboard_logs = {} - loss_value = None - - # If experimental fused Joint-Loss-WER is not used - if not self.joint.fuse_loss_wer: - if self.compute_eval_loss: - decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) - joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) - - loss_value = self.loss( - log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length - ) - tensorboard_logs['val_loss'] = loss_value - - self.wer.update(encoded, encoded_len, transcript, transcript_len) - wer, wer_num, wer_denom = self.wer.compute() - self.wer.reset() - - tensorboard_logs['val_wer_num'] = wer_num - tensorboard_logs['val_wer_denom'] = wer_denom - tensorboard_logs['val_wer'] = wer - - else: - # If experimental fused Joint-Loss-WER is used - compute_wer = True - - if self.compute_eval_loss: - decoded, target_len, states = self.decoder(targets=transcript, target_length=transcript_len) - else: - decoded = None - target_len = transcript_len - - # Fused joint step - loss_value, wer, wer_num, wer_denom = self.joint( - encoder_outputs=encoded, - decoder_outputs=decoded, - encoder_lengths=encoded_len, - transcripts=transcript, - transcript_lengths=target_len, - compute_wer=compute_wer, - ) - if loss_value is not None: - tensorboard_logs['val_loss'] = loss_value - - tensorboard_logs['val_wer_num'] = wer_num - tensorboard_logs['val_wer_denom'] = wer_denom - tensorboard_logs['val_wer'] = wer - - log_probs = self.ctc_decoder(encoder_output=encoded) - if self.compute_eval_loss: - ctc_loss = self.ctc_loss( - log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len - ) - tensorboard_logs['val_ctc_loss'] = ctc_loss - tensorboard_logs['val_rnnt_loss'] = loss_value - loss_value = (1 - self.ctc_loss_weight) * loss_value + self.ctc_loss_weight * ctc_loss - tensorboard_logs['val_loss'] = loss_value - self.ctc_wer.update( - predictions=log_probs, targets=transcript, target_lengths=transcript_len, predictions_lengths=encoded_len, - ) - ctc_wer, ctc_wer_num, ctc_wer_denom = self.ctc_wer.compute() - self.ctc_wer.reset() - tensorboard_logs['val_wer_num_ctc'] = ctc_wer_num - tensorboard_logs['val_wer_denom_ctc'] = ctc_wer_denom - tensorboard_logs['val_wer_ctc'] = ctc_wer - - self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) - - loss_value, additional_logs = self.add_interctc_losses( - loss_value, - transcript, - transcript_len, - compute_wer=True, - compute_loss=self.compute_eval_loss, - log_wer_num_denom=True, - log_prefix="val_", - ) - if self.compute_eval_loss: - # overriding total loss value. Note that the previous - # rnnt + ctc loss is available in metrics as "val_final_loss" now - tensorboard_logs['val_loss'] = loss_value - tensorboard_logs.update(additional_logs) - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - return tensorboard_logs - - def validation_step(self, batch, batch_idx, dataloader_idx=0): - tensorboard_logs = self.validation_pass(batch, batch_idx, dataloader_idx) - if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: - self.validation_step_outputs[dataloader_idx].append(tensorboard_logs) - else: - self.validation_step_outputs.append(tensorboard_logs) - - return tensorboard_logs - - def test_step(self, batch, batch_idx, dataloader_idx=0): - logs = self.validation_pass(batch, batch_idx, dataloader_idx=dataloader_idx) - test_logs = {name.replace("val_", "test_"): value for name, value in logs.items()} - if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: - self.test_step_outputs[dataloader_idx].append(test_logs) - else: - self.test_step_outputs.append(test_logs) - return test_logs - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): - if self.compute_eval_loss: - val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean() - val_loss_log = {'val_loss': val_loss_mean} - else: - val_loss_log = {} - wer_num = torch.stack([x['val_wer_num'] for x in outputs]).sum() - wer_denom = torch.stack([x['val_wer_denom'] for x in outputs]).sum() - tensorboard_logs = {**val_loss_log, 'val_wer': wer_num.float() / wer_denom} - if self.ctc_loss_weight > 0: - ctc_wer_num = torch.stack([x['val_wer_num_ctc'] for x in outputs]).sum() - ctc_wer_denom = torch.stack([x['val_wer_denom_ctc'] for x in outputs]).sum() - tensorboard_logs['val_wer_ctc'] = ctc_wer_num.float() / ctc_wer_denom - metrics = {**val_loss_log, 'log': tensorboard_logs} - self.finalize_interctc_metrics(metrics, outputs, prefix="val_") - return metrics - - def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): - if self.compute_eval_loss: - test_loss_mean = torch.stack([x['test_loss'] for x in outputs]).mean() - test_loss_log = {'test_loss': test_loss_mean} - else: - test_loss_log = {} - wer_num = torch.stack([x['test_wer_num'] for x in outputs]).sum() - wer_denom = torch.stack([x['test_wer_denom'] for x in outputs]).sum() - tensorboard_logs = {**test_loss_log, 'test_wer': wer_num.float() / wer_denom} - - if self.ctc_loss_weight > 0: - ctc_wer_num = torch.stack([x['test_wer_num_ctc'] for x in outputs]).sum() - ctc_wer_denom = torch.stack([x['test_wer_denom_ctc'] for x in outputs]).sum() - tensorboard_logs['test_wer_ctc'] = ctc_wer_num.float() / ctc_wer_denom - - metrics = {**test_loss_log, 'log': tensorboard_logs} - self.finalize_interctc_metrics(metrics, outputs, prefix="test_") - return metrics - - # EncDecRNNTModel is exported in 2 parts - def list_export_subnets(self): - if self.cur_decoder == 'rnnt': - return ['encoder', 'decoder_joint'] - else: - return ['self'] - - @property - def output_module(self): - if self.cur_decoder == 'rnnt': - return self.decoder - else: - return self.ctc_decoder - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - return results diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/k2_aligner_model.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/k2_aligner_model.py deleted file mode 100644 index 54d342df40e42146f1b4b05e494cc1681e9366eb..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/k2_aligner_model.py +++ /dev/null @@ -1,616 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os -from typing import Dict, List, Optional, Tuple, Union - -import numpy as np -import torch -from omegaconf import DictConfig, OmegaConf, open_dict -from tqdm.auto import tqdm - -from nemo.collections.asr.data.audio_to_ctm_dataset import FrameCtmUnit -from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs -from nemo.collections.asr.models.asr_model import ASRModel -from nemo.utils import logging - - -class AlignerWrapperModel(ASRModel): - """ASR model wrapper to perform alignment building. - Functionality is limited to the components needed to build an alignment.""" - - def __init__(self, model: ASRModel, cfg: DictConfig): - model_cfg = model.cfg - for ds in ("train_ds", "validation_ds", "test_ds"): - if ds in model_cfg: - model_cfg[ds] = None - super().__init__(cfg=model_cfg, trainer=model.trainer) - self._model = model - self.alignment_type = cfg.get("alignment_type", "forced") - self.word_output = cfg.get("word_output", True) - self.cpu_decoding = cfg.get("cpu_decoding", False) - self.decode_batch_size = cfg.get("decode_batch_size", 0) - - # list possible alignment types here for future work - if self.alignment_type == "forced": - pass - elif self.alignment_type == "argmax": - pass - elif self.alignment_type == "loose": - raise NotImplementedError(f"alignment_type=`{self.alignment_type}` is not supported at the moment.") - elif self.alignment_type == "rnnt_decoding_aux": - raise NotImplementedError(f"alignment_type=`{self.alignment_type}` is not supported at the moment.") - else: - raise RuntimeError(f"Unsupported alignment type: {self.alignment_type}") - - self._init_model_specific(cfg) - - def _init_ctc_alignment_specific(self, cfg: DictConfig): - """Part of __init__ intended to initialize attributes specific to the alignment type for CTC models. - - This method is not supposed to be called outside of __init__. - """ - # do nothing for regular CTC with `argmax` alignment type - if self.alignment_type == "argmax" and not hasattr(self._model, "use_graph_lm"): - return - - from nemo.collections.asr.modules.graph_decoder import ViterbiDecoderWithGraph - - if self.alignment_type == "forced": - if hasattr(self._model, "use_graph_lm"): - if self._model.use_graph_lm: - self.graph_decoder = self._model.transcribe_decoder - self._model.use_graph_lm = False - else: - self.graph_decoder = ViterbiDecoderWithGraph( - num_classes=self.blank_id, backend="k2", dec_type="topo", return_type="1best" - ) - # override split_batch_size - self.graph_decoder.split_batch_size = self.decode_batch_size - else: - self.graph_decoder = ViterbiDecoderWithGraph( - num_classes=self.blank_id, split_batch_size=self.decode_batch_size, - ) - # override decoder args if a config is provided - decoder_module_cfg = cfg.get("decoder_module_cfg", None) - if decoder_module_cfg is not None: - self.graph_decoder._decoder.intersect_pruned = decoder_module_cfg.get("intersect_pruned") - self.graph_decoder._decoder.intersect_conf = decoder_module_cfg.get("intersect_conf") - return - - if self.alignment_type == "argmax": - # we use transcribe_decoder to get topology-independent output - if not self._model.use_graph_lm: - self._model.transcribe_decoder = ViterbiDecoderWithGraph( - num_classes=self.blank_id, backend="k2", dec_type="topo", return_type="1best" - ) - # override decoder args - self._model.transcribe_decoder.return_ilabels = False - self._model.transcribe_decoder.output_aligned = True - self._model.transcribe_decoder.split_batch_size = self.decode_batch_size - self._model.use_graph_lm = False - return - - def _init_rnnt_alignment_specific(self, cfg: DictConfig): - """Part of __init__ intended to initialize attributes specific to the alignment type for RNNT models. - - This method is not supposed to be called outside of __init__. - """ - if self.alignment_type == "argmax": - return - - from nemo.collections.asr.modules.graph_decoder import ViterbiDecoderWithGraph - - if self.alignment_type == "forced": - self.predictor_window_size = cfg.rnnt_cfg.get("predictor_window_size", 0) - self.predictor_step_size = cfg.rnnt_cfg.get("predictor_step_size", 0) - - from nemo.collections.asr.parts.k2.utils import apply_rnnt_prune_ranges, get_uniform_rnnt_prune_ranges - - self.prepare_pruned_outputs = lambda encoder_outputs, encoded_len, decoder_outputs, transcript_len: apply_rnnt_prune_ranges( - encoder_outputs, - decoder_outputs, - get_uniform_rnnt_prune_ranges( - encoded_len, - transcript_len, - self.predictor_window_size + 1, - self.predictor_step_size, - encoder_outputs.size(1), - ).to(device=encoder_outputs.device), - ) - - from nemo.collections.asr.parts.k2.classes import GraphModuleConfig - - self.graph_decoder = ViterbiDecoderWithGraph( - num_classes=self.blank_id, - backend="k2", - dec_type="topo_rnnt_ali", - split_batch_size=self.decode_batch_size, - graph_module_cfg=OmegaConf.structured( - GraphModuleConfig( - topo_type="minimal", - predictor_window_size=self.predictor_window_size, - predictor_step_size=self.predictor_step_size, - ) - ), - ) - # override decoder args if a config is provided - decoder_module_cfg = cfg.get("decoder_module_cfg", None) - if decoder_module_cfg is not None: - self.graph_decoder._decoder.intersect_pruned = decoder_module_cfg.get("intersect_pruned") - self.graph_decoder._decoder.intersect_conf = decoder_module_cfg.get("intersect_conf") - return - - def _init_model_specific(self, cfg: DictConfig): - """Part of __init__ intended to initialize attributes specific to the model type. - - This method is not supposed to be called outside of __init__. - """ - from nemo.collections.asr.models.ctc_models import EncDecCTCModel - - if isinstance(self._model, EncDecCTCModel): - self.model_type = "ctc" - self.blank_id = self._model.decoder.num_classes_with_blank - 1 - self._predict_impl = self._predict_impl_ctc - - prob_suppress_index = cfg.ctc_cfg.get("prob_suppress_index", -1) - prob_suppress_value = cfg.ctc_cfg.get("prob_suppress_value", 1.0) - if prob_suppress_value > 1 or prob_suppress_value <= 0: - raise ValueError(f"Suppression value has to be in (0,1]: {prob_suppress_value}") - if prob_suppress_index < -(self.blank_id + 1) or prob_suppress_index > self.blank_id: - raise ValueError( - f"Suppression index for the provided model has to be in [{-self.blank_id+1},{self.blank_id}]: {prob_suppress_index}" - ) - self.prob_suppress_index = ( - self._model.decoder.num_classes_with_blank + prob_suppress_index - if prob_suppress_index < 0 - else prob_suppress_index - ) - self.prob_suppress_value = prob_suppress_value - - self._init_ctc_alignment_specific(cfg) - return - - from nemo.collections.asr.models.rnnt_models import EncDecRNNTModel - - if isinstance(self._model, EncDecRNNTModel): - self.model_type = "rnnt" - self.blank_id = self._model.joint.num_classes_with_blank - 1 - self.log_softmax = None if self._model.joint.log_softmax is None else not self._model.joint.log_softmax - self._predict_impl = self._predict_impl_rnnt - - decoding_config = copy.deepcopy(self._model.cfg.decoding) - decoding_config.strategy = "greedy_batch" - with open_dict(decoding_config): - decoding_config.preserve_alignments = True - decoding_config.fused_batch_size = -1 - self._model.change_decoding_strategy(decoding_config) - self._init_rnnt_alignment_specific(cfg) - return - - raise RuntimeError(f"Unsupported model type: {type(self._model)}") - - def _rnnt_joint_pruned( - self, - encoder_outputs: torch.Tensor, - encoded_len: torch.Tensor, - decoder_outputs: torch.Tensor, - transcript_len: torch.Tensor, - ) -> torch.Tensor: - """A variant of the RNNT Joiner tensor calculation with pruned Encoder and Predictor sum. - Only the uniform pruning is supported at the moment. - """ - encoder_outputs = self._model.joint.enc(encoder_outputs.transpose(1, 2)) # (B, T, H) - decoder_outputs = self._model.joint.pred(decoder_outputs.transpose(1, 2)) # (B, U, H) - - encoder_outputs_pruned, decoder_outputs_pruned = self.prepare_pruned_outputs( - encoder_outputs, encoded_len, decoder_outputs, transcript_len - ) - res = self._model.joint.joint_net(encoder_outputs_pruned + decoder_outputs_pruned) - # copied from model.joint.joint(...) - if self._model.joint.log_softmax is None: - if not res.is_cuda: - res = res.log_softmax(dim=-1) - else: - if self._model.joint.log_softmax: - res = res.log_softmax(dim=-1) - return res - - def _apply_prob_suppress(self, log_probs: torch.Tensor) -> torch.Tensor: - """Multiplies probability of an element with index self.prob_suppress_index by self.prob_suppress_value times - with stochasticity preservation of the log_probs tensor. - - Often used to suppress probability of the output of a CTC model. - - Example: - For - - log_probs = torch.log(torch.tensor([0.015, 0.085, 0.9])) - - self.prob_suppress_index = -1 - - self.prob_suppress_value = 0.5 - the result of _apply_prob_suppress(log_probs) is - - torch.log(torch.tensor([0.0825, 0.4675, 0.45])) - """ - exp_probs = (log_probs).exp() - x = exp_probs[:, :, self.prob_suppress_index] - # we cannot do y=1-x because exp_probs can be not stochastic due to numerical limitations - y = torch.cat( - [exp_probs[:, :, : self.prob_suppress_index], exp_probs[:, :, self.prob_suppress_index + 1 :]], 2 - ).sum(-1) - b1 = torch.full((exp_probs.shape[0], exp_probs.shape[1], 1), self.prob_suppress_value, device=log_probs.device) - b2 = ((1 - self.prob_suppress_value * x) / y).unsqueeze(2).repeat(1, 1, exp_probs.shape[-1] - 1) - return ( - exp_probs * torch.cat([b2[:, :, : self.prob_suppress_index], b1, b2[:, :, self.prob_suppress_index :]], 2) - ).log() - - def _prepare_ctc_argmax_predictions( - self, log_probs: torch.Tensor, encoded_len: torch.Tensor - ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: - """Obtains argmax predictions with corresponding probabilities. - Replaces consecutive repeated indices in the argmax predictions with the index. - """ - if hasattr(self._model, "transcribe_decoder"): - predictions, _, probs = self.transcribe_decoder.forward(log_probs=log_probs, log_probs_length=encoded_len) - else: - greedy_predictions = log_probs.argmax(dim=-1, keepdim=False) - probs_tensor, _ = log_probs.exp().max(dim=-1, keepdim=False) - predictions, probs = [], [] - for i in range(log_probs.shape[0]): - utt_len = encoded_len[i] - probs.append(probs_tensor[i, :utt_len]) - pred_candidate = greedy_predictions[i, :utt_len].cpu() - # replace consecutive tokens with - previous = self.blank_id - for j in range(utt_len): - p = pred_candidate[j] - if p == previous and previous != self.blank_id: - pred_candidate[j] = self.blank_id - previous = p - predictions.append(pred_candidate.to(device=greedy_predictions.device)) - return predictions, probs - - def _predict_impl_rnnt_argmax( - self, - encoded: torch.Tensor, - encoded_len: torch.Tensor, - transcript: torch.Tensor, - transcript_len: torch.Tensor, - sample_id: torch.Tensor, - ) -> List[Tuple[int, 'FrameCtmUnit']]: - """Builds time alignment of an encoded sequence. - This method assumes that the RNNT model is used and the alignment type is `argmax`. - - It produces a list of sample ids and fours: (label, start_frame, length, probability), called FrameCtmUnit. - """ - hypotheses = self._model.decoding.rnnt_decoder_predictions_tensor( - encoded, encoded_len, return_hypotheses=True - )[0] - results = [] - for s_id, hypothesis in zip(sample_id, hypotheses): - pred_ids = hypothesis.y_sequence.tolist() - tokens = self._model.decoding.decode_ids_to_tokens(pred_ids) - token_begin = hypothesis.timestep - token_len = [j - i for i, j in zip(token_begin, token_begin[1:] + [len(hypothesis.alignments)])] - # we have no token probabilities for the argmax rnnt setup - token_prob = [1.0] * len(tokens) - if self.word_output: - words = [w for w in self._model.decoding.decode_tokens_to_str(pred_ids).split(" ") if w != ""] - words, word_begin, word_len, word_prob = ( - self._process_tokens_to_words(tokens, token_begin, token_len, token_prob, words) - if hasattr(self._model, "tokenizer") - else self._process_char_with_space_to_words(tokens, token_begin, token_len, token_prob, words) - ) - results.append( - (s_id, [FrameCtmUnit(t, b, l, p) for t, b, l, p in zip(words, word_begin, word_len, word_prob)]) - ) - else: - results.append( - ( - s_id, - [FrameCtmUnit(t, b, l, p) for t, b, l, p in zip(tokens, token_begin, token_len, token_prob)], - ) - ) - return results - - def _process_tokens_to_words( - self, - tokens: List[str], - token_begin: List[int], - token_len: List[int], - token_prob: List[float], - words: List[str], - ) -> Tuple[List[str], List[int], List[int], List[float]]: - """Transforms alignment information from token level to word level. - - Used when self._model.tokenizer is present. - """ - # suppose that there are no whitespaces - assert len(self._model.tokenizer.text_to_tokens(words[0])) == len( - self._model.tokenizer.text_to_tokens(words[0] + " ") - ) - word_begin, word_len, word_prob = [], [], [] - token_len_nonzero = [(t_l if t_l > 0 else 1) for t_l in token_len] - i = 0 - for word in words: - loc_tokens = self._model.tokenizer.text_to_tokens(word) - step = len(loc_tokens) - # we assume that an empty word consists of only one token - # drop current token - if step == 0: - token_begin[i + 1] = token_begin[i] - token_len[i + 1] += token_len[i] - token_len_nonzero[i + 1] += token_len_nonzero[i] - del tokens[i], token_begin[i], token_len[i], token_len_nonzero[i], token_prob[i] - continue - # fix tokenization - if step == 2 and loc_tokens[-1] == "??": - step -= 1 - j = i + step - word_begin.append(token_begin[i]) - word_len.append(sum(token_len[i:j])) - denominator = sum(token_len_nonzero[i:j]) - word_prob.append(sum(token_prob[k] * token_len_nonzero[k] for k in range(i, j)) / denominator) - i = j - return words, word_begin, word_len, word_prob - - def _process_char_with_space_to_words( - self, - tokens: List[str], - token_begin: List[int], - token_len: List[int], - token_prob: List[float], - words: List[str], - ) -> Tuple[List[str], List[int], List[int], List[float]]: - """Transforms alignment information from character level to word level. - This method includes separator (typically the space) information in the results. - - Used with character-based models (no self._model.tokenizer). - """ - # suppose that there are no whitespaces anywhere except between words - space_idx = (np.array(tokens) == " ").nonzero()[0].tolist() - assert len(words) == len(space_idx) + 1 - token_len_nonzero = [(t_l if t_l > 0 else 1) for t_l in token_len] - if len(space_idx) == 0: - word_begin = [token_begin[0]] - word_len = [sum(token_len)] - denominator = sum(token_len_nonzero) - word_prob = [sum(t_p * t_l for t_p, t_l in zip(token_prob, token_len_nonzero)) / denominator] - else: - space_word = "[SEP]" - word_begin = [token_begin[0]] - word_len = [sum(token_len[: space_idx[0]])] - denominator = sum(token_len_nonzero[: space_idx[0]]) - word_prob = [sum(token_prob[k] * token_len_nonzero[k] for k in range(space_idx[0])) / denominator] - words_with_space = [words[0]] - for word, i, j in zip(words[1:], space_idx, space_idx[1:] + [len(tokens)]): - # append space - word_begin.append(token_begin[i]) - word_len.append(token_len[i]) - word_prob.append(token_prob[i]) - words_with_space.append(space_word) - # append next word - word_begin.append(token_begin[i + 1]) - word_len.append(sum(token_len[i + 1 : j])) - denominator = sum(token_len_nonzero[i + 1 : j]) - word_prob.append(sum(token_prob[k] * token_len_nonzero[k] for k in range(i + 1, j)) / denominator) - words_with_space.append(word) - words = words_with_space - return words, word_begin, word_len, word_prob - - def _results_to_ctmUnits( - self, s_id: int, pred: torch.Tensor, prob: torch.Tensor - ) -> Tuple[int, List['FrameCtmUnit']]: - """Transforms predictions with probabilities to a list of FrameCtmUnit objects, - containing frame-level alignment information (label, start, duration, probability), for a given sample id. - - Alignment information can be either token-based (char, wordpiece, ...) or word-based. - """ - if len(pred) == 0: - return (s_id, []) - - non_blank_idx = (pred != self.blank_id).nonzero(as_tuple=True)[0].cpu() - pred_ids = pred[non_blank_idx].tolist() - prob_list = prob.tolist() - if self.model_type == "rnnt": - wer_module = self._model.decoding - # for rnnt forced alignment we always have num_blanks == num_frames, - # thus len(pred) == num_frames + num_non_blanks - token_begin = non_blank_idx - torch.arange(len(non_blank_idx)) - token_end = torch.cat((token_begin[1:], torch.tensor([len(pred) - len(non_blank_idx)]))) - else: - wer_module = self._model._wer - token_begin = non_blank_idx - token_end = torch.cat((token_begin[1:], torch.tensor([len(pred)]))) - tokens = wer_module.decode_ids_to_tokens(pred_ids) - token_len = (token_end - token_begin).tolist() - token_begin = token_begin.tolist() - token_prob = [ - sum(prob_list[i:j]) / (j - i) - for i, j in zip(non_blank_idx.tolist(), non_blank_idx[1:].tolist() + [len(pred)]) - ] - if self.word_output: - words = wer_module.decode_tokens_to_str(pred_ids).split(" ") - words, word_begin, word_len, word_prob = ( - self._process_tokens_to_words(tokens, token_begin, token_len, token_prob, words) - if hasattr(self._model, "tokenizer") - else self._process_char_with_space_to_words(tokens, token_begin, token_len, token_prob, words) - ) - return s_id, [FrameCtmUnit(t, b, l, p) for t, b, l, p in zip(words, word_begin, word_len, word_prob)] - return s_id, [FrameCtmUnit(t, b, l, p) for t, b, l, p in zip(tokens, token_begin, token_len, token_prob)] - - def _predict_impl_ctc( - self, - encoded: torch.Tensor, - encoded_len: torch.Tensor, - transcript: torch.Tensor, - transcript_len: torch.Tensor, - sample_id: torch.Tensor, - ) -> List[Tuple[int, 'FrameCtmUnit']]: - """Builds time alignment of an encoded sequence. - This method assumes that the CTC model is used. - - It produces a list of sample ids and fours: (label, start_frame, length, probability), called FrameCtmUnit. - """ - log_probs = encoded - - if self.prob_suppress_value != 1.0: - log_probs = self._apply_prob_suppress(log_probs) - - if self.alignment_type == "argmax": - predictions, probs = self._prepare_ctc_argmax_predictions(log_probs, encoded_len) - elif self.alignment_type == "forced": - if self.cpu_decoding: - log_probs, encoded_len, transcript, transcript_len = ( - log_probs.cpu(), - encoded_len.cpu(), - transcript.cpu(), - transcript_len.cpu(), - ) - predictions, probs = self.graph_decoder.align(log_probs, encoded_len, transcript, transcript_len) - else: - raise NotImplementedError() - - return [ - self._results_to_ctmUnits(s_id, pred, prob) - for s_id, pred, prob in zip(sample_id.tolist(), predictions, probs) - ] - - def _predict_impl_rnnt( - self, - encoded: torch.Tensor, - encoded_len: torch.Tensor, - transcript: torch.Tensor, - transcript_len: torch.Tensor, - sample_id: torch.Tensor, - ) -> List[Tuple[int, 'FrameCtmUnit']]: - """Builds time alignment of an encoded sequence. - This method assumes that the RNNT model is used. - - It produces a list of sample ids and fours: (label, start_frame, length, probability), called FrameCtmUnit. - """ - if self.alignment_type == "argmax": - return self._predict_impl_rnnt_argmax(encoded, encoded_len, transcript, transcript_len, sample_id) - elif self.alignment_type == "forced": - decoded = self._model.decoder(targets=transcript, target_length=transcript_len)[0] - log_probs = ( - self._rnnt_joint_pruned(encoded, encoded_len, decoded, transcript_len) - if self.predictor_window_size > 0 and self.predictor_window_size < transcript_len.max() - else self._model.joint(encoder_outputs=encoded, decoder_outputs=decoded) - ) - apply_log_softmax = True if self.log_softmax is None and encoded.is_cuda else self.log_softmax - if apply_log_softmax: - log_probs = log_probs.log_softmax(dim=-1) - if self.cpu_decoding: - log_probs, encoded_len, transcript, transcript_len = ( - log_probs.cpu(), - encoded_len.cpu(), - transcript.cpu(), - transcript_len.cpu(), - ) - predictions, probs = self.graph_decoder.align(log_probs, encoded_len, transcript, transcript_len) - return [ - self._results_to_ctmUnits(s_id, pred, prob) - for s_id, pred, prob in zip(sample_id.tolist(), predictions, probs) - ] - else: - raise NotImplementedError() - - @torch.no_grad() - def predict_step(self, batch, batch_idx, dataloader_idx=0) -> List[Tuple[int, 'FrameCtmUnit']]: - signal, signal_len, transcript, transcript_len, sample_id = batch - - if isinstance(batch, DALIOutputs) and batch.has_processed_signal: - encoded, encoded_len = self._model.forward(processed_signal=signal, processed_signal_length=signal_len)[:2] - else: - encoded, encoded_len = self._model.forward(input_signal=signal, input_signal_length=signal_len)[:2] - - return self._predict_impl(encoded, encoded_len, transcript, transcript_len, sample_id) - - @torch.no_grad() - def transcribe( - self, manifest: List[str], batch_size: int = 4, num_workers: int = None, verbose: bool = True, - ) -> List['FrameCtmUnit']: - """ - Does alignment. Use this method for debugging and prototyping. - - Args: - - manifest: path to dataset JSON manifest file (in NeMo format). \ - Recommended length per audio file is between 5 and 25 seconds. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - num_workers: (int) number of workers for DataLoader - verbose: (bool) whether to display tqdm progress bar - - Returns: - A list of four: (label, start_frame, length, probability), called FrameCtmUnit, \ - in the same order as in the manifest. - """ - hypotheses = [] - # Model's mode and device - mode = self._model.training - device = next(self._model.parameters()).device - dither_value = self._model.preprocessor.featurizer.dither - pad_to_value = self._model.preprocessor.featurizer.pad_to - - if num_workers is None: - num_workers = min(batch_size, os.cpu_count() - 1) - - try: - self._model.preprocessor.featurizer.dither = 0.0 - self._model.preprocessor.featurizer.pad_to = 0 - - # Switch model to evaluation mode - self._model.eval() - # Freeze the encoder and decoder modules - self._model.encoder.freeze() - self._model.decoder.freeze() - if hasattr(self._model, "joint"): - self._model.joint.freeze() - logging_level = logging.get_verbosity() - logging.set_verbosity(logging.WARNING) - - config = { - 'manifest_filepath': manifest, - 'batch_size': batch_size, - 'num_workers': num_workers, - } - temporary_datalayer = self._model._setup_transcribe_dataloader(config) - for test_batch in tqdm(temporary_datalayer, desc="Aligning", disable=not verbose): - test_batch[0] = test_batch[0].to(device) - test_batch[1] = test_batch[1].to(device) - hypotheses += [unit for i, unit in self.predict_step(test_batch, 0)] - del test_batch - finally: - # set mode back to its original value - self._model.train(mode=mode) - self._model.preprocessor.featurizer.dither = dither_value - self._model.preprocessor.featurizer.pad_to = pad_to_value - - logging.set_verbosity(logging_level) - if mode is True: - self._model.encoder.unfreeze() - self._model.decoder.unfreeze() - if hasattr(self._model, "joint"): - self._model.joint.unfreeze() - return hypotheses - - def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]): - raise RuntimeError("This module cannot be used in training.") - - def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]): - raise RuntimeError("This module cannot be used in validation.") - - def setup_test_data(self, val_data_config: Optional[Union[DictConfig, Dict]]): - raise RuntimeError("This module cannot be used in testing.") diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/k2_sequence_models.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/k2_sequence_models.py deleted file mode 100644 index 087e9e41b85dd8673ac6e2ff667bad355c5e747f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/k2_sequence_models.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List, Optional - -from omegaconf import DictConfig -from pytorch_lightning import Trainer - -from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE -from nemo.collections.asr.models.ctc_models import EncDecCTCModel -from nemo.collections.asr.models.rnnt_bpe_models import EncDecRNNTBPEModel -from nemo.collections.asr.models.rnnt_models import EncDecRNNTModel -from nemo.collections.asr.parts.k2.classes import ASRK2Mixin -from nemo.core.classes.common import PretrainedModelInfo, typecheck -from nemo.utils import logging - - -class EncDecK2SeqModel(EncDecCTCModel, ASRK2Mixin): - """Encoder decoder models with various lattice losses.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - loss_type = cfg.graph_module_cfg.get("loss_type", "ctc") - if loss_type != "ctc" and loss_type != "mmi": - raise ValueError(f"Class {self.__class__.__name__} does not support `loss_type`={loss_type}") - super().__init__(cfg=cfg, trainer=trainer) - self._init_k2() - - @classmethod - def list_available_models(cls) -> Optional[List[PretrainedModelInfo]]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - pass - - def change_vocabulary(self, new_vocabulary: List[str]): - """ - Changes vocabulary used during CTC decoding process. Use this method when fine-tuning on from pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - If new_vocabulary == self.decoder.vocabulary then nothing will be changed. - - Args: - new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically, \ - this is target alphabet. - - Returns: None - - """ - super().change_vocabulary(new_vocabulary) - - if self.use_graph_lm: - self.token_lm = None - logging.warning( - f"""With .change_vocabulary() call for a model with criterion_type=`{self.loss.criterion_type}`, - a new token_lm has to be set manually: call .update_k2_modules(new_cfg) - or update .graph_module_cfg.backend_cfg.token_lm before calling this method.""" - ) - - self.update_k2_modules(self.graph_module_cfg) - - @typecheck() - def forward( - self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None, - ): - """ - Forward pass of the model. - - Args: - input_signal: Tensor that represents a batch of raw audio signals, - of shape [B, T]. T here represents timesteps, with 1 second of audio represented as - `self.sample_rate` number of floating point values. - input_signal_length: Vector of length B, that contains the individual lengths of the audio - sequences. - processed_signal: Tensor that represents a batch of processed audio signals, - of shape (B, D, T) that has undergone processing via some DALI preprocessor. - processed_signal_length: Vector of length B, that contains the individual lengths of the - processed audio sequences. - - Returns: - A tuple of 3 elements - - 1) The log probabilities tensor of shape [B, T, D]. - 2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B]. - 3) The greedy token predictions of the model of shape [B, T] (via argmax) - """ - log_probs, encoded_len, greedy_predictions = super().forward( - input_signal=input_signal, - input_signal_length=input_signal_length, - processed_signal=processed_signal, - processed_signal_length=processed_signal_length, - ) - return self._forward_k2_post_processing( - log_probs=log_probs, encoded_length=encoded_len, greedy_predictions=greedy_predictions - ) - - -class EncDecK2SeqModelBPE(EncDecCTCModelBPE, ASRK2Mixin): - """Encoder decoder models with Byte Pair Encoding and various lattice losses.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - loss_type = cfg.graph_module_cfg.get("loss_type", "ctc") - if loss_type != "ctc" and loss_type != "mmi": - raise ValueError(f"Class {self.__class__.__name__} does not support `loss_type`={loss_type}") - super().__init__(cfg=cfg, trainer=trainer) - self._init_k2() - - @classmethod - def list_available_models(cls) -> Optional[List[PretrainedModelInfo]]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - pass - - def change_vocabulary(self, new_tokenizer_dir: str, new_tokenizer_type: str): - """ - Changes vocabulary of the tokenizer used during CTC decoding process. - Use this method when fine-tuning on from pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - Args: - new_tokenizer_dir: Path to the new tokenizer directory. - new_tokenizer_type: Either `bpe` or `wpe`. `bpe` is used for SentencePiece tokenizers, - whereas `wpe` is used for `BertTokenizer`. - - Returns: None - - """ - super().change_vocabulary(new_tokenizer_dir, new_tokenizer_type) - - if self.use_graph_lm: - self.token_lm = None - logging.warning( - f"""With .change_vocabulary() call for a model with criterion_type=`{self.loss.criterion_type}`, - a new token_lm has to be set manually: call .update_k2_modules(new_cfg) - or update .graph_module_cfg.backend_cfg.token_lm before calling this method.""" - ) - - self.update_k2_modules(self.graph_module_cfg) - - @typecheck() - def forward( - self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None, - ): - """ - Forward pass of the model. - - Args: - input_signal: Tensor that represents a batch of raw audio signals, - of shape [B, T]. T here represents timesteps, with 1 second of audio represented as - `self.sample_rate` number of floating point values. - input_signal_length: Vector of length B, that contains the individual lengths of the audio - sequences. - processed_signal: Tensor that represents a batch of processed audio signals, - of shape (B, D, T) that has undergone processing via some DALI preprocessor. - processed_signal_length: Vector of length B, that contains the individual lengths of the - processed audio sequences. - - Returns: - A tuple of 3 elements - - 1) The log probabilities tensor of shape [B, T, D]. - 2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B]. - 3) The greedy token predictions of the model of shape [B, T] (via argmax) - """ - log_probs, encoded_len, greedy_predictions = super().forward( - input_signal=input_signal, - input_signal_length=input_signal_length, - processed_signal=processed_signal, - processed_signal_length=processed_signal_length, - ) - return self._forward_k2_post_processing( - log_probs=log_probs, encoded_length=encoded_len, greedy_predictions=greedy_predictions - ) - - -class EncDecK2RnntSeqModel(EncDecRNNTModel, ASRK2Mixin): - """Encoder decoder models with various lattice losses.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - loss_type = cfg.graph_module_cfg.get("loss_type", "rnnt") - criterion_type = cfg.graph_module_cfg.get("criterion_type", "ml") - if loss_type != "rnnt" or criterion_type != "ml": - raise ValueError( - f"""Class {self.__class__.__name__} does not support - `criterion_type`={criterion_type} with `loss_type`={loss_type}""" - ) - super().__init__(cfg=cfg, trainer=trainer) - self._init_k2() - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - pass - - def change_vocabulary(self, new_vocabulary: List[str]): - """ - Changes vocabulary used during CTC decoding process. Use this method when fine-tuning on from pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - If new_vocabulary == self.decoder.vocabulary then nothing will be changed. - - Args: - new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically, \ - this is target alphabet. - - Returns: None - - """ - super().change_vocabulary(new_vocabulary) - - if self.use_graph_lm: - self.token_lm = None - logging.warning( - f"""With .change_vocabulary() call for a model with criterion_type=`{self.loss.criterion_type}`, - a new token_lm has to be set manually: call .update_k2_modules(new_cfg) - or update .graph_module_cfg.backend_cfg.token_lm before calling this method.""" - ) - - self.update_k2_modules(self.graph_module_cfg) - - -class EncDecK2RnntSeqModelBPE(EncDecRNNTBPEModel, ASRK2Mixin): - """Encoder decoder models with Byte Pair Encoding and various lattice losses.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - loss_type = cfg.graph_module_cfg.get("loss_type", "rnnt") - criterion_type = cfg.graph_module_cfg.get("criterion_type", "ml") - if loss_type != "rnnt" or criterion_type != "ml": - raise ValueError( - f"""Class {self.__class__.__name__} does not support - `criterion_type`={criterion_type} with `loss_type`={loss_type}""" - ) - super().__init__(cfg=cfg, trainer=trainer) - self._init_k2() - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - pass - - def change_vocabulary(self, new_tokenizer_dir: str, new_tokenizer_type: str): - """ - Changes vocabulary of the tokenizer used during CTC decoding process. - Use this method when fine-tuning on from pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - Args: - new_tokenizer_dir: Path to the new tokenizer directory. - new_tokenizer_type: Either `bpe` or `wpe`. `bpe` is used for SentencePiece tokenizers, - whereas `wpe` is used for `BertTokenizer`. - - Returns: None - - """ - super().change_vocabulary(new_tokenizer_dir, new_tokenizer_type) - - if self.use_graph_lm: - self.token_lm = None - logging.warning( - f"""With .change_vocabulary() call for a model with criterion_type=`{self.loss.criterion_type}`, - a new token_lm has to be set manually: call .update_k2_modules(new_cfg) - or update .graph_module_cfg.backend_cfg.token_lm before calling this method.""" - ) - - self.update_k2_modules(self.graph_module_cfg) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/label_models.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/label_models.py deleted file mode 100644 index 83e57ece59e366215addc99a22ff4538d0ffd6ef..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/label_models.py +++ /dev/null @@ -1,597 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import copy -import itertools -from math import ceil -from typing import Dict, List, Optional, Union - -import librosa -import numpy as np -import torch -from hydra.utils import instantiate -from omegaconf import DictConfig, OmegaConf, open_dict -from pytorch_lightning import Trainer -from torchmetrics import Accuracy -from tqdm import tqdm - -from nemo.collections.asr.data.audio_to_label import AudioToSpeechLabelDataset, cache_datastore_manifests -from nemo.collections.asr.data.audio_to_label_dataset import ( - get_concat_tarred_speech_label_dataset, - get_tarred_speech_label_dataset, -) -from nemo.collections.asr.data.audio_to_text_dataset import convert_to_config_list -from nemo.collections.asr.models.asr_model import ExportableEncDecModel -from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer -from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations -from nemo.collections.common.metrics import TopKClassificationAccuracy -from nemo.collections.common.parts.preprocessing.collections import ASRSpeechLabel -from nemo.core.classes import ModelPT -from nemo.core.classes.common import PretrainedModelInfo, typecheck -from nemo.core.neural_types import * -from nemo.utils import logging - -__all__ = ['EncDecSpeakerLabelModel'] - - -class EncDecSpeakerLabelModel(ModelPT, ExportableEncDecModel): - """ - Encoder decoder class for speaker label models. - Model class creates training, validation methods for setting up data - performing model forward pass. - Expects config dict for - * preprocessor - * Jasper/Quartznet Encoder - * Speaker Decoder - """ - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - Returns: - List of available pre-trained models. - """ - result = [] - - model = PretrainedModelInfo( - pretrained_model_name="speakerverification_speakernet", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/speakerverification_speakernet/versions/1.16.0/files/speakerverification_speakernet.nemo", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:speakerverification_speakernet", - ) - result.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="ecapa_tdnn", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/ecapa_tdnn/versions/1.16.0/files/ecapa_tdnn.nemo", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:ecapa_tdnn", - ) - result.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="titanet_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/titanet_large/versions/v1/files/titanet-l.nemo", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_large", - ) - result.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="langid_ambernet", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/langid_ambernet/versions/1.12.0/files/ambernet.nemo", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/langid_ambernet", - ) - result.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="titanet_small", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:titanet_small", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/titanet_small/versions/1.19.0/files/titanet-s.nemo", - ) - result.append(model) - - return result - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - self.world_size = 1 - self.cal_labels_occurrence_train = False - self.labels_occurrence = None - self.labels = None - - num_classes = cfg.decoder.num_classes - - if 'loss' in cfg: - if 'weight' in cfg.loss: - if cfg.loss.weight == 'auto': - weight = num_classes * [1] - self.cal_labels_occurrence_train = True - else: - weight = cfg.loss.weight - else: - weight = None # weight is None for angular loss and CE loss if it's not specified. - - if trainer is not None: - self.world_size = trainer.num_nodes * trainer.num_devices - - super().__init__(cfg=cfg, trainer=trainer) - - if self.labels_occurrence: - # Goal is to give more weight to the classes with less samples so as to match the ones with the higher frequencies - weight = [sum(self.labels_occurrence) / (len(self.labels_occurrence) * i) for i in self.labels_occurrence] - - if 'loss' in cfg: - cfg_eval_loss = copy.deepcopy(cfg.loss) - - if 'angular' in cfg.loss._target_: - OmegaConf.set_struct(cfg, True) - with open_dict(cfg): - cfg.decoder.angular = True - - if 'weight' in cfg.loss: - cfg.loss.weight = weight - cfg_eval_loss.weight = None - - # May need a general check for arguments of loss - self.loss = instantiate(cfg.loss) - self.eval_loss = instantiate(cfg_eval_loss) - - else: - tmp_loss_cfg = OmegaConf.create( - {"_target_": "nemo.collections.common.losses.cross_entropy.CrossEntropyLoss"} - ) - - self.loss = instantiate(tmp_loss_cfg) - self.eval_loss = instantiate(tmp_loss_cfg) - - self._accuracy = TopKClassificationAccuracy(top_k=[1]) - - self.preprocessor = EncDecSpeakerLabelModel.from_config_dict(cfg.preprocessor) - self.encoder = EncDecSpeakerLabelModel.from_config_dict(cfg.encoder) - self.decoder = EncDecSpeakerLabelModel.from_config_dict(cfg.decoder) - - self._macro_accuracy = Accuracy(num_classes=num_classes, top_k=1, average='macro', task='multiclass') - - if hasattr(self._cfg, 'spec_augment') and self._cfg.spec_augment is not None: - self.spec_augmentation = EncDecSpeakerLabelModel.from_config_dict(self._cfg.spec_augment) - else: - self.spec_augmentation = None - - @staticmethod - def extract_labels(data_layer_config): - labels = set() - manifest_filepath = data_layer_config.get('manifest_filepath', None) - if manifest_filepath is None: - logging.warning("No manifest_filepath was provided, no labels got extracted!") - return None - manifest_filepaths = convert_to_config_list(data_layer_config['manifest_filepath']) - - for manifest_filepath in itertools.chain.from_iterable(manifest_filepaths): - cache_datastore_manifests(manifest_filepaths=manifest_filepath) - collection = ASRSpeechLabel( - manifests_files=manifest_filepath, - min_duration=data_layer_config.get("min_duration", None), - max_duration=data_layer_config.get("max_duration", None), - index_by_file_id=True, - ) - labels.update(collection.uniq_labels) - labels = list(sorted(labels)) - logging.warning(f"Total number of {len(labels)} found in all the manifest files.") - return labels - - def __setup_dataloader_from_config(self, config: Optional[Dict]): - if 'augmentor' in config: - augmentor = process_augmentations(config['augmentor']) - else: - augmentor = None - - featurizer = WaveformFeaturizer( - sample_rate=config['sample_rate'], int_values=config.get('int_values', False), augmentor=augmentor - ) - shuffle = config.get('shuffle', False) - if config.get('is_tarred', False): - if ('tarred_audio_filepaths' in config and config['tarred_audio_filepaths'] is None) or ( - 'manifest_filepath' in config and config['manifest_filepath'] is None - ): - logging.warning( - "Could not load dataset as `manifest_filepath` was None or " - f"`tarred_audio_filepaths` is None. Provided config : {config}" - ) - return None - - shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0 - if config.get("is_concat", False): - dataset = get_concat_tarred_speech_label_dataset( - featurizer=featurizer, - config=config, - shuffle_n=shuffle_n, - global_rank=self.global_rank, - world_size=self.world_size, - ) - else: - dataset = get_tarred_speech_label_dataset( - featurizer=featurizer, - config=config, - shuffle_n=shuffle_n, - global_rank=self.global_rank, - world_size=self.world_size, - ) - shuffle = False - else: - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}") - return None - - dataset = AudioToSpeechLabelDataset( - manifest_filepath=config['manifest_filepath'], - labels=config['labels'], - featurizer=featurizer, - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - trim=config.get('trim_silence', False), - normalize_audio=config.get('normalize_audio', False), - cal_labels_occurrence=config.get('cal_labels_occurrence', False), - ) - if dataset.labels_occurrence: - self.labels_occurrence = dataset.labels_occurrence - - if hasattr(dataset, 'fixed_seq_collate_fn'): - collate_fn = dataset.fixed_seq_collate_fn - else: - collate_fn = dataset.datasets[0].fixed_seq_collate_fn - - batch_size = config['batch_size'] - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=batch_size, - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def setup_training_data(self, train_data_layer_config: Optional[Union[DictConfig, Dict]]): - if self.cal_labels_occurrence_train: - # Calculate labels occurence for weighed CE loss for train set if weight equals 'auto' - # Note in this case, the cal_labels_occurrence in val_data_layer_config and test_data_layer_params need to be stay as False - OmegaConf.set_struct(train_data_layer_config, True) - with open_dict(train_data_layer_config): - train_data_layer_config['cal_labels_occurrence'] = True - - self.labels = self.extract_labels(train_data_layer_config) - train_data_layer_config['labels'] = self.labels - if 'shuffle' not in train_data_layer_config: - train_data_layer_config['shuffle'] = True - self._train_dl = self.__setup_dataloader_from_config(config=train_data_layer_config) - # Need to set this because if using an IterableDataset, the length of the dataloader is the total number - # of samples rather than the number of batches, and this messes up the tqdm progress bar. - # So we set the number of steps manually (to the correct number) to fix this. - if ( - self._train_dl is not None - and hasattr(self._train_dl, 'dataset') - and isinstance(self._train_dl.dataset, torch.utils.data.IterableDataset) - ): - # We also need to check if limit_train_batches is already set. - # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, - # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). - if self._trainer is not None and isinstance(self._trainer.limit_train_batches, float): - self._trainer.limit_train_batches = int( - self._trainer.limit_train_batches - * ceil((len(self._train_dl.dataset) / self.world_size) / train_data_layer_config['batch_size']) - ) - elif self._trainer is None: - logging.warning( - "Model Trainer was not set before constructing the dataset, incorrect number of " - "training batches will be used. Please set the trainer and rebuild the dataset." - ) - - def setup_validation_data(self, val_data_layer_config: Optional[Union[DictConfig, Dict]]): - val_data_layer_config['labels'] = self.labels - self._validation_dl = self.__setup_dataloader_from_config(config=val_data_layer_config) - - def setup_test_data(self, test_data_layer_params: Optional[Union[DictConfig, Dict]]): - if hasattr(self, 'dataset'): - test_data_layer_params['labels'] = self.labels - - self.embedding_dir = test_data_layer_params.get('embedding_dir', './') - self._test_dl = self.__setup_dataloader_from_config(config=test_data_layer_params) - self.test_manifest = test_data_layer_params.get('manifest_filepath', None) - - def test_dataloader(self): - if self._test_dl is not None: - return self._test_dl - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - if hasattr(self.preprocessor, '_sample_rate'): - audio_eltype = AudioSignal(freq=self.preprocessor._sample_rate) - else: - audio_eltype = AudioSignal() - return { - "input_signal": NeuralType(('B', 'T'), audio_eltype), - "input_signal_length": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return { - "logits": NeuralType(('B', 'D'), LogitsType()), - "embs": NeuralType(('B', 'D'), AcousticEncodedRepresentation()), - } - - def forward_for_export(self, processed_signal, processed_signal_len): - encoded, length = self.encoder(audio_signal=processed_signal, length=processed_signal_len) - logits, embs = self.decoder(encoder_output=encoded, length=length) - return logits, embs - - @typecheck() - def forward(self, input_signal, input_signal_length): - processed_signal, processed_signal_len = self.preprocessor( - input_signal=input_signal, length=input_signal_length, - ) - - if self.spec_augmentation is not None and self.training: - processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_len) - - encoded, length = self.encoder(audio_signal=processed_signal, length=processed_signal_len) - logits, embs = self.decoder(encoder_output=encoded, length=length) - return logits, embs - - # PTL-specific methods - def training_step(self, batch, batch_idx): - audio_signal, audio_signal_len, labels, _ = batch - logits, _ = self.forward(input_signal=audio_signal, input_signal_length=audio_signal_len) - loss = self.loss(logits=logits, labels=labels) - - self.log('loss', loss) - self.log('learning_rate', self._optimizer.param_groups[0]['lr']) - self.log('global_step', self.trainer.global_step) - - self._accuracy(logits=logits, labels=labels) - top_k = self._accuracy.compute() - self._accuracy.reset() - for i, top_i in enumerate(top_k): - self.log(f'training_batch_accuracy_top_{i}', top_i) - - return {'loss': loss} - - def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str = 'val'): - audio_signal, audio_signal_len, labels, _ = batch - logits, _ = self.forward(input_signal=audio_signal, input_signal_length=audio_signal_len) - loss_value = self.eval_loss(logits=logits, labels=labels) - acc_top_k = self._accuracy(logits=logits, labels=labels) - correct_counts, total_counts = self._accuracy.correct_counts_k, self._accuracy.total_counts_k - self._macro_accuracy.update(preds=logits, target=labels) - stats = self._macro_accuracy._final_state() - - output = { - f'{tag}_loss': loss_value, - f'{tag}_correct_counts': correct_counts, - f'{tag}_total_counts': total_counts, - f'{tag}_acc_micro_top_k': acc_top_k, - f'{tag}_acc_macro_stats': stats, - } - if tag == 'val': - if isinstance(self.trainer.val_dataloaders, (list, tuple)) and len(self.trainer.val_dataloaders) > 1: - self.validation_step_outputs[dataloader_idx].append(output) - else: - self.validation_step_outputs.append(output) - else: - if isinstance(self.trainer.test_dataloaders, (list, tuple)) and len(self.trainer.test_dataloaders) > 1: - self.test_step_outputs[dataloader_idx].append(output) - else: - self.test_step_outputs.append(output) - - return output - - def multi_evaluation_epoch_end(self, outputs, dataloader_idx: int = 0, tag: str = 'val'): - loss_mean = torch.stack([x[f'{tag}_loss'] for x in outputs]).mean() - correct_counts = torch.stack([x[f'{tag}_correct_counts'] for x in outputs]).sum(axis=0) - total_counts = torch.stack([x[f'{tag}_total_counts'] for x in outputs]).sum(axis=0) - - self._accuracy.correct_counts_k = correct_counts - self._accuracy.total_counts_k = total_counts - topk_scores = self._accuracy.compute() - - self._macro_accuracy.tp = torch.stack([x[f'{tag}_acc_macro_stats'][0] for x in outputs]).sum(axis=0) - self._macro_accuracy.fp = torch.stack([x[f'{tag}_acc_macro_stats'][1] for x in outputs]).sum(axis=0) - self._macro_accuracy.tn = torch.stack([x[f'{tag}_acc_macro_stats'][2] for x in outputs]).sum(axis=0) - self._macro_accuracy.fn = torch.stack([x[f'{tag}_acc_macro_stats'][3] for x in outputs]).sum(axis=0) - macro_accuracy_score = self._macro_accuracy.compute() - - self._accuracy.reset() - self._macro_accuracy.reset() - - self.log(f'{tag}_loss', loss_mean, sync_dist=True) - for top_k, score in zip(self._accuracy.top_k, topk_scores): - self.log(f'{tag}_acc_micro_top_{top_k}', score, sync_dist=True) - self.log(f'{tag}_acc_macro', macro_accuracy_score, sync_dist=True) - - return { - f'{tag}_loss': loss_mean, - f'{tag}_acc_micro_top_k': topk_scores, - f'{tag}_acc_macro': macro_accuracy_score, - } - - def validation_step(self, batch, batch_idx, dataloader_idx: int = 0): - return self.evaluation_step(batch, batch_idx, dataloader_idx, 'val') - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): - return self.multi_evaluation_epoch_end(outputs, dataloader_idx, 'val') - - def test_step(self, batch, batch_idx, dataloader_idx: int = 0): - return self.evaluation_step(batch, batch_idx, dataloader_idx, 'test') - - def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): - return self.multi_evaluation_epoch_end(outputs, dataloader_idx, 'test') - - @torch.no_grad() - def infer_file(self, path2audio_file): - """ - Args: - path2audio_file: path to an audio wav file - - Returns: - emb: speaker embeddings (Audio representations) - logits: logits corresponding of final layer - """ - audio, sr = librosa.load(path2audio_file, sr=None) - target_sr = self._cfg.train_ds.get('sample_rate', 16000) - if sr != target_sr: - audio = librosa.core.resample(audio, orig_sr=sr, target_sr=target_sr) - audio_length = audio.shape[0] - device = self.device - audio = np.array([audio]) - audio_signal, audio_signal_len = ( - torch.tensor(audio, device=device), - torch.tensor([audio_length], device=device), - ) - mode = self.training - self.freeze() - - logits, emb = self.forward(input_signal=audio_signal, input_signal_length=audio_signal_len) - - self.train(mode=mode) - if mode is True: - self.unfreeze() - del audio_signal, audio_signal_len - return emb, logits - - def get_label(self, path2audio_file): - """ - Returns label of path2audio_file from classes the model was trained on. - Args: - path2audio_file: path to audio wav file - - Returns: - label: label corresponding to the trained model - """ - _, logits = self.infer_file(path2audio_file=path2audio_file) - - trained_labels = self._cfg['train_ds'].get('labels', None) - if trained_labels is not None: - trained_labels = list(trained_labels) - label_id = logits.argmax(axis=1) - label = trained_labels[int(label_id[0])] - else: - logging.info("labels are not saved to model, hence only outputting the label id index") - label = logits.argmax(axis=1) - - return label - - def get_embedding(self, path2audio_file): - """ - Returns the speaker embeddings for a provided audio file. - - Args: - path2audio_file: path to an audio wav file - - Returns: - emb: speaker embeddings (Audio representations) - """ - - emb, _ = self.infer_file(path2audio_file=path2audio_file) - - return emb - - @torch.no_grad() - def verify_speakers(self, path2audio_file1, path2audio_file2, threshold=0.7): - """ - Verify if two audio files are from the same speaker or not. - - Args: - path2audio_file1: path to audio wav file of speaker 1 - path2audio_file2: path to audio wav file of speaker 2 - threshold: cosine similarity score used as a threshold to distinguish two embeddings (default = 0.7) - - Returns: - True if both audio files are from same speaker, False otherwise - """ - embs1 = self.get_embedding(path2audio_file1).squeeze() - embs2 = self.get_embedding(path2audio_file2).squeeze() - # Length Normalize - X = embs1 / torch.linalg.norm(embs1) - Y = embs2 / torch.linalg.norm(embs2) - # Score - similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5) - similarity_score = (similarity_score + 1) / 2 - # Decision - if similarity_score >= threshold: - logging.info(" two audio files are from same speaker") - return True - else: - logging.info(" two audio files are from different speakers") - return False - - @torch.no_grad() - def batch_inference(self, manifest_filepath, batch_size=32, sample_rate=16000, device='cuda'): - """ - Perform batch inference on EncDecSpeakerLabelModel. - To perform inference on single audio file, once can use infer_model, get_label or get_embedding - - To map predicted labels, one can do - `arg_values = logits.argmax(axis=1)` - `pred_labels = list(map(lambda t : trained_labels[t], arg_values))` - - Args: - manifest_filepath: Path to manifest file - batch_size: batch size to perform batch inference - sample_rate: sample rate of audio files in manifest file - device: compute device to perform operations. - - Returns: - The variables below all follow the audio file order in the manifest file. - embs: embeddings of files provided in manifest file - logits: logits of final layer of EncDecSpeakerLabel Model - gt_labels: labels from manifest file (needed for speaker enrollment and testing) - trained_labels: Classification labels sorted in the order that they are mapped by the trained model - - """ - mode = self.training - self.freeze() - self.eval() - self.to(device) - trained_labels = self._cfg['train_ds']['labels'] - if trained_labels is not None: - trained_labels = list(trained_labels) - - featurizer = WaveformFeaturizer(sample_rate=sample_rate) - - dataset = AudioToSpeechLabelDataset(manifest_filepath=manifest_filepath, labels=None, featurizer=featurizer) - - dataloader = torch.utils.data.DataLoader( - dataset=dataset, batch_size=batch_size, collate_fn=dataset.fixed_seq_collate_fn, - ) - - logits = [] - embs = [] - gt_labels = [] - - for test_batch in tqdm(dataloader): - if device == 'cuda': - test_batch = [x.to(device) for x in test_batch] - audio_signal, audio_signal_len, labels, _ = test_batch - logit, emb = self.forward(input_signal=audio_signal, input_signal_length=audio_signal_len) - - logits.extend(logit.cpu().numpy()) - gt_labels.extend(labels.cpu().numpy()) - embs.extend(emb.cpu().numpy()) - - gt_labels = list(map(lambda t: dataset.id2label[t], gt_labels)) - - self.train(mode=mode) - if mode is True: - self.unfreeze() - - logits, embs, gt_labels = np.asarray(logits), np.asarray(embs), np.asarray(gt_labels) - - return embs, logits, gt_labels, trained_labels diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/msdd_models.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/msdd_models.py deleted file mode 100644 index c85a57af72b9556cd5463b897cc8ca913979c3af..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/msdd_models.py +++ /dev/null @@ -1,1547 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import json -import os -import pickle as pkl -import tempfile -from collections import OrderedDict -from pathlib import Path -from statistics import mode -from typing import Any, Dict, List, Optional, Tuple, Union - -import numpy as np -import torch -from hydra.utils import instantiate -from omegaconf import DictConfig, open_dict -from pyannote.core import Annotation -from pyannote.metrics.diarization import DiarizationErrorRate -from pytorch_lightning import LightningModule, Trainer -from pytorch_lightning.utilities import rank_zero_only -from tqdm import tqdm - -from nemo.collections.asr.data.audio_to_diar_label import AudioToSpeechMSDDInferDataset, AudioToSpeechMSDDTrainDataset -from nemo.collections.asr.metrics.der import score_labels -from nemo.collections.asr.metrics.multi_binary_acc import MultiBinaryAccuracy -from nemo.collections.asr.models import ClusteringDiarizer -from nemo.collections.asr.models.asr_model import ExportableEncDecModel -from nemo.collections.asr.models.clustering_diarizer import ( - _MODEL_CONFIG_YAML, - _SPEAKER_MODEL, - _VAD_MODEL, - get_available_model_names, -) -from nemo.collections.asr.models.configs.diarizer_config import NeuralDiarizerInferenceConfig -from nemo.collections.asr.models.label_models import EncDecSpeakerLabelModel -from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer -from nemo.collections.asr.parts.utils.speaker_utils import ( - audio_rttm_map, - get_embs_and_timestamps, - get_id_tup_dict, - get_scale_mapping_argmat, - get_uniq_id_list_from_manifest, - labels_to_pyannote_object, - make_rttm_with_overlap, - parse_scale_configs, - rttm_to_labels, -) -from nemo.core.classes import ModelPT -from nemo.core.classes.common import PretrainedModelInfo, typecheck -from nemo.core.neural_types import AudioSignal, LengthsType, NeuralType -from nemo.core.neural_types.elements import ProbsType -from nemo.utils import logging - -try: - from torch.cuda.amp import autocast -except ImportError: - from contextlib import contextmanager - - @contextmanager - def autocast(enabled=None): - yield - - -__all__ = ['EncDecDiarLabelModel', 'ClusterEmbedding', 'NeuralDiarizer'] - - -class EncDecDiarLabelModel(ModelPT, ExportableEncDecModel): - """ - Encoder decoder class for multiscale diarization decoder (MSDD). Model class creates training, validation methods for setting - up data performing model forward pass. - - This model class expects config dict for: - * preprocessor - * msdd_model - * speaker_model - """ - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - result = [] - - model = PretrainedModelInfo( - pretrained_model_name="diar_msdd_telephonic", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/diar_msdd_telephonic/versions/1.0.1/files/diar_msdd_telephonic.nemo", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:diar_msdd_telephonic", - ) - result.append(model) - return result - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - """ - Initialize an MSDD model and the specified speaker embedding model. In this init function, training and validation datasets are prepared. - """ - self._trainer = trainer if trainer else None - self.cfg_msdd_model = cfg - - if self._trainer: - self._init_segmentation_info() - self.world_size = trainer.num_nodes * trainer.num_devices - self.emb_batch_size = self.cfg_msdd_model.emb_batch_size - self.pairwise_infer = False - else: - self.world_size = 1 - self.pairwise_infer = True - super().__init__(cfg=self.cfg_msdd_model, trainer=trainer) - - window_length_in_sec = self.cfg_msdd_model.diarizer.speaker_embeddings.parameters.window_length_in_sec - if isinstance(window_length_in_sec, int) or len(window_length_in_sec) <= 1: - raise ValueError("window_length_in_sec should be a list containing multiple segment (window) lengths") - else: - self.cfg_msdd_model.scale_n = len(window_length_in_sec) - self.cfg_msdd_model.msdd_module.scale_n = self.cfg_msdd_model.scale_n - self.scale_n = self.cfg_msdd_model.scale_n - - self.preprocessor = EncDecSpeakerLabelModel.from_config_dict(self.cfg_msdd_model.preprocessor) - self.frame_per_sec = int(1 / self.preprocessor._cfg.window_stride) - self.msdd = EncDecDiarLabelModel.from_config_dict(self.cfg_msdd_model.msdd_module) - - if trainer is not None: - self._init_speaker_model() - self.add_speaker_model_config(cfg) - else: - self.msdd._speaker_model = EncDecSpeakerLabelModel.from_config_dict(cfg.speaker_model_cfg) - - # Call `self.save_hyperparameters` in modelPT.py again since cfg should contain speaker model's config. - self.save_hyperparameters("cfg") - - self.loss = instantiate(self.cfg_msdd_model.loss) - self._accuracy_test = MultiBinaryAccuracy() - self._accuracy_train = MultiBinaryAccuracy() - self._accuracy_valid = MultiBinaryAccuracy() - - def add_speaker_model_config(self, cfg): - """ - Add config dictionary of the speaker model to the model's config dictionary. This is required to - save and load speaker model with MSDD model. - - Args: - cfg (DictConfig): DictConfig type variable that conatains hyperparameters of MSDD model. - """ - with open_dict(cfg): - cfg_cp = copy.copy(self.msdd._speaker_model.cfg) - cfg.speaker_model_cfg = cfg_cp - del cfg.speaker_model_cfg.train_ds - del cfg.speaker_model_cfg.validation_ds - - def _init_segmentation_info(self): - """Initialize segmentation settings: window, shift and multiscale weights. - """ - self._diarizer_params = self.cfg_msdd_model.diarizer - self.multiscale_args_dict = parse_scale_configs( - self._diarizer_params.speaker_embeddings.parameters.window_length_in_sec, - self._diarizer_params.speaker_embeddings.parameters.shift_length_in_sec, - self._diarizer_params.speaker_embeddings.parameters.multiscale_weights, - ) - - def _init_speaker_model(self): - """ - Initialize speaker embedding model with model name or path passed through config. Note that speaker embedding model is loaded to - `self.msdd` to enable multi-gpu and multi-node training. In addition, speaker embedding model is also saved with msdd model when - `.ckpt` files are saved. - """ - model_path = self.cfg_msdd_model.diarizer.speaker_embeddings.model_path - self._diarizer_params = self.cfg_msdd_model.diarizer - - if not torch.cuda.is_available(): - rank_id = torch.device('cpu') - elif self._trainer: - rank_id = torch.device(self._trainer.global_rank) - else: - rank_id = None - - if model_path is not None and model_path.endswith('.nemo'): - self.msdd._speaker_model = EncDecSpeakerLabelModel.restore_from(model_path, map_location=rank_id) - logging.info("Speaker Model restored locally from {}".format(model_path)) - elif model_path.endswith('.ckpt'): - self._speaker_model = EncDecSpeakerLabelModel.load_from_checkpoint(model_path, map_location=rank_id) - logging.info("Speaker Model restored locally from {}".format(model_path)) - else: - if model_path not in get_available_model_names(EncDecSpeakerLabelModel): - logging.warning( - "requested {} model name not available in pretrained models, instead".format(model_path) - ) - model_path = "titanet_large" - logging.info("Loading pretrained {} model from NGC".format(model_path)) - self.msdd._speaker_model = EncDecSpeakerLabelModel.from_pretrained( - model_name=model_path, map_location=rank_id - ) - self._speaker_params = self.cfg_msdd_model.diarizer.speaker_embeddings.parameters - - def __setup_dataloader_from_config(self, config): - featurizer = WaveformFeaturizer( - sample_rate=config['sample_rate'], int_values=config.get('int_values', False), augmentor=None - ) - - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}") - return None - dataset = AudioToSpeechMSDDTrainDataset( - manifest_filepath=config.manifest_filepath, - emb_dir=config.emb_dir, - multiscale_args_dict=self.multiscale_args_dict, - soft_label_thres=config.soft_label_thres, - featurizer=featurizer, - window_stride=self.cfg_msdd_model.preprocessor.window_stride, - emb_batch_size=config.emb_batch_size, - pairwise_infer=False, - global_rank=self._trainer.global_rank, - ) - - self.data_collection = dataset.collection - collate_ds = dataset - collate_fn = collate_ds.msdd_train_collate_fn - batch_size = config['batch_size'] - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=batch_size, - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=False, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def __setup_dataloader_from_config_infer( - self, config: DictConfig, emb_dict: dict, emb_seq: dict, clus_label_dict: dict, pairwise_infer=False - ): - shuffle = config.get('shuffle', False) - - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}") - return None - - dataset = AudioToSpeechMSDDInferDataset( - manifest_filepath=config['manifest_filepath'], - emb_dict=emb_dict, - clus_label_dict=clus_label_dict, - emb_seq=emb_seq, - soft_label_thres=config.soft_label_thres, - seq_eval_mode=config.seq_eval_mode, - window_stride=self._cfg.preprocessor.window_stride, - use_single_scale_clus=False, - pairwise_infer=pairwise_infer, - ) - self.data_collection = dataset.collection - collate_ds = dataset - collate_fn = collate_ds.msdd_infer_collate_fn - batch_size = config['batch_size'] - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=batch_size, - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]): - self._train_dl = self.__setup_dataloader_from_config(config=train_data_config,) - - def setup_validation_data(self, val_data_layer_config: Optional[Union[DictConfig, Dict]]): - self._validation_dl = self.__setup_dataloader_from_config(config=val_data_layer_config,) - - def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]): - if self.pairwise_infer: - self._test_dl = self.__setup_dataloader_from_config_infer( - config=test_data_config, - emb_dict=self.emb_sess_test_dict, - emb_seq=self.emb_seq_test, - clus_label_dict=self.clus_test_label_dict, - pairwise_infer=self.pairwise_infer, - ) - - def setup_multiple_test_data(self, test_data_config): - """ - MSDD does not use multiple_test_data template. This function is a placeholder for preventing error. - """ - return None - - def test_dataloader(self): - if self._test_dl is not None: - return self._test_dl - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - if hasattr(self.preprocessor, '_sample_rate'): - audio_eltype = AudioSignal(freq=self.preprocessor._sample_rate) - else: - audio_eltype = AudioSignal() - return { - "features": NeuralType(('B', 'T'), audio_eltype), - "feature_length": NeuralType(('B',), LengthsType()), - "ms_seg_timestamps": NeuralType(('B', 'C', 'T', 'D'), LengthsType()), - "ms_seg_counts": NeuralType(('B', 'C'), LengthsType()), - "clus_label_index": NeuralType(('B', 'T'), LengthsType()), - "scale_mapping": NeuralType(('B', 'C', 'T'), LengthsType()), - "targets": NeuralType(('B', 'T', 'C'), ProbsType()), - } - - @property - def output_types(self) -> Dict[str, NeuralType]: - return OrderedDict( - { - "probs": NeuralType(('B', 'T', 'C'), ProbsType()), - "scale_weights": NeuralType(('B', 'T', 'C', 'D'), ProbsType()), - } - ) - - def get_ms_emb_seq( - self, embs: torch.Tensor, scale_mapping: torch.Tensor, ms_seg_counts: torch.Tensor - ) -> torch.Tensor: - """ - Reshape the given tensor and organize the embedding sequence based on the original sequence counts. - Repeat the embeddings according to the scale_mapping information so that the final embedding sequence has - the identical length for all scales. - - Args: - embs (Tensor): - Merged embeddings without zero-padding in the batch. See `ms_seg_counts` for details. - Shape: (Total number of segments in the batch, emb_dim) - scale_mapping (Tensor): - The element at the m-th row and the n-th column of the scale mapping matrix indicates the (m+1)-th scale - segment index which has the closest center distance with (n+1)-th segment in the base scale. - Example: - scale_mapping_argmat[2][101] = 85 - In the above example, it means that 86-th segment in the 3rd scale (python index is 2) is mapped with - 102-th segment in the base scale. Thus, the longer segments bound to have more repeating numbers since - multiple base scale segments (since the base scale has the shortest length) fall into the range of the - longer segments. At the same time, each row contains N numbers of indices where N is number of - segments in the base-scale (i.e., the finest scale). - Shape: (batch_size, scale_n, self.diar_window_length) - ms_seg_counts (Tensor): - Cumulative sum of the number of segments in each scale. This information is needed to reconstruct - the multi-scale input matrix during forward propagating. - - Example: `batch_size=3, scale_n=6, emb_dim=192` - ms_seg_counts = - [[8, 9, 12, 16, 25, 51], - [11, 13, 14, 17, 25, 51], - [ 9, 9, 11, 16, 23, 50]] - - In this function, `ms_seg_counts` is used to get the actual length of each embedding sequence without - zero-padding. - - Returns: - ms_emb_seq (Tensor): - Multi-scale embedding sequence that is mapped, matched and repeated. The longer scales are less repeated, - while shorter scales are more frequently repeated following the scale mapping tensor. - """ - scale_n, batch_size = scale_mapping[0].shape[0], scale_mapping.shape[0] - split_emb_tup = torch.split(embs, ms_seg_counts.view(-1).tolist(), dim=0) - batch_emb_list = [split_emb_tup[i : i + scale_n] for i in range(0, len(split_emb_tup), scale_n)] - ms_emb_seq_list = [] - for batch_idx in range(batch_size): - feats_list = [] - for scale_index in range(scale_n): - repeat_mat = scale_mapping[batch_idx][scale_index] - feats_list.append(batch_emb_list[batch_idx][scale_index][repeat_mat, :]) - repp = torch.stack(feats_list).permute(1, 0, 2) - ms_emb_seq_list.append(repp) - ms_emb_seq = torch.stack(ms_emb_seq_list) - return ms_emb_seq - - @torch.no_grad() - def get_cluster_avg_embs_model( - self, embs: torch.Tensor, clus_label_index: torch.Tensor, ms_seg_counts: torch.Tensor, scale_mapping - ) -> torch.Tensor: - """ - Calculate the cluster-average speaker embedding based on the ground-truth speaker labels (i.e., cluster labels). - - Args: - embs (Tensor): - Merged embeddings without zero-padding in the batch. See `ms_seg_counts` for details. - Shape: (Total number of segments in the batch, emb_dim) - clus_label_index (Tensor): - Merged ground-truth cluster labels from all scales with zero-padding. Each scale's index can be - retrieved by using segment index in `ms_seg_counts`. - Shape: (batch_size, maximum total segment count among the samples in the batch) - ms_seg_counts (Tensor): - Cumulative sum of the number of segments in each scale. This information is needed to reconstruct - multi-scale input tensors during forward propagating. - - Example: `batch_size=3, scale_n=6, emb_dim=192` - ms_seg_counts = - [[8, 9, 12, 16, 25, 51], - [11, 13, 14, 17, 25, 51], - [ 9, 9, 11, 16, 23, 50]] - Counts of merged segments: (121, 131, 118) - embs has shape of (370, 192) - clus_label_index has shape of (3, 131) - - Shape: (batch_size, scale_n) - - Returns: - ms_avg_embs (Tensor): - Multi-scale cluster-average speaker embedding vectors. These embedding vectors are used as reference for - each speaker to predict the speaker label for the given multi-scale embedding sequences. - Shape: (batch_size, scale_n, emb_dim, self.num_spks_per_model) - """ - scale_n, batch_size = scale_mapping[0].shape[0], scale_mapping.shape[0] - split_emb_tup = torch.split(embs, ms_seg_counts.view(-1).tolist(), dim=0) - batch_emb_list = [split_emb_tup[i : i + scale_n] for i in range(0, len(split_emb_tup), scale_n)] - ms_avg_embs_list = [] - for batch_idx in range(batch_size): - oracle_clus_idx = clus_label_index[batch_idx] - max_seq_len = sum(ms_seg_counts[batch_idx]) - clus_label_index_batch = torch.split(oracle_clus_idx[:max_seq_len], ms_seg_counts[batch_idx].tolist()) - session_avg_emb_set_list = [] - for scale_index in range(scale_n): - spk_set_list = [] - for idx in range(self.cfg_msdd_model.max_num_of_spks): - _where = (clus_label_index_batch[scale_index] == idx).clone().detach() - if not torch.any(_where): - avg_emb = torch.zeros(self.msdd._speaker_model._cfg.decoder.emb_sizes).to(embs.device) - else: - avg_emb = torch.mean(batch_emb_list[batch_idx][scale_index][_where], dim=0) - spk_set_list.append(avg_emb) - session_avg_emb_set_list.append(torch.stack(spk_set_list)) - session_avg_emb_set = torch.stack(session_avg_emb_set_list) - ms_avg_embs_list.append(session_avg_emb_set) - - ms_avg_embs = torch.stack(ms_avg_embs_list).permute(0, 1, 3, 2) - ms_avg_embs = ms_avg_embs.float().detach().to(embs.device) - assert ( - not ms_avg_embs.requires_grad - ), "ms_avg_embs.requires_grad = True. ms_avg_embs should be detached from the torch graph." - return ms_avg_embs - - @torch.no_grad() - def get_ms_mel_feat( - self, - processed_signal: torch.Tensor, - processed_signal_len: torch.Tensor, - ms_seg_timestamps: torch.Tensor, - ms_seg_counts: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - """ - Load acoustic feature from audio segments for each scale and save it into a torch.tensor matrix. - In addition, create variables containing the information of the multiscale subsegmentation information. - - Note: `self.emb_batch_size` determines the number of embedding tensors attached to the computational graph. - If `self.emb_batch_size` is greater than 0, speaker embedding models are simultaneosly trained. Due to the - constrant of GPU memory size, only a subset of embedding tensors can be attached to the computational graph. - By default, the graph-attached embeddings are selected randomly by `torch.randperm`. Default value of - `self.emb_batch_size` is 0. - - Args: - processed_signal (Tensor): - Zero-padded Feature input. - Shape: (batch_size, feat_dim, the longest feature sequence length) - processed_signal_len (Tensor): - The actual legnth of feature input without zero-padding. - Shape: (batch_size,) - ms_seg_timestamps (Tensor): - Timestamps of the base-scale segments. - Shape: (batch_size, scale_n, number of base-scale segments, self.num_spks_per_model) - ms_seg_counts (Tensor): - Cumulative sum of the number of segments in each scale. This information is needed to reconstruct - the multi-scale input matrix during forward propagating. - Shape: (batch_size, scale_n) - - Returns: - ms_mel_feat (Tensor): - Feature input stream split into the same length. - Shape: (total number of segments, feat_dim, self.frame_per_sec * the-longest-scale-length) - ms_mel_feat_len (Tensor): - The actual length of feature without zero-padding. - Shape: (total number of segments,) - seq_len (Tensor): - The length of the input embedding sequences. - Shape: (total number of segments,) - detach_ids (tuple): - Tuple containing both detached embeding indices and attached embedding indices - """ - device = processed_signal.device - _emb_batch_size = min(self.emb_batch_size, ms_seg_counts.sum().item()) - feat_dim = self.preprocessor._cfg.features - max_sample_count = int(self.multiscale_args_dict["scale_dict"][0][0] * self.frame_per_sec) - ms_mel_feat_len_list, sequence_lengths_list, ms_mel_feat_list = [], [], [] - total_seg_count = torch.sum(ms_seg_counts) - - batch_size = processed_signal.shape[0] - for batch_idx in range(batch_size): - for scale_idx in range(self.scale_n): - scale_seg_num = ms_seg_counts[batch_idx][scale_idx] - for k, (stt, end) in enumerate(ms_seg_timestamps[batch_idx][scale_idx][:scale_seg_num]): - stt, end = int(stt.detach().item()), int(end.detach().item()) - end = min(end, stt + max_sample_count) - _features = torch.zeros(feat_dim, max_sample_count).to(torch.float32).to(device) - _features[:, : (end - stt)] = processed_signal[batch_idx][:, stt:end] - ms_mel_feat_list.append(_features) - ms_mel_feat_len_list.append(end - stt) - sequence_lengths_list.append(ms_seg_counts[batch_idx][-1]) - ms_mel_feat = torch.stack(ms_mel_feat_list).to(device) - ms_mel_feat_len = torch.tensor(ms_mel_feat_len_list).to(device) - seq_len = torch.tensor(sequence_lengths_list).to(device) - - if _emb_batch_size == 0: - attached, _emb_batch_size = torch.tensor([]), 0 - detached = torch.arange(total_seg_count) - else: - torch.manual_seed(self._trainer.current_epoch) - attached = torch.randperm(total_seg_count)[:_emb_batch_size] - detached = torch.randperm(total_seg_count)[_emb_batch_size:] - detach_ids = (attached, detached) - return ms_mel_feat, ms_mel_feat_len, seq_len, detach_ids - - def forward_infer(self, input_signal, input_signal_length, emb_vectors, targets): - """ - Wrapper function for inference case. - """ - preds, scale_weights = self.msdd( - ms_emb_seq=input_signal, length=input_signal_length, ms_avg_embs=emb_vectors, targets=targets - ) - return preds, scale_weights - - @typecheck() - def forward( - self, features, feature_length, ms_seg_timestamps, ms_seg_counts, clus_label_index, scale_mapping, targets - ): - processed_signal, processed_signal_len = self.msdd._speaker_model.preprocessor( - input_signal=features, length=feature_length - ) - audio_signal, audio_signal_len, sequence_lengths, detach_ids = self.get_ms_mel_feat( - processed_signal, processed_signal_len, ms_seg_timestamps, ms_seg_counts - ) - - # For detached embeddings - with torch.no_grad(): - self.msdd._speaker_model.eval() - logits, embs_d = self.msdd._speaker_model.forward_for_export( - processed_signal=audio_signal[detach_ids[1]], processed_signal_len=audio_signal_len[detach_ids[1]] - ) - embs = torch.zeros(audio_signal.shape[0], embs_d.shape[1]).to(embs_d.device) - embs[detach_ids[1], :] = embs_d.detach() - - # For attached embeddings - self.msdd._speaker_model.train() - if len(detach_ids[0]) > 1: - logits, embs_a = self.msdd._speaker_model.forward_for_export( - processed_signal=audio_signal[detach_ids[0]], processed_signal_len=audio_signal_len[detach_ids[0]] - ) - embs[detach_ids[0], :] = embs_a - - ms_emb_seq = self.get_ms_emb_seq(embs, scale_mapping, ms_seg_counts) - ms_avg_embs = self.get_cluster_avg_embs_model(embs, clus_label_index, ms_seg_counts, scale_mapping) - preds, scale_weights = self.msdd( - ms_emb_seq=ms_emb_seq, length=sequence_lengths, ms_avg_embs=ms_avg_embs, targets=targets - ) - return preds, scale_weights - - def training_step(self, batch: list, batch_idx: int): - features, feature_length, ms_seg_timestamps, ms_seg_counts, clus_label_index, scale_mapping, targets = batch - sequence_lengths = torch.tensor([x[-1] for x in ms_seg_counts.detach()]) - preds, _ = self.forward( - features=features, - feature_length=feature_length, - ms_seg_timestamps=ms_seg_timestamps, - ms_seg_counts=ms_seg_counts, - clus_label_index=clus_label_index, - scale_mapping=scale_mapping, - targets=targets, - ) - loss = self.loss(probs=preds, labels=targets, signal_lengths=sequence_lengths) - self._accuracy_train(preds, targets, sequence_lengths) - torch.cuda.empty_cache() - f1_acc = self._accuracy_train.compute() - self.log('loss', loss, sync_dist=True) - self.log('learning_rate', self._optimizer.param_groups[0]['lr'], sync_dist=True) - self.log('train_f1_acc', f1_acc, sync_dist=True) - self._accuracy_train.reset() - return {'loss': loss} - - def validation_step(self, batch: list, batch_idx: int, dataloader_idx: int = 0): - features, feature_length, ms_seg_timestamps, ms_seg_counts, clus_label_index, scale_mapping, targets = batch - sequence_lengths = torch.tensor([x[-1] for x in ms_seg_counts]) - preds, _ = self.forward( - features=features, - feature_length=feature_length, - ms_seg_timestamps=ms_seg_timestamps, - ms_seg_counts=ms_seg_counts, - clus_label_index=clus_label_index, - scale_mapping=scale_mapping, - targets=targets, - ) - loss = self.loss(probs=preds, labels=targets, signal_lengths=sequence_lengths) - self._accuracy_valid(preds, targets, sequence_lengths) - f1_acc = self._accuracy_valid.compute() - self.log('val_loss', loss, sync_dist=True) - self.log('val_f1_acc', f1_acc, sync_dist=True) - return { - 'val_loss': loss, - 'val_f1_acc': f1_acc, - } - - def multi_validation_epoch_end(self, outputs: list, dataloader_idx: int = 0): - val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean() - f1_acc = self._accuracy_valid.compute() - self._accuracy_valid.reset() - - self.log('val_loss', val_loss_mean, sync_dist=True) - self.log('val_f1_acc', f1_acc, sync_dist=True) - return { - 'val_loss': val_loss_mean, - 'val_f1_acc': f1_acc, - } - - def multi_test_epoch_end(self, outputs: List[Dict[str, torch.Tensor]], dataloader_idx: int = 0): - test_loss_mean = torch.stack([x['test_loss'] for x in outputs]).mean() - f1_acc = self._accuracy_test.compute() - self._accuracy_test.reset() - self.log('test_f1_acc', f1_acc, sync_dist=True) - return { - 'test_loss': test_loss_mean, - 'test_f1_acc': f1_acc, - } - - def compute_accuracies(self): - """ - Calculate F1 score and accuracy of the predicted sigmoid values. - - Returns: - f1_score (float): - F1 score of the estimated diarized speaker label sequences. - simple_acc (float): - Accuracy of predicted speaker labels: (total # of correct labels)/(total # of sigmoid values) - """ - f1_score = self._accuracy_test.compute() - num_correct = torch.sum(self._accuracy_test.true.bool()) - total_count = torch.prod(torch.tensor(self._accuracy_test.targets.shape)) - simple_acc = num_correct / total_count - return f1_score, simple_acc - - -class ClusterEmbedding(torch.nn.Module): - """ - This class is built for calculating cluster-average embeddings, segmentation and load/save of the estimated cluster labels. - The methods in this class is used for the inference of MSDD models. - - Args: - cfg_diar_infer (DictConfig): - Config dictionary from diarization inference YAML file - cfg_msdd_model (DictConfig): - Config dictionary from MSDD model checkpoint file - - Class Variables: - self.cfg_diar_infer (DictConfig): - Config dictionary from diarization inference YAML file - cfg_msdd_model (DictConfig): - Config dictionary from MSDD model checkpoint file - self._speaker_model (class `EncDecSpeakerLabelModel`): - This is a placeholder for class instance of `EncDecSpeakerLabelModel` - self.scale_window_length_list (list): - List containing the window lengths (i.e., scale length) of each scale. - self.scale_n (int): - Number of scales for multi-scale clustering diarizer - self.base_scale_index (int): - The index of the base-scale which is the shortest scale among the given multiple scales - """ - - def __init__( - self, cfg_diar_infer: DictConfig, cfg_msdd_model: DictConfig, speaker_model: Optional[EncDecSpeakerLabelModel] - ): - super().__init__() - self.cfg_diar_infer = cfg_diar_infer - self._cfg_msdd = cfg_msdd_model - self._speaker_model = speaker_model - self.scale_window_length_list = list( - self.cfg_diar_infer.diarizer.speaker_embeddings.parameters.window_length_in_sec - ) - self.scale_n = len(self.scale_window_length_list) - self.base_scale_index = len(self.scale_window_length_list) - 1 - self.clus_diar_model = ClusteringDiarizer(cfg=self.cfg_diar_infer, speaker_model=self._speaker_model) - - def prepare_cluster_embs_infer(self): - """ - Launch clustering diarizer to prepare embedding vectors and clustering results. - """ - self.max_num_speakers = self.cfg_diar_infer.diarizer.clustering.parameters.max_num_speakers - self.emb_sess_test_dict, self.emb_seq_test, self.clus_test_label_dict, _ = self.run_clustering_diarizer( - self._cfg_msdd.test_ds.manifest_filepath, self._cfg_msdd.test_ds.emb_dir - ) - - def assign_labels_to_longer_segs(self, base_clus_label_dict: Dict, session_scale_mapping_dict: Dict): - """ - In multi-scale speaker diarization system, clustering result is solely based on the base-scale (the shortest scale). - To calculate cluster-average speaker embeddings for each scale that are longer than the base-scale, this function assigns - clustering results for the base-scale to the longer scales by measuring the distance between subsegment timestamps in the - base-scale and non-base-scales. - - Args: - base_clus_label_dict (dict): - Dictionary containing clustering results for base-scale segments. Indexed by `uniq_id` string. - session_scale_mapping_dict (dict): - Dictionary containing multiscale mapping information for each session. Indexed by `uniq_id` string. - - Returns: - all_scale_clus_label_dict (dict): - Dictionary containing clustering labels of all scales. Indexed by scale_index in integer format. - - """ - all_scale_clus_label_dict = {scale_index: {} for scale_index in range(self.scale_n)} - for uniq_id, uniq_scale_mapping_dict in session_scale_mapping_dict.items(): - base_scale_clus_label = np.array([x[-1] for x in base_clus_label_dict[uniq_id]]) - all_scale_clus_label_dict[self.base_scale_index][uniq_id] = base_scale_clus_label - for scale_index in range(self.scale_n - 1): - new_clus_label = [] - assert ( - uniq_scale_mapping_dict[scale_index].shape[0] == base_scale_clus_label.shape[0] - ), "The number of base scale labels does not match the segment numbers in uniq_scale_mapping_dict" - max_index = max(uniq_scale_mapping_dict[scale_index]) - for seg_idx in range(max_index + 1): - if seg_idx in uniq_scale_mapping_dict[scale_index]: - seg_clus_label = mode(base_scale_clus_label[uniq_scale_mapping_dict[scale_index] == seg_idx]) - else: - seg_clus_label = 0 if len(new_clus_label) == 0 else new_clus_label[-1] - new_clus_label.append(seg_clus_label) - all_scale_clus_label_dict[scale_index][uniq_id] = new_clus_label - return all_scale_clus_label_dict - - def get_base_clus_label_dict(self, clus_labels: List[str], emb_scale_seq_dict: Dict[int, dict]): - """ - Retrieve base scale clustering labels from `emb_scale_seq_dict`. - - Args: - clus_labels (list): - List containing cluster results generated by clustering diarizer. - emb_scale_seq_dict (dict): - Dictionary containing multiscale embedding input sequences. - Returns: - base_clus_label_dict (dict): - Dictionary containing start and end of base scale segments and its cluster label. Indexed by `uniq_id`. - emb_dim (int): - Embedding dimension in integer. - """ - base_clus_label_dict = {key: [] for key in emb_scale_seq_dict[self.base_scale_index].keys()} - for line in clus_labels: - uniq_id = line.split()[0] - label = int(line.split()[-1].split('_')[-1]) - stt, end = [round(float(x), 2) for x in line.split()[1:3]] - base_clus_label_dict[uniq_id].append([stt, end, label]) - emb_dim = emb_scale_seq_dict[0][uniq_id][0].shape[0] - return base_clus_label_dict, emb_dim - - def get_cluster_avg_embs( - self, emb_scale_seq_dict: Dict, clus_labels: List, speaker_mapping_dict: Dict, session_scale_mapping_dict: Dict - ): - """ - MSDD requires cluster-average speaker embedding vectors for each scale. This function calculates an average embedding vector for each cluster (speaker) - and each scale. - - Args: - emb_scale_seq_dict (dict): - Dictionary containing embedding sequence for each scale. Keys are scale index in integer. - clus_labels (list): - Clustering results from clustering diarizer including all the sessions provided in input manifest files. - speaker_mapping_dict (dict): - Speaker mapping dictionary in case RTTM files are provided. This is mapping between integer based speaker index and - speaker ID tokens in RTTM files. - Example: - {'en_0638': {'speaker_0': 'en_0638_A', 'speaker_1': 'en_0638_B'}, - 'en_4065': {'speaker_0': 'en_4065_B', 'speaker_1': 'en_4065_A'}, ...,} - session_scale_mapping_dict (dict): - Dictionary containing multiscale mapping information for each session. Indexed by `uniq_id` string. - - Returns: - emb_sess_avg_dict (dict): - Dictionary containing speaker mapping information and cluster-average speaker embedding vector. - Each session-level dictionary is indexed by scale index in integer. - output_clus_label_dict (dict): - Subegmentation timestamps in float type and Clustering result in integer type. Indexed by `uniq_id` keys. - """ - self.scale_n = len(emb_scale_seq_dict.keys()) - emb_sess_avg_dict = { - scale_index: {key: [] for key in emb_scale_seq_dict[self.scale_n - 1].keys()} - for scale_index in emb_scale_seq_dict.keys() - } - output_clus_label_dict, emb_dim = self.get_base_clus_label_dict(clus_labels, emb_scale_seq_dict) - all_scale_clus_label_dict = self.assign_labels_to_longer_segs( - output_clus_label_dict, session_scale_mapping_dict - ) - for scale_index in emb_scale_seq_dict.keys(): - for uniq_id, _emb_tensor in emb_scale_seq_dict[scale_index].items(): - if type(_emb_tensor) == list: - emb_tensor = torch.tensor(np.array(_emb_tensor)) - else: - emb_tensor = _emb_tensor - clus_label_list = all_scale_clus_label_dict[scale_index][uniq_id] - spk_set = set(clus_label_list) - - # Create a label array which identifies clustering result for each segment. - label_array = torch.Tensor(clus_label_list) - avg_embs = torch.zeros(emb_dim, self.max_num_speakers) - for spk_idx in spk_set: - selected_embs = emb_tensor[label_array == spk_idx] - avg_embs[:, spk_idx] = torch.mean(selected_embs, dim=0) - - if speaker_mapping_dict is not None: - inv_map = {clus_key: rttm_key for rttm_key, clus_key in speaker_mapping_dict[uniq_id].items()} - else: - inv_map = None - - emb_sess_avg_dict[scale_index][uniq_id] = {'mapping': inv_map, 'avg_embs': avg_embs} - return emb_sess_avg_dict, output_clus_label_dict - - def run_clustering_diarizer(self, manifest_filepath: str, emb_dir: str): - """ - If no pre-existing data is provided, run clustering diarizer from scratch. This will create scale-wise speaker embedding - sequence, cluster-average embeddings, scale mapping and base scale clustering labels. Note that speaker embedding `state_dict` - is loaded from the `state_dict` in the provided MSDD checkpoint. - - Args: - manifest_filepath (str): - Input manifest file for creating audio-to-RTTM mapping. - emb_dir (str): - Output directory where embedding files and timestamp files are saved. - - Returns: - emb_sess_avg_dict (dict): - Dictionary containing cluster-average embeddings for each session. - emb_scale_seq_dict (dict): - Dictionary containing embedding tensors which are indexed by scale numbers. - base_clus_label_dict (dict): - Dictionary containing clustering results. Clustering results are cluster labels for the base scale segments. - """ - self.cfg_diar_infer.diarizer.manifest_filepath = manifest_filepath - self.cfg_diar_infer.diarizer.out_dir = emb_dir - - # Run ClusteringDiarizer which includes system VAD or oracle VAD. - self._out_dir = self.clus_diar_model._diarizer_params.out_dir - self.out_rttm_dir = os.path.join(self._out_dir, 'pred_rttms') - os.makedirs(self.out_rttm_dir, exist_ok=True) - - self.clus_diar_model._cluster_params = self.cfg_diar_infer.diarizer.clustering.parameters - self.clus_diar_model.multiscale_args_dict[ - "multiscale_weights" - ] = self.cfg_diar_infer.diarizer.speaker_embeddings.parameters.multiscale_weights - self.clus_diar_model._diarizer_params.speaker_embeddings.parameters = ( - self.cfg_diar_infer.diarizer.speaker_embeddings.parameters - ) - cluster_params = self.clus_diar_model._cluster_params - cluster_params = dict(cluster_params) if isinstance(cluster_params, DictConfig) else cluster_params.dict() - clustering_params_str = json.dumps(cluster_params, indent=4) - - logging.info(f"Multiscale Weights: {self.clus_diar_model.multiscale_args_dict['multiscale_weights']}") - logging.info(f"Clustering Parameters: {clustering_params_str}") - scores = self.clus_diar_model.diarize(batch_size=self.cfg_diar_infer.batch_size) - - # If RTTM (ground-truth diarization annotation) files do not exist, scores is None. - if scores is not None: - metric, speaker_mapping_dict, _ = scores - else: - metric, speaker_mapping_dict = None, None - - # Get the mapping between segments in different scales. - self._embs_and_timestamps = get_embs_and_timestamps( - self.clus_diar_model.multiscale_embeddings_and_timestamps, self.clus_diar_model.multiscale_args_dict - ) - session_scale_mapping_dict = self.get_scale_map(self._embs_and_timestamps) - emb_scale_seq_dict = self.load_emb_scale_seq_dict(emb_dir) - clus_labels = self.load_clustering_labels(emb_dir) - emb_sess_avg_dict, base_clus_label_dict = self.get_cluster_avg_embs( - emb_scale_seq_dict, clus_labels, speaker_mapping_dict, session_scale_mapping_dict - ) - emb_scale_seq_dict['session_scale_mapping'] = session_scale_mapping_dict - return emb_sess_avg_dict, emb_scale_seq_dict, base_clus_label_dict, metric - - def get_scale_map(self, embs_and_timestamps): - """ - Save multiscale mapping data into dictionary format. - - Args: - embs_and_timestamps (dict): - Dictionary containing embedding tensors and timestamp tensors. Indexed by `uniq_id` string. - Returns: - session_scale_mapping_dict (dict): - Dictionary containing multiscale mapping information for each session. Indexed by `uniq_id` string. - """ - session_scale_mapping_dict = {} - for uniq_id, uniq_embs_and_timestamps in embs_and_timestamps.items(): - scale_mapping_dict = get_scale_mapping_argmat(uniq_embs_and_timestamps) - session_scale_mapping_dict[uniq_id] = scale_mapping_dict - return session_scale_mapping_dict - - def check_clustering_labels(self, out_dir): - """ - Check whether the laoded clustering label file is including clustering results for all sessions. - This function is used for inference mode of MSDD. - - Args: - out_dir (str): - Path to the directory where clustering result files are saved. - Returns: - file_exists (bool): - Boolean that indicates whether clustering result file exists. - clus_label_path (str): - Path to the clustering label output file. - """ - clus_label_path = os.path.join( - out_dir, 'speaker_outputs', f'subsegments_scale{self.base_scale_index}_cluster.label' - ) - file_exists = os.path.exists(clus_label_path) - if not file_exists: - logging.info(f"Clustering label file {clus_label_path} does not exist.") - return file_exists, clus_label_path - - def load_clustering_labels(self, out_dir): - """ - Load clustering labels generated by clustering diarizer. This function is used for inference mode of MSDD. - - Args: - out_dir (str): - Path to the directory where clustering result files are saved. - Returns: - emb_scale_seq_dict (dict): - List containing clustering results in string format. - """ - file_exists, clus_label_path = self.check_clustering_labels(out_dir) - logging.info(f"Loading cluster label file from {clus_label_path}") - with open(clus_label_path) as f: - clus_labels = f.readlines() - return clus_labels - - def load_emb_scale_seq_dict(self, out_dir): - """ - Load saved embeddings generated by clustering diarizer. This function is used for inference mode of MSDD. - - Args: - out_dir (str): - Path to the directory where embedding pickle files are saved. - Returns: - emb_scale_seq_dict (dict): - Dictionary containing embedding tensors which are indexed by scale numbers. - """ - window_len_list = list(self.cfg_diar_infer.diarizer.speaker_embeddings.parameters.window_length_in_sec) - emb_scale_seq_dict = {scale_index: None for scale_index in range(len(window_len_list))} - for scale_index in range(len(window_len_list)): - pickle_path = os.path.join( - out_dir, 'speaker_outputs', 'embeddings', f'subsegments_scale{scale_index}_embeddings.pkl' - ) - logging.info(f"Loading embedding pickle file of scale:{scale_index} at {pickle_path}") - with open(pickle_path, "rb") as input_file: - emb_dict = pkl.load(input_file) - for key, val in emb_dict.items(): - emb_dict[key] = val - emb_scale_seq_dict[scale_index] = emb_dict - return emb_scale_seq_dict - - -class NeuralDiarizer(LightningModule): - """ - Class for inference based on multiscale diarization decoder (MSDD). MSDD requires initializing clustering results from - clustering diarizer. Overlap-aware diarizer requires separate RTTM generation and evaluation modules to check the effect of - overlap detection in speaker diarization. - """ - - def __init__(self, cfg: Union[DictConfig, NeuralDiarizerInferenceConfig]): - super().__init__() - self._cfg = cfg - - # Parameter settings for MSDD model - self.use_speaker_model_from_ckpt = cfg.diarizer.msdd_model.parameters.get('use_speaker_model_from_ckpt', True) - self.use_clus_as_main = cfg.diarizer.msdd_model.parameters.get('use_clus_as_main', False) - self.max_overlap_spks = cfg.diarizer.msdd_model.parameters.get('max_overlap_spks', 2) - self.num_spks_per_model = cfg.diarizer.msdd_model.parameters.get('num_spks_per_model', 2) - self.use_adaptive_thres = cfg.diarizer.msdd_model.parameters.get('use_adaptive_thres', True) - self.max_pred_length = cfg.diarizer.msdd_model.parameters.get('max_pred_length', 0) - self.diar_eval_settings = cfg.diarizer.msdd_model.parameters.get( - 'diar_eval_settings', [(0.25, True), (0.25, False), (0.0, False)] - ) - - self._init_msdd_model(cfg) - self.diar_window_length = cfg.diarizer.msdd_model.parameters.diar_window_length - self.msdd_model.cfg = self.transfer_diar_params_to_model_params(self.msdd_model, cfg) - - # Initialize clustering and embedding preparation instance (as a diarization encoder). - self.clustering_embedding = ClusterEmbedding( - cfg_diar_infer=cfg, cfg_msdd_model=self.msdd_model.cfg, speaker_model=self._speaker_model - ) - - # Parameters for creating diarization results from MSDD outputs. - self.clustering_max_spks = self.msdd_model._cfg.max_num_of_spks - self.overlap_infer_spk_limit = cfg.diarizer.msdd_model.parameters.get( - 'overlap_infer_spk_limit', self.clustering_max_spks - ) - - def transfer_diar_params_to_model_params(self, msdd_model, cfg): - """ - Transfer the parameters that are needed for MSDD inference from the diarization inference config files - to MSDD model config `msdd_model.cfg`. - """ - msdd_model.cfg.diarizer.out_dir = cfg.diarizer.out_dir - msdd_model.cfg.test_ds.manifest_filepath = cfg.diarizer.manifest_filepath - msdd_model.cfg.test_ds.emb_dir = cfg.diarizer.out_dir - msdd_model.cfg.test_ds.batch_size = cfg.diarizer.msdd_model.parameters.infer_batch_size - msdd_model.cfg.test_ds.seq_eval_mode = cfg.diarizer.msdd_model.parameters.seq_eval_mode - msdd_model._cfg.max_num_of_spks = cfg.diarizer.clustering.parameters.max_num_speakers - return msdd_model.cfg - - @rank_zero_only - def save_to(self, save_path: str): - """ - Saves model instances (weights and configuration) into EFF archive. - You can use "restore_from" method to fully restore instance from .nemo file. - - .nemo file is an archive (tar.gz) with the following: - model_config.yaml - model configuration in .yaml format. You can deserialize this into cfg argument for model's constructor - model_wights.chpt - model checkpoint - - Args: - save_path: Path to .nemo file where model instance should be saved - """ - self.clus_diar = self.clustering_embedding.clus_diar_model - _NEURAL_DIAR_MODEL = "msdd_model.nemo" - - with tempfile.TemporaryDirectory() as tmpdir: - config_yaml = os.path.join(tmpdir, _MODEL_CONFIG_YAML) - spkr_model = os.path.join(tmpdir, _SPEAKER_MODEL) - neural_diar_model = os.path.join(tmpdir, _NEURAL_DIAR_MODEL) - - self.clus_diar.to_config_file(path2yaml_file=config_yaml) - if self.clus_diar.has_vad_model: - vad_model = os.path.join(tmpdir, _VAD_MODEL) - self.clus_diar._vad_model.save_to(vad_model) - self.clus_diar._speaker_model.save_to(spkr_model) - self.msdd_model.save_to(neural_diar_model) - self.clus_diar.__make_nemo_file_from_folder(filename=save_path, source_dir=tmpdir) - - def extract_standalone_speaker_model(self, prefix: str = 'msdd._speaker_model.') -> EncDecSpeakerLabelModel: - """ - MSDD model file contains speaker embedding model and MSDD model. This function extracts standalone speaker model and save it to - `self.spk_emb_state_dict` to be loaded separately for clustering diarizer. - - Args: - ext (str): - File-name extension of the provided model path. - Returns: - standalone_model_path (str): - Path to the extracted standalone model without speaker embedding extractor model. - """ - model_state_dict = self.msdd_model.state_dict() - spk_emb_module_names = [] - for name in model_state_dict.keys(): - if prefix in name: - spk_emb_module_names.append(name) - - spk_emb_state_dict = {} - for name in spk_emb_module_names: - org_name = name.replace(prefix, '') - spk_emb_state_dict[org_name] = model_state_dict[name] - - _speaker_model = EncDecSpeakerLabelModel.from_config_dict(self.msdd_model.cfg.speaker_model_cfg) - _speaker_model.load_state_dict(spk_emb_state_dict) - return _speaker_model - - def _init_msdd_model(self, cfg: Union[DictConfig, NeuralDiarizerInferenceConfig]): - - """ - Initialized MSDD model with the provided config. Load either from `.nemo` file or `.ckpt` checkpoint files. - """ - model_path = cfg.diarizer.msdd_model.model_path - if model_path.endswith('.nemo'): - logging.info(f"Using local nemo file from {model_path}") - self.msdd_model = EncDecDiarLabelModel.restore_from(restore_path=model_path, map_location=cfg.device) - elif model_path.endswith('.ckpt'): - logging.info(f"Using local checkpoint from {model_path}") - self.msdd_model = EncDecDiarLabelModel.load_from_checkpoint( - checkpoint_path=model_path, map_location=cfg.device - ) - else: - if model_path not in get_available_model_names(EncDecDiarLabelModel): - logging.warning(f"requested {model_path} model name not available in pretrained models, instead") - logging.info("Loading pretrained {} model from NGC".format(model_path)) - self.msdd_model = EncDecDiarLabelModel.from_pretrained(model_name=model_path, map_location=cfg.device) - # Load speaker embedding model state_dict which is loaded from the MSDD checkpoint. - if self.use_speaker_model_from_ckpt: - self._speaker_model = self.extract_standalone_speaker_model() - else: - self._speaker_model = None - - def get_pred_mat(self, data_list: List[Union[Tuple[int], List[torch.Tensor]]]) -> torch.Tensor: - """ - This module puts together the pairwise, two-speaker, predicted results to form a finalized matrix that has dimension of - `(total_len, n_est_spks)`. The pairwise results are evenutally averaged. For example, in 4 speaker case (speaker 1, 2, 3, 4), - the sum of the pairwise results (1, 2), (1, 3), (1, 4) are then divided by 3 to take average of the sigmoid values. - - Args: - data_list (list): - List containing data points from `test_data_collection` variable. `data_list` has sublists `data` as follows: - data[0]: `target_spks` tuple - Examples: (0, 1, 2) - data[1]: Tensor containing estimaged sigmoid values. - [[0.0264, 0.9995], - [0.0112, 1.0000], - ..., - [1.0000, 0.0512]] - - Returns: - sum_pred (Tensor): - Tensor containing the averaged sigmoid values for each speaker. - """ - all_tups = tuple() - for data in data_list: - all_tups += data[0] - n_est_spks = len(set(all_tups)) - digit_map = dict(zip(sorted(set(all_tups)), range(n_est_spks))) - total_len = max([sess[1].shape[1] for sess in data_list]) - sum_pred = torch.zeros(total_len, n_est_spks) - for (_dim_tup, pred_mat) in data_list: - dim_tup = [digit_map[x] for x in _dim_tup] - if len(pred_mat.shape) == 3: - pred_mat = pred_mat.squeeze(0) - if n_est_spks <= self.num_spks_per_model: - sum_pred = pred_mat - else: - _end = pred_mat.shape[0] - sum_pred[:_end, dim_tup] += pred_mat.cpu().float() - sum_pred = sum_pred / (n_est_spks - 1) - return sum_pred - - def get_integrated_preds_list( - self, uniq_id_list: List[str], test_data_collection: List[Any], preds_list: List[torch.Tensor] - ) -> List[torch.Tensor]: - """ - Merge multiple sequence inference outputs into a session level result. - - Args: - uniq_id_list (list): - List containing `uniq_id` values. - test_data_collection (collections.DiarizationLabelEntity): - Class instance that is containing session information such as targeted speaker indices, audio filepaths and RTTM filepaths. - preds_list (list): - List containing tensors filled with sigmoid values. - - Returns: - output_list (list): - List containing session-level estimated prediction matrix. - """ - session_dict = get_id_tup_dict(uniq_id_list, test_data_collection, preds_list) - output_dict = {uniq_id: [] for uniq_id in uniq_id_list} - for uniq_id, data_list in session_dict.items(): - sum_pred = self.get_pred_mat(data_list) - output_dict[uniq_id] = sum_pred.unsqueeze(0) - output_list = [output_dict[uniq_id] for uniq_id in uniq_id_list] - return output_list - - def get_emb_clus_infer(self, cluster_embeddings): - """Assign dictionaries containing the clustering results from the class instance `cluster_embeddings`. - """ - self.msdd_model.emb_sess_test_dict = cluster_embeddings.emb_sess_test_dict - self.msdd_model.clus_test_label_dict = cluster_embeddings.clus_test_label_dict - self.msdd_model.emb_seq_test = cluster_embeddings.emb_seq_test - - @torch.no_grad() - def diarize(self) -> Optional[List[Optional[List[Tuple[DiarizationErrorRate, Dict]]]]]: - """ - Launch diarization pipeline which starts from VAD (or a oracle VAD stamp generation), initialization clustering and multiscale diarization decoder (MSDD). - Note that the result of MSDD can include multiple speakers at the same time. Therefore, RTTM output of MSDD needs to be based on `make_rttm_with_overlap()` - function that can generate overlapping timestamps. `self.run_overlap_aware_eval()` function performs DER evaluation. - """ - self.clustering_embedding.prepare_cluster_embs_infer() - self.msdd_model.pairwise_infer = True - self.get_emb_clus_infer(self.clustering_embedding) - preds_list, targets_list, signal_lengths_list = self.run_pairwise_diarization() - thresholds = list(self._cfg.diarizer.msdd_model.parameters.sigmoid_threshold) - return [self.run_overlap_aware_eval(preds_list, threshold) for threshold in thresholds] - - def get_range_average( - self, signals: torch.Tensor, emb_vectors: torch.Tensor, diar_window_index: int, test_data_collection: List[Any] - ) -> Tuple[torch.Tensor, torch.Tensor, int]: - """ - This function is only used when `split_infer=True`. This module calculates cluster-average embeddings for the given short range. - The range length is set by `self.diar_window_length`, and each cluster-average is only calculated for the specified range. - Note that if the specified range does not contain some speakers (e.g. the range contains speaker 1, 3) compared to the global speaker sets - (e.g. speaker 1, 2, 3, 4) then the missing speakers (e.g. speakers 2, 4) are assigned with zero-filled cluster-average speaker embedding. - - Args: - signals (Tensor): - Zero-padded Input multi-scale embedding sequences. - Shape: (length, scale_n, emb_vectors, emb_dim) - emb_vectors (Tensor): - Cluster-average multi-scale embedding vectors. - Shape: (length, scale_n, emb_vectors, emb_dim) - diar_window_index (int): - Index of split diarization wondows. - test_data_collection (collections.DiarizationLabelEntity) - Class instance that is containing session information such as targeted speaker indices, audio filepath and RTTM filepath. - - Returns: - return emb_vectors_split (Tensor): - Cluster-average speaker embedding vectors for each scale. - emb_seq (Tensor): - Zero-padded multi-scale embedding sequences. - seq_len (int): - Length of the sequence determined by `self.diar_window_length` variable. - """ - emb_vectors_split = torch.zeros_like(emb_vectors) - uniq_id = os.path.splitext(os.path.basename(test_data_collection.audio_file))[0] - clus_label_tensor = torch.tensor([x[-1] for x in self.msdd_model.clus_test_label_dict[uniq_id]]) - for spk_idx in range(len(test_data_collection.target_spks)): - stt, end = ( - diar_window_index * self.diar_window_length, - min((diar_window_index + 1) * self.diar_window_length, clus_label_tensor.shape[0]), - ) - seq_len = end - stt - if stt < clus_label_tensor.shape[0]: - target_clus_label_tensor = clus_label_tensor[stt:end] - emb_seq, seg_length = ( - signals[stt:end, :, :], - min( - self.diar_window_length, - clus_label_tensor.shape[0] - diar_window_index * self.diar_window_length, - ), - ) - target_clus_label_bool = target_clus_label_tensor == test_data_collection.target_spks[spk_idx] - - # There are cases where there is no corresponding speaker in split range, so any(target_clus_label_bool) could be False. - if any(target_clus_label_bool): - emb_vectors_split[:, :, spk_idx] = torch.mean(emb_seq[target_clus_label_bool], dim=0) - - # In case when the loop reaches the end of the sequence - if seq_len < self.diar_window_length: - emb_seq = torch.cat( - [ - emb_seq, - torch.zeros(self.diar_window_length - seq_len, emb_seq.shape[1], emb_seq.shape[2]).to( - signals.device - ), - ], - dim=0, - ) - else: - emb_seq = torch.zeros(self.diar_window_length, emb_vectors.shape[0], emb_vectors.shape[1]).to( - signals.device - ) - seq_len = 0 - return emb_vectors_split, emb_seq, seq_len - - def get_range_clus_avg_emb( - self, test_batch: List[torch.Tensor], _test_data_collection: List[Any], device: torch.device('cpu') - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ - This function is only used when `get_range_average` function is called. This module calculates cluster-average embeddings for - the given short range. The range length is set by `self.diar_window_length`, and each cluster-average is only calculated for the specified range. - - Args: - test_batch: (list) - List containing embedding sequences, length of embedding sequences, ground truth labels (if exists) and initializing embedding vectors. - test_data_collection: (list) - List containing test-set dataloader contents. test_data_collection includes wav file path, RTTM file path, clustered speaker indices. - - Returns: - sess_emb_vectors (Tensor): - Tensor of cluster-average speaker embedding vectors. - Shape: (batch_size, scale_n, emb_dim, 2*num_of_spks) - sess_emb_seq (Tensor): - Tensor of input multi-scale embedding sequences. - Shape: (batch_size, length, scale_n, emb_dim) - sess_sig_lengths (Tensor): - Tensor of the actucal sequence length without zero-padding. - Shape: (batch_size) - """ - _signals, signal_lengths, _targets, _emb_vectors = test_batch - sess_emb_vectors, sess_emb_seq, sess_sig_lengths = [], [], [] - split_count = torch.ceil(torch.tensor(_signals.shape[1] / self.diar_window_length)).int() - self.max_pred_length = max(self.max_pred_length, self.diar_window_length * split_count) - for k in range(_signals.shape[0]): - signals, emb_vectors, test_data_collection = _signals[k], _emb_vectors[k], _test_data_collection[k] - for diar_window_index in range(split_count): - emb_vectors_split, emb_seq, seq_len = self.get_range_average( - signals, emb_vectors, diar_window_index, test_data_collection - ) - sess_emb_vectors.append(emb_vectors_split) - sess_emb_seq.append(emb_seq) - sess_sig_lengths.append(seq_len) - sess_emb_vectors = torch.stack(sess_emb_vectors).to(device) - sess_emb_seq = torch.stack(sess_emb_seq).to(device) - sess_sig_lengths = torch.tensor(sess_sig_lengths).to(device) - return sess_emb_vectors, sess_emb_seq, sess_sig_lengths - - def diar_infer( - self, test_batch: List[torch.Tensor], test_data_collection: List[Any] - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Launch forward_infer() function by feeding the session-wise embedding sequences to get pairwise speaker prediction values. - If split_infer is True, the input audio clips are broken into short sequences then cluster average embeddings are calculated - for inference. Split-infer might result in an improved results if calculating clustering average on the shorter tim-espan can - help speaker assignment. - - Args: - test_batch: (list) - List containing embedding sequences, length of embedding sequences, ground truth labels (if exists) and initializing embedding vectors. - test_data_collection: (list) - List containing test-set dataloader contents. test_data_collection includes wav file path, RTTM file path, clustered speaker indices. - - Returns: - preds (Tensor): - Tensor containing predicted values which are generated from MSDD model. - targets (Tensor): - Tensor containing binary ground-truth values. - signal_lengths (Tensor): - The actual Session length (number of steps = number of base-scale segments) without zero padding. - """ - signals, signal_lengths, _targets, emb_vectors = test_batch - if self._cfg.diarizer.msdd_model.parameters.split_infer: - split_count = torch.ceil(torch.tensor(signals.shape[1] / self.diar_window_length)).int() - sess_emb_vectors, sess_emb_seq, sess_sig_lengths = self.get_range_clus_avg_emb( - test_batch, test_data_collection, device=self.msdd_model.device - ) - with autocast(): - _preds, scale_weights = self.msdd_model.forward_infer( - input_signal=sess_emb_seq, - input_signal_length=sess_sig_lengths, - emb_vectors=sess_emb_vectors, - targets=None, - ) - _preds = _preds.reshape(len(signal_lengths), split_count * self.diar_window_length, -1) - _preds = _preds[:, : signals.shape[1], :] - else: - with autocast(): - _preds, scale_weights = self.msdd_model.forward_infer( - input_signal=signals, input_signal_length=signal_lengths, emb_vectors=emb_vectors, targets=None - ) - self.max_pred_length = max(_preds.shape[1], self.max_pred_length) - preds = torch.zeros(_preds.shape[0], self.max_pred_length, _preds.shape[2]) - targets = torch.zeros(_preds.shape[0], self.max_pred_length, _preds.shape[2]) - preds[:, : _preds.shape[1], :] = _preds - return preds, targets, signal_lengths - - @torch.no_grad() - def run_pairwise_diarization(self) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]: - """ - Setup the parameters needed for batch inference and run batch inference. Note that each sample is pairwise speaker input. - The pairwise inference results are reconstructed to make session-wise prediction results. - - Returns: - integrated_preds_list: (list) - List containing the session-wise speaker predictions in torch.tensor format. - targets_list: (list) - List containing the ground-truth labels in matrix format filled with 0 or 1. - signal_lengths_list: (list) - List containing the actual length of each sequence in session. - """ - self.out_rttm_dir = self.clustering_embedding.out_rttm_dir - self.msdd_model.setup_test_data(self.msdd_model.cfg.test_ds) - self.msdd_model.eval() - cumul_sample_count = [0] - preds_list, targets_list, signal_lengths_list = [], [], [] - uniq_id_list = get_uniq_id_list_from_manifest(self.msdd_model.cfg.test_ds.manifest_filepath) - test_data_collection = [d for d in self.msdd_model.data_collection] - for sidx, test_batch in enumerate(tqdm(self.msdd_model.test_dataloader())): - signals, signal_lengths, _targets, emb_vectors = test_batch - cumul_sample_count.append(cumul_sample_count[-1] + signal_lengths.shape[0]) - preds, targets, signal_lengths = self.diar_infer( - test_batch, test_data_collection[cumul_sample_count[-2] : cumul_sample_count[-1]] - ) - if self._cfg.diarizer.msdd_model.parameters.seq_eval_mode: - self.msdd_model._accuracy_test(preds, targets, signal_lengths) - - preds_list.extend(list(torch.split(preds, 1))) - targets_list.extend(list(torch.split(targets, 1))) - signal_lengths_list.extend(list(torch.split(signal_lengths, 1))) - - if self._cfg.diarizer.msdd_model.parameters.seq_eval_mode: - f1_score, simple_acc = self.msdd_model.compute_accuracies() - logging.info(f"Test Inference F1 score. {f1_score:.4f}, simple Acc. {simple_acc:.4f}") - integrated_preds_list = self.get_integrated_preds_list(uniq_id_list, test_data_collection, preds_list) - return integrated_preds_list, targets_list, signal_lengths_list - - def run_overlap_aware_eval( - self, preds_list: List[torch.Tensor], threshold: float - ) -> List[Optional[Tuple[DiarizationErrorRate, Dict]]]: - """ - Based on the predicted sigmoid values, render RTTM files then evaluate the overlap-aware diarization results. - - Args: - preds_list: (list) - List containing predicted pairwise speaker labels. - threshold: (float) - A floating-point threshold value that determines overlapped speech detection. - - If threshold is 1.0, no overlap speech is detected and only detect major speaker. - - If threshold is 0.0, all speakers are considered active at any time step. - """ - logging.info( - f" [Threshold: {threshold:.4f}] [use_clus_as_main={self.use_clus_as_main}] [diar_window={self.diar_window_length}]" - ) - outputs = [] - manifest_filepath = self.msdd_model.cfg.test_ds.manifest_filepath - rttm_map = audio_rttm_map(manifest_filepath) - for k, (collar, ignore_overlap) in enumerate(self.diar_eval_settings): - all_reference, all_hypothesis = make_rttm_with_overlap( - manifest_filepath, - self.msdd_model.clus_test_label_dict, - preds_list, - threshold=threshold, - infer_overlap=True, - use_clus_as_main=self.use_clus_as_main, - overlap_infer_spk_limit=self.overlap_infer_spk_limit, - use_adaptive_thres=self.use_adaptive_thres, - max_overlap_spks=self.max_overlap_spks, - out_rttm_dir=self.out_rttm_dir, - ) - output = score_labels( - rttm_map, - all_reference, - all_hypothesis, - collar=collar, - ignore_overlap=ignore_overlap, - verbose=self._cfg.verbose, - ) - outputs.append(output) - logging.info(f" \n") - return outputs - - @classmethod - def from_pretrained( - cls, - model_name: str, - vad_model_name: str = 'vad_multilingual_marblenet', - map_location: Optional[str] = None, - verbose: bool = False, - ): - """ - Instantiate a `NeuralDiarizer` to run Speaker Diarization. - - Args: - model_name (str): Path/Name of the neural diarization model to load. - vad_model_name (str): Path/Name of the voice activity detection (VAD) model to load. - map_location (str): Optional str to map the instantiated model to a device (cpu, cuda). - By default, (None), it will select a GPU if available, falling back to CPU otherwise. - verbose (bool): Enable verbose logging when loading models/running diarization. - Returns: - `NeuralDiarizer` - """ - logging.setLevel(logging.INFO if verbose else logging.WARNING) - cfg = NeuralDiarizerInferenceConfig.init_config( - diar_model_path=model_name, vad_model_path=vad_model_name, map_location=map_location, verbose=verbose, - ) - return cls(cfg) - - def __call__( - self, - audio_filepath: str, - batch_size: int = 64, - num_workers: int = 1, - max_speakers: Optional[int] = None, - num_speakers: Optional[int] = None, - out_dir: Optional[str] = None, - verbose: bool = False, - ) -> Union[Annotation, List[Annotation]]: - """ - Run the `NeuralDiarizer` inference pipeline. - - Args: - audio_filepath (str, list): Audio path to run speaker diarization on. - max_speakers (int): If known, the max number of speakers in the file(s). - num_speakers (int): If known, the exact number of speakers in the file(s). - batch_size (int): Batch size when running inference. - num_workers (int): Number of workers to use in data-loading. - out_dir (str): Path to store intermediate files during inference (default temp directory). - Returns: - `pyannote.Annotation` for each audio path, containing speaker labels and segment timestamps. - """ - if out_dir: - os.makedirs(out_dir, exist_ok=True) - with tempfile.TemporaryDirectory(dir=out_dir) as tmpdir: - manifest_path = os.path.join(tmpdir, 'manifest.json') - meta = [ - { - 'audio_filepath': audio_filepath, - 'offset': 0, - 'duration': None, - 'label': 'infer', - 'text': '-', - 'num_speakers': num_speakers, - 'rttm_filepath': None, - 'uem_filepath': None, - } - ] - - with open(manifest_path, 'w') as f: - f.write('\n'.join(json.dumps(x) for x in meta)) - - self._initialize_configs( - manifest_path=manifest_path, - max_speakers=max_speakers, - num_speakers=num_speakers, - tmpdir=tmpdir, - batch_size=batch_size, - num_workers=num_workers, - verbose=verbose, - ) - - self.msdd_model.cfg.test_ds.manifest_filepath = manifest_path - self.diarize() - - pred_labels_clus = rttm_to_labels(f'{tmpdir}/pred_rttms/{Path(audio_filepath).stem}.rttm') - return labels_to_pyannote_object(pred_labels_clus) - - def _initialize_configs( - self, - manifest_path: str, - max_speakers: Optional[int], - num_speakers: Optional[int], - tmpdir: tempfile.TemporaryDirectory, - batch_size: int, - num_workers: int, - verbose: bool, - ) -> None: - self._cfg.batch_size = batch_size - self._cfg.num_workers = num_workers - self._cfg.diarizer.manifest_filepath = manifest_path - self._cfg.diarizer.out_dir = tmpdir - self._cfg.verbose = verbose - self._cfg.diarizer.clustering.parameters.oracle_num_speakers = num_speakers is not None - if max_speakers: - self._cfg.diarizer.clustering.parameters.max_num_speakers = max_speakers - self.transfer_diar_params_to_model_params(self.msdd_model, self._cfg) - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - return EncDecDiarLabelModel.list_available_models() diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/online_diarizer.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/online_diarizer.py deleted file mode 100644 index 7074b92f4c041b486419024302398280314bfacf..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/online_diarizer.py +++ /dev/null @@ -1,579 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import time -from copy import deepcopy -from typing import Dict - -import torch -from omegaconf import DictConfig - -from nemo.collections.asr.models import ClusteringDiarizer -from nemo.collections.asr.parts.utils.offline_clustering import get_scale_interpolated_embs, split_input_data -from nemo.collections.asr.parts.utils.online_clustering import OnlineSpeakerClustering -from nemo.collections.asr.parts.utils.speaker_utils import ( - OnlineSegmentor, - audio_rttm_map, - generate_cluster_labels, - get_embs_and_timestamps, -) -from nemo.utils import logging, model_utils - -__all__ = ['OnlineClusteringDiarizer'] - - -def timeit(method): - """ - Monitor elapsed time of the corresponding function displaying the method name. - - Args: - method: function that is being measured - - Return: - `timed` function for measuring the elapsed time - """ - - def timed(*args, **kwargs): - ts = time.time() - result = method(*args, **kwargs) - te = time.time() - if 'log_time' in kwargs: - name = kwargs.get('log_name', method.__name__.upper()) - kwargs['log_time'][name] = int((te - ts) * 1000) - else: - logging.info('%2.2fms %r' % ((te - ts) * 1000, method.__name__)) - return result - - return timed - - -class OnlineClusteringDiarizer(ClusteringDiarizer): - """ - A class that enables online (streaming) clustering based diarization. - - - The instance created from `OnlineClusteringDiarizer` sets aside a certain amount of memory - to provide the upcoming inference with history information - - - There are two major modules involved: `OnlineSegmentor` and `OnlineSpeakerClustering`. - OnlineSegmentor: Take the VAD-timestamps and generate segments for each scale - OnlineSpeakerClustering: Update the entire speaker labels of the given online session - while updating the speaker labels of the streaming inputs. - - - The overall diarization process is done by calling `diarize_step` function. - `diarize_step` function goes through the following steps: - (1) Segmentation (`OnlineSegmentor` class) - (2) Embedding extraction (`_extract_online_embeddings` function call) - (3) Online speaker counting and speaker clustering (`OnlineClusteringDiarizer` class) - (4) Label generation (`generate_cluster_labels` function call) - """ - - def __init__(self, cfg: DictConfig): - super().__init__(cfg) - self.cfg = model_utils.convert_model_config_to_dict_config(cfg) - self._cfg_diarizer = self.cfg.diarizer - self.base_scale_index = max(self.multiscale_args_dict['scale_dict'].keys()) - - self.uniq_id = self._cfg_diarizer.get('uniq_id', None) - self.decimals = self._cfg_diarizer.get('decimals', 2) - self.AUDIO_RTTM_MAP = audio_rttm_map(self.cfg.diarizer.manifest_filepath) - self.sample_rate = self.cfg.sample_rate - torch.manual_seed(0) - - self._out_dir = self._cfg_diarizer.out_dir - if not os.path.exists(self._out_dir): - os.mkdir(self._out_dir) - - if torch.cuda.is_available(): - self.cuda = True - self.device = torch.device("cuda") - else: - self.cuda = False - self.device = torch.device("cpu") - - self.reset() - - # Set speaker embedding model in eval mode - self._speaker_model.eval() - - def _init_online_clustering_module(self, clustering_params): - """ - Initialize online speaker clustering module - - Attributes: - online_clus (OnlineSpeakerClustering): - Online clustering diarizer class instance - history_n (int): - History buffer size for saving history of speaker label inference - Total number of embedding vectors saved in the buffer that is kept till the end of the session - current_n (int): - Current buffer (FIFO queue) size for calculating the speaker label inference - Total number of embedding vectors saved in the FIFO queue for clustering inference - """ - self.online_clus = OnlineSpeakerClustering( - max_num_speakers=clustering_params.max_num_speakers, - max_rp_threshold=clustering_params.max_rp_threshold, - sparse_search_volume=clustering_params.sparse_search_volume, - history_buffer_size=clustering_params.history_buffer_size, - current_buffer_size=clustering_params.current_buffer_size, - cuda=self.cuda, - ) - self.history_n = clustering_params.history_buffer_size - self.current_n = clustering_params.current_buffer_size - - self.max_num_speakers = self.online_clus.max_num_speakers - - def _init_online_segmentor_module(self, sample_rate): - """ - Initialize an online segmentor module - - Attributes: - online_segmentor (OnlineSegmentor): - online segmentation module that generates short speech segments from the VAD input - """ - self.online_segmentor = OnlineSegmentor(sample_rate) - - def _init_memory_buffer(self): - """ - Variables are kept in memory for future updates - - Attributes: - memory_margin (int): - The number of embeddings saved in the memory buffer. - This memory margin is dependent on the base scale length: margin = (buffer_length)/(base scale shift) - memory margin is automatically calculated to have minimal memory usage - memory_segment_ranges (dict): - The segment range information kept in the memory buffer - memory_segment_indexes (dict): - The segment indexes kept in the memory buffer - memory_cluster_labels (Tensor): - The cluster labels inferred in the previous diarization steps - """ - self.memory_margin = 0 - self.memory_segment_ranges = {key: [] for key in self.multiscale_args_dict['scale_dict'].keys()} - self.memory_segment_indexes = {key: [] for key in self.multiscale_args_dict['scale_dict'].keys()} - self.memory_cluster_labels = torch.tensor([]) - - def _init_temporal_major_voting_module(self, clustering_params): - """ - Variables needed for taking majority votes for speaker labels - - Attributes: - use_temporal_label_major_vote (bool): - Boolean for whether to use temporal majority voting - temporal_label_major_vote_buffer_size (int): - buffer size for majority voting - base_scale_label_dict (dict): - Dictionary containing multiple speaker labels for major voting - Speaker labels from multiple steps are saved for each segment index. - """ - self.use_temporal_label_major_vote = clustering_params.get('use_temporal_label_major_vote', False) - self.temporal_label_major_vote_buffer_size = clustering_params.get('temporal_label_major_vote_buffer_size', 1) - self.base_scale_label_dict = {} - - def _init_segment_variables(self): - """ - Initialize segment variables for each scale. - Note that we have `uniq_id` variable in case where multiple sessions are handled. - """ - self.emb_vectors = {} - self.time_stamps = {} - self.segment_range_ts = {} - self.segment_raw_audio = {} - self.segment_indexes = {} - - for scale_idx in self.multiscale_args_dict['scale_dict'].keys(): - self.multiscale_embeddings_and_timestamps[scale_idx] = [None, None] - self.emb_vectors[scale_idx] = torch.tensor([]) - self.time_stamps[scale_idx] = [] - self.segment_range_ts[scale_idx] = [] - self.segment_raw_audio[scale_idx] = [] - self.segment_indexes[scale_idx] = [] - - def _init_buffer_frame_timestamps(self): - """ - Timing variables transferred from OnlineDiarWithASR class. - Buffer is window region where input signal is kept for ASR. - Frame is window region where the actual inference ASR decoded results are updated - - Example: - buffer_len = 5.0 - frame_len = 1.0 - - |___Buffer___[___________]____________| - |____________[ Frame ]____________| - - | <- buffer_start - |____________| <- frame_start - |_____________________________________| <- buffer_end - - buffer_start = 12.0 - buffer_end = 17.0 - frame_start = 14.0 - - These timestamps and index variables are updated by OnlineDiarWithASR. - - Attributes: - frame_index (int): - Integer index of frame window - frame_start (float): - The start of the frame window - buffer_start (float): - The start of the buffer window - buffer_end (float): - The end of the buffer - """ - self.frame_index = 0 - self.frame_start = 0.0 - self.buffer_start = 0.0 - self.buffer_end = 0.0 - - def _transfer_timestamps_to_segmentor(self): - """ - Pass the timing information from streaming ASR buffers. - """ - self.online_segmentor.frame_start = self.frame_start - self.online_segmentor.buffer_start = self.buffer_start - self.online_segmentor.buffer_end = self.buffer_end - - def reset(self): - """ - Reset all the necessary variables and initialize classes. - - Attributes: - n_embed_seg_len (int): - Number of segments needed for 1 second of input time-series signal - """ - self.n_embed_seg_len = int( - self.sample_rate * self.multiscale_args_dict['scale_dict'][self.base_scale_index][0] - ) - self._init_segment_variables() - self._init_online_clustering_module(self._cfg_diarizer.clustering.parameters) - self._init_online_segmentor_module(self.cfg.sample_rate) - self._init_memory_buffer() - self._init_temporal_major_voting_module(self._cfg_diarizer.clustering.parameters) - self._init_buffer_frame_timestamps() - - def _clear_memory(self, scale_idx: int): - """ - Calculate how many segments should be removed from memory (`memory_margin`) and - save the necessary information. - `keep_range` determines how many segments and their corresponding embedding, raw audio, - timestamps in the memory of the online diarizer instance. - - Args: - scale_idx (int): - Scale index in integer type - """ - base_scale_shift = self.multiscale_args_dict['scale_dict'][self.base_scale_index][1] - self.memory_margin = int((self.buffer_end - self.buffer_start) / base_scale_shift) - - scale_buffer_size = int( - len(set(self.scale_mapping_dict[scale_idx].tolist())) - / len(set(self.scale_mapping_dict[self.base_scale_index].tolist())) - * (self.history_n + self.current_n) - ) - keep_range = scale_buffer_size + self.memory_margin - self.emb_vectors[scale_idx] = self.emb_vectors[scale_idx][-keep_range:] - self.segment_raw_audio[scale_idx] = self.segment_raw_audio[scale_idx][-keep_range:] - self.segment_range_ts[scale_idx] = self.segment_range_ts[scale_idx][-keep_range:] - self.segment_indexes[scale_idx] = self.segment_indexes[scale_idx][-keep_range:] - - @timeit - def _temporal_label_major_vote(self) -> torch.Tensor: - """ - Take a majority voting for every segment on temporal steps. This feature significantly reduces the error coming - from unstable speaker counting in the beginning of sessions. - - Returns: - maj_vote_labels (list): - List containing the major-voted speaker labels on temporal domain - """ - maj_vote_labels = [] - for seg_idx in self.memory_segment_indexes[self.base_scale_index]: - if seg_idx not in self.base_scale_label_dict: - self.base_scale_label_dict[seg_idx] = [self.memory_cluster_labels[seg_idx]] - else: - while len(self.base_scale_label_dict[seg_idx]) > self.temporal_label_major_vote_buffer_size: - self.base_scale_label_dict[seg_idx].pop(0) - self.base_scale_label_dict[seg_idx].append(self.memory_cluster_labels[seg_idx]) - - maj_vote_labels.append(torch.mode(torch.tensor(self.base_scale_label_dict[seg_idx]))[0].item()) - return maj_vote_labels - - def save_history_data(self, scale_idx: int, total_cluster_labels: torch.Tensor, is_online: bool) -> torch.Tensor: - """ - Save the temporary input to the class memory buffer. - - - Clustering is done for (hist_N + curr_N) number of embeddings. - - Thus, we need to remove the clustering results on the embedding memory. - - If self.diar.history_buffer_seg_end is not None, that indicates streaming diarization system - is starting to save embeddings to its memory. Thus, the new incoming clustering label should be separated. - - If `is_online = True`, old embeddings outside the window are removed to save GPU memory. - - Args: - scale_idx (int): - Scale index in integer - total_cluster_labels (Tensor): - The speaker labels from the beginning of the session to the current position - is_online (bool) - Boolean variable that indicates whether the system is currently in online mode or not - - Returns: - cluster_label_hyp (Tensor): - Majority voted speaker labels over multiple inferences - """ - total_cluster_labels = total_cluster_labels.tolist() - - if not is_online: - self.memory_segment_ranges[scale_idx] = deepcopy(self.segment_range_ts[scale_idx]) - self.memory_segment_indexes[scale_idx] = deepcopy(self.segment_indexes[scale_idx]) - if scale_idx == self.base_scale_index: - self.memory_cluster_labels = deepcopy(total_cluster_labels) - - # Only if there are newly obtained embeddings, update ranges and embeddings. - elif self.segment_indexes[scale_idx][-1] > self.memory_segment_indexes[scale_idx][-1]: - # Get the global index of the first segment we want to keep in the buffer - global_stt_idx = max(max(self.memory_segment_indexes[scale_idx]) - self.memory_margin, 0) - - # Convert global index global_stt_idx to buffer index buffer_stt_idx - segment_indexes_mat = torch.tensor(self.segment_indexes[scale_idx]) - buffer_stt_idx = torch.where(segment_indexes_mat == global_stt_idx)[0][0] - self.memory_segment_ranges[scale_idx][global_stt_idx:] = deepcopy( - self.segment_range_ts[scale_idx][buffer_stt_idx:] - ) - self.memory_segment_indexes[scale_idx][global_stt_idx:] = deepcopy( - self.segment_indexes[scale_idx][buffer_stt_idx:] - ) - if scale_idx == self.base_scale_index: - self.memory_cluster_labels[global_stt_idx:] = deepcopy(total_cluster_labels[global_stt_idx:]) - if len(self.memory_cluster_labels) != len(self.memory_segment_ranges[scale_idx]): - raise ValueError( - "self.memory_cluster_labels and self.memory_segment_ranges should always have the same length, " - f"but they have {len(self.memory_cluster_labels)} and {len(self.memory_segment_ranges[scale_idx])}." - ) - - # Remove unnecessary old values - self._clear_memory(scale_idx) - - if not ( - len(self.emb_vectors[scale_idx]) - == len(self.segment_raw_audio[scale_idx]) - == len(self.segment_indexes[scale_idx]) - == len(self.segment_range_ts[scale_idx]) - ): - raise ValueError( - "self.emb_vectors, self.segment_raw_audio, self.segment_indexes, and self.segment_range_ts " - "should always have the same length, " - f"but they have {len(self.emb_vectors[scale_idx])}, {len(self.segment_raw_audio[scale_idx])}, " - f"{len(self.segment_indexes[scale_idx])}, and {len(self.segment_range_ts[scale_idx])}, respectively." - ) - - if self.use_temporal_label_major_vote: - cluster_label_hyp = self._temporal_label_major_vote() - else: - cluster_label_hyp = self.memory_cluster_labels - return cluster_label_hyp - - @timeit - @torch.no_grad() - def _run_embedding_extractor(self, audio_signal: torch.Tensor) -> torch.Tensor: - """ - Call `forward` function of the speaker embedding model. - - Args: - audio_signal (Tensor): - Torch tensor containing time-series signal - - Returns: - Speaker embedding vectors for the given time-series input `audio_signal`. - """ - audio_signal = torch.stack(audio_signal).float().to(self.device) - audio_signal_lens = torch.tensor([self.n_embed_seg_len for k in range(audio_signal.shape[0])]).to(self.device) - _, torch_embs = self._speaker_model.forward(input_signal=audio_signal, input_signal_length=audio_signal_lens) - return torch_embs - - @timeit - def _extract_online_embeddings( - self, audio_signal: torch.Tensor, segment_ranges: torch.Tensor, embeddings - ) -> torch.Tensor: - """ - Incrementally extract speaker embeddings based on `audio_signal` and `segment_ranges` variables. - Unlike offline speaker diarization, speaker embedding and subsegment ranges are not saved to disk. - Measures the mismatch between `segment_ranges` and `embeddings` then extract the necessary amount of - speaker embeddings. - - Args: - audio_signal (Tensor): - Torch tensor containing time-series audio signal - embeddings (Tensor): - Previously existing Torch tensor containing speaker embedding vector - segment_ranges(Tensor): - Torch tensor containing the start and end of each segment - - Returns: - embeddings (Tensor): - Concatenated speaker embedding vectors that match segment range information in `segment_ranges`. - """ - stt_idx = 0 if embeddings is None else embeddings.shape[0] - end_idx = len(segment_ranges) - - if end_idx > stt_idx: - torch_embs = self._run_embedding_extractor(audio_signal[stt_idx:end_idx]) - if embeddings is None or embeddings.shape[0] == 0: - embeddings = torch_embs - else: - embeddings = torch.vstack((embeddings[:stt_idx, :], torch_embs)) - - elif end_idx < stt_idx: - embeddings = embeddings[: len(segment_ranges)] - - if len(segment_ranges) != embeddings.shape[0]: - raise ValueError("Segment ranges and embeddings shapes do not match.") - return embeddings - - @timeit - def _perform_online_clustering( - self, uniq_embs_and_timestamps: Dict[str, torch.Tensor], cuda=False, - ) -> torch.Tensor: - """ - Launch online clustering for `uniq_embs_and_timestamps` input variable. - - Args: - uniq_embs_and_timestamps (dict): - Dictionary containing embeddings, timestamps and multiscale weights. - If uniq_embs_and_timestamps contains only one scale, single scale diarization - is performed. - cuda (bool): - Boolean indicator for cuda usages - """ - device = torch.device("cuda") if cuda else torch.device("cpu") - - # Get base-scale (the highest index) information from uniq_embs_and_timestamps. - embeddings_in_scales, timestamps_in_scales = split_input_data( - embeddings_in_scales=uniq_embs_and_timestamps['embeddings'], - timestamps_in_scales=uniq_embs_and_timestamps['timestamps'], - multiscale_segment_counts=uniq_embs_and_timestamps['multiscale_segment_counts'], - ) - - curr_emb, self.scale_mapping_dict = get_scale_interpolated_embs( - multiscale_weights=uniq_embs_and_timestamps['multiscale_weights'], - embeddings_in_scales=embeddings_in_scales, - timestamps_in_scales=timestamps_in_scales, - device=device, - ) - - base_segment_indexes = torch.tensor(self.segment_indexes[self.base_scale_index]).to(curr_emb.device) - merged_clus_labels = self.online_clus.forward_infer( - curr_emb=curr_emb, base_segment_indexes=base_segment_indexes, frame_index=self.frame_index, cuda=cuda, - ) - # Update history data - for scale_idx, (window, shift) in self.multiscale_args_dict['scale_dict'].items(): - cluster_label_hyp = self.save_history_data(scale_idx, merged_clus_labels, self.online_clus.is_online) - - return cluster_label_hyp - - def _get_interim_output(self) -> torch.Tensor: - """ - In case buffer is not filled or there is no speech activity in the input, generate temporary output. - - Returns: - diar_hyp (Tensor): Speaker labels based on the previously saved segments and speaker labels - """ - if len(self.memory_cluster_labels) == 0 or self.buffer_start < 0: - diar_hyp, _ = generate_cluster_labels([[0.0, self.total_buffer_in_secs]], [0]) - else: - diar_hyp, _ = generate_cluster_labels( - self.memory_segment_ranges[self.base_scale_index], self.memory_cluster_labels - ) - return diar_hyp - - @timeit - def diarize_step(self, audio_buffer: torch.Tensor, vad_timestamps: torch.Tensor) -> torch.Tensor: - """ - A function for a unit diarization step. Each diarization step goes through the following steps: - - 1. Segmentation: - Using `OnlineSegmentor` class, call `run_online_segmentation` method to get the segments. - 2. Embedding Extraction: - Extract multiscale embeddings from the extracted speech segments. - 3. Online Clustering & Counting - Perform online speaker clustering by using `OnlineSpeakerClustering` class. - 4. Generate speaker labels: - Generate start and end timestamps of speaker labels based on the diarization results. - - c.f.) Also see method `diarize` in `ClusteringDiarizer` class. - - Args: - audio_buffer (Tensor): - Tensor variable containing the time series signal at the current frame - Dimensions: (Number of audio time-series samples) x 1 - vad_timestamps (Tensor): - List containing VAD timestamps. - Dimensions: (Number of segments) x 2 - Example: - >>> vad_timestamps = torch.Tensor([[0.05, 2.52], [3.12, 6.85]]) - - Returns: - diar_hyp (Tensor): - Speaker label hypothesis from the start of the session to the current position - """ - self._transfer_timestamps_to_segmentor() - - # In case buffer is not filled or there is no speech activity in the input - if self.buffer_start < 0 or len(vad_timestamps) == 0: - return self._get_interim_output() - - # Segmentation: (c.f. see `diarize` function in ClusteringDiarizer class) - for scale_idx, (window, shift) in self.multiscale_args_dict['scale_dict'].items(): - - # Step 1: Get subsegments for embedding extraction. - audio_sigs, segment_ranges, range_inds = self.online_segmentor.run_online_segmentation( - audio_buffer=audio_buffer, - vad_timestamps=vad_timestamps, - segment_raw_audio=self.segment_raw_audio[scale_idx], - segment_range_ts=self.segment_range_ts[scale_idx], - segment_indexes=self.segment_indexes[scale_idx], - window=window, - shift=shift, - ) - self.segment_raw_audio[scale_idx] = audio_sigs - self.segment_range_ts[scale_idx] = segment_ranges - self.segment_indexes[scale_idx] = range_inds - - # Step 2-1: Extract speaker embeddings from the extracted subsegment timestamps. - embeddings = self._extract_online_embeddings( - audio_signal=self.segment_raw_audio[scale_idx], - segment_ranges=self.segment_range_ts[scale_idx], - embeddings=self.emb_vectors[scale_idx], - ) - - # Step 2-2:Save the embeddings and segmentation timestamps in memory - self.emb_vectors[scale_idx] = embeddings - - self.multiscale_embeddings_and_timestamps[scale_idx] = [ - {self.uniq_id: embeddings}, - {self.uniq_id: segment_ranges}, - ] - - embs_and_timestamps = get_embs_and_timestamps( - self.multiscale_embeddings_and_timestamps, self.multiscale_args_dict - ) - - # Step 3 - Clustering: Perform an online version of clustering algorithm - cluster_label_hyp = self._perform_online_clustering(embs_and_timestamps[self.uniq_id], cuda=self.cuda,) - - # Step 4: Generate RTTM style diarization labels from segment ranges and cluster labels - diar_hyp, _ = generate_cluster_labels(self.memory_segment_ranges[self.base_scale_index], cluster_label_hyp) - return diar_hyp diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/rnnt_bpe_models.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/rnnt_bpe_models.py deleted file mode 100644 index 2b8ed315903c863222dbea4b06b44190a55f353a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/rnnt_bpe_models.py +++ /dev/null @@ -1,564 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os -from typing import Dict, List, Optional, Union - -import torch -from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict -from pytorch_lightning import Trainer - -from nemo.collections.asr.data import audio_to_text_dataset -from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset -from nemo.collections.asr.losses.rnnt import RNNTLoss -from nemo.collections.asr.metrics.rnnt_wer_bpe import RNNTBPEWER, RNNTBPEDecoding, RNNTBPEDecodingConfig -from nemo.collections.asr.models.rnnt_models import EncDecRNNTModel -from nemo.collections.asr.parts.mixins import ASRBPEMixin -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging, model_utils - - -class EncDecRNNTBPEModel(EncDecRNNTModel, ASRBPEMixin): - """Base class for encoder decoder RNNT-based models with subword tokenization.""" - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_contextnet_256", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_256", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_contextnet_256/versions/1.6.0/files/stt_en_contextnet_256.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_contextnet_512", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_512", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_contextnet_512/versions/1.6.0/files/stt_en_contextnet_512.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_contextnet_1024", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_1024", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_contextnet_1024/versions/1.9.0/files/stt_en_contextnet_1024.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_contextnet_256_mls", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_256_mls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_contextnet_256_mls/versions/1.0.0/files/stt_en_contextnet_256_mls.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_contextnet_512_mls", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_512_mls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_contextnet_512_mls/versions/1.0.0/files/stt_en_contextnet_512_mls.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_contextnet_1024_mls", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_contextnet_1024_mls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_contextnet_1024_mls/versions/1.0.0/files/stt_en_contextnet_1024_mls.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_conformer_transducer_small", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_small", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_transducer_small/versions/1.6.0/files/stt_en_conformer_transducer_small.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_conformer_transducer_medium", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_medium", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_transducer_medium/versions/1.6.0/files/stt_en_conformer_transducer_medium.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_conformer_transducer_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_transducer_large/versions/1.10.0/files/stt_en_conformer_transducer_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_conformer_transducer_large_ls", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_large_ls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_transducer_large_ls/versions/1.8.0/files/stt_en_conformer_transducer_large_ls.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_conformer_transducer_xlarge", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_xlarge", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_transducer_xlarge/versions/1.10.0/files/stt_en_conformer_transducer_xlarge.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_conformer_transducer_xxlarge", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_xxlarge", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_transducer_xxlarge/versions/1.8.0/files/stt_en_conformer_transducer_xxlarge.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_de_contextnet_1024", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_contextnet_1024", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_contextnet_1024/versions/1.4.0/files/stt_de_contextnet_1024.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_fr_contextnet_1024", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_contextnet_1024", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_contextnet_1024/versions/1.5/files/stt_fr_contextnet_1024.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_es_contextnet_1024", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_contextnet_1024", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_contextnet_1024/versions/1.8.0/files/stt_es_contextnet_1024.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_de_conformer_transducer_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_conformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_conformer_transducer_large/versions/1.5.0/files/stt_de_conformer_transducer_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_fr_conformer_transducer_large", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_conformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_conformer_transducer_large/versions/1.5/files/stt_fr_conformer_transducer_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_es_conformer_transducer_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_conformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_conformer_transducer_large/versions/1.8.0/files/stt_es_conformer_transducer_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_enes_conformer_transducer_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_enes_conformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_enes_conformer_transducer_large/versions/1.0.0/files/stt_enes_conformer_transducer_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_enes_contextnet_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_enes_contextnet_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_enes_contextnet_large/versions/1.0.0/files/stt_enes_contextnet_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_ca_conformer_transducer_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_conformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ca_conformer_transducer_large/versions/1.11.0/files/stt_ca_conformer_transducer_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_rw_conformer_transducer_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_rw_conformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_rw_conformer_transducer_large/versions/1.11.0/files/stt_rw_conformer_transducer_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_enes_conformer_transducer_large_codesw", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_enes_conformer_transducer_large_codesw", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_enes_conformer_transducer_large_codesw/versions/1.0.0/files/stt_enes_conformer_transducer_large_codesw.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_kab_conformer_transducer_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_kab_conformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_kab_conformer_transducer_large/versions/1.12.0/files/stt_kab_conformer_transducer_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_be_conformer_transducer_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_be_conformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_be_conformer_transducer_large/versions/1.12.0/files/stt_be_conformer_transducer_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_hr_conformer_transducer_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_conformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_hr_conformer_transducer_large/versions/1.11.0/files/stt_hr_conformer_transducer_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_it_conformer_transducer_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_conformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_it_conformer_transducer_large/versions/1.13.0/files/stt_it_conformer_transducer_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_ru_conformer_transducer_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_conformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ru_conformer_transducer_large/versions/1.13.0/files/stt_ru_conformer_transducer_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_eo_conformer_transducer_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_eo_conformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_eo_conformer_transducer_large/versions/1.14.0/files/stt_eo_conformer_transducer_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_fastconformer_transducer_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_transducer_large/versions/1.0.0/files/stt_en_fastconformer_transducer_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_fastconformer_transducer_large_ls", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_large_ls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_transducer_large_ls/versions/1.0.0/files/stt_en_fastconformer_transducer_large_ls.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_fastconformer_transducer_xlarge", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_xlarge", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_transducer_xlarge/versions/1.20.1/files/stt_en_fastconformer_transducer_xlarge.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="stt_en_fastconformer_transducer_xxlarge", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_xxlarge", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_transducer_xxlarge/versions/1.20.1/files/stt_en_fastconformer_transducer_xxlarge.nemo", - ) - results.append(model) - - return results - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - # Convert to Hydra 1.0 compatible DictConfig - cfg = model_utils.convert_model_config_to_dict_config(cfg) - cfg = model_utils.maybe_update_config_version(cfg) - - # Tokenizer is necessary for this model - if 'tokenizer' not in cfg: - raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !") - - if not isinstance(cfg, DictConfig): - cfg = OmegaConf.create(cfg) - - # Setup the tokenizer - self._setup_tokenizer(cfg.tokenizer) - - # Initialize a dummy vocabulary - vocabulary = self.tokenizer.tokenizer.get_vocab() - - # Set the new vocabulary - with open_dict(cfg): - cfg.labels = ListConfig(list(vocabulary)) - - with open_dict(cfg.decoder): - cfg.decoder.vocab_size = len(vocabulary) - - with open_dict(cfg.joint): - cfg.joint.num_classes = len(vocabulary) - cfg.joint.vocabulary = ListConfig(list(vocabulary)) - cfg.joint.jointnet.encoder_hidden = cfg.model_defaults.enc_hidden - cfg.joint.jointnet.pred_hidden = cfg.model_defaults.pred_hidden - - super().__init__(cfg=cfg, trainer=trainer) - - # Setup decoding object - self.decoding = RNNTBPEDecoding( - decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) - - # Setup wer object - self.wer = RNNTBPEWER( - decoding=self.decoding, - batch_dim_index=0, - use_cer=self._cfg.get('use_cer', False), - log_prediction=self._cfg.get('log_prediction', True), - dist_sync_on_step=True, - ) - - # Setup fused Joint step if flag is set - if self.joint.fuse_loss_wer: - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - def change_vocabulary( - self, - new_tokenizer_dir: Union[str, DictConfig], - new_tokenizer_type: str, - decoding_cfg: Optional[DictConfig] = None, - ): - """ - Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning on from pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - Args: - new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer (if the tokenizer type is `agg`) - new_tokenizer_type: Type of tokenizer. Can be either `agg`, `bpe` or `wpe`. - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - - Returns: None - - """ - if isinstance(new_tokenizer_dir, DictConfig): - if new_tokenizer_type == 'agg': - new_tokenizer_cfg = new_tokenizer_dir - else: - raise ValueError( - f'New tokenizer dir should be a string unless the tokenizer is `agg`, but this tokenizer type is: {new_tokenizer_type}' - ) - else: - new_tokenizer_cfg = None - - if new_tokenizer_cfg is not None: - tokenizer_cfg = new_tokenizer_cfg - else: - if not os.path.isdir(new_tokenizer_dir): - raise NotADirectoryError( - f'New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}' - ) - - if new_tokenizer_type.lower() not in ('bpe', 'wpe'): - raise ValueError(f'New tokenizer type must be either `bpe` or `wpe`') - - tokenizer_cfg = OmegaConf.create({'dir': new_tokenizer_dir, 'type': new_tokenizer_type}) - - # Setup the tokenizer - self._setup_tokenizer(tokenizer_cfg) - - # Initialize a dummy vocabulary - vocabulary = self.tokenizer.tokenizer.get_vocab() - - joint_config = self.joint.to_config_dict() - new_joint_config = copy.deepcopy(joint_config) - if self.tokenizer_type == "agg": - new_joint_config["vocabulary"] = ListConfig(vocabulary) - else: - new_joint_config["vocabulary"] = ListConfig(list(vocabulary.keys())) - - new_joint_config['num_classes'] = len(vocabulary) - del self.joint - self.joint = EncDecRNNTBPEModel.from_config_dict(new_joint_config) - - decoder_config = self.decoder.to_config_dict() - new_decoder_config = copy.deepcopy(decoder_config) - new_decoder_config.vocab_size = len(vocabulary) - del self.decoder - self.decoder = EncDecRNNTBPEModel.from_config_dict(new_decoder_config) - - del self.loss - self.loss = RNNTLoss(num_classes=self.joint.num_classes_with_blank - 1) - - if decoding_cfg is None: - # Assume same decoding config as before - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(RNNTBPEDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = RNNTBPEDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) - - self.wer = RNNTBPEWER( - decoding=self.decoding, - batch_dim_index=self.wer.batch_dim_index, - use_cer=self.wer.use_cer, - log_prediction=self.wer.log_prediction, - dist_sync_on_step=True, - ) - - # Setup fused Joint step - if self.joint.fuse_loss_wer or ( - self.decoding.joint_fused_batch_size is not None and self.decoding.joint_fused_batch_size > 0 - ): - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - # Update config - with open_dict(self.cfg.joint): - self.cfg.joint = new_joint_config - - with open_dict(self.cfg.decoder): - self.cfg.decoder = new_decoder_config - - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - logging.info(f"Changed decoder to output to {self.joint.vocabulary} vocabulary.") - - def change_decoding_strategy(self, decoding_cfg: DictConfig): - """ - Changes decoding strategy used during RNNT decoding process. - - Args: - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - """ - if decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config") - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(RNNTBPEDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = RNNTBPEDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) - - self.wer = RNNTBPEWER( - decoding=self.decoding, - batch_dim_index=self.wer.batch_dim_index, - use_cer=self.wer.use_cer, - log_prediction=self.wer.log_prediction, - dist_sync_on_step=True, - ) - - # Setup fused Joint step - if self.joint.fuse_loss_wer or ( - self.decoding.joint_fused_batch_size is not None and self.decoding.joint_fused_batch_size > 0 - ): - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - self.joint.temperature = decoding_cfg.get('temperature', 1.0) - - # Update config - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - logging.info(f"Changed decoding strategy to \n{OmegaConf.to_yaml(self.cfg.decoding)}") - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config( - config=config, - local_rank=self.local_rank, - global_rank=self.global_rank, - world_size=self.world_size, - tokenizer=self.tokenizer, - preprocessor_cfg=self.cfg.get("preprocessor", None), - ) - - if dataset is None: - return None - - if isinstance(dataset, AudioToBPEDALIDataset): - # DALI Dataset implements dataloader interface - return dataset - - shuffle = config['shuffle'] - if isinstance(dataset, torch.utils.data.IterableDataset): - shuffle = False - - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - elif hasattr(dataset.datasets[0], 'collate_fn'): - # support datasets that are lists of entries - collate_fn = dataset.datasets[0].collate_fn - else: - # support datasets that are lists of lists - collate_fn = dataset.datasets[0].datasets[0].collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': - """ - Setup function for a temporary data loader which wraps the provided audio file. - - Args: - config: A python dictionary which contains the following keys: - paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \ - Recommended length per file is between 5 and 25 seconds. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - temp_dir: (str) A temporary directory where the audio manifest is temporarily - stored. - - Returns: - A pytorch DataLoader for the given audio file(s). - """ - if 'manifest_filepath' in config: - manifest_filepath = config['manifest_filepath'] - batch_size = config['batch_size'] - else: - manifest_filepath = os.path.join(config['temp_dir'], 'manifest.json') - batch_size = min(config['batch_size'], len(config['paths2audio_files'])) - - dl_config = { - 'manifest_filepath': manifest_filepath, - 'sample_rate': self.preprocessor._sample_rate, - 'batch_size': batch_size, - 'shuffle': False, - 'num_workers': config.get('num_workers', min(batch_size, os.cpu_count() - 1)), - 'pin_memory': True, - 'channel_selector': config.get('channel_selector', None), - 'use_start_end_token': self.cfg.validation_ds.get('use_start_end_token', False), - } - - if config.get("augmentor"): - dl_config['augmentor'] = config.get("augmentor") - - temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) - return temporary_datalayer diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/rnnt_models.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/rnnt_models.py deleted file mode 100644 index 8b5798d343561dbd28bae67bf704de18dc9663ad..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/rnnt_models.py +++ /dev/null @@ -1,994 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import json -import os -import tempfile -from math import ceil, isclose -from typing import Dict, List, Optional, Tuple, Union - -import torch -from omegaconf import DictConfig, OmegaConf, open_dict -from pytorch_lightning import Trainer -from tqdm.auto import tqdm - -from nemo.collections.asr.data import audio_to_text_dataset -from nemo.collections.asr.data.audio_to_text_dali import AudioToCharDALIDataset, DALIOutputs -from nemo.collections.asr.losses.rnnt import RNNTLoss, resolve_rnnt_default_loss_name -from nemo.collections.asr.metrics.rnnt_wer import RNNTWER, RNNTDecoding, RNNTDecodingConfig -from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel -from nemo.collections.asr.modules.rnnt import RNNTDecoderJoint -from nemo.collections.asr.parts.mixins import ASRModuleMixin -from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType -from nemo.core.classes import Exportable -from nemo.core.classes.common import PretrainedModelInfo, typecheck -from nemo.core.classes.mixins import AccessMixin -from nemo.core.neural_types import AcousticEncodedRepresentation, AudioSignal, LengthsType, NeuralType, SpectrogramType -from nemo.utils import logging - - -class EncDecRNNTModel(ASRModel, ASRModuleMixin, ExportableEncDecModel): - """Base class for encoder decoder RNNT-based models.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable - # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 - self.world_size = 1 - if trainer is not None: - self.world_size = trainer.world_size - - super().__init__(cfg=cfg, trainer=trainer) - - # Initialize components - self.preprocessor = EncDecRNNTModel.from_config_dict(self.cfg.preprocessor) - self.encoder = EncDecRNNTModel.from_config_dict(self.cfg.encoder) - - # Update config values required by components dynamically - with open_dict(self.cfg.decoder): - self.cfg.decoder.vocab_size = len(self.cfg.labels) - - with open_dict(self.cfg.joint): - self.cfg.joint.num_classes = len(self.cfg.labels) - self.cfg.joint.vocabulary = self.cfg.labels - self.cfg.joint.jointnet.encoder_hidden = self.cfg.model_defaults.enc_hidden - self.cfg.joint.jointnet.pred_hidden = self.cfg.model_defaults.pred_hidden - - self.decoder = EncDecRNNTModel.from_config_dict(self.cfg.decoder) - self.joint = EncDecRNNTModel.from_config_dict(self.cfg.joint) - - # Setup RNNT Loss - loss_name, loss_kwargs = self.extract_rnnt_loss_cfg(self.cfg.get("loss", None)) - - num_classes = self.joint.num_classes_with_blank - 1 # for standard RNNT and multi-blank - - if loss_name == 'tdt': - num_classes = num_classes - self.joint.num_extra_outputs - - self.loss = RNNTLoss( - num_classes=num_classes, - loss_name=loss_name, - loss_kwargs=loss_kwargs, - reduction=self.cfg.get("rnnt_reduction", "mean_batch"), - ) - - if hasattr(self.cfg, 'spec_augment') and self._cfg.spec_augment is not None: - self.spec_augmentation = EncDecRNNTModel.from_config_dict(self.cfg.spec_augment) - else: - self.spec_augmentation = None - - # Setup decoding objects - self.decoding = RNNTDecoding( - decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, vocabulary=self.joint.vocabulary, - ) - # Setup WER calculation - self.wer = RNNTWER( - decoding=self.decoding, - batch_dim_index=0, - use_cer=self._cfg.get('use_cer', False), - log_prediction=self._cfg.get('log_prediction', True), - dist_sync_on_step=True, - ) - - # Whether to compute loss during evaluation - if 'compute_eval_loss' in self.cfg: - self.compute_eval_loss = self.cfg.compute_eval_loss - else: - self.compute_eval_loss = True - - # Setup fused Joint step if flag is set - if self.joint.fuse_loss_wer or ( - self.decoding.joint_fused_batch_size is not None and self.decoding.joint_fused_batch_size > 0 - ): - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - # Setup optimization normalization (if provided in config) - self.setup_optim_normalization() - - # Setup optional Optimization flags - self.setup_optimization_flags() - - # Setup encoder adapters (from ASRAdapterModelMixin) - self.setup_adapters() - - def setup_optim_normalization(self): - """ - Helper method to setup normalization of certain parts of the model prior to the optimization step. - - Supported pre-optimization normalizations are as follows: - - .. code-block:: yaml - - # Variation Noise injection - model: - variational_noise: - std: 0.0 - start_step: 0 - - # Joint - Length normalization - model: - normalize_joint_txu: false - - # Encoder Network - gradient normalization - model: - normalize_encoder_norm: false - - # Decoder / Prediction Network - gradient normalization - model: - normalize_decoder_norm: false - - # Joint - gradient normalization - model: - normalize_joint_norm: false - """ - # setting up the variational noise for the decoder - if hasattr(self.cfg, 'variational_noise'): - self._optim_variational_noise_std = self.cfg['variational_noise'].get('std', 0) - self._optim_variational_noise_start = self.cfg['variational_noise'].get('start_step', 0) - else: - self._optim_variational_noise_std = 0 - self._optim_variational_noise_start = 0 - - # Setup normalized gradients for model joint by T x U scaling factor (joint length normalization) - self._optim_normalize_joint_txu = self.cfg.get('normalize_joint_txu', False) - self._optim_normalize_txu = None - - # Setup normalized encoder norm for model - self._optim_normalize_encoder_norm = self.cfg.get('normalize_encoder_norm', False) - - # Setup normalized decoder norm for model - self._optim_normalize_decoder_norm = self.cfg.get('normalize_decoder_norm', False) - - # Setup normalized joint norm for model - self._optim_normalize_joint_norm = self.cfg.get('normalize_joint_norm', False) - - def extract_rnnt_loss_cfg(self, cfg: Optional[DictConfig]): - """ - Helper method to extract the rnnt loss name, and potentially its kwargs - to be passed. - - Args: - cfg: Should contain `loss_name` as a string which is resolved to a RNNT loss name. - If the default should be used, then `default` can be used. - Optionally, one can pass additional kwargs to the loss function. The subdict - should have a keyname as follows : `{loss_name}_kwargs`. - - Note that whichever loss_name is selected, that corresponding kwargs will be - selected. For the "default" case, the "{resolved_default}_kwargs" will be used. - - Examples: - .. code-block:: yaml - - loss_name: "default" - warprnnt_numba_kwargs: - kwargs2: some_other_val - - Returns: - A tuple, the resolved loss name as well as its kwargs (if found). - """ - if cfg is None: - cfg = DictConfig({}) - - loss_name = cfg.get("loss_name", "default") - - if loss_name == "default": - loss_name = resolve_rnnt_default_loss_name() - - loss_kwargs = cfg.get(f"{loss_name}_kwargs", None) - - logging.info(f"Using RNNT Loss : {loss_name}\n" f"Loss {loss_name}_kwargs: {loss_kwargs}") - - return loss_name, loss_kwargs - - @torch.no_grad() - def transcribe( - self, - paths2audio_files: List[str], - batch_size: int = 4, - return_hypotheses: bool = False, - partial_hypothesis: Optional[List['Hypothesis']] = None, - num_workers: int = 0, - channel_selector: Optional[ChannelSelectorType] = None, - augmentor: DictConfig = None, - verbose: bool = True, - ) -> Tuple[List[str], Optional[List['Hypothesis']]]: - """ - Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. - - Args: - - paths2audio_files: (a list) of paths to audio files. \ - Recommended length per file is between 5 and 25 seconds. \ - But it is possible to pass a few hours long file if enough GPU memory is available. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - return_hypotheses: (bool) Either return hypotheses or text - With hypotheses can do some postprocessing like getting timestamp or rescoring - num_workers: (int) number of workers for DataLoader - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. - augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied. - verbose: (bool) whether to display tqdm progress bar - Returns: - Returns a tuple of 2 items - - * A list of greedy transcript texts / Hypothesis - * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis. - """ - if paths2audio_files is None or len(paths2audio_files) == 0: - return {} - - # We will store transcriptions here - hypotheses = [] - all_hypotheses = [] - # Model's mode and device - mode = self.training - device = next(self.parameters()).device - dither_value = self.preprocessor.featurizer.dither - pad_to_value = self.preprocessor.featurizer.pad_to - - if num_workers is None: - num_workers = min(batch_size, os.cpu_count() - 1) - - try: - self.preprocessor.featurizer.dither = 0.0 - self.preprocessor.featurizer.pad_to = 0 - - # Switch model to evaluation mode - self.eval() - # Freeze the encoder and decoder modules - self.encoder.freeze() - self.decoder.freeze() - self.joint.freeze() - logging_level = logging.get_verbosity() - logging.set_verbosity(logging.WARNING) - # Work in tmp directory - will store manifest file there - with tempfile.TemporaryDirectory() as tmpdir: - with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp: - for audio_file in paths2audio_files: - entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': ''} - fp.write(json.dumps(entry) + '\n') - - config = { - 'paths2audio_files': paths2audio_files, - 'batch_size': batch_size, - 'temp_dir': tmpdir, - 'num_workers': num_workers, - 'channel_selector': channel_selector, - } - - if augmentor: - config['augmentor'] = augmentor - - temporary_datalayer = self._setup_transcribe_dataloader(config) - for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=(not verbose)): - encoded, encoded_len = self.forward( - input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) - ) - best_hyp, all_hyp = self.decoding.rnnt_decoder_predictions_tensor( - encoded, - encoded_len, - return_hypotheses=return_hypotheses, - partial_hypotheses=partial_hypothesis, - ) - - hypotheses += best_hyp - if all_hyp is not None: - all_hypotheses += all_hyp - else: - all_hypotheses += best_hyp - - del encoded - del test_batch - finally: - # set mode back to its original value - self.train(mode=mode) - self.preprocessor.featurizer.dither = dither_value - self.preprocessor.featurizer.pad_to = pad_to_value - - logging.set_verbosity(logging_level) - if mode is True: - self.encoder.unfreeze() - self.decoder.unfreeze() - self.joint.unfreeze() - return hypotheses, all_hypotheses - - def change_vocabulary(self, new_vocabulary: List[str], decoding_cfg: Optional[DictConfig] = None): - """ - Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning a pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - Args: - new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically, \ - this is target alphabet. - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - - Returns: None - - """ - if self.joint.vocabulary == new_vocabulary: - logging.warning(f"Old {self.joint.vocabulary} and new {new_vocabulary} match. Not changing anything.") - else: - if new_vocabulary is None or len(new_vocabulary) == 0: - raise ValueError(f'New vocabulary must be non-empty list of chars. But I got: {new_vocabulary}') - - joint_config = self.joint.to_config_dict() - new_joint_config = copy.deepcopy(joint_config) - new_joint_config['vocabulary'] = new_vocabulary - new_joint_config['num_classes'] = len(new_vocabulary) - del self.joint - self.joint = EncDecRNNTModel.from_config_dict(new_joint_config) - - decoder_config = self.decoder.to_config_dict() - new_decoder_config = copy.deepcopy(decoder_config) - new_decoder_config.vocab_size = len(new_vocabulary) - del self.decoder - self.decoder = EncDecRNNTModel.from_config_dict(new_decoder_config) - - del self.loss - loss_name, loss_kwargs = self.extract_rnnt_loss_cfg(self.cfg.get('loss', None)) - self.loss = RNNTLoss( - num_classes=self.joint.num_classes_with_blank - 1, loss_name=loss_name, loss_kwargs=loss_kwargs - ) - - if decoding_cfg is None: - # Assume same decoding config as before - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(RNNTDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = RNNTDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, vocabulary=self.joint.vocabulary, - ) - - self.wer = RNNTWER( - decoding=self.decoding, - batch_dim_index=self.wer.batch_dim_index, - use_cer=self.wer.use_cer, - log_prediction=self.wer.log_prediction, - dist_sync_on_step=True, - ) - - # Setup fused Joint step - if self.joint.fuse_loss_wer or ( - self.decoding.joint_fused_batch_size is not None and self.decoding.joint_fused_batch_size > 0 - ): - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - # Update config - with open_dict(self.cfg.joint): - self.cfg.joint = new_joint_config - - with open_dict(self.cfg.decoder): - self.cfg.decoder = new_decoder_config - - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - ds_keys = ['train_ds', 'validation_ds', 'test_ds'] - for key in ds_keys: - if key in self.cfg: - with open_dict(self.cfg[key]): - self.cfg[key]['labels'] = OmegaConf.create(new_vocabulary) - - logging.info(f"Changed decoder to output to {self.joint.vocabulary} vocabulary.") - - def change_decoding_strategy(self, decoding_cfg: DictConfig): - """ - Changes decoding strategy used during RNNT decoding process. - - Args: - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - """ - if decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config") - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(RNNTDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = RNNTDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, vocabulary=self.joint.vocabulary, - ) - - self.wer = RNNTWER( - decoding=self.decoding, - batch_dim_index=self.wer.batch_dim_index, - use_cer=self.wer.use_cer, - log_prediction=self.wer.log_prediction, - dist_sync_on_step=True, - ) - - # Setup fused Joint step - if self.joint.fuse_loss_wer or ( - self.decoding.joint_fused_batch_size is not None and self.decoding.joint_fused_batch_size > 0 - ): - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - self.joint.temperature = decoding_cfg.get('temperature', 1.0) - - # Update config - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - logging.info(f"Changed decoding strategy to \n{OmegaConf.to_yaml(self.cfg.decoding)}") - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - # Automatically inject args from model config to dataloader config - audio_to_text_dataset.inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate') - audio_to_text_dataset.inject_dataloader_value_from_model_config(self.cfg, config, key='labels') - dataset = audio_to_text_dataset.get_audio_to_text_char_dataset_from_config( - config=config, - local_rank=self.local_rank, - global_rank=self.global_rank, - world_size=self.world_size, - preprocessor_cfg=self._cfg.get("preprocessor", None), - ) - - if dataset is None: - return None - - if isinstance(dataset, AudioToCharDALIDataset): - # DALI Dataset implements dataloader interface - return dataset - - shuffle = config['shuffle'] - if isinstance(dataset, torch.utils.data.IterableDataset): - shuffle = False - - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - elif hasattr(dataset.datasets[0], 'collate_fn'): - # support datasets that are lists of entries - collate_fn = dataset.datasets[0].collate_fn - else: - # support datasets that are lists of lists - collate_fn = dataset.datasets[0].datasets[0].collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the training data loader via a Dict-like object. - - Args: - train_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` - """ - if 'shuffle' not in train_data_config: - train_data_config['shuffle'] = True - - # preserve config - self._update_dataset_config(dataset_name='train', config=train_data_config) - - self._train_dl = self._setup_dataloader_from_config(config=train_data_config) - - # Need to set this because if using an IterableDataset, the length of the dataloader is the total number - # of samples rather than the number of batches, and this messes up the tqdm progress bar. - # So we set the number of steps manually (to the correct number) to fix this. - if ( - self._train_dl is not None - and hasattr(self._train_dl, 'dataset') - and isinstance(self._train_dl.dataset, torch.utils.data.IterableDataset) - ): - # We also need to check if limit_train_batches is already set. - # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, - # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). - if self._trainer is not None and isinstance(self._trainer.limit_train_batches, float): - self._trainer.limit_train_batches = int( - self._trainer.limit_train_batches - * ceil((len(self._train_dl.dataset) / self.world_size) / train_data_config['batch_size']) - ) - elif self._trainer is None: - logging.warning( - "Model Trainer was not set before constructing the dataset, incorrect number of " - "training batches will be used. Please set the trainer and rebuild the dataset." - ) - - def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the validation data loader via a Dict-like object. - - Args: - val_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` - """ - if 'shuffle' not in val_data_config: - val_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='validation', config=val_data_config) - - self._validation_dl = self._setup_dataloader_from_config(config=val_data_config) - - def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the test data loader via a Dict-like object. - - Args: - test_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` - """ - if 'shuffle' not in test_data_config: - test_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='test', config=test_data_config) - - self._test_dl = self._setup_dataloader_from_config(config=test_data_config) - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - if hasattr(self.preprocessor, '_sample_rate'): - input_signal_eltype = AudioSignal(freq=self.preprocessor._sample_rate) - else: - input_signal_eltype = AudioSignal() - - return { - "input_signal": NeuralType(('B', 'T'), input_signal_eltype, optional=True), - "input_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), - "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType(), optional=True), - "processed_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), - } - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return { - "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - } - - @typecheck() - def forward( - self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None - ): - """ - Forward pass of the model. Note that for RNNT Models, the forward pass of the model is a 3 step process, - and this method only performs the first step - forward of the acoustic model. - - Please refer to the `training_step` in order to see the full `forward` step for training - which - performs the forward of the acoustic model, the prediction network and then the joint network. - Finally, it computes the loss and possibly compute the detokenized text via the `decoding` step. - - Please refer to the `validation_step` in order to see the full `forward` step for inference - which - performs the forward of the acoustic model, the prediction network and then the joint network. - Finally, it computes the decoded tokens via the `decoding` step and possibly compute the batch metrics. - - Args: - input_signal: Tensor that represents a batch of raw audio signals, - of shape [B, T]. T here represents timesteps, with 1 second of audio represented as - `self.sample_rate` number of floating point values. - input_signal_length: Vector of length B, that contains the individual lengths of the audio - sequences. - processed_signal: Tensor that represents a batch of processed audio signals, - of shape (B, D, T) that has undergone processing via some DALI preprocessor. - processed_signal_length: Vector of length B, that contains the individual lengths of the - processed audio sequences. - - Returns: - A tuple of 2 elements - - 1) The log probabilities tensor of shape [B, T, D]. - 2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B]. - """ - has_input_signal = input_signal is not None and input_signal_length is not None - has_processed_signal = processed_signal is not None and processed_signal_length is not None - if (has_input_signal ^ has_processed_signal) is False: - raise ValueError( - f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive " - " with ``processed_signal`` and ``processed_signal_len`` arguments." - ) - - if not has_processed_signal: - processed_signal, processed_signal_length = self.preprocessor( - input_signal=input_signal, length=input_signal_length, - ) - - # Spec augment is not applied during evaluation/testing - if self.spec_augmentation is not None and self.training: - processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) - - encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length) - return encoded, encoded_len - - # PTL-specific methods - def training_step(self, batch, batch_nb): - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - signal, signal_len, transcript, transcript_len = batch - - # forward() only performs encoder forward - if isinstance(batch, DALIOutputs) and batch.has_processed_signal: - encoded, encoded_len = self.forward(processed_signal=signal, processed_signal_length=signal_len) - else: - encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) - del signal - - # During training, loss must be computed, so decoder forward is necessary - decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) - - if hasattr(self, '_trainer') and self._trainer is not None: - log_every_n_steps = self._trainer.log_every_n_steps - sample_id = self._trainer.global_step - else: - log_every_n_steps = 1 - sample_id = batch_nb - - # If experimental fused Joint-Loss-WER is not used - if not self.joint.fuse_loss_wer: - # Compute full joint and loss - joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) - loss_value = self.loss( - log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length - ) - - # Add auxiliary losses, if registered - loss_value = self.add_auxiliary_losses(loss_value) - - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - tensorboard_logs = { - 'train_loss': loss_value, - 'learning_rate': self._optimizer.param_groups[0]['lr'], - 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), - } - - if (sample_id + 1) % log_every_n_steps == 0: - self.wer.update(encoded, encoded_len, transcript, transcript_len) - _, scores, words = self.wer.compute() - self.wer.reset() - tensorboard_logs.update({'training_batch_wer': scores.float() / words}) - - else: - # If experimental fused Joint-Loss-WER is used - if (sample_id + 1) % log_every_n_steps == 0: - compute_wer = True - else: - compute_wer = False - - # Fused joint step - loss_value, wer, _, _ = self.joint( - encoder_outputs=encoded, - decoder_outputs=decoder, - encoder_lengths=encoded_len, - transcripts=transcript, - transcript_lengths=transcript_len, - compute_wer=compute_wer, - ) - - # Add auxiliary losses, if registered - loss_value = self.add_auxiliary_losses(loss_value) - - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - tensorboard_logs = { - 'train_loss': loss_value, - 'learning_rate': self._optimizer.param_groups[0]['lr'], - 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), - } - - if compute_wer: - tensorboard_logs.update({'training_batch_wer': wer}) - - # Log items - self.log_dict(tensorboard_logs) - - # Preserve batch acoustic model T and language model U parameters if normalizing - if self._optim_normalize_joint_txu: - self._optim_normalize_txu = [encoded_len.max(), transcript_len.max()] - - return {'loss': loss_value} - - def predict_step(self, batch, batch_idx, dataloader_idx=0): - signal, signal_len, transcript, transcript_len, sample_id = batch - - # forward() only performs encoder forward - if isinstance(batch, DALIOutputs) and batch.has_processed_signal: - encoded, encoded_len = self.forward(processed_signal=signal, processed_signal_length=signal_len) - else: - encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) - del signal - - best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( - encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False - ) - - sample_id = sample_id.cpu().detach().numpy() - return list(zip(sample_id, best_hyp_text)) - - def validation_pass(self, batch, batch_idx, dataloader_idx=0): - signal, signal_len, transcript, transcript_len = batch - - # forward() only performs encoder forward - if isinstance(batch, DALIOutputs) and batch.has_processed_signal: - encoded, encoded_len = self.forward(processed_signal=signal, processed_signal_length=signal_len) - else: - encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) - del signal - - tensorboard_logs = {} - - # If experimental fused Joint-Loss-WER is not used - if not self.joint.fuse_loss_wer: - if self.compute_eval_loss: - decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) - joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) - - loss_value = self.loss( - log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length - ) - - tensorboard_logs['val_loss'] = loss_value - - self.wer.update(encoded, encoded_len, transcript, transcript_len) - wer, wer_num, wer_denom = self.wer.compute() - self.wer.reset() - - tensorboard_logs['val_wer_num'] = wer_num - tensorboard_logs['val_wer_denom'] = wer_denom - tensorboard_logs['val_wer'] = wer - - else: - # If experimental fused Joint-Loss-WER is used - compute_wer = True - - if self.compute_eval_loss: - decoded, target_len, states = self.decoder(targets=transcript, target_length=transcript_len) - else: - decoded = None - target_len = transcript_len - - # Fused joint step - loss_value, wer, wer_num, wer_denom = self.joint( - encoder_outputs=encoded, - decoder_outputs=decoded, - encoder_lengths=encoded_len, - transcripts=transcript, - transcript_lengths=target_len, - compute_wer=compute_wer, - ) - - if loss_value is not None: - tensorboard_logs['val_loss'] = loss_value - - tensorboard_logs['val_wer_num'] = wer_num - tensorboard_logs['val_wer_denom'] = wer_denom - tensorboard_logs['val_wer'] = wer - - self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) - - return tensorboard_logs - - def validation_step(self, batch, batch_idx, dataloader_idx=0): - metrics = self.validation_pass(batch, batch_idx, dataloader_idx) - if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: - self.validation_step_outputs[dataloader_idx].append(metrics) - else: - self.validation_step_outputs.append(metrics) - return metrics - - def test_step(self, batch, batch_idx, dataloader_idx=0): - logs = self.validation_pass(batch, batch_idx, dataloader_idx=dataloader_idx) - test_logs = {name.replace("val_", "test_"): value for name, value in logs.items()} - if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: - self.test_step_outputs[dataloader_idx].append(test_logs) - else: - self.test_step_outputs.append(test_logs) - return test_logs - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): - if self.compute_eval_loss: - val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean() - val_loss_log = {'val_loss': val_loss_mean} - else: - val_loss_log = {} - wer_num = torch.stack([x['val_wer_num'] for x in outputs]).sum() - wer_denom = torch.stack([x['val_wer_denom'] for x in outputs]).sum() - tensorboard_logs = {**val_loss_log, 'val_wer': wer_num.float() / wer_denom} - return {**val_loss_log, 'log': tensorboard_logs} - - def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): - if self.compute_eval_loss: - test_loss_mean = torch.stack([x['test_loss'] for x in outputs]).mean() - test_loss_log = {'test_loss': test_loss_mean} - else: - test_loss_log = {} - wer_num = torch.stack([x['test_wer_num'] for x in outputs]).sum() - wer_denom = torch.stack([x['test_wer_denom'] for x in outputs]).sum() - tensorboard_logs = {**test_loss_log, 'test_wer': wer_num.float() / wer_denom} - return {**test_loss_log, 'log': tensorboard_logs} - - def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': - """ - Setup function for a temporary data loader which wraps the provided audio file. - - Args: - config: A python dictionary which contains the following keys: - paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \ - Recommended length per file is between 5 and 25 seconds. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - temp_dir: (str) A temporary directory where the audio manifest is temporarily - stored. - - Returns: - A pytorch DataLoader for the given audio file(s). - """ - if 'manifest_filepath' in config: - manifest_filepath = config['manifest_filepath'] - batch_size = config['batch_size'] - else: - manifest_filepath = os.path.join(config['temp_dir'], 'manifest.json') - batch_size = min(config['batch_size'], len(config['paths2audio_files'])) - - dl_config = { - 'manifest_filepath': manifest_filepath, - 'sample_rate': self.preprocessor._sample_rate, - 'labels': self.joint.vocabulary, - 'batch_size': batch_size, - 'trim_silence': False, - 'shuffle': False, - 'num_workers': config.get('num_workers', min(batch_size, os.cpu_count() - 1)), - 'pin_memory': True, - } - - if config.get("augmentor"): - dl_config['augmentor'] = config.get("augmentor") - - temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) - return temporary_datalayer - - def on_after_backward(self): - super().on_after_backward() - if self._optim_variational_noise_std > 0 and self.global_step >= self._optim_variational_noise_start: - for param_name, param in self.decoder.named_parameters(): - if param.grad is not None: - noise = torch.normal( - mean=0.0, - std=self._optim_variational_noise_std, - size=param.size(), - device=param.device, - dtype=param.dtype, - ) - param.grad.data.add_(noise) - - if self._optim_normalize_joint_txu: - T, U = self._optim_normalize_txu - if T is not None and U is not None: - for param_name, param in self.encoder.named_parameters(): - if param.grad is not None: - param.grad.data.div_(U) - - for param_name, param in self.decoder.named_parameters(): - if param.grad is not None: - param.grad.data.div_(T) - - if self._optim_normalize_encoder_norm: - for param_name, param in self.encoder.named_parameters(): - if param.grad is not None: - norm = param.grad.norm() - param.grad.data.div_(norm) - - if self._optim_normalize_decoder_norm: - for param_name, param in self.decoder.named_parameters(): - if param.grad is not None: - norm = param.grad.norm() - param.grad.data.div_(norm) - - if self._optim_normalize_joint_norm: - for param_name, param in self.joint.named_parameters(): - if param.grad is not None: - norm = param.grad.norm() - param.grad.data.div_(norm) - - # EncDecRNNTModel is exported in 2 parts - def list_export_subnets(self): - return ['encoder', 'decoder_joint'] - - # for export - @property - def decoder_joint(self): - return RNNTDecoderJoint(self.decoder, self.joint) - - def set_export_config(self, args): - if 'decoder_type' in args: - if hasattr(self, 'change_decoding_strategy'): - self.change_decoding_strategy(decoder_type=args['decoder_type']) - else: - raise Exception("Model does not have decoder type option") - super().set_export_config(args) - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - - model = PretrainedModelInfo( - pretrained_model_name="stt_zh_conformer_transducer_large", - description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_zh_conformer_transducer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_conformer_transducer_large/versions/1.8.0/files/stt_zh_conformer_transducer_large.nemo", - ) - results.append(model) - - return results diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/slu_models.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/slu_models.py deleted file mode 100644 index 59323cfbfffefb23c66acc4d78062b05c6a3a372..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/slu_models.py +++ /dev/null @@ -1,659 +0,0 @@ -# ! /usr/bin/python -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -import tempfile -from math import ceil -from typing import Dict, List, Optional, Union - -import torch -from omegaconf import DictConfig, OmegaConf, open_dict -from tqdm.auto import tqdm - -from nemo.collections.asr.data import audio_to_text_dataset -from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs -from nemo.collections.asr.metrics.wer_bpe import WERBPE, CTCBPEDecoding, CTCBPEDecodingConfig -from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel -from nemo.collections.asr.parts.mixins import ASRBPEMixin, ASRModuleMixin -from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations -from nemo.collections.asr.parts.utils.slu_utils import SequenceGenerator, SequenceGeneratorConfig, get_seq_mask -from nemo.collections.common.losses import SmoothedNLLLoss -from nemo.core.classes.common import PretrainedModelInfo, typecheck -from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, LogprobsType, NeuralType, SpectrogramType -from nemo.utils import logging, model_utils - -__all__ = ["SLUIntentSlotBPEModel"] - - -class SLUIntentSlotBPEModel(ASRModel, ExportableEncDecModel, ASRModuleMixin, ASRBPEMixin): - """Model for end-to-end speech intent classification and slot filling, which is formulated as a speech-to-sequence task""" - - def __init__(self, cfg: DictConfig, trainer=None): - # Convert to Hydra 1.0 compatible DictConfig - cfg = model_utils.convert_model_config_to_dict_config(cfg) - cfg = model_utils.maybe_update_config_version(cfg) - - if 'tokenizer' not in cfg: - raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !") - - # Setup the tokenizer - self._setup_tokenizer(cfg.tokenizer) - - super().__init__(cfg=cfg, trainer=trainer) - - self.preprocessor = self.from_config_dict(self.cfg.preprocessor) - self.encoder = self.from_config_dict(self.cfg.encoder) - self.decoder = self.from_config_dict(self.cfg.decoder) - - if hasattr(self._cfg, 'spec_augment') and self._cfg.spec_augment is not None: - self.spec_augmentation = self.from_config_dict(self._cfg.spec_augment) - else: - self.spec_augmentation = None - - # Setup optional Optimization flags - self.setup_optimization_flags() - - # Adapter modules setup (from ASRAdapterModelMixin) - self.setup_adapters() - - self.vocabulary = self.tokenizer.tokenizer.get_vocab() - vocab_size = len(self.vocabulary) - - # Create embedding layer - self.cfg.embedding["vocab_size"] = vocab_size - self.embedding = self.from_config_dict(self.cfg.embedding) - - # Create token classifier - self.cfg.classifier["num_classes"] = vocab_size - self.classifier = self.from_config_dict(self.cfg.classifier) - - self.loss = SmoothedNLLLoss(label_smoothing=self.cfg.loss.label_smoothing) - - self.sequence_generator = SequenceGenerator( - cfg=self.cfg.sequence_generator, - embedding=self.embedding, - decoder=self.decoder, - log_softmax=self.classifier, - tokenizer=self.tokenizer, - ) - - # Setup decoding objects - decoding_cfg = self.cfg.get('decoding', None) - - # In case decoding config not found, use default config - if decoding_cfg is None: - decoding_cfg = OmegaConf.structured(CTCBPEDecodingConfig) - with open_dict(self.cfg): - self.cfg.decoding = decoding_cfg - - self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer) - - # Setup metric with decoding strategy - self._wer = WERBPE( - decoding=self.decoding, - use_cer=self._cfg.get('use_cer', False), - dist_sync_on_step=True, - log_prediction=self._cfg.get("log_prediction", False), - fold_consecutive=False, - ) - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - if hasattr(self.preprocessor, '_sample_rate'): - input_signal_eltype = AudioSignal(freq=self.preprocessor._sample_rate) - else: - input_signal_eltype = AudioSignal() - return { - "input_signal": NeuralType(('B', 'T'), input_signal_eltype, optional=True), - "input_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), - "target_semantics": NeuralType(('B', 'T'), input_signal_eltype, optional=True), - "target_semantics_length": NeuralType(tuple('B'), LengthsType(), optional=True), - "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType(), optional=True), - "processed_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), - "sample_id": NeuralType(tuple('B'), LengthsType(), optional=True), - } - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return { - "log_probs": NeuralType(('B', 'T', 'D'), LogprobsType(), optional=True), - "lengths": NeuralType(tuple('B'), LengthsType(), optional=True), - "greedy_predictions": NeuralType(('B', 'T'), LabelsType(), optional=True), - } - - def set_decoding_strategy(self, cfg: SequenceGeneratorConfig): - cfg.max_sequence_length = self.sequence_generator.generator.max_seq_length - self.sequence_generator = SequenceGenerator(cfg, self.embedding, self.decoder, self.classifier, self.tokenizer) - - @typecheck() - def forward( - self, - input_signal=None, - input_signal_length=None, - target_semantics=None, - target_semantics_length=None, - processed_signal=None, - processed_signal_length=None, - ): - """ - Forward pass of the model. - - Params: - input_signal: Tensor that represents a batch of raw audio signals, of shape [B, T]. T here represents - timesteps, with 1 second of audio represented as `self.sample_rate` number of floating point values. - - input_signal_length: Vector of length B, that contains the individual lengths of the audio sequences. - - target_semantics: Tensor that represents a batch of semantic tokens, of shape [B, L]. - - target_semantics_length: Vector of length B, that contains the individual lengths of the semantic sequences. - - processed_signal: Tensor that represents a batch of processed audio signals, of shape (B, D, T) that has - undergone processing via some DALI preprocessor. - - processed_signal_length: Vector of length B, that contains the individual lengths of the processed audio - sequences. - - Returns: - A tuple of 3 elements - - 1) The log probabilities tensor of shape [B, T, D]. - 2) The lengths of the output sequence after decoder, of shape [B]. - 3) The token predictions of the model of shape [B, T]. - """ - has_input_signal = input_signal is not None and input_signal_length is not None - has_processed_signal = processed_signal is not None and processed_signal_length is not None - if (has_input_signal ^ has_processed_signal) == False: - raise ValueError( - f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive " - " with ``processed_signal`` and ``processed_signal_len`` arguments." - ) - - if not has_processed_signal: - processed_signal, processed_signal_length = self.preprocessor( - input_signal=input_signal, length=input_signal_length, - ) - - if self.spec_augmentation is not None and self.training: - processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) - - encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length) - encoded = encoded.transpose(1, 2) # BxDxT -> BxTxD - encoded_mask = get_seq_mask(encoded, encoded_len) - - if target_semantics is None: # in inference-only mode - predictions = self.sequence_generator(encoded, encoded_mask) - return None, None, predictions - - bos_semantics_tokens = target_semantics[:, :-1] - bos_semantics = self.embedding(bos_semantics_tokens) - bos_semantics_mask = get_seq_mask(bos_semantics, target_semantics_length - 1) - - decoded = self.decoder( - encoder_states=encoded, - encoder_mask=encoded_mask, - decoder_states=bos_semantics, - decoder_mask=bos_semantics_mask, - ) - log_probs = self.classifier(decoded) - - predictions = log_probs.argmax(dim=-1, keepdim=False) - - pred_len = self.sequence_generator.get_seq_length(predictions) - return log_probs, pred_len, predictions - - # PTL-specific methods - def training_step(self, batch, batch_nb): - if len(batch) == 4: - signal, signal_len, semantics, semantics_len = batch - else: - signal, signal_len, semantics, semantics_len, sample_id = batch - - log_probs, pred_len, predictions = self.forward( - input_signal=signal, - input_signal_length=signal_len, - target_semantics=semantics, - target_semantics_length=semantics_len, - ) - - eos_semantics = semantics[:, 1:] - eos_semantics_len = semantics_len - 1 # subtract 1 for eos tokens - - loss_value = self.loss(log_probs=log_probs, labels=eos_semantics, lengths=eos_semantics_len) - - tensorboard_logs = {'train_loss': loss_value.item()} - if len(self._optimizer.param_groups) == 1: - tensorboard_logs['learning_rate'] = self._optimizer.param_groups[0]['lr'] - else: - for i, group in enumerate(self._optimizer.param_groups): - tensorboard_logs[f'learning_rate_g{i}'] = group['lr'] - - if hasattr(self, '_trainer') and self._trainer is not None: - log_every_n_steps = self._trainer.log_every_n_steps - else: - log_every_n_steps = 1 - - if (batch_nb + 1) % log_every_n_steps == 0: - self._wer.update( - predictions=predictions, - targets=eos_semantics, - predictions_lengths=pred_len, - target_lengths=eos_semantics_len, - ) - wer, _, _ = self._wer.compute() - self._wer.reset() - tensorboard_logs.update({'training_batch_wer': wer}) - - return {'loss': loss_value, 'log': tensorboard_logs} - - def predict( - self, input_signal, input_signal_length, processed_signal=None, processed_signal_length=None, dataloader_idx=0 - ) -> List[str]: - has_input_signal = input_signal is not None and input_signal_length is not None - has_processed_signal = processed_signal is not None and processed_signal_length is not None - if (has_input_signal ^ has_processed_signal) == False: - raise ValueError( - f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive " - " with ``processed_signal`` and ``processed_signal_len`` arguments." - ) - - if not has_processed_signal: - processed_signal, processed_signal_length = self.preprocessor( - input_signal=input_signal, length=input_signal_length, - ) - - if self.spec_augmentation is not None and self.training: - processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) - - encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length) - encoded = encoded.transpose(1, 2) # BxDxT -> BxTxD - encoded_mask = get_seq_mask(encoded, encoded_len) - - pred_tokens = self.sequence_generator(encoded, encoded_mask) - predictions = self.sequence_generator.decode_semantics_from_tokens(pred_tokens) - return predictions - - def validation_pass(self, batch, batch_idx, dataloader_idx=0): - if len(batch) == 4: - signal, signal_len, semantics, semantics_len = batch - else: - signal, signal_len, semantics, semantics_len, sample_id = batch - - if isinstance(batch, DALIOutputs) and batch.has_processed_signal: - log_probs, pred_len, predictions = self.forward( - processed_signal=signal, - processed_signal_length=signal_len, - target_semantics=semantics, - target_semantics_length=semantics_len, - ) - else: - log_probs, pred_len, predictions = self.forward( - input_signal=signal, - input_signal_length=signal_len, - target_semantics=semantics, - target_semantics_length=semantics_len, - ) - - eos_semantics = semantics[:, 1:] - eos_semantics_len = semantics_len - 1 # subtract 1 for bos&eos tokens - - loss_value = self.loss(log_probs=log_probs, labels=eos_semantics, lengths=eos_semantics_len) - - self._wer.update( - predictions=predictions, - targets=eos_semantics, - predictions_lengths=pred_len, - target_lengths=eos_semantics_len, - ) - wer, wer_num, wer_denom = self._wer.compute() - self._wer.reset() - - return { - 'val_loss': loss_value, - 'val_wer_num': wer_num, - 'val_wer_denom': wer_denom, - 'val_wer': wer, - } - - def validation_step(self, batch, batch_idx, dataloader_idx=0): - metrics = self.validation_pass(batch, batch_idx, dataloader_idx) - if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: - self.validation_step_outputs[dataloader_idx].append(metrics) - else: - self.validation_step_outputs.append(metrics) - return metrics - - def test_step(self, batch, batch_idx, dataloader_idx=0): - logs = self.validation_pass(batch, batch_idx, dataloader_idx=dataloader_idx) - test_logs = {name.replace("val_", "test_"): value for name, value in logs.items()} - if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: - self.test_step_outputs[dataloader_idx].append(test_logs) - else: - self.test_step_outputs.append(test_logs) - return test_logs - - def test_dataloader(self): - if self._test_dl is None: - # None dataloader no longer supported in PTL2.0 - self._test_dl = [] - - return self._test_dl - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - if 'augmentor' in config: - augmentor = process_augmentations(config['augmentor']) - else: - augmentor = None - - shuffle = config['shuffle'] - device = 'gpu' if torch.cuda.is_available() else 'cpu' - if config.get('use_dali', False): - device_id = self.local_rank if device == 'gpu' else None - dataset = audio_to_text_dataset.get_dali_bpe_dataset( - config=config, - tokenizer=self.tokenizer, - shuffle=shuffle, - device_id=device_id, - global_rank=self.global_rank, - world_size=self.world_size, - preprocessor_cfg=self._cfg.preprocessor, - ) - return dataset - - # Instantiate tarred dataset loader or normal dataset loader - if config.get('is_tarred', False): - if ('tarred_audio_filepaths' in config and config['tarred_audio_filepaths'] is None) or ( - 'manifest_filepath' in config and config['manifest_filepath'] is None - ): - logging.warning( - "Could not load dataset as `manifest_filepath` was None or " - f"`tarred_audio_filepaths` is None. Provided config : {config}" - ) - return None - - shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0 - dataset = audio_to_text_dataset.get_tarred_dataset( - config=config, - tokenizer=self.tokenizer, - shuffle_n=shuffle_n, - global_rank=self.global_rank, - world_size=self.world_size, - augmentor=augmentor, - ) - shuffle = False - else: - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}") - return None - - dataset = audio_to_text_dataset.get_bpe_dataset( - config=config, tokenizer=self.tokenizer, augmentor=augmentor - ) - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - elif hasattr(dataset.datasets[0], 'collate_fn'): - # support datasets that are lists of entries - collate_fn = dataset.datasets[0].collate_fn - else: - # support datasets that are lists of lists - collate_fn = dataset.datasets[0].datasets[0].collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the training data loader via a Dict-like object. - - Args: - train_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` - """ - if 'shuffle' not in train_data_config: - train_data_config['shuffle'] = True - - # preserve config - self._update_dataset_config(dataset_name='train', config=train_data_config) - - self._train_dl = self._setup_dataloader_from_config(config=train_data_config) - - # Need to set this because if using an IterableDataset, the length of the dataloader is the total number - # of samples rather than the number of batches, and this messes up the tqdm progress bar. - # So we set the number of steps manually (to the correct number) to fix this. - if ( - self._train_dl is not None - and hasattr(self._train_dl, 'dataset') - and isinstance(self._train_dl.dataset, torch.utils.data.IterableDataset) - ): - # We also need to check if limit_train_batches is already set. - # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, - # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). - if self._trainer is not None and isinstance(self._trainer.limit_train_batches, float): - self._trainer.limit_train_batches = int( - self._trainer.limit_train_batches - * ceil((len(self._train_dl.dataset) / self.world_size) / train_data_config['batch_size']) - ) - elif self._trainer is None: - logging.warning( - "Model Trainer was not set before constructing the dataset, incorrect number of " - "training batches will be used. Please set the trainer and rebuild the dataset." - ) - - def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the validation data loader via a Dict-like object. - - Args: - val_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` - """ - if 'shuffle' not in val_data_config: - val_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='validation', config=val_data_config) - - self._validation_dl = self._setup_dataloader_from_config(config=val_data_config) - - def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the test data loader via a Dict-like object. - - Args: - test_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` - """ - if 'shuffle' not in test_data_config: - test_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='test', config=test_data_config) - - self._test_dl = self._setup_dataloader_from_config(config=test_data_config) - - def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': - """ - Setup function for a temporary data loader which wraps the provided audio file. - - Args: - config: A python dictionary which contains the following keys: - paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \ - Recommended length per file is between 5 and 25 seconds. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - temp_dir: (str) A temporary directory where the audio manifest is temporarily - stored. - num_workers: (int) number of workers. Depends of the batch_size and machine. \ - 0 - only the main process will load batches, 1 - one worker (not main process) - - Returns: - A pytorch DataLoader for the given audio file(s). - """ - - if 'manifest_filepath' in config: - manifest_filepath = config['manifest_filepath'] - batch_size = config['batch_size'] - else: - manifest_filepath = os.path.join(config['temp_dir'], 'manifest.json') - batch_size = min(config['batch_size'], len(config['paths2audio_files'])) - - dl_config = { - 'manifest_filepath': manifest_filepath, - 'sample_rate': self.preprocessor._sample_rate, - 'batch_size': batch_size, - 'shuffle': False, - 'num_workers': config.get('num_workers', min(batch_size, os.cpu_count() - 1)), - 'pin_memory': True, - 'use_start_end_token': self.cfg.validation_ds.get('use_start_end_token', False), - } - - temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) - return temporary_datalayer - - @torch.no_grad() - def transcribe( - self, - paths2audio_files: List[str], - batch_size: int = 4, - logprobs: bool = False, - return_hypotheses: bool = False, - num_workers: int = 0, - verbose: bool = True, - ) -> List[str]: - """ - Uses greedy decoding to transcribe audio files into SLU semantics. - Use this method for debugging and prototyping. - - Args: - paths2audio_files: (a list) of paths to audio files. \ - Recommended length per file is between 5 and 25 seconds. \ - But it is possible to pass a few hours long file if enough GPU memory is available. - batch_size: (int) batch size to use during inference. - Bigger will result in better throughput performance but would use more memory. - logprobs: (bool) pass True to get log probabilities instead of transcripts. - return_hypotheses: (bool) Either return hypotheses or text - With hypotheses can do some postprocessing like getting timestamp or rescoring - num_workers: (int) number of workers for DataLoader - verbose: (bool) whether to display tqdm progress bar - - Returns: - A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files - """ - if paths2audio_files is None or len(paths2audio_files) == 0: - return {} - - if return_hypotheses and logprobs: - raise ValueError( - "Either `return_hypotheses` or `logprobs` can be True at any given time." - "Returned hypotheses will contain the logprobs." - ) - - if num_workers is None: - num_workers = min(batch_size, os.cpu_count() - 1) - - # We will store transcriptions here - hypotheses = [] - - # Model's mode and device - mode = self.training - device = next(self.parameters()).device - dither_value = self.preprocessor.featurizer.dither - pad_to_value = self.preprocessor.featurizer.pad_to - - try: - self.preprocessor.featurizer.dither = 0.0 - self.preprocessor.featurizer.pad_to = 0 - # Switch model to evaluation mode - self.eval() - - logging_level = logging.get_verbosity() - logging.set_verbosity(logging.WARNING) - # Work in tmp directory - will store manifest file there - with tempfile.TemporaryDirectory() as tmpdir: - with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp: - for audio_file in paths2audio_files: - entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': ''} - fp.write(json.dumps(entry) + '\n') - - config = { - 'paths2audio_files': paths2audio_files, - 'batch_size': batch_size, - 'temp_dir': tmpdir, - 'num_workers': num_workers, - } - - temporary_datalayer = self._setup_transcribe_dataloader(config) - for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=not verbose): - predictions = self.predict( - input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) - ) - - hypotheses += predictions - - del predictions - del test_batch - finally: - # set mode back to its original value - self.train(mode=mode) - self.preprocessor.featurizer.dither = dither_value - self.preprocessor.featurizer.pad_to = pad_to_value - logging.set_verbosity(logging_level) - - return hypotheses - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - - model = PretrainedModelInfo( - pretrained_model_name="slu_conformer_transformer_large_slurp", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:slu_conformer_transformer_large_slurp", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/slu_conformer_transformer_large_slurp/versions/1.13.0/files/slu_conformer_transformer_large_slurp.nemo", - ) - results.append(model) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/ssl_models.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/ssl_models.py deleted file mode 100644 index 6dca3815119fb36e2437d72eaabf77a52830ad17..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/ssl_models.py +++ /dev/null @@ -1,572 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from math import ceil -from typing import Dict, List, Optional, Union - -import torch -import torch.nn as nn -from omegaconf import DictConfig -from pytorch_lightning import Trainer - -from nemo.collections.asr.data import audio_to_text_dataset -from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs -from nemo.collections.asr.parts.mixins import ASRModuleMixin -from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations -from nemo.core.classes import ModelPT -from nemo.core.classes.common import PretrainedModelInfo, typecheck -from nemo.core.classes.mixins import AccessMixin, set_access_cfg -from nemo.core.neural_types import ( - AcousticEncodedRepresentation, - AudioSignal, - LabelsType, - LengthsType, - NeuralType, - SpectrogramType, -) -from nemo.utils import logging - -__all__ = ['SpeechEncDecSelfSupervisedModel'] - - -class SpeechEncDecSelfSupervisedModel(ModelPT, ASRModuleMixin, AccessMixin): - """Base class for encoder-decoder models used for self-supervised encoder pre-training""" - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - - model = PretrainedModelInfo( - pretrained_model_name="ssl_en_conformer_large", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:ssl_en_conformer_large", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/ssl_en_conformer_large/versions/1.10.1/files/ssl_en_conformer_large.nemo", - ) - results.append(model) - - model = PretrainedModelInfo( - pretrained_model_name="ssl_en_conformer_xlarge", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:ssl_en_conformer_xlarge", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/ssl_en_conformer_xlarge/versions/1.10.0/files/ssl_en_conformer_xlarge.nemo", - ) - results.append(model) - - return results - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable - # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 - self.world_size = 1 - if trainer is not None: - self.world_size = trainer.world_size - - super().__init__(cfg=cfg, trainer=trainer) - self.preprocessor = SpeechEncDecSelfSupervisedModel.from_config_dict(self._cfg.preprocessor) - self.encoder = SpeechEncDecSelfSupervisedModel.from_config_dict(self._cfg.encoder) - - self.decoder_losses = None - - if "loss_list" in self._cfg: - - self.decoder_losses = {} - self.loss_alphas = {} - self.start_step = {} - self.output_from_layer = {} - self.transpose_encoded = {} - self.targets_from_loss = {} - self.decoder_losses_active = {} - # need to be separate for moduledict - - for decoder_loss_name, decoder_loss_cfg in self._cfg.loss_list.items(): - if not decoder_loss_cfg.get("is_active", True): # active by default - continue - - new_decoder_loss = { - 'decoder': SpeechEncDecSelfSupervisedModel.from_config_dict(decoder_loss_cfg.decoder), - 'loss': SpeechEncDecSelfSupervisedModel.from_config_dict(decoder_loss_cfg.loss), - } - new_decoder_loss = nn.ModuleDict(new_decoder_loss) - self.decoder_losses[decoder_loss_name] = new_decoder_loss - self.loss_alphas[decoder_loss_name] = decoder_loss_cfg.get("loss_alpha", 1.0) - self.output_from_layer[decoder_loss_name] = decoder_loss_cfg.get("output_from_layer", None) - self.targets_from_loss[decoder_loss_name] = decoder_loss_cfg.get("targets_from_loss", None) - self.start_step[decoder_loss_name] = decoder_loss_cfg.get("start_step", 0) - self.transpose_encoded[decoder_loss_name] = decoder_loss_cfg.get("transpose_encoded", False) - self.decoder_losses_active[decoder_loss_name] = True - - self.decoder_losses = nn.ModuleDict(self.decoder_losses) - - else: - self.decoder_ssl = SpeechEncDecSelfSupervisedModel.from_config_dict(self._cfg.decoder) - self.loss = SpeechEncDecSelfSupervisedModel.from_config_dict(self._cfg.loss) - - self.spec_augmentation = SpeechEncDecSelfSupervisedModel.from_config_dict(self._cfg.spec_augment) - - # dropout for features/spectrograms (applied before masking) - self.dropout_features = ( - torch.nn.Dropout(self._cfg.dropout_features) if "dropout_features" in self._cfg else None - ) - - # dropout for targets (applied before quantization) - self.dropout_features_q = ( - torch.nn.Dropout(self._cfg.dropout_features_q) if "dropout_features_q" in self._cfg else None - ) - - # Feature penalty for preprocessor encodings (for Wav2Vec training) - if "feature_penalty" in self._cfg: - self.feat_pen, self.pen_factor = 0.0, self._cfg.feature_penalty - else: - self.feat_pen, self.pen_factor = None, None - - if "access" in self._cfg: - set_access_cfg(self._cfg.access) - - self.apply_masking = True - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - if 'augmentor' in config: - augmentor = process_augmentations(config['augmentor']) - else: - augmentor = None - - # Automatically inject args from model config to dataloader config - audio_to_text_dataset.inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate') - - shuffle = config['shuffle'] - device = 'gpu' if torch.cuda.is_available() else 'cpu' - if config.get('use_dali', False): - device_id = self.local_rank if device == 'gpu' else None - dataset = audio_to_text_dataset.get_dali_char_dataset( - config=config, - shuffle=shuffle, - device_id=device_id, - global_rank=self.global_rank, - world_size=self.world_size, - preprocessor_cfg=self._cfg.preprocessor, - ) - return dataset - - # Instantiate tarred dataset loader or normal dataset loader - if config.get('is_tarred', False): - if ('tarred_audio_filepaths' in config and config['tarred_audio_filepaths'] is None) or ( - 'manifest_filepath' in config and config['manifest_filepath'] is None - ): - logging.warning( - "Could not load dataset as `manifest_filepath` was None or " - f"`tarred_audio_filepaths` is None. Provided config : {config}" - ) - return None - - shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0 - dataset = audio_to_text_dataset.get_tarred_dataset( - config=config, - shuffle_n=shuffle_n, - global_rank=self.global_rank, - world_size=self.world_size, - augmentor=augmentor, - ) - shuffle = False - else: - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}") - return None - - dataset = audio_to_text_dataset.get_char_dataset(config=config, augmentor=augmentor) - - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - elif hasattr(dataset.datasets[0], 'collate_fn'): - # support datasets that are lists of entries - collate_fn = dataset.datasets[0].collate_fn - else: - # support datasets that are lists of lists - collate_fn = dataset.datasets[0].datasets[0].collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the training data loader via a Dict-like object. - - Args: - train_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` - """ - if 'shuffle' not in train_data_config: - train_data_config['shuffle'] = True - - # preserve config - self._update_dataset_config(dataset_name='train', config=train_data_config) - - self._train_dl = self._setup_dataloader_from_config(config=train_data_config) - - # Need to set this because if using an IterableDataset, the length of the dataloader is the total number - # of samples rather than the number of batches, and this messes up the tqdm progress bar. - # So we set the number of steps manually (to the correct number) to fix this. - if ( - self._train_dl is not None - and hasattr(self._train_dl, 'dataset') - and isinstance(self._train_dl.dataset, torch.utils.data.IterableDataset) - ): - # We also need to check if limit_train_batches is already set. - # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, - # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). - if isinstance(self._trainer.limit_train_batches, float): - self._trainer.limit_train_batches = int( - self._trainer.limit_train_batches - * ceil((len(self._train_dl.dataset) / self.world_size) / train_data_config['batch_size']) - ) - - def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the validation data loader via a Dict-like object. - - Args: - val_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` - """ - if 'shuffle' not in val_data_config: - val_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='validation', config=val_data_config) - - self._validation_dl = self._setup_dataloader_from_config(config=val_data_config) - - # Need to set this because if using an IterableDataset, the length of the dataloader is the total number - # of samples rather than the number of batches, and this messes up the tqdm progress bar. - # So we set the number of steps manually (to the correct number) to fix this. - if ( - self._validation_dl is not None - and hasattr(self._validation_dl, 'dataset') - and isinstance(self._validation_dl.dataset, torch.utils.data.IterableDataset) - ): - # We also need to check if limit_train_batches is already set. - # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, - # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). - if isinstance(self._trainer.limit_val_batches, float): - self._trainer.limit_val_batches = int( - self._trainer.limit_val_batches - * ceil((len(self._validation_dl.dataset) / self.world_size) / val_data_config['batch_size']) - ) - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - if hasattr(self.preprocessor, '_sample_rate'): - input_signal_eltype = AudioSignal(freq=self.preprocessor._sample_rate) - else: - input_signal_eltype = AudioSignal() - return { - "input_signal": NeuralType(('B', 'T'), input_signal_eltype, optional=True), - "input_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), - "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType(), optional=True), - "processed_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), - "targets": NeuralType(('B', 'T'), LabelsType(), optional=True), - "target_lengths": NeuralType(tuple('B'), LengthsType(), optional=True), - } - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return { - "spectrograms": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "spec_masks": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "encoded": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "encoded_len": NeuralType(tuple('B'), LengthsType()), - } - - @typecheck() - def forward( - self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None, - ): - """ - Forward pass of the model. - - Args: - input_signal: Tensor that represents a batch of raw audio signals, - of shape [B, T]. T here represents timesteps, with 1 second of audio represented as - `self.sample_rate` number of floating point values. - input_signal_length: Vector of length B, that contains the individual lengths of the audio - sequences. - processed_signal: Tensor that represents a batch of processed audio signals, - of shape (B, D, T) that has undergone processing via some DALI preprocessor. - processed_signal_length: Vector of length B, that contains the individual lengths of the - processed audio sequences. - - Returns: - A tuple of 4 elements - - 1) Processed spectrograms of shape [B, D, T]. - 2) Masks applied to spectrograms of shape [B, D, T]. - 3) The encoded features tensor of shape [B, D, T]. - 2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B]. - """ - # Reset access registry - if self.is_access_enabled(): - self.reset_registry() - - # Check for special flag for validation step - if hasattr(self, '_in_validation_step'): - in_validation_step = self._in_validation_step - else: - in_validation_step = False - - # reset module registry from AccessMixin - if ( - (self.training or in_validation_step) - and self.decoder_losses is not None - and self.output_from_layer is not None - and len(self.output_from_layer) > 0 - ): - layer_names = list(self.output_from_layer.values()) - register_layer = any([name is not None for name in layer_names]) - - if register_layer: - self.access_cfg['save_encoder_tensors'] = True - self.set_access_enabled(access_enabled=True) - - has_input_signal = input_signal is not None and input_signal_length is not None - has_processed_signal = processed_signal is not None and processed_signal_length is not None - if (has_input_signal ^ has_processed_signal) == False: - raise ValueError( - f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive " - " with ``processed_signal`` and ``processed_signal_len`` arguments." - ) - - if not has_processed_signal: - processed_signal, processed_signal_length = self.preprocessor( - input_signal=input_signal, length=input_signal_length, - ) - - if self.pen_factor: - self.feat_pen = processed_signal.float().pow(2).mean() * self.pen_factor - spectrograms = processed_signal.detach().clone() - - if self.dropout_features: - processed_signal = self.dropout_features(processed_signal) - if self.dropout_features_q: - spectrograms = self.dropout_features_q(spectrograms) - - if self.apply_masking: - processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) - - masked_spectrograms = processed_signal.detach() - spec_masks = torch.logical_and(masked_spectrograms < 1e-5, masked_spectrograms > -1e-5).float() - for idx, proc_len in enumerate(processed_signal_length): - spec_masks[idx, :, proc_len:] = 0.0 - - encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length) - - return spectrograms, spec_masks, encoded, encoded_len - - def decoder_loss_step(self, spectrograms, spec_masks, encoded, encoded_len, targets=None, target_lengths=None): - """ - Forward pass through all decoders and calculate corresponding losses. - Args: - spectrograms: Processed spectrograms of shape [B, D, T]. - spec_masks: Masks applied to spectrograms of shape [B, D, T]. - encoded: The encoded features tensor of shape [B, D, T]. - encoded_len: The lengths of the acoustic sequence after propagation through the encoder, of shape [B]. - targets: Optional target labels of shape [B, T] - target_lengths: Optional target label lengths of shape [B] - - Returns: - A tuple of 2 elements - - 1) Total sum of losses weighted by corresponding loss_alphas - 2) Dictionary of unweighted losses - """ - loss_val_dict = {} - - if self.decoder_losses is None: - if hasattr(self.decoder_ssl, "needs_labels") and self.decoder_ssl.needs_labels: - outputs = self.decoder_ssl(encoder_output=encoded, targets=targets, target_lengths=target_lengths) - else: - outputs = self.decoder_ssl(encoder_output=encoded) - if self.loss.needs_labels: - loss_value = self.loss( - spec_masks=spec_masks, - decoder_outputs=outputs, - targets=targets, - decoder_lengths=encoded_len, - target_lengths=target_lengths, - ) - else: - loss_value = self.loss(spectrograms=spectrograms, spec_masks=spec_masks, decoder_outputs=outputs) - else: - - loss_value = encoded.new_zeros(1) - outputs = {} - registry = self.get_module_registry(self.encoder) - - for dec_loss_name, dec_loss in self.decoder_losses.items(): - # loop through decoders and corresponding losses - if not self.decoder_losses_active[dec_loss_name]: - continue - - if self.output_from_layer[dec_loss_name] is None: - dec_input = encoded - else: - # extract output from specified layer using AccessMixin registry - dec_input = registry[self.output_from_layer[dec_loss_name]]['encoder'][-1] - if self.transpose_encoded[dec_loss_name]: - dec_input = dec_input.transpose(-2, -1) - - if self.targets_from_loss[dec_loss_name] is not None: - # extract targets from specified loss - target_loss = self.targets_from_loss[dec_loss_name] - targets = self.decoder_losses[target_loss]['loss'].target_ids - target_lengths = self.decoder_losses[target_loss]['loss'].target_lengths - if target_lengths is None: - target_lengths = encoded_len - - if hasattr(dec_loss['decoder'], "needs_labels") and dec_loss['decoder'].needs_labels: - # if we are using a decoder which needs labels, provide them - outputs[dec_loss_name] = dec_loss['decoder']( - encoder_output=dec_input, targets=targets, target_lengths=target_lengths - ) - else: - outputs[dec_loss_name] = dec_loss['decoder'](encoder_output=dec_input) - - current_loss = dec_loss['loss'] - if current_loss.needs_labels: - # if we are using a loss which needs labels, provide them - current_loss_value = current_loss( - spec_masks=spec_masks, - decoder_outputs=outputs[dec_loss_name], - targets=targets, - decoder_lengths=encoded_len, - target_lengths=target_lengths, - ) - else: - current_loss_value = current_loss( - spectrograms=spectrograms, - spec_masks=spec_masks, - decoder_outputs=outputs[dec_loss_name], - decoder_lengths=encoded_len, - ) - loss_value = loss_value + current_loss_value * self.loss_alphas[dec_loss_name] - loss_val_dict[dec_loss_name] = current_loss_value - - return loss_value, loss_val_dict - - # PTL-specific methods - def training_step(self, batch, batch_nb): - signal, signal_len, targets, target_lengths = batch - if isinstance(batch, DALIOutputs) and batch.has_processed_signal: - spectrograms, spec_masks, encoded, encoded_len = self.forward( - processed_signal=signal, processed_signal_length=signal_len, - ) - else: - spectrograms, spec_masks, encoded, encoded_len = self.forward( - input_signal=signal, input_signal_length=signal_len, - ) - - if self.decoder_losses is not None: - for dec_loss_name, dec_loss in self.decoder_losses.items(): - self.decoder_losses_active[dec_loss_name] = self.trainer.global_step >= self.start_step[dec_loss_name] - loss = dec_loss['loss'] - if hasattr(loss, "set_num_updates"): - loss.set_num_updates(self.trainer.global_step) - else: - if hasattr(self.loss, "set_num_updates"): - self.loss.set_num_updates(self.trainer.global_step) - - loss_value, loss_val_dict = self.decoder_loss_step( - spectrograms, spec_masks, encoded, encoded_len, targets, target_lengths - ) - - tensorboard_logs = { - 'learning_rate': self._optimizer.param_groups[0]['lr'], - 'global_step': self.trainer.global_step, - } - - for loss_name, loss_val in loss_val_dict.items(): - tensorboard_logs['train_' + loss_name] = loss_val - - if self.feat_pen: - loss_value += self.feat_pen - - # Reset access registry - self.reset_registry() - - return {'loss': loss_value, 'log': tensorboard_logs} - - def validation_pass(self, batch, batch_idx, dataloader_idx=0): - # Set flag to register tensors - self._in_validation_step = True - - signal, signal_len, targets, target_lengths = batch - if isinstance(batch, DALIOutputs) and batch.has_processed_signal: - spectrograms, spec_masks, encoded, encoded_len = self.forward( - processed_signal=signal, processed_signal_length=signal_len, - ) - else: - spectrograms, spec_masks, encoded, encoded_len = self.forward( - input_signal=signal, input_signal_length=signal_len, - ) - - if self.decoder_losses is not None: - for dec_loss_name, dec_loss in self.decoder_losses.items(): - self.decoder_losses_active[dec_loss_name] = self.trainer.global_step >= self.start_step[dec_loss_name] - - loss_value, _ = self.decoder_loss_step(spectrograms, spec_masks, encoded, encoded_len, targets, target_lengths) - - if self.feat_pen: - loss_value += self.feat_pen - - # reset access registry - self.reset_registry() - del self._in_validation_step - - metrics = {'val_loss': loss_value} - - return metrics - - def validation_step(self, batch, batch_idx, dataloader_idx=0): - metrics = self.validation_pass(batch, batch_idx, dataloader_idx) - if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: - self.validation_step_outputs[dataloader_idx].append(metrics) - else: - self.validation_step_outputs.append(metrics) - return metrics - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): - val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean() - tensorboard_logs = {'val_loss': val_loss_mean} - return {'val_loss': val_loss_mean, 'log': tensorboard_logs} diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/models/transformer_bpe_models.py b/SoundScribe/SpeakerID/nemo/collections/asr/models/transformer_bpe_models.py deleted file mode 100644 index 178746795ae84e3319b7faf9995f81a802fed497..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/models/transformer_bpe_models.py +++ /dev/null @@ -1,614 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -import json -import os -import tempfile -from math import ceil -from typing import Dict, List, Optional, Union - -import editdistance -import torch -import torch.distributed as dist -from omegaconf import DictConfig, OmegaConf -from pytorch_lightning import Trainer -from tqdm.auto import tqdm - -from nemo.collections.asr.data import audio_to_text_dataset -from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs -from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel -from nemo.collections.asr.parts.mixins import ASRBPEMixin -from nemo.collections.common.losses import SmoothedCrossEntropyLoss -from nemo.collections.common.metrics import GlobalAverageLossMetric -from nemo.collections.common.parts import transformer_weights_init - -from nemo.core.classes.common import typecheck -from nemo.core.neural_types import ( - AudioSignal, - ChannelType, - LabelsType, - LengthsType, - LogprobsType, - MaskType, - NeuralType, - SpectrogramType, -) -from nemo.utils import logging - -try: - from sacrebleu import corpus_bleu - from nemo.collections.nlp.modules.common import TokenClassifier - from nemo.collections.nlp.modules.common.lm_utils import get_transformer - from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder - - NLP_AVAILABLE = True -except (ImportError, ModuleNotFoundError): - NLP_AVAILABLE = False - logging.warning("Could not import NeMo NLP collection which is required for speech translation model.") - -__all__ = ['EncDecTransfModelBPE'] - - -def lens_to_mask(lens, max_length): - batch_size = lens.shape[0] - mask = torch.arange(max_length).repeat(batch_size, 1).to(lens.device) < lens[:, None] - return mask - - -class EncDecTransfModelBPE(ASRModel, ExportableEncDecModel, ASRBPEMixin): - """Base class for encoder decoder CTC-based models.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - - if 'tokenizer' not in cfg: - raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !") - - # Setup the tokenizer - self._setup_tokenizer(cfg.tokenizer) - - super().__init__(cfg=cfg, trainer=trainer) - - # Setup audio preprocessor - self.preprocessor = EncDecTransfModelBPE.from_config_dict(self.cfg.preprocessor) - - # Setup audio encoder - self.encoder = EncDecTransfModelBPE.from_config_dict(self.cfg.encoder) - - # Add projection layer if encoder and decoder differ in hidden size - if self.cfg.encoder['d_model'] != self.cfg.transf_decoder['hidden_size']: - self.adapter = torch.nn.Linear(self.cfg.encoder['d_model'], self.cfg.transf_decoder['hidden_size']) - else: - self.adapter = torch.nn.Identity() - - transf_encoder_cfg_dict = OmegaConf.to_container(cfg.get('transf_encoder')) - - # Whether to add Transformer Encoder block between Conformer and Transformer Decoder - self.use_transf_encoder = False - if transf_encoder_cfg_dict['num_layers'] > 0: - self.use_transf_encoder = True - - self.transf_encoder = TransformerEncoder( - num_layers=transf_encoder_cfg_dict['num_layers'], - hidden_size=transf_encoder_cfg_dict['hidden_size'], - inner_size=transf_encoder_cfg_dict['inner_size'], - mask_future=False, - num_attention_heads=transf_encoder_cfg_dict['num_attention_heads'], - attn_score_dropout=transf_encoder_cfg_dict['attn_score_dropout'], - attn_layer_dropout=transf_encoder_cfg_dict['attn_layer_dropout'], - ffn_dropout=transf_encoder_cfg_dict['ffn_dropout'], - pre_ln=transf_encoder_cfg_dict.get('pre_ln', True), - pre_ln_final_layer_norm=transf_encoder_cfg_dict.get('pre_ln_final_layer_norm', True), - ) - std_init_range = 1 / transf_encoder_cfg_dict['hidden_size'] ** 0.5 - self.transf_encoder.apply(lambda module: transformer_weights_init(module, std_init_range)) - - transf_decoder_cfg_dict = OmegaConf.to_container(cfg.get('transf_decoder')) - - # Transformer decoder - vocab_size = 8 * ceil(self.tokenizer.vocab_size / 8) - transf_decoder_cfg_dict['vocab_size'] = vocab_size - library = transf_decoder_cfg_dict.pop('library', 'nemo') - model_name = transf_decoder_cfg_dict.pop('model_name', None) - pretrained = transf_decoder_cfg_dict.pop('pretrained', False) - self.transf_decoder = get_transformer( - library=library, - model_name=model_name, - pretrained=pretrained, - config_dict=transf_decoder_cfg_dict, - encoder=False, - pre_ln_final_layer_norm=transf_decoder_cfg_dict.get("pre_ln_final_layer_norm", False), - ) - - self.log_softmax = TokenClassifier( - hidden_size=self.transf_decoder.hidden_size, - num_classes=vocab_size, - activation=self.cfg.head.activation, - log_softmax=self.cfg.head.log_softmax, - dropout=self.cfg.head.dropout, - use_transformer_init=self.cfg.head.use_transformer_init, - ) - self.log_softmax.mlp.layer0.weight = self.transf_decoder.embedding.token_embedding.weight - std_init_range = 1 / self.transf_decoder.hidden_size ** 0.5 - self.transf_decoder.apply(lambda module: transformer_weights_init(module, std_init_range)) - self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range)) - - # Beam Search decoding - self.beam_search = BeamSearchSequenceGenerator( - embedding=self.transf_decoder.embedding, - decoder=self.transf_decoder.decoder, - log_softmax=self.log_softmax, - max_sequence_length=self.transf_decoder.max_sequence_length, - beam_size=self.cfg.beam_search.beam_size, - bos=self.tokenizer.bos_id, - pad=self.tokenizer.pad_id, - eos=self.tokenizer.eos_id, - len_pen=self.cfg.beam_search.len_pen, - max_delta_length=self.cfg.beam_search.max_generation_delta, - ) - - # Define autoregressive CE loss - self.transf_loss = SmoothedCrossEntropyLoss( - pad_id=self.tokenizer.pad_id, label_smoothing=self.cfg.label_smoothing - ) - - if hasattr(self.cfg, 'spec_augment') and self.cfg.spec_augment is not None: - self.spec_augmentation = EncDecTransfModelBPE.from_config_dict(self.cfg.spec_augment) - else: - self.spec_augmentation = None - - self.val_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True) - - @torch.no_grad() - def translate( - self, - paths2audio_files: List[str], - batch_size: int = 4, - logprobs: bool = False, - return_hypotheses: bool = False, - ) -> List[str]: - hypotheses = self.transcribe(paths2audio_files, batch_size, logprobs, return_hypotheses) - return hypotheses - - @torch.no_grad() - def transcribe( - self, - paths2audio_files: List[str], - batch_size: int = 4, - logprobs: bool = False, - return_hypotheses: bool = False, - ) -> List[str]: - """ - Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. - Args: - paths2audio_files: (a list) of paths to audio files. \ - Recommended length per file is between 5 and 25 seconds. \ - But it is possible to pass a few hours long file if enough GPU memory is available. - batch_size: (int) batch size to use during inference. - Bigger will result in better throughput performance but would use more memory. - logprobs: (bool) pass True to get log probabilities instead of transcripts. - return_hypotheses: (bool) Either return hypotheses or text - With hypotheses can do some postprocessing like getting timestamp or rescoring - Returns: - A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files - """ - if paths2audio_files is None or len(paths2audio_files) == 0: - return {} - - if return_hypotheses and logprobs: - raise ValueError( - "Either `return_hypotheses` or `logprobs` can be True at any given time." - "Returned hypotheses will contain the logprobs." - ) - - # We will store transcriptions here - hypotheses = [] - - # Model's mode and device - mode = self.training - device = next(self.parameters()).device - dither_value = self.preprocessor.featurizer.dither - pad_to_value = self.preprocessor.featurizer.pad_to - - try: - self.preprocessor.featurizer.dither = 0.0 - self.preprocessor.featurizer.pad_to = 0 - # Switch model to evaluation mode - self.eval() - # Freeze the encoder and decoder modules - self.encoder.freeze() - self.transf_decoder.freeze() - logging_level = logging.get_verbosity() - logging.set_verbosity(logging.WARNING) - # Work in tmp directory - will store manifest file there - with tempfile.TemporaryDirectory() as tmpdir: - with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp: - for audio_file in paths2audio_files: - entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': 'nothing'} - fp.write(json.dumps(entry) + '\n') - - config = {'paths2audio_files': paths2audio_files, 'batch_size': batch_size, 'temp_dir': tmpdir} - - temporary_datalayer = self._setup_transcribe_dataloader(config) - for test_batch in tqdm(temporary_datalayer, desc="Transcribing"): - log_probs, encoded_len, enc_states, enc_mask = self.forward( - input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) - ) - - beam_hypotheses = ( - self.beam_search( - encoder_hidden_states=enc_states, encoder_input_mask=enc_mask, return_beam_scores=False - ) - .detach() - .cpu() - .numpy() - ) - beam_hypotheses = [self.tokenizer.ids_to_text(hyp) for hyp in beam_hypotheses] - - if return_hypotheses: - # dump log probs per file - for idx in range(logits.shape[0]): - current_hypotheses[idx].y_sequence = logits[idx][: logits_len[idx]] - - hypotheses += beam_hypotheses - - del test_batch, log_probs, encoded_len, enc_states, enc_mask - finally: - # set mode back to its original value - self.train(mode=mode) - self.preprocessor.featurizer.dither = dither_value - self.preprocessor.featurizer.pad_to = pad_to_value - if mode is True: - self.encoder.unfreeze() - self.transf_decoder.unfreeze() - logging.set_verbosity(logging_level) - - return hypotheses - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - - dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config( - config=config, - local_rank=self.local_rank, - global_rank=self.global_rank, - world_size=self.world_size, - tokenizer=self.tokenizer, - preprocessor_cfg=self.cfg.get("preprocessor", None), - ) - - if dataset is None: - return None - - shuffle = config['shuffle'] - if config.get('is_tarred', False): - shuffle = False - - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - else: - collate_fn = dataset.datasets[0].collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def setup_training_data(self, train_data_config: Optional[DictConfig]): - - # create audio-only data loader - self._update_dataset_config(dataset_name='train', config=train_data_config) - self._train_dl = self._setup_dataloader_from_config(config=train_data_config) - - # Need to set this because if using an IterableDataset, the length of the - # dataloader is the total number of samples rather than the number of batches, - # and this messes up the tqdm progress bar. So we set the number of steps manually - # (to the correct number) to fix this. - if 'is_tarred' in train_data_config and train_data_config['is_tarred']: - # We also need to check if limit_train_batches is already set. - # If it's an int, we assume that the user has set it to something sane, - # i.e. <= # training batches, and don't change it. Otherwise, adjust - # batches accordingly if it's a float (including 1.0). - if self._trainer is not None and isinstance(self._trainer.limit_train_batches, float): - self._trainer.limit_train_batches = int( - self._trainer.limit_train_batches - * ceil((len(self._train_dl.dataset) / self.world_size) / train_data_config['batch_size']) - ) - elif self._trainer is None: - logging.warning( - "Model Trainer was not set before constructing the dataset, incorrect number of " - "training batches will be used. Please set the trainer and rebuild the dataset." - ) - - def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the validation data loader via a Dict-like object. - Args: - val_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` - """ - if 'shuffle' not in val_data_config: - val_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='validation', config=val_data_config) - self._validation_dl = self._setup_dataloader_from_config(config=val_data_config) - - def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the test data loader via a Dict-like object. - Args: - test_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - Supported Datasets: - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` - - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` - - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` - """ - if 'shuffle' not in test_data_config: - test_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='test', config=test_data_config) - self._test_dl = self._setup_dataloader_from_config(config=test_data_config) - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - if hasattr(self.preprocessor, '_sample_rate'): - input_signal_eltype = AudioSignal(freq=self.preprocessor._sample_rate) - else: - input_signal_eltype = AudioSignal() - return { - "input_signal": NeuralType(('B', 'T'), input_signal_eltype, optional=True), - "input_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), - "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType(), optional=True), - "processed_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), - "transcript": NeuralType(('B', 'T'), LabelsType(), optional=True), - "transcript_length": NeuralType(tuple('B'), LengthsType(), optional=True), - "sample_id": NeuralType(tuple('B'), LengthsType(), optional=True), - } - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return { - "transf_log_probs": NeuralType(('B', 'T', 'D'), LogprobsType()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - "encoder_states": NeuralType(('B', 'T', 'D'), ChannelType()), - "encoder_mask": NeuralType(('B', 'T'), MaskType()), - } - - @typecheck() - def forward( - self, - input_signal=None, - input_signal_length=None, - processed_signal=None, - processed_signal_length=None, - transcript=None, - transcript_length=None, - ): - """ - Forward pass of the model. - Args: - input_signal: Tensor that represents a batch of raw audio signals, - of shape [B, T]. T here represents timesteps, with 1 second of audio represented as - `self.sample_rate` number of floating point values. - input_signal_length: Vector of length B, that contains the individual lengths of the audio - sequences. - processed_signal: Tensor that represents a batch of processed audio signals, - of shape (B, D, T) that has undergone processing via some DALI preprocessor. - processed_signal_length: Vector of length B, that contains the individual lengths of the - processed audio sequences. - Returns: - A tuple of 3 elements - - 1) The log probabilities tensor of shape [B, T, D]. - 2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B]. - 3) The greedy token predictions of the model of shape [B, T] (via argmax) - """ - has_input_signal = input_signal is not None and input_signal_length is not None - has_processed_signal = processed_signal is not None and processed_signal_length is not None - if (has_input_signal ^ has_processed_signal) == False: - raise ValueError( - f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive " - " with ``processed_signal`` and ``processed_signal_len`` arguments." - ) - - if not has_processed_signal: - processed_signal, processed_signal_length = self.preprocessor( - input_signal=input_signal, length=input_signal_length - ) - - if self.spec_augmentation is not None and self.training: - processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) - - encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length) - - enc_states = encoded.permute(0, 2, 1) - enc_states = self.adapter(enc_states) - enc_mask = lens_to_mask(encoded_len, enc_states.shape[1]).to(enc_states.dtype) - if self.use_transf_encoder: - enc_states = self.transf_encoder(encoder_states=enc_states, encoder_mask=enc_mask) - - transf_log_probs = None - if transcript is not None: - dec_mask = lens_to_mask(transcript_length, transcript.shape[1]).to(transcript.dtype) - dec_states = self.transf_decoder( - input_ids=transcript, decoder_mask=dec_mask, encoder_embeddings=enc_states, encoder_mask=enc_mask - ) - transf_log_probs = self.log_softmax(hidden_states=dec_states) - - return transf_log_probs, encoded_len, enc_states, enc_mask - - def compute_audio_loss(self, batch): - - if batch is None: - return 0 - - signal, signal_len, transcript, transcript_len = batch - input_ids, labels = transcript[:, :-1], transcript[:, 1:] - - transf_log_probs, encoded_len, enc_states, enc_mask = self.forward( - input_signal=signal, - input_signal_length=signal_len, - transcript=input_ids, - transcript_length=transcript_len, - ) - - transf_loss = self.transf_loss(log_probs=transf_log_probs, labels=labels) - - return transf_loss - - # PTL-specific methods - def training_step(self, batch, batch_nb): - - audio_loss = self.compute_audio_loss(batch) - - tensorboard_logs = { - 'train_loss': audio_loss, - 'learning_rate': self._optimizer.param_groups[0]['lr'], - } - - return {'loss': audio_loss, 'log': tensorboard_logs} - - def validation_step(self, batch, batch_idx, dataloader_idx=0, eval_mode="val"): - signal, signal_len, transcript, transcript_len = batch - input_ids, labels = transcript[:, :-1], transcript[:, 1:] - - if isinstance(batch, DALIOutputs) and batch.has_processed_signal: - transf_log_probs, encoded_len, enc_states, enc_mask = self.forward( - processed_signal=signal, - processed_signal_length=signal_len, - transcript=input_ids, - transcript_length=transcript_len, - ) - else: - transf_log_probs, encoded_len, enc_states, enc_mask = self.forward( - input_signal=signal, - input_signal_length=signal_len, - transcript=input_ids, - transcript_length=transcript_len, - ) - - beam_hypotheses = self.beam_search( - encoder_hidden_states=enc_states, encoder_input_mask=enc_mask, return_beam_scores=False - ) - transf_loss = self.transf_loss(log_probs=transf_log_probs, labels=labels) - - ground_truths = [self.tokenizer.ids_to_text(sent) for sent in transcript.detach().cpu().tolist()] - translations = [self.tokenizer.ids_to_text(sent) for sent in beam_hypotheses.detach().cpu().tolist()] - - self.val_loss(loss=transf_loss, num_measurements=transf_log_probs.shape[0] * transf_log_probs.shape[1]) - - return {f'{eval_mode}_loss': transf_loss, 'translations': translations, 'ground_truths': ground_truths} - - def test_step(self, batch, batch_idx, dataloader_idx=0): - return self.validation_step(batch, batch_idx, dataloader_idx, eval_mode="test") - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0, eval_mode: str = "val"): - """ - Called at the end of validation to aggregate outputs. - :param outputs: list of individual outputs of each validation step. - """ - if not outputs: - return - - if isinstance(outputs[0], dict): - outputs = [outputs] - - for output in outputs: - eval_loss = getattr(self, 'val_loss').compute() - translations = list(itertools.chain(*[x['translations'] for x in output])) - ground_truths = list(itertools.chain(*[x['ground_truths'] for x in output])) - - # Gather translations and ground truths from all workers - tr_and_gt = [None for _ in range(self.world_size)] - # we also need to drop pairs where ground truth is an empty string - if self.world_size > 1: - dist.all_gather_object( - tr_and_gt, [(t, g) for (t, g) in zip(translations, ground_truths) if g.strip() != ''] - ) - else: - tr_and_gt[0] = [(t, g) for (t, g) in zip(translations, ground_truths) if g.strip() != ''] - - if self.global_rank == 0: - _translations = [] - _ground_truths = [] - for rank in range(0, self.world_size): - _translations += [t for (t, g) in tr_and_gt[rank]] - _ground_truths += [g for (t, g) in tr_and_gt[rank]] - - sacre_bleu = corpus_bleu(_translations, [_ground_truths], tokenize="13a") - sb_score = sacre_bleu.score * self.world_size - - wer_scores, wer_words = 0, 0 - for h, r in zip(_translations, _ground_truths): - wer_words += len(r.split()) - wer_scores += editdistance.eval(h.split(), r.split()) - wer_score = 1.0 * wer_scores * self.world_size / wer_words - - else: - sb_score = 0.0 - wer_score = 0.0 - - self.log(f"{eval_mode}_loss", eval_loss, sync_dist=True) - self.log(f"{eval_mode}_sacreBLEU", sb_score, sync_dist=True) - self.log(f"{eval_mode}_WER", wer_score, sync_dist=True) - self.val_loss.reset() - - def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): - return self.multi_validation_epoch_end(outputs, dataloader_idx, eval_mode="test") - - def test_dataloader(self): - if self._test_dl is not None: - return self._test_dl - - def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': - """ - Setup function for a temporary data loader which wraps the provided audio file. - Args: - config: A python dictionary which contains the following keys: - paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \ - Recommended length per file is between 5 and 25 seconds. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - temp_dir: (str) A temporary directory where the audio manifest is temporarily - stored. - Returns: - A pytorch DataLoader for the given audio file(s). - """ - batch_size = min(config['batch_size'], len(config['paths2audio_files'])) - dl_config = { - 'manifest_filepath': os.path.join(config['temp_dir'], 'manifest.json'), - 'sample_rate': self.preprocessor._sample_rate, - 'batch_size': batch_size, - 'trim_silence': False, - 'shuffle': False, - 'num_workers': min(batch_size, os.cpu_count() - 1), - 'pin_memory': True, - } - - temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) - return temporary_datalayer diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/__init__.py deleted file mode 100644 index 0265d9e306878c3d17286d8e3e6f2a79499b8dda..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__init__.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.modules.audio_modules import ( - MaskBasedBeamformer, - MaskEstimatorFlexChannels, - MaskEstimatorRNN, - MaskReferenceChannel, -) -from nemo.collections.asr.modules.audio_preprocessing import ( - AudioToMelSpectrogramPreprocessor, - AudioToMFCCPreprocessor, - AudioToSpectrogram, - CropOrPadSpectrogramAugmentation, - MaskedPatchAugmentation, - SpectrogramAugmentation, - SpectrogramToAudio, -) -from nemo.collections.asr.modules.beam_search_decoder import BeamSearchDecoderWithLM -from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder, ConformerEncoderAdapter -from nemo.collections.asr.modules.conv_asr import ( - ConvASRDecoder, - ConvASRDecoderClassification, - ConvASRDecoderReconstruction, - ConvASREncoder, - ConvASREncoderAdapter, - ECAPAEncoder, - ParallelConvASREncoder, - SpeakerDecoder, -) -from nemo.collections.asr.modules.graph_decoder import ViterbiDecoderWithGraph -from nemo.collections.asr.modules.hybrid_autoregressive_transducer import HATJoint -from nemo.collections.asr.modules.lstm_decoder import LSTMDecoder -from nemo.collections.asr.modules.msdd_diarizer import MSDD_module -from nemo.collections.asr.modules.rnn_encoder import RNNEncoder -from nemo.collections.asr.modules.rnnt import ( - RNNTDecoder, - RNNTDecoderJointSSL, - RNNTJoint, - SampledRNNTJoint, - StatelessTransducerDecoder, -) -from nemo.collections.asr.modules.squeezeformer_encoder import SqueezeformerEncoder, SqueezeformerEncoderAdapter diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index d432d559993a19f4251654f5e4eebcc3e5a288fc..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/audio_modules.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/audio_modules.cpython-310.pyc deleted file mode 100644 index ab122e6d0a8ed21eec480fafffa6fd57e8eedfbc..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/audio_modules.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/audio_preprocessing.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/audio_preprocessing.cpython-310.pyc deleted file mode 100644 index 0f2e2e1098aa9468f5327b90ec6160f9308a1f16..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/audio_preprocessing.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/beam_search_decoder.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/beam_search_decoder.cpython-310.pyc deleted file mode 100644 index 785c7f07635e3ad0ca3b5a6407043a7d51937e28..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/beam_search_decoder.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/conformer_encoder.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/conformer_encoder.cpython-310.pyc deleted file mode 100644 index 5c0e59e42382d7c52b64ce63c26c712f51a26e71..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/conformer_encoder.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/conv_asr.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/conv_asr.cpython-310.pyc deleted file mode 100644 index e67896b192971e249619888aecc3d869e4d3e86c..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/conv_asr.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/graph_decoder.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/graph_decoder.cpython-310.pyc deleted file mode 100644 index 7f1c652b9295c79c970fd16982c88e9a33d683b5..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/graph_decoder.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/hybrid_autoregressive_transducer.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/hybrid_autoregressive_transducer.cpython-310.pyc deleted file mode 100644 index 22f6996accef4e9c032aa90493a5e27b12702791..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/hybrid_autoregressive_transducer.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/lstm_decoder.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/lstm_decoder.cpython-310.pyc deleted file mode 100644 index 26620984b1cd45f27ba853cb7398fe36a281b4b8..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/lstm_decoder.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/msdd_diarizer.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/msdd_diarizer.cpython-310.pyc deleted file mode 100644 index a35f48575e647804b6daeb353831ef83bcb5f1b9..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/msdd_diarizer.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/rnn_encoder.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/rnn_encoder.cpython-310.pyc deleted file mode 100644 index a4a454fd3ae2d39c376b369c8fa8165f445297f9..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/rnn_encoder.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/rnnt.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/rnnt.cpython-310.pyc deleted file mode 100644 index 1c27279c1cea0f5c4c59aa71fcbbb626de6d5743..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/rnnt.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/rnnt_abstract.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/rnnt_abstract.cpython-310.pyc deleted file mode 100644 index 5bbb7d7803365a47f7cd440840a73a51e4b0c4a6..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/rnnt_abstract.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/squeezeformer_encoder.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/squeezeformer_encoder.cpython-310.pyc deleted file mode 100644 index 1d267f378f95c700f5a4f78aa1c94a0ce8b03fa9..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/__pycache__/squeezeformer_encoder.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/audio_modules.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/audio_modules.py deleted file mode 100644 index 3abbd287051a784e7ef2b49bce4e5cb5694ce835..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/audio_modules.py +++ /dev/null @@ -1,1433 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, List, Optional, Tuple - -import numpy as np -import torch - -from nemo.collections.asr.losses.audio_losses import temporal_mean -from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder -from nemo.collections.asr.parts.preprocessing.features import make_seq_mask_like -from nemo.collections.asr.parts.submodules.multichannel_modules import ( - ChannelAttentionPool, - ChannelAveragePool, - ParametricMultichannelWienerFilter, - TransformAttendConcatenate, - TransformAverageConcatenate, -) -from nemo.collections.asr.parts.utils.audio_utils import db2mag, wrap_to_pi -from nemo.core.classes import NeuralModule, typecheck -from nemo.core.neural_types import FloatType, LengthsType, NeuralType, SpectrogramType -from nemo.utils import logging -from nemo.utils.decorators import experimental - -__all__ = [ - 'MaskEstimatorRNN', - 'MaskEstimatorFlexChannels', - 'MaskReferenceChannel', - 'MaskBasedBeamformer', - 'MaskBasedDereverbWPE', -] - - -class SpectrogramToMultichannelFeatures(NeuralModule): - """Convert a complex-valued multi-channel spectrogram to - multichannel features. - - Args: - num_subbands: Expected number of subbands in the input signal - num_input_channels: Optional, provides the number of channels - of the input signal. Used to infer the number - of output channels. - mag_reduction: Reduction across channels. Default `None`, will calculate - magnitude of each channel. - mag_power: Optional, apply power on the magnitude. - use_ipd: Use inter-channel phase difference (IPD). - mag_normalization: Normalization for magnitude features - ipd_normalization: Normalization for IPD features - eps: Small regularization constant. - """ - - def __init__( - self, - num_subbands: int, - num_input_channels: Optional[int] = None, - mag_reduction: Optional[str] = None, - mag_power: Optional[float] = None, - use_ipd: bool = False, - mag_normalization: Optional[str] = None, - ipd_normalization: Optional[str] = None, - eps: float = 1e-8, - ): - super().__init__() - self.mag_reduction = mag_reduction - self.mag_power = mag_power - self.use_ipd = use_ipd - - if mag_normalization not in [None, 'mean', 'mean_var']: - raise NotImplementedError(f'Unknown magnitude normalization {mag_normalization}') - self.mag_normalization = mag_normalization - - if ipd_normalization not in [None, 'mean', 'mean_var']: - raise NotImplementedError(f'Unknown ipd normalization {ipd_normalization}') - self.ipd_normalization = ipd_normalization - - if self.use_ipd: - self._num_features = 2 * num_subbands - self._num_channels = num_input_channels - else: - self._num_features = num_subbands - self._num_channels = num_input_channels if self.mag_reduction is None else 1 - - self.eps = eps - - logging.debug('Initialized %s with', self.__class__.__name__) - logging.debug('\tnum_subbands: %d', num_subbands) - logging.debug('\tmag_reduction: %s', self.mag_reduction) - logging.debug('\tmag_power: %s', self.mag_power) - logging.debug('\tuse_ipd: %s', self.use_ipd) - logging.debug('\tmag_normalization: %s', self.mag_normalization) - logging.debug('\tipd_normalization: %s', self.ipd_normalization) - logging.debug('\teps: %f', self.eps) - logging.debug('\t_num_features: %s', self._num_features) - logging.debug('\t_num_channels: %s', self._num_channels) - - @property - def input_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "input": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "input_length": NeuralType(('B',), LengthsType()), - } - - @property - def output_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "output": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "output_length": NeuralType(('B',), LengthsType()), - } - - @property - def num_features(self) -> int: - """Configured number of features - """ - return self._num_features - - @property - def num_channels(self) -> int: - """Configured number of channels - """ - if self._num_channels is not None: - return self._num_channels - else: - raise ValueError( - 'Num channels is not configured. To configure this, `num_input_channels` ' - 'must be provided when constructing the object.' - ) - - @staticmethod - def get_mean_time_channel(input: torch.Tensor, input_length: Optional[torch.Tensor] = None) -> torch.Tensor: - """Calculate mean across time and channel dimensions. - - Args: - input: tensor with shape (B, C, F, T) - input_length: tensor with shape (B,) - - Returns: - Mean of `input` calculated across time and channel dimension - with shape (B, 1, F, 1) - """ - assert input.ndim == 4, f'Expected input to have 4 dimensions, got {input.ndim}' - - if input_length is None: - mean = torch.mean(input, dim=(-1, -3), keepdim=True) - else: - # temporal mean - mean = temporal_mean(input, input_length, keepdim=True) - # channel mean - mean = torch.mean(mean, dim=-3, keepdim=True) - - return mean - - @classmethod - def get_mean_std_time_channel( - cls, input: torch.Tensor, input_length: Optional[torch.Tensor] = None, eps: float = 1e-10 - ) -> torch.Tensor: - """Calculate mean and standard deviation across time and channel dimensions. - - Args: - input: tensor with shape (B, C, F, T) - input_length: tensor with shape (B,) - - Returns: - Mean and standard deviation of the `input` calculated across time and - channel dimension, each with shape (B, 1, F, 1). - """ - assert input.ndim == 4, f'Expected input to have 4 dimensions, got {input.ndim}' - - if input_length is None: - std, mean = torch.std_mean(input, dim=(-1, -3), unbiased=False, keepdim=True) - else: - mean = cls.get_mean_time_channel(input, input_length) - std = (input - mean).pow(2) - # temporal mean - std = temporal_mean(std, input_length, keepdim=True) - # channel mean - std = torch.mean(std, dim=-3, keepdim=True) - # final value - std = torch.sqrt(std.clamp(eps)) - - return mean, std - - @typecheck( - input_types={ - 'input': NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - 'input_length': NeuralType(tuple('B'), LengthsType()), - }, - output_types={'output': NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),}, - ) - def normalize_mean(self, input: torch.Tensor, input_length: torch.Tensor) -> torch.Tensor: - """Mean normalization for the input tensor. - - Args: - input: input tensor - input_length: valid length for each example - - Returns: - Mean normalized input. - """ - mean = self.get_mean_time_channel(input=input, input_length=input_length) - output = input - mean - return output - - @typecheck( - input_types={ - 'input': NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - 'input_length': NeuralType(tuple('B'), LengthsType()), - }, - output_types={'output': NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),}, - ) - def normalize_mean_var(self, input: torch.Tensor, input_length: torch.Tensor) -> torch.Tensor: - """Mean and variance normalization for the input tensor. - - Args: - input: input tensor - input_length: valid length for each example - - Returns: - Mean and variance normalized input. - """ - mean, std = self.get_mean_std_time_channel(input=input, input_length=input_length, eps=self.eps) - output = (input - mean) / std - return output - - @typecheck() - def forward(self, input: torch.Tensor, input_length: torch.Tensor) -> torch.Tensor: - """Convert input batch of C-channel spectrograms into - a batch of time-frequency features with dimension num_feat. - The output number of channels may be the same as input, or - reduced to 1, e.g., if averaging over magnitude and not appending individual IPDs. - - Args: - input: Spectrogram for C channels with F subbands and N time frames, (B, C, F, N) - input_length: Length of valid entries along the time dimension, shape (B,) - - Returns: - num_feat_channels channels with num_feat features, shape (B, num_feat_channels, num_feat, N) - """ - # Magnitude spectrum - if self.mag_reduction is None: - mag = torch.abs(input) - elif self.mag_reduction == 'abs_mean': - mag = torch.abs(torch.mean(input, axis=1, keepdim=True)) - elif self.mag_reduction == 'mean_abs': - mag = torch.mean(torch.abs(input), axis=1, keepdim=True) - elif self.mag_reduction == 'rms': - mag = torch.sqrt(torch.mean(torch.abs(input) ** 2, axis=1, keepdim=True)) - else: - raise ValueError(f'Unexpected magnitude reduction {self.mag_reduction}') - - if self.mag_power is not None: - mag = torch.pow(mag, self.mag_power) - - if self.mag_normalization == 'mean': - # normalize mean across channels and time steps - mag = self.normalize_mean(input=mag, input_length=input_length) - elif self.mag_normalization == 'mean_var': - mag = self.normalize_mean_var(input=mag, input_length=input_length) - - features = mag - - if self.use_ipd: - # Calculate IPD relative to the average spec - spec_mean = torch.mean(input, axis=1, keepdim=True) # channel average - ipd = torch.angle(input) - torch.angle(spec_mean) - # Modulo to [-pi, pi] - ipd = wrap_to_pi(ipd) - - if self.ipd_normalization == 'mean': - # normalize mean across channels and time steps - # mean across time - ipd = self.normalize_mean(input=ipd, input_length=input_length) - elif self.ipd_normalization == 'mean_var': - ipd = self.normalize_mean_var(input=ipd, input_length=input_length) - - # Concatenate to existing features - features = torch.cat([features.expand(ipd.shape), ipd], axis=2) - - if self._num_channels is not None and features.size(1) != self._num_channels: - raise RuntimeError( - f'Number of channels in features {features.size(1)} is different than the configured number of channels {self._num_channels}' - ) - - return features, input_length - - -class MaskEstimatorRNN(NeuralModule): - """Estimate `num_outputs` masks from the input spectrogram - using stacked RNNs and projections. - - The module is structured as follows: - input --> spatial features --> input projection --> - --> stacked RNNs --> output projection for each output --> sigmoid - - Reference: - Multi-microphone neural speech separation for far-field multi-talker - speech recognition (https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8462081) - - Args: - num_outputs: Number of output masks to estimate - num_subbands: Number of subbands of the input spectrogram - num_features: Number of features after the input projections - num_layers: Number of RNN layers - num_hidden_features: Number of hidden features in RNN layers - num_input_channels: Number of input channels - dropout: If non-zero, introduces dropout on the outputs of each RNN layer except the last layer, with dropout - probability equal to `dropout`. Default: 0 - bidirectional: If `True`, use bidirectional RNN. - rnn_type: Type of RNN, either `lstm` or `gru`. Default: `lstm` - mag_reduction: Channel-wise reduction for magnitude features - use_ipd: Use inter-channel phase difference (IPD) features - """ - - def __init__( - self, - num_outputs: int, - num_subbands: int, - num_features: int = 1024, - num_layers: int = 3, - num_hidden_features: Optional[int] = None, - num_input_channels: Optional[int] = None, - dropout: float = 0, - bidirectional=True, - rnn_type: str = 'lstm', - mag_reduction: str = 'rms', - use_ipd: bool = None, - ): - super().__init__() - if num_hidden_features is None: - num_hidden_features = num_features - - self.features = SpectrogramToMultichannelFeatures( - num_subbands=num_subbands, - num_input_channels=num_input_channels, - mag_reduction=mag_reduction, - use_ipd=use_ipd, - ) - - self.input_projection = torch.nn.Linear( - in_features=self.features.num_features * self.features.num_channels, out_features=num_features - ) - - if rnn_type == 'lstm': - self.rnn = torch.nn.LSTM( - input_size=num_features, - hidden_size=num_hidden_features, - num_layers=num_layers, - batch_first=True, - dropout=dropout, - bidirectional=bidirectional, - ) - elif rnn_type == 'gru': - self.rnn = torch.nn.GRU( - input_size=num_features, - hidden_size=num_hidden_features, - num_layers=num_layers, - batch_first=True, - dropout=dropout, - bidirectional=bidirectional, - ) - else: - raise ValueError(f'Unknown rnn_type: {rnn_type}') - - self.fc = torch.nn.Linear( - in_features=2 * num_features if bidirectional else num_features, out_features=num_features - ) - self.norm = torch.nn.LayerNorm(num_features) - - # Each output shares the RNN and has a separate projection - self.output_projections = torch.nn.ModuleList( - [torch.nn.Linear(in_features=num_features, out_features=num_subbands) for _ in range(num_outputs)] - ) - self.output_nonlinearity = torch.nn.Sigmoid() - - @property - def input_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "input": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "input_length": NeuralType(('B',), LengthsType()), - } - - @property - def output_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "output": NeuralType(('B', 'C', 'D', 'T'), FloatType()), - "output_length": NeuralType(('B',), LengthsType()), - } - - @typecheck() - def forward(self, input: torch.Tensor, input_length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """Estimate `num_outputs` masks from the input spectrogram. - - Args: - input: C-channel input, shape (B, C, F, N) - input_length: Length of valid entries along the time dimension, shape (B,) - - Returns: - Returns `num_outputs` masks in a tensor, shape (B, num_outputs, F, N), - and output length with shape (B,) - """ - input, _ = self.features(input=input, input_length=input_length) - B, num_feature_channels, num_features, N = input.shape - - # (B, num_feat_channels, num_feat, N) -> (B, N, num_feat_channels, num_feat) - input = input.permute(0, 3, 1, 2) - - # (B, N, num_feat_channels, num_feat) -> (B, N, num_feat_channels * num_features) - input = input.view(B, N, -1) - - # Apply projection on num_feat - input = self.input_projection(input) - - # Apply RNN on the input sequence - input_packed = torch.nn.utils.rnn.pack_padded_sequence( - input, input_length.cpu(), batch_first=True, enforce_sorted=False - ).to(input.device) - self.rnn.flatten_parameters() - input_packed, _ = self.rnn(input_packed) - output, output_length = torch.nn.utils.rnn.pad_packed_sequence(input_packed, batch_first=True) - output_length = output_length.to(input.device) - - # Layer normalization and skip connection - output = self.norm(self.fc(output)) + input - - # Create `num_outputs` masks - masks = [] - for output_projection in self.output_projections: - # Output projection - mask = output_projection(output) - mask = self.output_nonlinearity(mask) - - # Back to the original format - # (B, N, F) -> (B, F, N) - mask = mask.transpose(2, 1) - - # Append to the output - masks.append(mask) - - # Stack along channel dimension to get (B, M, F, N) - masks = torch.stack(masks, axis=1) - - # Mask frames beyond output length - length_mask: torch.Tensor = make_seq_mask_like( - lengths=output_length, like=masks, time_dim=-1, valid_ones=False - ) - masks = masks.masked_fill(length_mask, 0.0) - - return masks, output_length - - -class MaskEstimatorFlexChannels(NeuralModule): - """Estimate `num_outputs` masks from the input spectrogram - using stacked channel-wise and temporal layers. - - This model is using interlaved channel blocks and temporal blocks, and - it can process arbitrary number of input channels. - Default channel block is the transform-average-concatenate layer. - Default temporal block is the Conformer encoder. - Reduction from multichannel signal to single-channel signal is performed - after `channel_reduction_position` blocks. Only temporal blocks are used afterwards. - After the sequence of blocks, the output mask is computed using an additional - output temporal layer and a nonlinearity. - - References: - - Yoshioka et al, VarArray: Array-Geometry-Agnostic Continuous Speech Separation, 2022 - - Jukić et al, Flexible multichannel speech enhancement for noise-robust frontend, 2023 - - Args: - num_outputs: Number of output masks. - num_subbands: Number of subbands on the input spectrogram. - num_blocks: Number of blocks in the model. - channel_reduction_position: After this block, the signal will be reduced across channels. - channel_reduction_type: Reduction across channels: 'average' or 'attention' - channel_block_type: Block for channel processing: 'transform_average_concatenate' or 'transform_attend_concatenate' - temporal_block_type: Block for temporal processing: 'conformer_encoder' - temporal_block_num_layers: Number of layers for the temporal block - temporal_block_num_heads: Number of heads for the temporal block - temporal_block_dimension: The hidden size of the model - temporal_block_self_attention_model: Self attention model for the temporal block - temporal_block_att_context_size: Attention context size for the temporal block - mag_reduction: Channel-wise reduction for magnitude features - mag_power: Power to apply on magnitude features - use_ipd: Use inter-channel phase difference (IPD) features - mag_normalization: Normalize using mean ('mean') or mean and variance ('mean_var') - ipd_normalization: Normalize using mean ('mean') or mean and variance ('mean_var') - """ - - def __init__( - self, - num_outputs: int, - num_subbands: int, - num_blocks: int, - channel_reduction_position: int = -1, # if 0, apply before block 0, if -1 apply at the end - channel_reduction_type: str = 'attention', - channel_block_type: str = 'transform_attend_concatenate', - temporal_block_type: str = 'conformer_encoder', - temporal_block_num_layers: int = 5, - temporal_block_num_heads: int = 4, - temporal_block_dimension: int = 128, - temporal_block_self_attention_model: str = 'rel_pos', - temporal_block_att_context_size: Optional[List[int]] = None, - num_input_channels: Optional[int] = None, - mag_reduction: str = 'abs_mean', - mag_power: Optional[float] = None, - use_ipd: bool = True, - mag_normalization: Optional[str] = None, - ipd_normalization: Optional[str] = None, - ): - super().__init__() - - self.features = SpectrogramToMultichannelFeatures( - num_subbands=num_subbands, - num_input_channels=num_input_channels, - mag_reduction=mag_reduction, - mag_power=mag_power, - use_ipd=use_ipd, - mag_normalization=mag_normalization, - ipd_normalization=ipd_normalization, - ) - self.num_blocks = num_blocks - logging.debug('Total number of blocks: %d', self.num_blocks) - - # Channel reduction - if channel_reduction_position == -1: - # Apply reduction after the last layer - channel_reduction_position = num_blocks - - if channel_reduction_position > num_blocks: - raise ValueError( - f'Channel reduction position {channel_reduction_position} exceeds the number of blocks {num_blocks}' - ) - self.channel_reduction_position = channel_reduction_position - logging.debug('Channel reduction will be applied before block %d', self.channel_reduction_position) - - # Prepare processing blocks - self.channel_blocks = torch.nn.ModuleList() - self.temporal_blocks = torch.nn.ModuleList() - - for n in range(num_blocks): - logging.debug('Prepare block %d', n) - - # Setup channel block - if n < channel_reduction_position: - # Number of input features is either the number of input channels or the number of temporal block features - channel_in_features = self.features.num_features if n == 0 else temporal_block_dimension - logging.debug( - 'Setup channel block %s with %d input features and %d output features', - channel_block_type, - channel_in_features, - temporal_block_dimension, - ) - - # Instantiante the channel block - if channel_block_type == 'transform_average_concatenate': - channel_block = TransformAverageConcatenate( - in_features=channel_in_features, out_features=temporal_block_dimension - ) - elif channel_block_type == 'transform_attend_concatenate': - channel_block = TransformAttendConcatenate( - in_features=channel_in_features, out_features=temporal_block_dimension - ) - else: - raise ValueError(f'Unknown channel layer type: {channel_block_type}') - self.channel_blocks.append(channel_block) - - # Setup temporal block - temporal_in_features = ( - self.features.num_features if n == self.channel_reduction_position == 0 else temporal_block_dimension - ) - logging.debug('Setup temporal block %s', temporal_block_type) - if temporal_block_type == 'conformer_encoder': - temporal_block = ConformerEncoder( - feat_in=temporal_in_features, - n_layers=temporal_block_num_layers, - d_model=temporal_block_dimension, - subsampling_factor=1, - self_attention_model=temporal_block_self_attention_model, - att_context_size=temporal_block_att_context_size, - n_heads=temporal_block_num_heads, - ) - else: - raise ValueError(f'Unknown temporal block {temporal_block}.') - - self.temporal_blocks.append(temporal_block) - - logging.debug('Setup channel reduction %s', channel_reduction_type) - if channel_reduction_type == 'average': - # Mean across channel dimension - self.channel_reduction = ChannelAveragePool() - elif channel_reduction_type == 'attention': - # Number of input features is either the number of input channels or the number of temporal block features - channel_reduction_in_features = ( - self.features.num_features if self.channel_reduction_position == 0 else temporal_block_dimension - ) - # Attention across channel dimension - self.channel_reduction = ChannelAttentionPool(in_features=channel_reduction_in_features) - else: - raise ValueError(f'Unknown channel reduction type: {channel_reduction_type}') - - logging.debug('Setup %d output layers', num_outputs) - self.output_layers = torch.nn.ModuleList( - [ - ConformerEncoder( - feat_in=temporal_block_dimension, - n_layers=1, - d_model=temporal_block_dimension, - feat_out=num_subbands, - subsampling_factor=1, - self_attention_model=temporal_block_self_attention_model, - att_context_size=temporal_block_att_context_size, - n_heads=temporal_block_num_heads, - ) - for _ in range(num_outputs) - ] - ) - - # Output nonlinearity - self.output_nonlinearity = torch.nn.Sigmoid() - - @property - def input_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "input": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "input_length": NeuralType(('B',), LengthsType()), - } - - @property - def output_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "output": NeuralType(('B', 'C', 'D', 'T'), FloatType()), - "output_length": NeuralType(('B',), LengthsType()), - } - - @typecheck() - def forward(self, input: torch.Tensor, input_length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """Estimate `num_outputs` masks from the input spectrogram. - """ - # get input features from a complex-valued spectrogram, (B, C, F, T) - output, output_length = self.features(input=input, input_length=input_length) - - # batch and num channels - B, M = input.size(0), input.size(1) - - # process all blocks - for n in range(self.num_blocks): - if n < self.channel_reduction_position: - # apply multichannel block - output = self.channel_blocks[n](input=output) - # change to a single-stream format - F, T = output.size(-2), output.size(-1) - # (B, M, F, T) -> (B * M, F, T) - output = output.reshape(-1, F, T) - if M > 1: - # adjust the lengths accordingly - output_length = output_length.repeat_interleave(M) - - elif n == self.channel_reduction_position: - # apply channel reduction - # (B, M, F, T) -> (B, F, T) - output = self.channel_reduction(input=output) - - # apply temporal model on each channel independently - with typecheck.disable_checks(): - # output is AcousticEncodedRepresentation, conformer encoder requires SpectrogramType - output, output_length = self.temporal_blocks[n](audio_signal=output, length=output_length) - - # if channel reduction has not been applied yet, go back to multichannel layout - if n < self.channel_reduction_position: - # back to multi-channel format with possibly a different number of features - T = output.size(-1) - # (B * M, F, T) -> (B, M, F, T) - output = output.reshape(B, M, -1, T) - if M > 1: - # convert lengths from single-stream format to original multichannel - output_length = output_length[0:-1:M] - - if self.channel_reduction_position == self.num_blocks: - # apply channel reduction after the last layer - # (B, M, F, T) -> (B, F, T) - output = self.channel_reduction(input=output) - - # final mask for each output - masks = [] - for output_layer in self.output_layers: - # calculate mask - with typecheck.disable_checks(): - # output is AcousticEncodedRepresentation, conformer encoder requires SpectrogramType - mask, mask_length = output_layer(audio_signal=output, length=output_length) - mask = self.output_nonlinearity(mask) - # append to all masks - masks.append(mask) - - # stack masks along channel dimensions - masks = torch.stack(masks, dim=1) - - return masks, mask_length - - -class MaskReferenceChannel(NeuralModule): - """A simple mask processor which applies mask - on ref_channel of the input signal. - - Args: - ref_channel: Index of the reference channel. - mask_min_db: Threshold mask to a minimal value before applying it, defaults to -200dB - mask_max_db: Threshold mask to a maximal value before applying it, defaults to 0dB - """ - - def __init__(self, ref_channel: int = 0, mask_min_db: float = -200, mask_max_db: float = 0): - super().__init__() - self.ref_channel = ref_channel - # Mask thresholding - self.mask_min = db2mag(mask_min_db) - self.mask_max = db2mag(mask_max_db) - - logging.debug('Initialized %s with', self.__class__.__name__) - logging.debug('\tref_channel: %d', self.ref_channel) - logging.debug('\tmask_min: %f', self.mask_min) - logging.debug('\tmask_max: %f', self.mask_max) - - @property - def input_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "input": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "input_length": NeuralType(('B',), LengthsType()), - "mask": NeuralType(('B', 'C', 'D', 'T'), FloatType()), - } - - @property - def output_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "output": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "output_length": NeuralType(('B',), LengthsType()), - } - - @typecheck() - def forward( - self, input: torch.Tensor, input_length: torch.Tensor, mask: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Apply mask on `ref_channel` of the input signal. - This can be used to generate multi-channel output. - If `mask` has `M` channels, the output will have `M` channels as well. - - Args: - input: Input signal complex-valued spectrogram, shape (B, C, F, N) - input_length: Length of valid entries along the time dimension, shape (B,) - mask: Mask for M outputs, shape (B, M, F, N) - - Returns: - M-channel output complex-valed spectrogram with shape (B, M, F, N) - """ - # Apply thresholds - mask = torch.clamp(mask, min=self.mask_min, max=self.mask_max) - - # Apply each output mask on the ref channel - output = mask * input[:, self.ref_channel : self.ref_channel + 1, ...] - return output, input_length - - -class MaskBasedBeamformer(NeuralModule): - """Multi-channel processor using masks to estimate signal statistics. - - Args: - filter_type: string denoting the type of the filter. Defaults to `mvdr` - filter_beta: Parameter of the parameteric multichannel Wiener filter - filter_rank: Parameter of the parametric multichannel Wiener filter - filter_postfilter: Optional, postprocessing of the filter - ref_channel: Optional, reference channel. If None, it will be estimated automatically - ref_hard: If true, hard (one-hot) reference. If false, a soft reference - ref_hard_use_grad: If true, use straight-through gradient when using the hard reference - ref_subband_weighting: If true, use subband weighting when estimating reference channel - num_subbands: Optional, used to determine the parameter size for reference estimation - mask_min_db: Threshold mask to a minimal value before applying it, defaults to -200dB - mask_max_db: Threshold mask to a maximal value before applying it, defaults to 0dB - diag_reg: Optional, diagonal regularization for the multichannel filter - eps: Small regularization constant to avoid division by zero - """ - - def __init__( - self, - filter_type: str = 'mvdr_souden', - filter_beta: float = 0.0, - filter_rank: str = 'one', - filter_postfilter: Optional[str] = None, - ref_channel: Optional[int] = 0, - ref_hard: bool = True, - ref_hard_use_grad: bool = False, - ref_subband_weighting: bool = False, - num_subbands: Optional[int] = None, - mask_min_db: float = -200, - mask_max_db: float = 0, - postmask_min_db: float = 0, - postmask_max_db: float = 0, - diag_reg: Optional[float] = 1e-6, - eps: float = 1e-8, - ): - super().__init__() - if filter_type not in ['pmwf', 'mvdr_souden']: - raise ValueError(f'Unknown filter type {filter_type}') - - self.filter_type = filter_type - if self.filter_type == 'mvdr_souden' and filter_beta != 0: - logging.warning( - 'Using filter type %s: beta will be automatically set to zero (current beta %f) and rank to one (current rank %s).', - self.filter_type, - filter_beta, - filter_rank, - ) - filter_beta = 0.0 - filter_rank = 'one' - # Prepare filter - self.filter = ParametricMultichannelWienerFilter( - beta=filter_beta, - rank=filter_rank, - postfilter=filter_postfilter, - ref_channel=ref_channel, - ref_hard=ref_hard, - ref_hard_use_grad=ref_hard_use_grad, - ref_subband_weighting=ref_subband_weighting, - num_subbands=num_subbands, - diag_reg=diag_reg, - eps=eps, - ) - # Mask thresholding - if mask_min_db >= mask_max_db: - raise ValueError( - f'Lower bound for the mask {mask_min_db}dB must be smaller than the upper bound {mask_max_db}dB' - ) - self.mask_min = db2mag(mask_min_db) - self.mask_max = db2mag(mask_max_db) - # Postmask thresholding - if postmask_min_db > postmask_max_db: - raise ValueError( - f'Lower bound for the postmask {postmask_min_db}dB must be smaller or equal to the upper bound {postmask_max_db}dB' - ) - self.postmask_min = db2mag(postmask_min_db) - self.postmask_max = db2mag(postmask_max_db) - - logging.debug('Initialized %s', self.__class__.__name__) - logging.debug('\tfilter_type: %s', self.filter_type) - logging.debug('\tmask_min: %e', self.mask_min) - logging.debug('\tmask_max: %e', self.mask_max) - logging.debug('\tpostmask_min: %e', self.postmask_min) - logging.debug('\tpostmask_max: %e', self.postmask_max) - - @property - def input_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "input": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "mask": NeuralType(('B', 'C', 'D', 'T'), FloatType()), - "mask_undesired": NeuralType(('B', 'C', 'D', 'T'), FloatType(), optional=True), - "input_length": NeuralType(('B',), LengthsType(), optional=True), - } - - @property - def output_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "output": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "output_length": NeuralType(('B',), LengthsType(), optional=True), - } - - @typecheck() - def forward( - self, - input: torch.Tensor, - mask: torch.Tensor, - mask_undesired: Optional[torch.Tensor] = None, - input_length: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Apply a mask-based beamformer to the input spectrogram. - This can be used to generate multi-channel output. - If `mask` has multiple channels, a multichannel filter is created for each mask, - and the output is concatenation of individual outputs along the channel dimension. - The total number of outputs is `num_masks * M`, where `M` is the number of channels - at the filter output. - - Args: - input: Input signal complex-valued spectrogram, shape (B, C, F, N) - mask: Mask for M output signals, shape (B, num_masks, F, N) - input_length: Length of valid entries along the time dimension, shape (B,) - - Returns: - Multichannel output signal complex-valued spectrogram, shape (B, num_masks * M, F, N) - """ - # Length mask - if input_length is not None: - length_mask: torch.Tensor = make_seq_mask_like( - lengths=input_length, like=mask[:, 0, ...], time_dim=-1, valid_ones=False - ) - - # Use each mask to generate an output - output, num_masks = [], mask.size(1) - for m in range(num_masks): - # Desired signal mask - mask_d = mask[:, m, ...] - # Undesired signal mask - if mask_undesired is not None: - mask_u = mask_undesired[:, m, ...] - elif num_masks == 1: - # If a single mask is estimated, use the complement - mask_u = 1 - mask_d - else: - # Use sum of all other sources - mask_u = torch.sum(mask, dim=1) - mask_d - - # Threshold masks - mask_d = torch.clamp(mask_d, min=self.mask_min, max=self.mask_max) - mask_u = torch.clamp(mask_u, min=self.mask_min, max=self.mask_max) - - if input_length is not None: - mask_d = mask_d.masked_fill(length_mask, 0.0) - mask_u = mask_u.masked_fill(length_mask, 0.0) - - # Apply filter - output_m = self.filter(input=input, mask_s=mask_d, mask_n=mask_u) - - # Optional: apply a postmask with min and max thresholds - if self.postmask_min < self.postmask_max: - postmask_m = torch.clamp(mask[:, m, ...], min=self.postmask_min, max=self.postmask_max) - output_m = output_m * postmask_m.unsqueeze(1) - - # Save the current output (B, M, F, T) - output.append(output_m) - - # Combine outputs along the channel dimension - # Each output is (B, M, F, T) - output = torch.concatenate(output, axis=1) - - # Apply masking - if input_length is not None: - output = output.masked_fill(length_mask[:, None, ...], 0.0) - - return output, input_length - - -class WPEFilter(NeuralModule): - """A weighted prediction error filter. - Given input signal, and expected power of the desired signal, this - class estimates a multiple-input multiple-output prediction filter - and returns the filtered signal. Currently, estimation of statistics - and processing is performed in batch mode. - - Args: - filter_length: Length of the prediction filter in frames, per channel - prediction_delay: Prediction delay in frames - diag_reg: Diagonal regularization for the correlation matrix Q, applied as diag_reg * trace(Q) + eps - eps: Small positive constant for regularization - - References: - - Yoshioka and Nakatani, Generalization of Multi-Channel Linear Prediction - Methods for Blind MIMO Impulse Response Shortening, 2012 - - Jukić et al, Group sparsity for MIMO speech dereverberation, 2015 - """ - - def __init__(self, filter_length: int, prediction_delay: int, diag_reg: Optional[float] = 1e-6, eps: float = 1e-8): - super().__init__() - self.filter_length = filter_length - self.prediction_delay = prediction_delay - self.diag_reg = diag_reg - self.eps = eps - - logging.debug('Initialized %s', self.__class__.__name__) - logging.debug('\tfilter_length: %d', self.filter_length) - logging.debug('\tprediction_delay: %d', self.prediction_delay) - logging.debug('\tdiag_reg: %g', self.diag_reg) - logging.debug('\teps: %g', self.eps) - - @property - def input_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "input": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "power": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "input_length": NeuralType(('B',), LengthsType(), optional=True), - } - - @property - def output_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "output": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "output_length": NeuralType(('B',), LengthsType(), optional=True), - } - - @typecheck() - def forward( - self, input: torch.Tensor, power: torch.Tensor, input_length: Optional[torch.Tensor] = None - ) -> torch.Tensor: - """Given input and the predicted power for the desired signal, estimate - the WPE filter and return the processed signal. - - Args: - input: Input signal, shape (B, C, F, N) - power: Predicted power of the desired signal, shape (B, C, F, N) - input_length: Optional, length of valid frames in `input`. Defaults to `None` - - Returns: - Tuple of (processed_signal, output_length). Processed signal has the same - shape as the input signal (B, C, F, N), and the output length is the same - as the input length. - """ - # Temporal weighting: average power over channels, output shape (B, F, N) - weight = torch.mean(power, dim=1) - # Use inverse power as the weight - weight = 1 / (weight + self.eps) - - # Multi-channel convolution matrix for each subband - tilde_input = self.convtensor(input, filter_length=self.filter_length, delay=self.prediction_delay) - - # Estimate correlation matrices - Q, R = self.estimate_correlations( - input=input, weight=weight, tilde_input=tilde_input, input_length=input_length - ) - - # Estimate prediction filter - G = self.estimate_filter(Q=Q, R=R) - - # Apply prediction filter - undesired_signal = self.apply_filter(filter=G, tilde_input=tilde_input) - - # Dereverberation - desired_signal = input - undesired_signal - - if input_length is not None: - # Mask padded frames - length_mask: torch.Tensor = make_seq_mask_like( - lengths=input_length, like=desired_signal, time_dim=-1, valid_ones=False - ) - desired_signal = desired_signal.masked_fill(length_mask, 0.0) - - return desired_signal, input_length - - @classmethod - def convtensor( - cls, x: torch.Tensor, filter_length: int, delay: int = 0, n_steps: Optional[int] = None - ) -> torch.Tensor: - """Create a tensor equivalent of convmtx_mc for each example in the batch. - The input signal tensor `x` has shape (B, C, F, N). - Convtensor returns a view of the input signal `x`. - - Note: We avoid reshaping the output to collapse channels and filter taps into - a single dimension, e.g., (B, F, N, -1). In this way, the output is a view of the input, - while an additional reshape would result in a contiguous array and more memory use. - - Args: - x: input tensor, shape (B, C, F, N) - filter_length: length of the filter, determines the shape of the convolution tensor - delay: delay to add to the input signal `x` before constructing the convolution tensor - n_steps: Optional, number of time steps to keep in the out. Defaults to the number of - time steps in the input tensor. - - Returns: - Return a convolutional tensor with shape (B, C, F, n_steps, filter_length) - """ - if x.ndim != 4: - raise RuntimeError(f'Expecting a 4-D input. Received input with shape {x.shape}') - - B, C, F, N = x.shape - - if n_steps is None: - # Keep the same length as the input signal - n_steps = N - - # Pad temporal dimension - x = torch.nn.functional.pad(x, (filter_length - 1 + delay, 0)) - - # Build Toeplitz-like matrix view by unfolding across time - tilde_X = x.unfold(-1, filter_length, 1) - - # Trim to the set number of time steps - tilde_X = tilde_X[:, :, :, :n_steps, :] - - return tilde_X - - @classmethod - def permute_convtensor(cls, x: torch.Tensor) -> torch.Tensor: - """Reshape and permute columns to convert the result of - convtensor to be equal to convmtx_mc. This is used for verification - purposes and it is not required to use the filter. - - Args: - x: output of self.convtensor, shape (B, C, F, N, filter_length) - - Returns: - Output has shape (B, F, N, C*filter_length) that corresponds to - the layout of convmtx_mc. - """ - B, C, F, N, filter_length = x.shape - - # .view will not work, so a copy will have to be created with .reshape - # That will result in more memory use, since we don't use a view of the original - # multi-channel signal - x = x.permute(0, 2, 3, 1, 4) - x = x.reshape(B, F, N, C * filter_length) - - permute = [] - for m in range(C): - permute[m * filter_length : (m + 1) * filter_length] = m * filter_length + np.flip( - np.arange(filter_length) - ) - return x[..., permute] - - def estimate_correlations( - self, - input: torch.Tensor, - weight: torch.Tensor, - tilde_input: torch.Tensor, - input_length: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor]: - """ - Args: - input: Input signal, shape (B, C, F, N) - weight: Time-frequency weight, shape (B, F, N) - tilde_input: Multi-channel convolution tensor, shape (B, C, F, N, filter_length) - input_length: Length of each input example, shape (B) - - Returns: - Returns a tuple of correlation matrices for each batch. - - Let `X` denote the input signal in a single subband, - `tilde{X}` the corresponding multi-channel correlation matrix, - and `w` the vector of weights. - - The first output is - Q = tilde{X}^H * diag(w) * tilde{X} (1) - for each (b, f). - The matrix calculated in (1) has shape (C * filter_length, C * filter_length) - The output is returned in a tensor with shape (B, F, C, filter_length, C, filter_length). - - The second output is - R = tilde{X}^H * diag(w) * X (2) - for each (b, f). - The matrix calculated in (2) has shape (C * filter_length, C) - The output is returned in a tensor with shape (B, F, C, filter_length, C). The last - dimension corresponds to output channels. - """ - if input_length is not None: - # Take only valid samples into account - length_mask: torch.Tensor = make_seq_mask_like( - lengths=input_length, like=weight, time_dim=-1, valid_ones=False - ) - weight = weight.masked_fill(length_mask, 0.0) - - # Calculate (1) - # result: (B, F, C, filter_length, C, filter_length) - Q = torch.einsum('bjfik,bmfin->bfjkmn', tilde_input.conj(), weight[:, None, :, :, None] * tilde_input) - - # Calculate (2) - # result: (B, F, C, filter_length, C) - R = torch.einsum('bjfik,bmfi->bfjkm', tilde_input.conj(), weight[:, None, :, :] * input) - - return Q, R - - def estimate_filter(self, Q: torch.Tensor, R: torch.Tensor) -> torch.Tensor: - """Estimate the MIMO prediction filter as - G(b,f) = Q(b,f) \ R(b,f) - for each subband in each example in the batch (b, f). - - Args: - Q: shape (B, F, C, filter_length, C, filter_length) - R: shape (B, F, C, filter_length, C) - - Returns: - Complex-valued prediction filter, shape (B, C, F, C, filter_length) - """ - B, F, C, filter_length, _, _ = Q.shape - assert ( - filter_length == self.filter_length - ), f'Shape of Q {Q.shape} is not matching filter length {self.filter_length}' - - # Reshape to analytical dimensions for each (b, f) - Q = Q.reshape(B, F, C * self.filter_length, C * filter_length) - R = R.reshape(B, F, C * self.filter_length, C) - - # Diagonal regularization - if self.diag_reg: - # Regularization: diag_reg * trace(Q) + eps - diag_reg = self.diag_reg * torch.diagonal(Q, dim1=-2, dim2=-1).sum(-1).real + self.eps - # Apply regularization on Q - Q = Q + torch.diag_embed(diag_reg.unsqueeze(-1) * torch.ones(Q.shape[-1], device=Q.device)) - - # Solve for the filter - G = torch.linalg.solve(Q, R) - - # Reshape to desired representation: (B, F, input channels, filter_length, output channels) - G = G.reshape(B, F, C, filter_length, C) - # Move output channels to front: (B, output channels, F, input channels, filter_length) - G = G.permute(0, 4, 1, 2, 3) - - return G - - def apply_filter( - self, filter: torch.Tensor, input: Optional[torch.Tensor] = None, tilde_input: Optional[torch.Tensor] = None - ) -> torch.Tensor: - """Apply a prediction filter `filter` on the input `input` as - - output(b,f) = tilde{input(b,f)} * filter(b,f) - - If available, directly use the convolution matrix `tilde_input`. - - Args: - input: Input signal, shape (B, C, F, N) - tilde_input: Convolution matrix for the input signal, shape (B, C, F, N, filter_length) - filter: Prediction filter, shape (B, C, F, C, filter_length) - - Returns: - Multi-channel signal obtained by applying the prediction filter on - the input signal, same shape as input (B, C, F, N) - """ - if input is None and tilde_input is None: - raise RuntimeError(f'Both inputs cannot be None simultaneously.') - if input is not None and tilde_input is not None: - raise RuntimeError(f'Both inputs cannot be provided simultaneously.') - - if tilde_input is None: - tilde_input = self.convtensor(input, filter_length=self.filter_length, delay=self.prediction_delay) - - # For each (batch, output channel, f, time step), sum across (input channel, filter tap) - output = torch.einsum('bjfik,bmfjk->bmfi', tilde_input, filter) - - return output - - -class MaskBasedDereverbWPE(NeuralModule): - """Multi-channel linear prediction-based dereverberation using - weighted prediction error for filter estimation. - - An optional mask to estimate the signal power can be provided. - If a time-frequency mask is not provided, the algorithm corresponds - to the conventional WPE algorithm. - - Args: - filter_length: Length of the convolutional filter for each channel in frames. - prediction_delay: Delay of the input signal for multi-channel linear prediction in frames. - num_iterations: Number of iterations for reweighting - mask_min_db: Threshold mask to a minimal value before applying it, defaults to -200dB - mask_max_db: Threshold mask to a minimal value before applying it, defaults to 0dB - diag_reg: Diagonal regularization for WPE - eps: Small regularization constant - dtype: Data type for internal computations - - References: - - Kinoshita et al, Neural network-based spectrum estimation for online WPE dereverberation, 2017 - - Yoshioka and Nakatani, Generalization of Multi-Channel Linear Prediction Methods for Blind MIMO Impulse Response Shortening, 2012 - """ - - def __init__( - self, - filter_length: int, - prediction_delay: int, - num_iterations: int = 1, - mask_min_db: float = -200, - mask_max_db: float = 0, - diag_reg: Optional[float] = 1e-6, - eps: float = 1e-8, - dtype: torch.dtype = torch.cdouble, - ): - super().__init__() - # Filter setup - self.filter = WPEFilter( - filter_length=filter_length, prediction_delay=prediction_delay, diag_reg=diag_reg, eps=eps - ) - self.num_iterations = num_iterations - # Mask thresholding - self.mask_min = db2mag(mask_min_db) - self.mask_max = db2mag(mask_max_db) - # Internal calculations - if dtype not in [torch.cfloat, torch.cdouble]: - raise ValueError(f'Unsupported dtype {dtype}, expecting torch.cfloat or torch.cdouble') - self.dtype = dtype - - logging.debug('Initialized %s', self.__class__.__name__) - logging.debug('\tnum_iterations: %s', self.num_iterations) - logging.debug('\tmask_min: %g', self.mask_min) - logging.debug('\tmask_max: %g', self.mask_max) - logging.debug('\tdtype: %s', self.dtype) - - @property - def input_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "input": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "input_length": NeuralType(('B',), LengthsType(), optional=True), - "mask": NeuralType(('B', 'C', 'D', 'T'), FloatType(), optional=True), - } - - @property - def output_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "output": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "output_length": NeuralType(('B',), LengthsType(), optional=True), - } - - @typecheck() - def forward( - self, input: torch.Tensor, input_length: Optional[torch.Tensor] = None, mask: Optional[torch.Tensor] = None - ) -> torch.Tensor: - """Given an input signal `input`, apply the WPE dereverberation algoritm. - - Args: - input: C-channel complex-valued spectrogram, shape (B, C, F, T) - input_length: Optional length for each signal in the batch, shape (B,) - mask: Optional mask, shape (B, 1, F, N) or (B, C, F, T) - - Returns: - Processed tensor with the same number of channels as the input, - shape (B, C, F, T). - """ - io_dtype = input.dtype - - with torch.cuda.amp.autocast(enabled=False): - output = input.to(dtype=self.dtype) - - if not output.is_complex(): - raise RuntimeError(f'Expecting complex input, got {output.dtype}') - - for i in range(self.num_iterations): - magnitude = torch.abs(output) - if i == 0 and mask is not None: - # Apply thresholds - mask = torch.clamp(mask, min=self.mask_min, max=self.mask_max) - # Mask magnitude - magnitude = mask * magnitude - # Calculate power - power = magnitude ** 2 - # Apply filter - output, output_length = self.filter(input=output, input_length=input_length, power=power) - - return output.to(io_dtype), output_length - - -class MixtureConsistencyProjection(NeuralModule): - """Ensure estimated sources are consistent with the input mixture. - Note that the input mixture is assume to be a single-channel signal. - - Args: - weighting: Optional weighting mode for the consistency constraint. - If `None`, use uniform weighting. If `power`, use the power of the - estimated source as the weight. - eps: Small positive value for regularization - - Reference: - Wisdom et al, Differentiable consistency constraints for improved deep speech enhancement, 2018 - """ - - def __init__(self, weighting: Optional[str] = None, eps: float = 1e-8): - super().__init__() - self.weighting = weighting - self.eps = eps - - if self.weighting not in [None, 'power']: - raise NotImplementedError(f'Weighting mode {self.weighting} not implemented') - - @property - def input_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "mixture": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "estimate": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - } - - @property - def output_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "output": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - } - - @typecheck() - def forward(self, mixture: torch.Tensor, estimate: torch.Tensor) -> torch.Tensor: - """Enforce mixture consistency on the estimated sources. - Args: - mixture: Single-channel mixture, shape (B, 1, F, N) - estimate: M estimated sources, shape (B, M, F, N) - - Returns: - Source estimates consistent with the mixture, shape (B, M, F, N) - """ - # number of sources - M = estimate.size(-3) - # estimated mixture based on the estimated sources - estimated_mixture = torch.sum(estimate, dim=-3, keepdim=True) - - # weighting - if self.weighting is None: - weight = 1 / M - elif self.weighting == 'power': - weight = estimate.abs().pow(2) - weight = weight / (weight.sum(dim=-3, keepdim=True) + self.eps) - else: - raise NotImplementedError(f'Weighting mode {self.weighting} not implemented') - - # consistent estimate - consistent_estimate = estimate + weight * (mixture - estimated_mixture) - - return consistent_estimate diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/audio_preprocessing.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/audio_preprocessing.py deleted file mode 100644 index cc5312403255871316ddf46b02b2d5384a8c53e9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/audio_preprocessing.py +++ /dev/null @@ -1,986 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import random -from abc import ABC, abstractmethod -from dataclasses import dataclass -from typing import Any, Dict, Optional, Tuple - -import torch -from packaging import version - -from nemo.collections.asr.parts.numba.spec_augment import SpecAugmentNumba, spec_augment_launch_heuristics -from nemo.collections.asr.parts.preprocessing.features import ( - FilterbankFeatures, - FilterbankFeaturesTA, - make_seq_mask_like, -) -from nemo.collections.asr.parts.submodules.spectr_augment import SpecAugment, SpecCutout -from nemo.core.classes import Exportable, NeuralModule, typecheck -from nemo.core.neural_types import ( - AudioSignal, - LengthsType, - MelSpectrogramType, - MFCCSpectrogramType, - NeuralType, - SpectrogramType, -) -from nemo.core.utils import numba_utils -from nemo.core.utils.numba_utils import __NUMBA_MINIMUM_VERSION__ -from nemo.utils import logging - -try: - import torchaudio - import torchaudio.functional - import torchaudio.transforms - - TORCHAUDIO_VERSION = version.parse(torchaudio.__version__) - TORCHAUDIO_VERSION_MIN = version.parse('0.5') - - HAVE_TORCHAUDIO = True -except ModuleNotFoundError: - HAVE_TORCHAUDIO = False - -__all__ = [ - 'AudioToMelSpectrogramPreprocessor', - 'AudioToSpectrogram', - 'SpectrogramToAudio', - 'AudioToMFCCPreprocessor', - 'SpectrogramAugmentation', - 'MaskedPatchAugmentation', - 'CropOrPadSpectrogramAugmentation', -] - - -class AudioPreprocessor(NeuralModule, ABC): - """ - An interface for Neural Modules that performs audio pre-processing, - transforming the wav files to features. - """ - - def __init__(self, win_length, hop_length): - super().__init__() - - self.win_length = win_length - self.hop_length = hop_length - - self.torch_windows = { - 'hann': torch.hann_window, - 'hamming': torch.hamming_window, - 'blackman': torch.blackman_window, - 'bartlett': torch.bartlett_window, - 'ones': torch.ones, - None: torch.ones, - } - - @typecheck() - @torch.no_grad() - def forward(self, input_signal, length): - processed_signal, processed_length = self.get_features(input_signal, length) - - return processed_signal, processed_length - - @abstractmethod - def get_features(self, input_signal, length): - # Called by forward(). Subclasses should implement this. - pass - - -class AudioToMelSpectrogramPreprocessor(AudioPreprocessor, Exportable): - """Featurizer module that converts wavs to mel spectrograms. - - Args: - sample_rate (int): Sample rate of the input audio data. - Defaults to 16000 - window_size (float): Size of window for fft in seconds - Defaults to 0.02 - window_stride (float): Stride of window for fft in seconds - Defaults to 0.01 - n_window_size (int): Size of window for fft in samples - Defaults to None. Use one of window_size or n_window_size. - n_window_stride (int): Stride of window for fft in samples - Defaults to None. Use one of window_stride or n_window_stride. - window (str): Windowing function for fft. can be one of ['hann', - 'hamming', 'blackman', 'bartlett'] - Defaults to "hann" - normalize (str): Can be one of ['per_feature', 'all_features']; all - other options disable feature normalization. 'all_features' - normalizes the entire spectrogram to be mean 0 with std 1. - 'pre_features' normalizes per channel / freq instead. - Defaults to "per_feature" - n_fft (int): Length of FT window. If None, it uses the smallest power - of 2 that is larger than n_window_size. - Defaults to None - preemph (float): Amount of pre emphasis to add to audio. Can be - disabled by passing None. - Defaults to 0.97 - features (int): Number of mel spectrogram freq bins to output. - Defaults to 64 - lowfreq (int): Lower bound on mel basis in Hz. - Defaults to 0 - highfreq (int): Lower bound on mel basis in Hz. - Defaults to None - log (bool): Log features. - Defaults to True - log_zero_guard_type(str): Need to avoid taking the log of zero. There - are two options: "add" or "clamp". - Defaults to "add". - log_zero_guard_value(float, or str): Add or clamp requires the number - to add with or clamp to. log_zero_guard_value can either be a float - or "tiny" or "eps". torch.finfo is used if "tiny" or "eps" is - passed. - Defaults to 2**-24. - dither (float): Amount of white-noise dithering. - Defaults to 1e-5 - pad_to (int): Ensures that the output size of the time dimension is - a multiple of pad_to. - Defaults to 16 - frame_splicing (int): Defaults to 1 - exact_pad (bool): If True, sets stft center to False and adds padding, such that num_frames = audio_length - // hop_length. Defaults to False. - pad_value (float): The value that shorter mels are padded with. - Defaults to 0 - mag_power (float): The power that the linear spectrogram is raised to - prior to multiplication with mel basis. - Defaults to 2 for a power spec - rng : Random number generator - nb_augmentation_prob (float) : Probability with which narrowband augmentation would be applied to - samples in the batch. - Defaults to 0.0 - nb_max_freq (int) : Frequency above which all frequencies will be masked for narrowband augmentation. - Defaults to 4000 - use_torchaudio: Whether to use the `torchaudio` implementation. - mel_norm: Normalization used for mel filterbank weights. - Defaults to 'slaney' (area normalization) - stft_exact_pad: Deprecated argument, kept for compatibility with older checkpoints. - stft_conv: Deprecated argument, kept for compatibility with older checkpoints. - """ - - def save_to(self, save_path: str): - pass - - @classmethod - def restore_from(cls, restore_path: str): - pass - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "input_signal": NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)), - "length": NeuralType( - tuple('B'), LengthsType() - ), # Please note that length should be in samples not seconds. - } - - @property - def output_types(self): - """Returns definitions of module output ports. - - processed_signal: - 0: AxisType(BatchTag) - 1: AxisType(MelSpectrogramSignalTag) - 2: AxisType(ProcessedTimeTag) - processed_length: - 0: AxisType(BatchTag) - """ - return { - "processed_signal": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), - "processed_length": NeuralType(tuple('B'), LengthsType()), - } - - def __init__( - self, - sample_rate=16000, - window_size=0.02, - window_stride=0.01, - n_window_size=None, - n_window_stride=None, - window="hann", - normalize="per_feature", - n_fft=None, - preemph=0.97, - features=64, - lowfreq=0, - highfreq=None, - log=True, - log_zero_guard_type="add", - log_zero_guard_value=2 ** -24, - dither=1e-5, - pad_to=16, - frame_splicing=1, - exact_pad=False, - pad_value=0, - mag_power=2.0, - rng=None, - nb_augmentation_prob=0.0, - nb_max_freq=4000, - use_torchaudio: bool = False, - mel_norm="slaney", - stft_exact_pad=False, # Deprecated arguments; kept for config compatibility - stft_conv=False, # Deprecated arguments; kept for config compatibility - ): - super().__init__(n_window_size, n_window_stride) - - self._sample_rate = sample_rate - if window_size and n_window_size: - raise ValueError(f"{self} received both window_size and " f"n_window_size. Only one should be specified.") - if window_stride and n_window_stride: - raise ValueError( - f"{self} received both window_stride and " f"n_window_stride. Only one should be specified." - ) - if window_size: - n_window_size = int(window_size * self._sample_rate) - if window_stride: - n_window_stride = int(window_stride * self._sample_rate) - - # Given the long and similar argument list, point to the class and instantiate it by reference - if not use_torchaudio: - featurizer_class = FilterbankFeatures - else: - featurizer_class = FilterbankFeaturesTA - self.featurizer = featurizer_class( - sample_rate=self._sample_rate, - n_window_size=n_window_size, - n_window_stride=n_window_stride, - window=window, - normalize=normalize, - n_fft=n_fft, - preemph=preemph, - nfilt=features, - lowfreq=lowfreq, - highfreq=highfreq, - log=log, - log_zero_guard_type=log_zero_guard_type, - log_zero_guard_value=log_zero_guard_value, - dither=dither, - pad_to=pad_to, - frame_splicing=frame_splicing, - exact_pad=exact_pad, - pad_value=pad_value, - mag_power=mag_power, - rng=rng, - nb_augmentation_prob=nb_augmentation_prob, - nb_max_freq=nb_max_freq, - mel_norm=mel_norm, - stft_exact_pad=stft_exact_pad, # Deprecated arguments; kept for config compatibility - stft_conv=stft_conv, # Deprecated arguments; kept for config compatibility - ) - - def input_example(self, max_batch: int = 8, max_dim: int = 32000, min_length: int = 200): - batch_size = torch.randint(low=1, high=max_batch, size=[1]).item() - max_length = torch.randint(low=min_length, high=max_dim, size=[1]).item() - signals = torch.rand(size=[batch_size, max_length]) * 2 - 1 - lengths = torch.randint(low=min_length, high=max_dim, size=[batch_size]) - lengths[0] = max_length - return signals, lengths - - def get_features(self, input_signal, length): - return self.featurizer(input_signal, length) - - @property - def filter_banks(self): - return self.featurizer.filter_banks - - -class AudioToMFCCPreprocessor(AudioPreprocessor): - """Preprocessor that converts wavs to MFCCs. - Uses torchaudio.transforms.MFCC. - - Args: - sample_rate: The sample rate of the audio. - Defaults to 16000. - window_size: Size of window for fft in seconds. Used to calculate the - win_length arg for mel spectrogram. - Defaults to 0.02 - window_stride: Stride of window for fft in seconds. Used to caculate - the hop_length arg for mel spect. - Defaults to 0.01 - n_window_size: Size of window for fft in samples - Defaults to None. Use one of window_size or n_window_size. - n_window_stride: Stride of window for fft in samples - Defaults to None. Use one of window_stride or n_window_stride. - window: Windowing function for fft. can be one of ['hann', - 'hamming', 'blackman', 'bartlett', 'none', 'null']. - Defaults to 'hann' - n_fft: Length of FT window. If None, it uses the smallest power of 2 - that is larger than n_window_size. - Defaults to None - lowfreq (int): Lower bound on mel basis in Hz. - Defaults to 0 - highfreq (int): Lower bound on mel basis in Hz. - Defaults to None - n_mels: Number of mel filterbanks. - Defaults to 64 - n_mfcc: Number of coefficients to retain - Defaults to 64 - dct_type: Type of discrete cosine transform to use - norm: Type of norm to use - log: Whether to use log-mel spectrograms instead of db-scaled. - Defaults to True. - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "input_signal": NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)), - "length": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return { - "processed_signal": NeuralType(('B', 'D', 'T'), MFCCSpectrogramType()), - "processed_length": NeuralType(tuple('B'), LengthsType()), - } - - def save_to(self, save_path: str): - pass - - @classmethod - def restore_from(cls, restore_path: str): - pass - - def __init__( - self, - sample_rate=16000, - window_size=0.02, - window_stride=0.01, - n_window_size=None, - n_window_stride=None, - window='hann', - n_fft=None, - lowfreq=0.0, - highfreq=None, - n_mels=64, - n_mfcc=64, - dct_type=2, - norm='ortho', - log=True, - ): - self._sample_rate = sample_rate - if not HAVE_TORCHAUDIO: - logging.error('Could not import torchaudio. Some features might not work.') - - raise ModuleNotFoundError( - "torchaudio is not installed but is necessary for " - "AudioToMFCCPreprocessor. We recommend you try " - "building it from source for the PyTorch version you have." - ) - if window_size and n_window_size: - raise ValueError(f"{self} received both window_size and " f"n_window_size. Only one should be specified.") - if window_stride and n_window_stride: - raise ValueError( - f"{self} received both window_stride and " f"n_window_stride. Only one should be specified." - ) - # Get win_length (n_window_size) and hop_length (n_window_stride) - if window_size: - n_window_size = int(window_size * self._sample_rate) - if window_stride: - n_window_stride = int(window_stride * self._sample_rate) - - super().__init__(n_window_size, n_window_stride) - - mel_kwargs = {} - - mel_kwargs['f_min'] = lowfreq - mel_kwargs['f_max'] = highfreq - mel_kwargs['n_mels'] = n_mels - - mel_kwargs['n_fft'] = n_fft or 2 ** math.ceil(math.log2(n_window_size)) - - mel_kwargs['win_length'] = n_window_size - mel_kwargs['hop_length'] = n_window_stride - - # Set window_fn. None defaults to torch.ones. - window_fn = self.torch_windows.get(window, None) - if window_fn is None: - raise ValueError( - f"Window argument for AudioProcessor is invalid: {window}." - f"For no window function, use 'ones' or None." - ) - mel_kwargs['window_fn'] = window_fn - - # Use torchaudio's implementation of MFCCs as featurizer - self.featurizer = torchaudio.transforms.MFCC( - sample_rate=self._sample_rate, - n_mfcc=n_mfcc, - dct_type=dct_type, - norm=norm, - log_mels=log, - melkwargs=mel_kwargs, - ) - - def get_features(self, input_signal, length): - features = self.featurizer(input_signal) - seq_len = torch.ceil(length.to(torch.float32) / self.hop_length).to(dtype=torch.long) - return features, seq_len - - -class SpectrogramAugmentation(NeuralModule): - """ - Performs time and freq cuts in one of two ways. - SpecAugment zeroes out vertical and horizontal sections as described in - SpecAugment (https://arxiv.org/abs/1904.08779). Arguments for use with - SpecAugment are `freq_masks`, `time_masks`, `freq_width`, and `time_width`. - SpecCutout zeroes out rectangulars as described in Cutout - (https://arxiv.org/abs/1708.04552). Arguments for use with Cutout are - `rect_masks`, `rect_freq`, and `rect_time`. - - Args: - freq_masks (int): how many frequency segments should be cut. - Defaults to 0. - time_masks (int): how many time segments should be cut - Defaults to 0. - freq_width (int): maximum number of frequencies to be cut in one - segment. - Defaults to 10. - time_width (int): maximum number of time steps to be cut in one - segment - Defaults to 10. - rect_masks (int): how many rectangular masks should be cut - Defaults to 0. - rect_freq (int): maximum size of cut rectangles along the frequency - dimension - Defaults to 5. - rect_time (int): maximum size of cut rectangles along the time - dimension - Defaults to 25. - """ - - @property - def input_types(self): - """Returns definitions of module input types - """ - return { - "input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Returns definitions of module output types - """ - return {"augmented_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())} - - def __init__( - self, - freq_masks=0, - time_masks=0, - freq_width=10, - time_width=10, - rect_masks=0, - rect_time=5, - rect_freq=20, - rng=None, - mask_value=0.0, - use_numba_spec_augment: bool = True, - ): - super().__init__() - - if rect_masks > 0: - self.spec_cutout = SpecCutout(rect_masks=rect_masks, rect_time=rect_time, rect_freq=rect_freq, rng=rng,) - # self.spec_cutout.to(self._device) - else: - self.spec_cutout = lambda input_spec: input_spec - if freq_masks + time_masks > 0: - self.spec_augment = SpecAugment( - freq_masks=freq_masks, - time_masks=time_masks, - freq_width=freq_width, - time_width=time_width, - rng=rng, - mask_value=mask_value, - ) - else: - self.spec_augment = lambda input_spec, length: input_spec - - # Check if numba is supported, and use a Numba kernel if it is - if use_numba_spec_augment and numba_utils.numba_cuda_is_supported(__NUMBA_MINIMUM_VERSION__): - logging.info('Numba CUDA SpecAugment kernel is being used') - self.spec_augment_numba = SpecAugmentNumba( - freq_masks=freq_masks, - time_masks=time_masks, - freq_width=freq_width, - time_width=time_width, - rng=rng, - mask_value=mask_value, - ) - else: - self.spec_augment_numba = None - - @typecheck() - def forward(self, input_spec, length): - augmented_spec = self.spec_cutout(input_spec=input_spec) - - # To run the Numba kernel, correct numba version is required as well as - # tensor must be on GPU and length must be provided - if self.spec_augment_numba is not None and spec_augment_launch_heuristics(augmented_spec, length): - augmented_spec = self.spec_augment_numba(input_spec=augmented_spec, length=length) - else: - augmented_spec = self.spec_augment(input_spec=augmented_spec, length=length) - return augmented_spec - - -class MaskedPatchAugmentation(NeuralModule): - """ - Zeroes out fixed size time patches of the spectrogram. - All samples in batch are guaranteed to have the same amount of masked time steps. - Optionally also performs frequency masking in the same way as SpecAugment. - Args: - patch_size (int): up to how many time steps does one patch consist of. - Defaults to 48. - mask_patches (float): how many patches should be masked in each sample. - if >= 1., interpreted as number of patches (after converting to int) - if <1., interpreted as fraction of total tokens to be masked (number of patches is rounded up) - Defaults to 10. - freq_masks (int): how many frequency segments should be cut. - Defaults to 0. - freq_width (int): maximum number of frequencies to be cut in a segment. - Defaults to 0. - """ - - @property - def input_types(self): - """Returns definitions of module input types - """ - return { - "input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Returns definitions of module output types - """ - return {"augmented_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())} - - def __init__( - self, patch_size: int = 48, mask_patches: float = 10.0, freq_masks: int = 0, freq_width: int = 0, - ): - super().__init__() - self.patch_size = patch_size - if mask_patches >= 1: - self.mask_patches = int(mask_patches) - elif mask_patches >= 0: - self._mask_fraction = mask_patches - self.mask_patches = None - else: - raise ValueError('mask_patches cannot be negative') - - if freq_masks > 0: - self.spec_augment = SpecAugment(freq_masks=freq_masks, time_masks=0, freq_width=freq_width, time_width=0,) - else: - self.spec_augment = None - - @typecheck() - def forward(self, input_spec, length): - augmented_spec = input_spec - - min_len = torch.min(length) - - if self.mask_patches is None: - # masking specified as fraction - len_fraction = int(min_len * self._mask_fraction) - mask_patches = len_fraction // self.patch_size + int(len_fraction % self.patch_size != 0) - else: - mask_patches = self.mask_patches - - if min_len < self.patch_size * mask_patches: - mask_patches = min_len // self.patch_size - - for idx in range(input_spec.shape[0]): - cur_len = length[idx] - patches = range(cur_len // self.patch_size) - masked_patches = random.sample(patches, mask_patches) - - for mp in masked_patches: - augmented_spec[idx, :, mp * self.patch_size : (mp + 1) * self.patch_size] = 0.0 - - if self.spec_augment is not None: - augmented_spec = self.spec_augment(input_spec=augmented_spec, length=length) - - return augmented_spec - - -class CropOrPadSpectrogramAugmentation(NeuralModule): - """ - Pad or Crop the incoming Spectrogram to a certain shape. - - Args: - audio_length (int): the final number of timesteps that is required. - The signal will be either padded or cropped temporally to this - size. - """ - - def __init__(self, audio_length): - super(CropOrPadSpectrogramAugmentation, self).__init__() - self.audio_length = audio_length - - if self.audio_length < 0: - raise ValueError( - 'audio_length must be non-negative. If using a dataclass with OmegaConf, ' - 'please call OmegaConf.to_object(cfg) to call appropriate __post_init__ methods.' - ) - - @typecheck() - @torch.no_grad() - def forward(self, input_signal, length): - image = input_signal - num_images = image.shape[0] - - audio_length = self.audio_length - image_len = image.shape[-1] - - # Crop long signal - if image_len > audio_length: # randomly slice - cutout_images = [] - offset = torch.randint(low=0, high=image_len - audio_length + 1, size=[num_images]) - - for idx, offset in enumerate(offset): - cutout_images.append(image[idx : idx + 1, :, offset : offset + audio_length]) - - image = torch.cat(cutout_images, dim=0) - del cutout_images - - else: # symmetrically pad short signal with zeros - pad_left = (audio_length - image_len) // 2 - pad_right = (audio_length - image_len) // 2 - - if (audio_length - image_len) % 2 == 1: - pad_right += 1 - - image = torch.nn.functional.pad(image, [pad_left, pad_right], mode="constant", value=0) - - # Replace dynamic length sequences with static number of timesteps - length = (length * 0) + audio_length - - return image, length - - @property - def input_types(self): - """Returns definitions of module output ports. - """ - return { - "input_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return { - "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "processed_length": NeuralType(tuple('B'), LengthsType()), - } - - def save_to(self, save_path: str): - pass - - @classmethod - def restore_from(cls, restore_path: str): - pass - - -class AudioToSpectrogram(NeuralModule): - """Transform a batch of input multi-channel signals into a batch of - STFT-based spectrograms. - - Args: - fft_length: length of FFT - hop_length: length of hops/shifts of the sliding window - power: exponent for magnitude spectrogram. Default `None` will - return a complex-valued spectrogram - """ - - def __init__(self, fft_length: int, hop_length: int, power: Optional[float] = None): - if not HAVE_TORCHAUDIO: - logging.error('Could not import torchaudio. Some features might not work.') - - raise ModuleNotFoundError( - f"torchaudio is not installed but is necessary to instantiate a {self.__class__.__name__}" - ) - - super().__init__() - - # For now, assume FFT length is divisible by two - if fft_length % 2 != 0: - raise ValueError(f'fft_length = {fft_length} must be divisible by 2') - - self.stft = torchaudio.transforms.Spectrogram( - n_fft=fft_length, hop_length=hop_length, power=power, pad_mode='constant' - ) - - # number of subbands - self.F = fft_length // 2 + 1 - - @property - def num_subbands(self) -> int: - return self.F - - @property - def input_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "input": NeuralType(('B', 'C', 'T'), AudioSignal()), - "input_length": NeuralType(('B',), LengthsType(), optional=True), - } - - @property - def output_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "output": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "output_length": NeuralType(('B',), LengthsType()), - } - - @typecheck() - def forward( - self, input: torch.Tensor, input_length: Optional[torch.Tensor] = None - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Convert a batch of C-channel input signals - into a batch of complex-valued spectrograms. - - Args: - input: Time-domain input signal with C channels, shape (B, C, T) - input_length: Length of valid entries along the time dimension, shape (B,) - - Returns: - Output spectrogram with F subbands and N time frames, shape (B, C, F, N) - and output length with shape (B,). - """ - B, T = input.size(0), input.size(-1) - input = input.view(B, -1, T) - - # STFT output (B, C, F, N) - with torch.cuda.amp.autocast(enabled=False): - output = self.stft(input.float()) - - if input_length is not None: - # Mask padded frames - output_length = self.get_output_length(input_length=input_length) - - length_mask: torch.Tensor = make_seq_mask_like( - lengths=output_length, like=output, time_dim=-1, valid_ones=False - ) - output = output.masked_fill(length_mask, 0.0) - else: - # Assume all frames are valid for all examples in the batch - output_length = output.size(-1) * torch.ones(B, device=output.device).long() - - return output, output_length - - def get_output_length(self, input_length: torch.Tensor) -> torch.Tensor: - """Get length of valid frames for the output. - - Args: - input_length: number of valid samples, shape (B,) - - Returns: - Number of valid frames, shape (B,) - """ - output_length = input_length.div(self.stft.hop_length, rounding_mode='floor').add(1).long() - return output_length - - -class SpectrogramToAudio(NeuralModule): - """Transform a batch of input multi-channel spectrograms into a batch of - time-domain multi-channel signals. - - Args: - fft_length: length of FFT - hop_length: length of hops/shifts of the sliding window - power: exponent for magnitude spectrogram. Default `None` will - return a complex-valued spectrogram - """ - - def __init__(self, fft_length: int, hop_length: int): - if not HAVE_TORCHAUDIO: - logging.error('Could not import torchaudio. Some features might not work.') - - raise ModuleNotFoundError( - f"torchaudio is not installed but is necessary to instantiate a {self.__class__.__name__}" - ) - - super().__init__() - - # For now, assume FFT length is divisible by two - if fft_length % 2 != 0: - raise ValueError(f'fft_length = {fft_length} must be divisible by 2') - - self.istft = torchaudio.transforms.InverseSpectrogram( - n_fft=fft_length, hop_length=hop_length, pad_mode='constant' - ) - - self.F = fft_length // 2 + 1 - - @property - def num_subbands(self) -> int: - return self.F - - @property - def input_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "input": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - "input_length": NeuralType(('B',), LengthsType(), optional=True), - } - - @property - def output_types(self) -> Dict[str, NeuralType]: - """Returns definitions of module output ports. - """ - return { - "output": NeuralType(('B', 'C', 'T'), AudioSignal()), - "output_length": NeuralType(('B',), LengthsType()), - } - - @typecheck() - def forward(self, input: torch.Tensor, input_length: Optional[torch.Tensor] = None) -> torch.Tensor: - """Convert input complex-valued spectrogram to a time-domain - signal. Multi-channel IO is supported. - - Args: - input: Input spectrogram for C channels, shape (B, C, F, N) - input_length: Length of valid entries along the time dimension, shape (B,) - - Returns: - Time-domain signal with T time-domain samples and C channels, (B, C, T) - and output length with shape (B,). - """ - B, F, N = input.size(0), input.size(-2), input.size(-1) - assert F == self.F, f'Number of subbands F={F} not matching self.F={self.F}' - input = input.view(B, -1, F, N) - - # iSTFT output (B, C, T) - with torch.cuda.amp.autocast(enabled=False): - output = self.istft(input.cfloat()) - - if input_length is not None: - # Mask padded samples - output_length = self.get_output_length(input_length=input_length) - - length_mask: torch.Tensor = make_seq_mask_like( - lengths=output_length, like=output, time_dim=-1, valid_ones=False - ) - output = output.masked_fill(length_mask, 0.0) - else: - # Assume all frames are valid for all examples in the batch - output_length = output.size(-1) * torch.ones(B, device=output.device).long() - - return output, output_length - - def get_output_length(self, input_length: torch.Tensor) -> torch.Tensor: - """Get length of valid samples for the output. - - Args: - input_length: number of valid frames, shape (B,) - - Returns: - Number of valid samples, shape (B,) - """ - output_length = input_length.sub(1).mul(self.istft.hop_length).long() - return output_length - - -@dataclass -class AudioToMelSpectrogramPreprocessorConfig: - _target_: str = "nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor" - sample_rate: int = 16000 - window_size: float = 0.02 - window_stride: float = 0.01 - n_window_size: Optional[int] = None - n_window_stride: Optional[int] = None - window: str = "hann" - normalize: str = "per_feature" - n_fft: Optional[int] = None - preemph: float = 0.97 - features: int = 64 - lowfreq: int = 0 - highfreq: Optional[int] = None - log: bool = True - log_zero_guard_type: str = "add" - log_zero_guard_value: float = 2 ** -24 - dither: float = 1e-5 - pad_to: int = 16 - frame_splicing: int = 1 - exact_pad: bool = False - pad_value: int = 0 - mag_power: float = 2.0 - rng: Optional[str] = None - nb_augmentation_prob: float = 0.0 - nb_max_freq: int = 4000 - use_torchaudio: bool = False - mel_norm: str = "slaney" - stft_exact_pad: bool = False # Deprecated argument, kept for compatibility with older checkpoints. - stft_conv: bool = False # Deprecated argument, kept for compatibility with older checkpoints. - - -@dataclass -class AudioToMFCCPreprocessorConfig: - _target_: str = 'nemo.collections.asr.modules.AudioToMFCCPreprocessor' - sample_rate: int = 16000 - window_size: float = 0.02 - window_stride: float = 0.01 - n_window_size: Optional[int] = None - n_window_stride: Optional[int] = None - window: str = 'hann' - n_fft: Optional[int] = None - lowfreq: Optional[float] = 0.0 - highfreq: Optional[float] = None - n_mels: int = 64 - n_mfcc: int = 64 - dct_type: int = 2 - norm: str = 'ortho' - log: bool = True - - -@dataclass -class SpectrogramAugmentationConfig: - _target_: str = "nemo.collections.asr.modules.SpectrogramAugmentation" - freq_masks: int = 0 - time_masks: int = 0 - freq_width: int = 0 - time_width: Optional[Any] = 0 - rect_masks: int = 0 - rect_time: int = 0 - rect_freq: int = 0 - mask_value: float = 0 - rng: Optional[Any] = None # random.Random() type - use_numba_spec_augment: bool = True - - -@dataclass -class CropOrPadSpectrogramAugmentationConfig: - audio_length: int - _target_: str = "nemo.collections.asr.modules.CropOrPadSpectrogramAugmentation" - - -@dataclass -class MaskedPatchAugmentationConfig: - patch_size: int = 48 - mask_patches: float = 10.0 - freq_masks: int = 0 - freq_width: int = 0 - _target_: str = "nemo.collections.asr.modules.MaskedPatchAugmentation" diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/beam_search_decoder.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/beam_search_decoder.py deleted file mode 100644 index b39804ae8e658eccb77c41e12bf1ab8ef63a51f9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/beam_search_decoder.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - -from nemo.core.classes import NeuralModule, typecheck -from nemo.core.neural_types import LengthsType, LogprobsType, NeuralType, PredictionsType - - -class BeamSearchDecoderWithLM(NeuralModule): - """Neural Module that does CTC beam search with a N-gram language model. - It takes a batch of log_probabilities. Note the bigger the batch, the - better as processing is parallelized. Outputs a list of size batch_size. - Each element in the list is a list of size beam_search, and each element - in that list is a tuple of (final_log_prob, hyp_string). - Args: - vocab (list): List of characters that can be output by the ASR model. For English, this is the 28 character set - {a-z '}. The CTC blank symbol is automatically added. - beam_width (int): Size of beams to keep and expand upon. Larger beams result in more accurate but slower - predictions - alpha (float): The amount of importance to place on the N-gram language model. Larger alpha means more - importance on the LM and less importance on the acoustic model. - beta (float): A penalty term given to longer word sequences. Larger beta will result in shorter sequences. - lm_path (str): Path to N-gram language model - num_cpus (int): Number of CPUs to use - cutoff_prob (float): Cutoff probability in vocabulary pruning, default 1.0, no pruning - cutoff_top_n (int): Cutoff number in pruning, only top cutoff_top_n characters with highest probs in - vocabulary will be used in beam search, default 40. - input_tensor (bool): Set to True if you intend to pass PyTorch Tensors, set to False if you intend to pass - NumPy arrays. - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "log_probs": NeuralType(('B', 'T', 'D'), LogprobsType()), - "log_probs_length": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return {"predictions": NeuralType(('B', 'T'), PredictionsType())} - - def __init__( - self, vocab, beam_width, alpha, beta, lm_path, num_cpus, cutoff_prob=1.0, cutoff_top_n=40, input_tensor=False - ): - - try: - from ctc_decoders import Scorer, ctc_beam_search_decoder_batch - except ModuleNotFoundError: - raise ModuleNotFoundError( - "BeamSearchDecoderWithLM requires the installation of ctc_decoders " - "from scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh" - ) - - super().__init__() - - if lm_path is not None: - self.scorer = Scorer(alpha, beta, model_path=lm_path, vocabulary=vocab) - else: - self.scorer = None - self.beam_search_func = ctc_beam_search_decoder_batch - self.vocab = vocab - self.beam_width = beam_width - self.num_cpus = num_cpus - self.cutoff_prob = cutoff_prob - self.cutoff_top_n = cutoff_top_n - self.input_tensor = input_tensor - - @typecheck(ignore_collections=True) - @torch.no_grad() - def forward(self, log_probs, log_probs_length): - probs_list = log_probs - if self.input_tensor: - probs = torch.exp(log_probs) - probs_list = [] - for i, prob in enumerate(probs): - probs_list.append(prob[: log_probs_length[i], :]) - res = self.beam_search_func( - probs_list, - self.vocab, - beam_size=self.beam_width, - num_processes=self.num_cpus, - ext_scoring_func=self.scorer, - cutoff_prob=self.cutoff_prob, - cutoff_top_n=self.cutoff_top_n, - ) - return res diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/conformer_encoder.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/conformer_encoder.py deleted file mode 100644 index 6d2879e02a116952c2fe3ca0e2b9f2758f923a5e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/conformer_encoder.py +++ /dev/null @@ -1,1120 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import random -from collections import OrderedDict -from dataclasses import dataclass -from typing import List, Optional, Set - -import torch -import torch.distributed -import torch.nn as nn -from omegaconf import DictConfig, ListConfig, open_dict - -from nemo.collections.asr.models.configs import CacheAwareStreamingConfig -from nemo.collections.asr.parts.mixins.streaming import StreamingEncoder -from nemo.collections.asr.parts.submodules.causal_convs import CausalConv1D -from nemo.collections.asr.parts.submodules.conformer_modules import ConformerLayer -from nemo.collections.asr.parts.submodules.multi_head_attention import ( - LocalAttRelPositionalEncoding, - MultiHeadAttention, - PositionalEncoding, - RelPositionalEncoding, - RelPositionMultiHeadAttention, - RelPositionMultiHeadAttentionLongformer, -) -from nemo.collections.asr.parts.submodules.subsampling import ( - ConvSubsampling, - StackingSubsampling, - SubsamplingReductionModule, -) -from nemo.collections.asr.parts.utils import adapter_utils -from nemo.collections.asr.parts.utils.regularization_utils import compute_stochastic_depth_drop_probs -from nemo.core.classes.common import typecheck -from nemo.core.classes.exportable import Exportable -from nemo.core.classes.mixins import AccessMixin, adapter_mixins -from nemo.core.classes.module import NeuralModule -from nemo.core.neural_types import AcousticEncodedRepresentation, ChannelType, LengthsType, NeuralType, SpectrogramType -from nemo.utils import logging - -__all__ = ['ConformerEncoder'] - - -class ConformerEncoder(NeuralModule, StreamingEncoder, Exportable, AccessMixin): - """ - The encoder for ASR model of Conformer. - Based on this paper: - 'Conformer: Convolution-augmented Transformer for Speech Recognition' by Anmol Gulati et al. - https://arxiv.org/abs/2005.08100 - - Args: - feat_in (int): the size of feature channels - n_layers (int): number of layers of ConformerBlock - d_model (int): the hidden size of the model - feat_out (int): the size of the output features - Defaults to -1 (means feat_out is d_model) - subsampling (str): the method of subsampling, choices=['vggnet', 'striding', 'dw-striding', 'stacking', 'stacking_norm'] - Defaults to striding. - subsampling_factor (int): the subsampling factor which should be power of 2 - Defaults to 4. - subsampling_conv_chunking_factor(int): optionally, force chunk inputs (helpful for large inputs) - Should be power of 2, 1 (auto-chunking, default), or -1 (no chunking) - subsampling_conv_channels (int): the size of the convolutions in the subsampling module - Defaults to -1 which would set it to d_model. - reduction (str, Optional): the method of reduction, choices=['pooling', 'striding']. If no value - is passed, then no reduction is performed and the models runs with the original 4x subsampling. - reduction_position (int, Optional): the index of the layer to apply reduction. If -1, apply reduction - at the end. - reduction_factor (int): the reduction factor which should be either 1 or a power of 2 - Defaults to 1. - ff_expansion_factor (int): the expansion factor in feed forward layers - Defaults to 4. - self_attention_model (str): type of the attention layer and positional encoding - 'rel_pos': relative positional embedding and Transformer-XL - 'rel_pos_local_attn': relative positional embedding and Transformer-XL with local attention using - overlapping chunks. Attention context is determined by att_context_size parameter. - 'abs_pos': absolute positional embedding and Transformer - Default is rel_pos. - pos_emb_max_len (int): the maximum length of positional embeddings - Defaults to 5000 - n_heads (int): number of heads in multi-headed attention layers - Defaults to 4. - att_context_size (List[Union[List[int],int]]): specifies the context sizes on each side. Each context size should be a list of two integers like [100,100]. - A list of context sizes like [[100,100],[100,50]] can also be passed. -1 means unlimited context. - Defaults to [-1,-1] - att_context_probs (List[float]): a list of probabilities of each one of the att_context_size when a list of them is passed. If not specified, uniform distribution is being used. - Defaults to None - att_context_style (str): 'regular' or 'chunked_limited'. - Defaults to 'regular' - xscaling (bool): enables scaling the inputs to the multi-headed attention layers by sqrt(d_model) - Defaults to True. - untie_biases (bool): whether to not share (untie) the bias weights between layers of Transformer-XL - Defaults to True. - conv_kernel_size (int): the size of the convolutions in the convolutional modules - Defaults to 31. - conv_norm_type (str): the type of the normalization in the convolutional modules - Defaults to 'batch_norm'. - conv_context_size (list): it can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size. - None means [(conv_kernel_size-1)//2, (conv_kernel_size-1)//2], and 'causal' means [(conv_kernel_size-1), 0]. - Defaults to None. - conv_dual_mode (bool): specifies if convolution should be dual mode when dual_offline mode is being used. When enables, the left half of the convolution kernel would get masked in streaming cases. - Defaults to False - dropout (float): the dropout rate used in all layers except the attention layers - Defaults to 0.1. - dropout_pre_encoder (float): the dropout rate used before the encoder - Defaults to 0.1. - dropout_emb (float): the dropout rate used for the positional embeddings - Defaults to 0.1. - dropout_att (float): the dropout rate used for the attention layer - Defaults to 0.0. - stochastic_depth_drop_prob (float): if non-zero, will randomly drop - layers during training. The higher this value, the more often layers - are dropped. Defaults to 0.0. - stochastic_depth_mode (str): can be either "linear" or "uniform". If - set to "uniform", all layers have the same probability of drop. If - set to "linear", the drop probability grows linearly from 0 for the - first layer to the desired value for the final layer. Defaults to - "linear". - stochastic_depth_start_layer (int): starting layer for stochastic depth. - All layers before this will never be dropped. Note that drop - probability will be adjusted accordingly if mode is "linear" when - start layer is > 1. Defaults to 1. - global_tokens (int): number of tokens to be used for global attention. - Only relevant if self_attention_model is 'rel_pos_local_attn'. - Defaults to 0. - global_tokens_spacing (int): how far apart the global tokens are - Defaults to 1. - global_attn_separate (bool): whether the q, k, v layers used for global tokens should be separate. - Defaults to False. - - """ - - def input_example(self, max_batch=1, max_dim=256): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - dev = next(self.parameters()).device - if self.export_cache_support: - window_size = max_dim - if self.streaming_cfg is not None: - if isinstance(self.streaming_cfg.chunk_size, list): - chunk_size = self.streaming_cfg.chunk_size[1] - else: - chunk_size = self.streaming_cfg.chunk_size - if isinstance(self.streaming_cfg.pre_encode_cache_size, list): - pre_encode_cache_size = self.streaming_cfg.pre_encode_cache_size[1] - else: - pre_encode_cache_size = self.streaming_cfg.pre_encode_cache_size - window_size = chunk_size + pre_encode_cache_size - input_example = torch.randn(max_batch, self._feat_in, window_size, device=dev) - input_example_length = torch.randint( - window_size // 4, window_size, (max_batch,), device=dev, dtype=torch.int64 - ) - cache_last_channel, cache_last_time, cache_last_channel_len = self.get_initial_cache_state( - batch_size=max_batch, device=dev, max_dim=max_dim - ) - all_input_example = tuple( - [ - input_example, - input_example_length, - cache_last_channel.transpose(0, 1), - cache_last_time.transpose(0, 1), - cache_last_channel_len, - ] - ) - else: - input_example = torch.randn(max_batch, self._feat_in, max_dim, device=dev) - input_example_length = torch.randint(max_dim // 4, max_dim, (max_batch,), device=dev, dtype=torch.int64) - all_input_example = tuple([input_example, input_example_length]) - - return all_input_example - - @property - def input_types(self): - """Returns definitions of module input ports.""" - return OrderedDict( - { - "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - "cache_last_channel": NeuralType(('D', 'B', 'T', 'D'), ChannelType(), optional=True), - "cache_last_time": NeuralType(('D', 'B', 'D', 'T'), ChannelType(), optional=True), - "cache_last_channel_len": NeuralType(tuple('B'), LengthsType(), optional=True), - } - ) - - @property - def input_types_for_export(self): - """Returns definitions of module input ports.""" - return OrderedDict( - { - "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - "cache_last_channel": NeuralType(('B', 'D', 'T', 'D'), ChannelType(), optional=True), - "cache_last_time": NeuralType(('B', 'D', 'D', 'T'), ChannelType(), optional=True), - "cache_last_channel_len": NeuralType(tuple('B'), LengthsType(), optional=True), - } - ) - - @property - def output_types(self): - """Returns definitions of module output ports.""" - return OrderedDict( - { - "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - "cache_last_channel_next": NeuralType(('D', 'B', 'T', 'D'), ChannelType(), optional=True), - "cache_last_time_next": NeuralType(('D', 'B', 'D', 'T'), ChannelType(), optional=True), - "cache_last_channel_next_len": NeuralType(tuple('B'), LengthsType(), optional=True), - } - ) - - @property - def output_types_for_export(self): - """Returns definitions of module output ports.""" - return OrderedDict( - { - "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - "cache_last_channel_next": NeuralType(('B', 'D', 'T', 'D'), ChannelType(), optional=True), - "cache_last_time_next": NeuralType(('B', 'D', 'D', 'T'), ChannelType(), optional=True), - "cache_last_channel_next_len": NeuralType(tuple('B'), LengthsType(), optional=True), - } - ) - - @property - def disabled_deployment_input_names(self): - if not self.export_cache_support: - return set(["cache_last_channel", "cache_last_time", "cache_last_channel_len"]) - else: - return set() - - @property - def disabled_deployment_output_names(self): - if not self.export_cache_support: - return set(["cache_last_channel_next", "cache_last_time_next", "cache_last_channel_next_len"]) - else: - return set() - - def __init__( - self, - feat_in, - n_layers, - d_model, - feat_out=-1, - causal_downsampling=False, - subsampling='striding', - subsampling_factor=4, - subsampling_conv_chunking_factor=1, - subsampling_conv_channels=-1, - reduction=None, - reduction_position=None, - reduction_factor=1, - ff_expansion_factor=4, - self_attention_model='rel_pos', - n_heads=4, - att_context_size=None, - att_context_probs=None, - att_context_style='regular', - xscaling=True, - untie_biases=True, - pos_emb_max_len=5000, - conv_kernel_size=31, - conv_norm_type='batch_norm', - conv_context_size=None, - dropout=0.1, - dropout_pre_encoder=0.1, - dropout_emb=0.1, - dropout_att=0.0, - stochastic_depth_drop_prob: float = 0.0, - stochastic_depth_mode: str = "linear", - stochastic_depth_start_layer: int = 1, - global_tokens: int = 0, - global_tokens_spacing: int = 1, - global_attn_separate: bool = False, - ): - super().__init__() - d_ff = d_model * ff_expansion_factor - self.d_model = d_model - self.n_layers = n_layers - self._feat_in = feat_in - self.att_context_style = att_context_style - self.subsampling_factor = subsampling_factor - self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor - - self.self_attention_model = self_attention_model - self.global_tokens = global_tokens - self.global_attn_separate = global_attn_separate - self.global_tokens_spacing = global_tokens_spacing - - # Setting up the att_context_size - ( - self.att_context_size_all, - self.att_context_size, - self.att_context_probs, - self.conv_context_size, - ) = self._calc_context_sizes( - att_context_style=att_context_style, - att_context_size=att_context_size, - att_context_probs=att_context_probs, - conv_context_size=conv_context_size, - conv_kernel_size=conv_kernel_size, - ) - - if xscaling: - self.xscale = math.sqrt(d_model) - else: - self.xscale = None - - # Subsampling - if subsampling_conv_channels == -1: - subsampling_conv_channels = d_model - if subsampling and subsampling_factor > 1: - if subsampling in ['stacking', 'stacking_norm']: - # stacking_norm has an extra layer norm after stacking comparing to stacking - self.pre_encode = StackingSubsampling( - subsampling_factor=subsampling_factor, - feat_in=feat_in, - feat_out=d_model, - norm=True if subsampling == 'stacking_norm' else False, - ) - else: - self.pre_encode = ConvSubsampling( - subsampling=subsampling, - subsampling_factor=subsampling_factor, - feat_in=feat_in, - feat_out=d_model, - conv_channels=subsampling_conv_channels, - subsampling_conv_chunking_factor=subsampling_conv_chunking_factor, - activation=nn.ReLU(True), - is_causal=causal_downsampling, - ) - else: - self.pre_encode = nn.Linear(feat_in, d_model) - - # Reduction - if reduction and reduction_factor > 1: - assert reduction_position >= -1 and reduction_position < n_layers - self.reduction_subsampling = SubsamplingReductionModule( - reduction=reduction, d_model=d_model, reduction_factor=reduction_factor, - ) - self.reduction_position = reduction_position - else: - self.reduction_subsampling = None - self.reduction_position = None - - self._feat_out = d_model - - # Biases for relative positional encoding - if not untie_biases and self_attention_model == "rel_pos": - d_head = d_model // n_heads - pos_bias_u = nn.Parameter(torch.Tensor(n_heads, d_head)) - pos_bias_v = nn.Parameter(torch.Tensor(n_heads, d_head)) - nn.init.zeros_(pos_bias_u) - nn.init.zeros_(pos_bias_v) - else: - pos_bias_u = None - pos_bias_v = None - - # Positional encodings - self.pos_emb_max_len = pos_emb_max_len - if self_attention_model == "rel_pos": - self.pos_enc = RelPositionalEncoding( - d_model=d_model, - dropout_rate=dropout_pre_encoder, - max_len=pos_emb_max_len, - xscale=self.xscale, - dropout_rate_emb=dropout_emb, - ) - elif self_attention_model == 'rel_pos_local_attn': - if max(att_context_size) <= 0: - raise ValueError("When using local attention, context size must be set > 0") - self.pos_enc = LocalAttRelPositionalEncoding( - att_context_size=att_context_size, - d_model=d_model, - dropout_rate=dropout, - max_len=pos_emb_max_len, - xscale=self.xscale, - dropout_rate_emb=dropout_emb, - ) - elif self_attention_model == "abs_pos": - pos_bias_u = None - pos_bias_v = None - self.pos_enc = PositionalEncoding( - d_model=d_model, dropout_rate=dropout_pre_encoder, max_len=pos_emb_max_len, xscale=self.xscale - ) - else: - raise ValueError(f"Not valid self_attention_model: '{self_attention_model}'!") - - self.layers = nn.ModuleList() - for i in range(n_layers): - layer = ConformerLayer( - d_model=d_model, - d_ff=d_ff, - self_attention_model=self_attention_model, - global_tokens=global_tokens, - global_tokens_spacing=global_tokens_spacing, - global_attn_separate=global_attn_separate, - n_heads=n_heads, - conv_kernel_size=conv_kernel_size, - conv_norm_type=conv_norm_type, - conv_context_size=self.conv_context_size, - dropout=dropout, - dropout_att=dropout_att, - pos_bias_u=pos_bias_u, - pos_bias_v=pos_bias_v, - att_context_size=self.att_context_size, - ) - self.layers.append(layer) - - if feat_out > 0 and feat_out != self._feat_out: - self.out_proj = nn.Linear(self._feat_out, feat_out) - self._feat_out = feat_out - else: - self.out_proj = None - self._feat_out = d_model - self.set_max_audio_length(self.pos_emb_max_len) - self.use_pad_mask = True - - self.setup_streaming_params() - self.export_cache_support = False - - self.layer_drop_probs = compute_stochastic_depth_drop_probs( - len(self.layers), stochastic_depth_drop_prob, stochastic_depth_mode, stochastic_depth_start_layer - ) - # will be set in self.forward() if defined in AccessMixin config - self.interctc_capture_at_layers = None - - def forward_for_export( - self, audio_signal, length, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None - ): - if cache_last_channel is not None: - cache_last_channel = cache_last_channel.transpose(0, 1) - cache_last_time = cache_last_time.transpose(0, 1) - - rets = self.forward_internal( - audio_signal, - length, - cache_last_channel=cache_last_channel, - cache_last_time=cache_last_time, - cache_last_channel_len=cache_last_channel_len, - ) - rets = self.streaming_post_process(rets, keep_all_outputs=False) - if len(rets) == 2: - return rets - elif rets[2] is None and rets[3] is None and rets[4] is None: - return (rets[0], rets[1]) - else: - return ( - rets[0], - rets[1], - rets[2].transpose(0, 1), - rets[3].transpose(0, 1), - rets[4], - ) - - def streaming_post_process(self, rets, keep_all_outputs=True): - if len(rets) == 2: - return rets[0], rets[1], None, None, None - - (encoded, encoded_len, cache_last_channel_next, cache_last_time_next, cache_last_channel_next_len) = rets - - if cache_last_channel_next is not None and self.streaming_cfg.last_channel_cache_size >= 0: - if self.streaming_cfg.last_channel_cache_size > 0: - cache_last_channel_next = cache_last_channel_next[ - :, :, -self.streaming_cfg.last_channel_cache_size :, : - ] - - if self.streaming_cfg.valid_out_len > 0 and (not keep_all_outputs or self.att_context_style == "regular"): - encoded = encoded[:, :, : self.streaming_cfg.valid_out_len] - encoded_len = torch.clamp(encoded_len, max=self.streaming_cfg.valid_out_len) - - return (encoded, encoded_len, cache_last_channel_next, cache_last_time_next, cache_last_channel_next_len) - - @typecheck() - def forward( - self, audio_signal, length, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None - ): - return self.forward_internal( - audio_signal, - length, - cache_last_channel=cache_last_channel, - cache_last_time=cache_last_time, - cache_last_channel_len=cache_last_channel_len, - ) - - def forward_internal( - self, audio_signal, length, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None - ): - self.update_max_seq_length(seq_length=audio_signal.size(2), device=audio_signal.device) - - if length is None: - length = audio_signal.new_full( - (audio_signal.size(0),), audio_signal.size(-1), dtype=torch.int64, device=audio_signal.device - ) - - # select a random att_context_size with the distribution specified by att_context_probs during training - # for non-validation cases like test, validation or inference, it uses the first mode in self.att_context_size - if self.training and len(self.att_context_size_all) > 1: - cur_att_context_size = random.choices(self.att_context_size_all, weights=self.att_context_probs)[0] - else: - cur_att_context_size = self.att_context_size - - audio_signal = torch.transpose(audio_signal, 1, 2) - - if isinstance(self.pre_encode, nn.Linear): - audio_signal = self.pre_encode(audio_signal) - else: - audio_signal, length = self.pre_encode(x=audio_signal, lengths=length) - length = length.to(torch.int64) - # self.streaming_cfg is set by setup_streaming_cfg(), called in the init - if self.streaming_cfg.drop_extra_pre_encoded > 0 and cache_last_channel is not None: - audio_signal = audio_signal[:, self.streaming_cfg.drop_extra_pre_encoded :, :] - length = (length - self.streaming_cfg.drop_extra_pre_encoded).clamp(min=0) - - if self.reduction_position is not None and cache_last_channel is not None: - raise ValueError("Caching with reduction feature is not supported yet!") - - max_audio_length = audio_signal.size(1) - if cache_last_channel is not None: - cache_len = self.streaming_cfg.last_channel_cache_size - cache_keep_size = max_audio_length - self.streaming_cfg.cache_drop_size - max_audio_length = max_audio_length + cache_len - padding_length = length + cache_len - offset = torch.neg(cache_last_channel_len) + cache_len - else: - padding_length = length - cache_last_channel_next = None - cache_len = 0 - offset = None - - audio_signal, pos_emb = self.pos_enc(x=audio_signal, cache_len=cache_len) - - # Create the self-attention and padding masks - pad_mask, att_mask = self._create_masks( - att_context_size=cur_att_context_size, - padding_length=padding_length, - max_audio_length=max_audio_length, - offset=offset, - device=audio_signal.device, - ) - - if cache_last_channel is not None: - pad_mask = pad_mask[:, cache_len:] - if att_mask is not None: - att_mask = att_mask[:, cache_len:] - # Convert caches from the tensor to list - cache_last_time_next = [] - cache_last_channel_next = [] - - for lth, (drop_prob, layer) in enumerate(zip(self.layer_drop_probs, self.layers)): - original_signal = audio_signal - if cache_last_channel is not None: - cache_last_channel_cur = cache_last_channel[lth] - cache_last_time_cur = cache_last_time[lth] - else: - cache_last_channel_cur = None - cache_last_time_cur = None - audio_signal = layer( - x=audio_signal, - att_mask=att_mask, - pos_emb=pos_emb, - pad_mask=pad_mask, - cache_last_channel=cache_last_channel_cur, - cache_last_time=cache_last_time_cur, - ) - - if cache_last_channel_cur is not None: - (audio_signal, cache_last_channel_cur, cache_last_time_cur) = audio_signal - cache_last_channel_next.append(cache_last_channel_cur) - cache_last_time_next.append(cache_last_time_cur) - - # applying stochastic depth logic from https://arxiv.org/abs/2102.03216 - if self.training and drop_prob > 0.0: - should_drop = torch.rand(1) < drop_prob - # adjusting to match expectation - if should_drop: - # that's not efficient, but it's hard to implement distributed - # version of dropping layers without deadlock or random seed meddling - # so multiplying the signal by 0 to ensure all weights get gradients - audio_signal = audio_signal * 0.0 + original_signal - else: - # not doing this operation if drop prob is 0 as it's identity in that case - audio_signal = (audio_signal - original_signal) / (1.0 - drop_prob) + original_signal - - if self.reduction_position == lth: - audio_signal, length = self.reduction_subsampling(x=audio_signal, lengths=length) - max_audio_length = audio_signal.size(1) - # Don't update the audio_signal here because then it will again scale the audio_signal - # and cause an increase in the WER - _, pos_emb = self.pos_enc(x=audio_signal, cache_len=cache_len) - pad_mask, att_mask = self._create_masks( - att_context_size=cur_att_context_size, - padding_length=length, - max_audio_length=max_audio_length, - offset=offset, - device=audio_signal.device, - ) - - # saving tensors if required for interctc loss - if self.is_access_enabled(): - if self.interctc_capture_at_layers is None: - self.interctc_capture_at_layers = self.access_cfg.get('interctc', {}).get('capture_layers', []) - if lth in self.interctc_capture_at_layers: - lth_audio_signal = audio_signal - if self.out_proj is not None: - lth_audio_signal = self.out_proj(audio_signal) - # shape is the same as the shape of audio_signal output, i.e. [B, D, T] - self.register_accessible_tensor( - name=f'interctc/layer_output_{lth}', tensor=torch.transpose(lth_audio_signal, 1, 2) - ) - self.register_accessible_tensor(name=f'interctc/layer_length_{lth}', tensor=length) - - if self.out_proj is not None: - audio_signal = self.out_proj(audio_signal) - - # Reduction - if self.reduction_position == -1: - audio_signal, length = self.reduction_subsampling(x=audio_signal, lengths=length) - - audio_signal = torch.transpose(audio_signal, 1, 2) - length = length.to(dtype=torch.int64) - - if cache_last_channel is not None: - cache_last_channel_next = torch.stack(cache_last_channel_next, dim=0) - cache_last_time_next = torch.stack(cache_last_time_next, dim=0) - return ( - audio_signal, - length, - cache_last_channel_next, - cache_last_time_next, - torch.clamp(cache_last_channel_len + cache_keep_size, max=cache_len), - ) - else: - return audio_signal, length - - def update_max_seq_length(self, seq_length: int, device): - # Find global max audio length across all nodes - if torch.distributed.is_initialized(): - global_max_len = torch.tensor([seq_length], dtype=torch.float32, device=device) - - # Update across all ranks in the distributed system - torch.distributed.all_reduce(global_max_len, op=torch.distributed.ReduceOp.MAX) - - seq_length = global_max_len.int().item() - - if seq_length > self.max_audio_length: - self.set_max_audio_length(seq_length) - - def set_max_audio_length(self, max_audio_length): - """ - Sets maximum input length. - Pre-calculates internal seq_range mask. - """ - self.max_audio_length = max_audio_length - device = next(self.parameters()).device - self.pos_enc.extend_pe(max_audio_length, device) - - def _create_masks(self, att_context_size, padding_length, max_audio_length, offset, device): - if self.self_attention_model != "rel_pos_local_attn": - att_mask = torch.ones(1, max_audio_length, max_audio_length, dtype=torch.bool, device=device) - - if self.att_context_style == "regular": - if att_context_size[0] >= 0: - att_mask = att_mask.triu(diagonal=-att_context_size[0]) - if att_context_size[1] >= 0: - att_mask = att_mask.tril(diagonal=att_context_size[1]) - elif self.att_context_style == "chunked_limited": - # When right context is unlimited, just the left side of the masking need to get updated - if att_context_size[1] == -1: - if att_context_size[0] >= 0: - att_mask = att_mask.triu(diagonal=-att_context_size[0]) - else: - chunk_size = att_context_size[1] + 1 - # left_chunks_num specifies the number of chunks to be visible by each chunk on the left side - if att_context_size[0] >= 0: - left_chunks_num = att_context_size[0] // chunk_size - else: - left_chunks_num = 10000 - - chunk_idx = torch.arange(0, max_audio_length, dtype=torch.int, device=att_mask.device) - chunk_idx = torch.div(chunk_idx, chunk_size, rounding_mode="trunc") - diff_chunks = chunk_idx.unsqueeze(1) - chunk_idx.unsqueeze(0) - chunked_limited_mask = torch.logical_and( - torch.le(diff_chunks, left_chunks_num), torch.ge(diff_chunks, 0) - ) - att_mask = torch.logical_and(att_mask, chunked_limited_mask.unsqueeze(0)) - else: - att_mask = None - - # pad_mask is the masking to be used to ignore paddings - pad_mask = torch.arange(0, max_audio_length, device=device).expand( - padding_length.size(0), -1 - ) < padding_length.unsqueeze(-1) - - if offset is not None: - pad_mask_off = torch.arange(0, max_audio_length, device=device).expand( - padding_length.size(0), -1 - ) >= offset.unsqueeze(-1) - pad_mask = pad_mask_off.logical_and(pad_mask) - - if att_mask is not None: - # pad_mask_for_att_mask is the mask which helps to ignore paddings - pad_mask_for_att_mask = pad_mask.unsqueeze(1).repeat([1, max_audio_length, 1]) - pad_mask_for_att_mask = torch.logical_and(pad_mask_for_att_mask, pad_mask_for_att_mask.transpose(1, 2)) - # att_mask is the masking to be used by the MHA layers to ignore the tokens not supposed to be visible - att_mask = att_mask[:, :max_audio_length, :max_audio_length] - # paddings should also get ignored, so pad_mask_for_att_mask is used to ignore their corresponding scores - att_mask = torch.logical_and(pad_mask_for_att_mask, att_mask.to(pad_mask_for_att_mask.device)) - att_mask = ~att_mask - - pad_mask = ~pad_mask - return pad_mask, att_mask - - def enable_pad_mask(self, on=True): - # On inference, user may choose to disable pad mask - mask = self.use_pad_mask - self.use_pad_mask = on - return mask - - def _calc_context_sizes( - self, att_context_size, att_context_probs, att_context_style, conv_context_size, conv_kernel_size - ): - # convert att_context_size to a standard list of lists - if att_context_size: - att_context_size_all = list(att_context_size) - if isinstance(att_context_size_all[0], int): - att_context_size_all = [att_context_size_all] - for i, att_cs in enumerate(att_context_size_all): - if isinstance(att_cs, ListConfig): - att_context_size_all[i] = list(att_cs) - if att_context_style == "chunked_limited": - if att_cs[0] > 0 and att_cs[0] % (att_cs[1] + 1) > 0: - raise ValueError(f"att_context_size[{i}][0] % (att_context_size[{i}][1] + 1) should be zero!") - if att_cs[1] < 0 and len(att_context_size_all) <= 1: - raise ValueError( - f"Right context (att_context_size[{i}][1]) can not be unlimited for chunked_limited style!" - ) - else: - att_context_size_all = [[-1, -1]] - - if att_context_probs: - if len(att_context_probs) != len(att_context_size_all): - raise ValueError("The size of the att_context_probs should be the same as att_context_size.") - att_context_probs = list(att_context_probs) - if sum(att_context_probs) != 1: - raise ValueError( - "The sum of numbers in att_context_probs should be equal to one to be a distribution." - ) - else: - att_context_probs = [1.0 / len(att_context_size_all)] * len(att_context_size_all) - - if conv_context_size is not None: - if isinstance(conv_context_size, ListConfig): - conv_context_size = list(conv_context_size) - if not isinstance(conv_context_size, list) and not isinstance(conv_context_size, str): - raise ValueError( - f"Invalid conv_context_size! It should be the string 'causal' or a list of two integers." - ) - if conv_context_size == "causal": - conv_context_size = [conv_kernel_size - 1, 0] - else: - if conv_context_size[0] + conv_context_size[1] + 1 != conv_kernel_size: - raise ValueError(f"Invalid conv_context_size: {self.conv_context_size}!") - else: - conv_context_size = [(conv_kernel_size - 1) // 2, (conv_kernel_size - 1) // 2] - return att_context_size_all, att_context_size_all[0], att_context_probs, conv_context_size - - def set_default_att_context_size(self, att_context_size): - if att_context_size not in self.att_context_size_all: - logging.warning( - f"att_context_size={att_context_size} is not among the list of the supported look-aheads: {self.att_context_size_all}" - ) - if att_context_size is not None: - self.att_context_size = att_context_size - - def setup_streaming_params( - self, - chunk_size: int = None, - shift_size: int = None, - left_chunks: int = None, - att_context_size: list = None, - max_context: int = 10000, - ): - """ - This function sets the needed values and parameters to perform streaming. The configuration would be stored in self.streaming_cfg. - The streaming configuration is needed to simulate streaming inference. - Args: - chunk_size (int): overrides the chunk size - shift_size (int): overrides the shift size for chunks - left_chunks (int): overrides the number of left chunks visible to each chunk - max_context (int): the value used for the cache size of last_channel layers if left context is set to infinity (-1) - Defaults to -1 (means feat_out is d_model) - """ - streaming_cfg = CacheAwareStreamingConfig() - - # When att_context_size is not specified, it uses the default_att_context_size - if att_context_size is None: - att_context_size = self.att_context_size - - if chunk_size is not None: - if chunk_size < 1: - raise ValueError("chunk_size needs to be a number larger or equal to one.") - lookahead_steps = chunk_size - 1 - streaming_cfg.cache_drop_size = chunk_size - shift_size - elif self.att_context_style == "chunked_limited": - lookahead_steps = att_context_size[1] - streaming_cfg.cache_drop_size = 0 - elif self.att_context_style == "regular": - lookahead_steps = att_context_size[1] * self.n_layers + self.conv_context_size[1] * self.n_layers - streaming_cfg.cache_drop_size = lookahead_steps - else: - streaming_cfg.cache_drop_size = 0 - lookahead_steps = None - - if chunk_size is None: - streaming_cfg.last_channel_cache_size = att_context_size[0] if att_context_size[0] >= 0 else max_context - else: - if left_chunks is None: - raise ValueError("left_chunks can not be None when chunk_size is set.") - streaming_cfg.last_channel_cache_size = left_chunks * chunk_size - - if hasattr(self.pre_encode, "get_sampling_frames"): - sampling_frames = self.pre_encode.get_sampling_frames() - else: - sampling_frames = 0 - - if isinstance(sampling_frames, list): - streaming_cfg.chunk_size = [ - sampling_frames[0] + self.subsampling_factor * lookahead_steps, - sampling_frames[1] + self.subsampling_factor * lookahead_steps, - ] - else: - streaming_cfg.chunk_size = sampling_frames * (1 + lookahead_steps) - - if isinstance(sampling_frames, list): - streaming_cfg.shift_size = [ - sampling_frames[0] + sampling_frames[1] * (lookahead_steps - streaming_cfg.cache_drop_size), - sampling_frames[1] + sampling_frames[1] * (lookahead_steps - streaming_cfg.cache_drop_size), - ] - else: - streaming_cfg.shift_size = sampling_frames * (1 + lookahead_steps - streaming_cfg.cache_drop_size) - - if isinstance(streaming_cfg.shift_size, list): - streaming_cfg.valid_out_len = ( - streaming_cfg.shift_size[1] - sampling_frames[1] - ) // self.subsampling_factor + 1 - else: - streaming_cfg.valid_out_len = streaming_cfg.shift_size // self.subsampling_factor - - if hasattr(self.pre_encode, "get_streaming_cache_size"): - streaming_cfg.pre_encode_cache_size = self.pre_encode.get_streaming_cache_size() - else: - streaming_cfg.pre_encode_cache_size = 0 - - if isinstance(streaming_cfg.pre_encode_cache_size, list): - if streaming_cfg.pre_encode_cache_size[1] >= 1: - streaming_cfg.drop_extra_pre_encoded = ( - 1 + (streaming_cfg.pre_encode_cache_size[1] - 1) // self.subsampling_factor - ) - else: - streaming_cfg.drop_extra_pre_encoded = 0 - else: - streaming_cfg.drop_extra_pre_encoded = streaming_cfg.pre_encode_cache_size // self.subsampling_factor - - for m in self.layers.modules(): - if hasattr(m, "_max_cache_len"): - if isinstance(m, MultiHeadAttention): - m.cache_drop_size = streaming_cfg.cache_drop_size - if isinstance(m, CausalConv1D): - m.cache_drop_size = streaming_cfg.cache_drop_size - - self.streaming_cfg = streaming_cfg - - def get_initial_cache_state(self, batch_size=1, dtype=torch.float32, device=None, max_dim=0): - if device is None: - device = next(self.parameters()).device - if max_dim > 0: - create_tensor = torch.randn - else: - create_tensor = torch.zeros - last_time_cache_size = self.conv_context_size[0] - cache_last_channel = create_tensor( - (len(self.layers), batch_size, self.streaming_cfg.last_channel_cache_size, self.d_model,), - device=device, - dtype=dtype, - ) - cache_last_time = create_tensor( - (len(self.layers), batch_size, self.d_model, last_time_cache_size), device=device, dtype=dtype, - ) - if max_dim > 0: - cache_last_channel_len = torch.randint( - 0, - min(max_dim, self.streaming_cfg.last_channel_cache_size), - (batch_size,), - device=device, - dtype=torch.int64, - ) - for i in range(batch_size): - cache_last_channel[:, i, cache_last_channel_len[i] :, :] = 0 - # what is the right rule to zero out cache_last_time? - if cache_last_channel_len[i] == 0: - cache_last_time[:, i, :, :] = 0 - else: - cache_last_channel_len = torch.zeros(batch_size, device=device, dtype=torch.int64) - return cache_last_channel, cache_last_time, cache_last_channel_len - - def change_attention_model( - self, - self_attention_model: str = None, - att_context_size: List[int] = None, - update_config: bool = True, - device: torch.device = None, - ): - - """ - Update the self_attention_model which changes the positional encoding and attention layers. - - Args: - self_attention_model (str): type of the attention layer and positional encoding - 'rel_pos': relative positional embedding and Transformer-XL - 'rel_pos_local_attn': relative positional embedding and Transformer-XL with local attention using - overlapping windows. Attention context is determined by att_context_size parameter. - 'abs_pos': absolute positional embedding and Transformer - If None is provided, the self_attention_model isn't changed. Defaults to None. - att_context_size (List[int]): List of 2 ints corresponding to left and right attention context sizes, - or None to keep as it is. Defaults to None. - update_config (bool): Whether to update the config or not with the new attention model. - Defaults to True. - device (torch.device): If provided, new layers will be moved to the device. - Defaults to None. - """ - - if att_context_size: - att_context_size = list(att_context_size) - else: - att_context_size = self.att_context_size - - if self_attention_model is None: - self_attention_model = self.self_attention_model - - if self_attention_model == 'rel_pos_local_attn' and max(att_context_size) <= 0: - raise ValueError("When using local attention, context size must be set > 0") - - if self_attention_model == "rel_pos": - new_pos_enc = RelPositionalEncoding( - d_model=self._cfg.d_model, - dropout_rate=self._cfg.dropout, - max_len=self._cfg.pos_emb_max_len, - xscale=self.xscale, - dropout_rate_emb=self._cfg.dropout_emb, - ) - elif self_attention_model == 'rel_pos_local_attn': - new_pos_enc = LocalAttRelPositionalEncoding( - att_context_size=att_context_size, - d_model=self._cfg.d_model, - dropout_rate=self._cfg.dropout, - max_len=self._cfg.pos_emb_max_len, - xscale=self.xscale, - dropout_rate_emb=self._cfg.dropout_emb, - ) - elif self_attention_model == "abs_pos": - new_pos_enc = PositionalEncoding( - d_model=self._cfg.d_model, - dropout_rate=self._cfg.dropout, - max_len=self._cfg.pos_emb_max_len, - xscale=self.xscale, - ) - else: - raise ValueError(f"Not valid self_attention_model: '{self_attention_model}'!") - - if device is not None: - new_pos_enc = new_pos_enc.to(device=device) - del self.pos_enc - self.pos_enc = new_pos_enc - self.self_attention_model = self_attention_model - self.att_context_size = att_context_size - self.set_max_audio_length(self.pos_emb_max_len) - - for name, m in self.named_modules(): - if type(m) == ConformerLayer: - if self_attention_model == 'rel_pos': - new_attn = RelPositionMultiHeadAttention( - n_head=self._cfg.n_heads, - n_feat=self._cfg.d_model, - dropout_rate=self._cfg.dropout_att, - max_cache_len=att_context_size[0], - pos_bias_u=None, - pos_bias_v=None, - ) - elif self_attention_model == 'rel_pos_local_attn': - new_attn = RelPositionMultiHeadAttentionLongformer( - n_head=self._cfg.n_heads, - n_feat=self._cfg.d_model, - dropout_rate=self._cfg.dropout_att, - max_cache_len=att_context_size[0], - att_context_size=att_context_size, - pos_bias_u=None, - pos_bias_v=None, - ) - elif self_attention_model == 'abs_pos': - new_attn = MultiHeadAttention( - n_head=self._cfg.n_heads, - n_feat=self._cfg.d_model, - dropout_rate=self._cfg.dropout_att, - max_cache_len=att_context_size[0], - ) - else: - raise ValueError( - f"'{self_attention_model}' is not not a valid value for 'self_attention_model', " - f"valid values can be from ['rel_pos', 'rel_pos_local_attn', 'abs_pos']" - ) - if device is not None: - new_attn = new_attn.to(device=device) - new_attn.load_state_dict(m.self_attn.state_dict(), strict=False) - del m.self_attn - m.self_attn = new_attn - m.self_attention_model = self_attention_model - - if update_config: - with open_dict(self._cfg): - self._cfg.self_attention_model = self_attention_model - self._cfg.att_context_size = att_context_size - - def change_subsampling_conv_chunking_factor(self, subsampling_conv_chunking_factor: int): - """ - Update the conv_chunking_factor (int) - Default is 1 (auto) - Set it to -1 (disabled) or to a specific value (power of 2) if you OOM in the conv subsampling layers - - - Args: - subsampling_conv_chunking_factor (int) - """ - - if not hasattr(self.pre_encode, "change_subsampling_conv_chunking_factor"): - logging.info("Model pre_encoder doesn't have a change_subsampling_conv_chunking_factor method ") - return - - self.pre_encode.change_subsampling_conv_chunking_factor( - subsampling_conv_chunking_factor=subsampling_conv_chunking_factor - ) - - -class ConformerEncoderAdapter(ConformerEncoder, adapter_mixins.AdapterModuleMixin): - - # Higher level forwarding - def add_adapter(self, name: str, cfg: dict): - cfg = self._update_adapter_cfg_input_dim(cfg) - for conformer_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin - conformer_layer.add_adapter(name, cfg) - - def is_adapter_available(self) -> bool: - return any([conformer_layer.is_adapter_available() for conformer_layer in self.layers]) - - def set_enabled_adapters(self, name: Optional[str] = None, enabled: bool = True): - for conformer_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin - conformer_layer.set_enabled_adapters(name=name, enabled=enabled) - - def get_enabled_adapters(self) -> List[str]: - names = set([]) - for conformer_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin - names.update(conformer_layer.get_enabled_adapters()) - - names = sorted(list(names)) - return names - - def _update_adapter_cfg_input_dim(self, cfg: DictConfig): - cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=self.d_model) - return cfg - - def get_accepted_adapter_types(self,) -> Set[type]: - types = super().get_accepted_adapter_types() - - if len(types) == 0: - self.set_accepted_adapter_types( - [ - adapter_utils.LINEAR_ADAPTER_CLASSPATH, - adapter_utils.MHA_ADAPTER_CLASSPATH, - adapter_utils.RELMHA_ADAPTER_CLASSPATH, - ] - ) - types = self.get_accepted_adapter_types() - return types - - -""" -Register any additional information -""" -if adapter_mixins.get_registered_adapter(ConformerEncoder) is None: - adapter_mixins.register_adapter(base_class=ConformerEncoder, adapter_class=ConformerEncoderAdapter) - - -@dataclass -class ConformerChangeConfig: - # Change self_attention_model for Conformer - # Options: - # 'rel_pos': relative positional embedding and Transformer-XL - # 'rel_pos_local_attn': relative positional embedding and Transformer-XL with local attention using - # overlapping chunks. Attention context is determined by att_context_size parameter. - # 'abs_pos': absolute positional embedding and Transformer - # If None is provided, self_attention_model is not changed. - self_attention_model: Optional[str] = None - - # Change the attention context size by providing 2 integers, - # corresponding to left and right context, or -1 for full context. - # If None is provided, the attention context size isn't changed. - att_context_size: Optional[List[int]] = None diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/conv_asr.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/conv_asr.py deleted file mode 100644 index a05ee894f050fb776a98b7ff81558aff98c8a5eb..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/conv_asr.py +++ /dev/null @@ -1,994 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from collections import OrderedDict -from dataclasses import dataclass, field -from typing import List, Optional, Set, Union - -import torch -import torch.distributed -import torch.nn as nn -import torch.nn.functional as F -from omegaconf import MISSING, DictConfig, ListConfig, OmegaConf - -from nemo.collections.asr.parts.submodules.jasper import ( - JasperBlock, - MaskedConv1d, - ParallelBlock, - SqueezeExcite, - init_weights, - jasper_activations, -) -from nemo.collections.asr.parts.submodules.tdnn_attention import ( - AttentivePoolLayer, - StatsPoolLayer, - TDNNModule, - TDNNSEModule, -) -from nemo.collections.asr.parts.utils import adapter_utils -from nemo.core.classes.common import typecheck -from nemo.core.classes.exportable import Exportable -from nemo.core.classes.mixins import AccessMixin, adapter_mixins -from nemo.core.classes.module import NeuralModule -from nemo.core.neural_types import ( - AcousticEncodedRepresentation, - LengthsType, - LogitsType, - LogprobsType, - NeuralType, - SpectrogramType, -) -from nemo.utils import logging - -__all__ = ['ConvASRDecoder', 'ConvASREncoder', 'ConvASRDecoderClassification'] - - -class ConvASREncoder(NeuralModule, Exportable, AccessMixin): - """ - Convolutional encoder for ASR models. With this class you can implement JasperNet and QuartzNet models. - - Based on these papers: - https://arxiv.org/pdf/1904.03288.pdf - https://arxiv.org/pdf/1910.10261.pdf - """ - - def _prepare_for_export(self, **kwargs): - m_count = 0 - for name, m in self.named_modules(): - if isinstance(m, MaskedConv1d): - m.use_mask = False - m_count += 1 - - Exportable._prepare_for_export(self, **kwargs) - logging.warning(f"Turned off {m_count} masked convolutions") - - def input_example(self, max_batch=1, max_dim=8192): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - device = next(self.parameters()).device - input_example = torch.randn(max_batch, self._feat_in, max_dim, device=device) - lens = torch.full(size=(input_example.shape[0],), fill_value=max_dim, device=device) - return tuple([input_example, lens]) - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return OrderedDict( - { - "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - } - ) - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return OrderedDict( - { - "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - } - ) - - def __init__( - self, - jasper, - activation: str, - feat_in: int, - normalization_mode: str = "batch", - residual_mode: str = "add", - norm_groups: int = -1, - conv_mask: bool = True, - frame_splicing: int = 1, - init_mode: Optional[str] = 'xavier_uniform', - quantize: bool = False, - ): - super().__init__() - if isinstance(jasper, ListConfig): - jasper = OmegaConf.to_container(jasper) - - activation = jasper_activations[activation]() - - # If the activation can be executed in place, do so. - if hasattr(activation, 'inplace'): - activation.inplace = True - - feat_in = feat_in * frame_splicing - - self._feat_in = feat_in - - residual_panes = [] - encoder_layers = [] - self.dense_residual = False - for layer_idx, lcfg in enumerate(jasper): - dense_res = [] - if lcfg.get('residual_dense', False): - residual_panes.append(feat_in) - dense_res = residual_panes - self.dense_residual = True - groups = lcfg.get('groups', 1) - separable = lcfg.get('separable', False) - heads = lcfg.get('heads', -1) - residual_mode = lcfg.get('residual_mode', residual_mode) - se = lcfg.get('se', False) - se_reduction_ratio = lcfg.get('se_reduction_ratio', 8) - se_context_window = lcfg.get('se_context_size', -1) - se_interpolation_mode = lcfg.get('se_interpolation_mode', 'nearest') - kernel_size_factor = lcfg.get('kernel_size_factor', 1.0) - stride_last = lcfg.get('stride_last', False) - future_context = lcfg.get('future_context', -1) - encoder_layers.append( - JasperBlock( - feat_in, - lcfg['filters'], - repeat=lcfg['repeat'], - kernel_size=lcfg['kernel'], - stride=lcfg['stride'], - dilation=lcfg['dilation'], - dropout=lcfg['dropout'], - residual=lcfg['residual'], - groups=groups, - separable=separable, - heads=heads, - residual_mode=residual_mode, - normalization=normalization_mode, - norm_groups=norm_groups, - activation=activation, - residual_panes=dense_res, - conv_mask=conv_mask, - se=se, - se_reduction_ratio=se_reduction_ratio, - se_context_window=se_context_window, - se_interpolation_mode=se_interpolation_mode, - kernel_size_factor=kernel_size_factor, - stride_last=stride_last, - future_context=future_context, - quantize=quantize, - layer_idx=layer_idx, - ) - ) - feat_in = lcfg['filters'] - - self._feat_out = feat_in - - self.encoder = torch.nn.Sequential(*encoder_layers) - self.apply(lambda x: init_weights(x, mode=init_mode)) - - self.max_audio_length = 0 - - @typecheck() - def forward(self, audio_signal, length): - self.update_max_sequence_length(seq_length=audio_signal.size(2), device=audio_signal.device) - s_input, length = self.encoder(([audio_signal], length)) - if length is None: - return s_input[-1] - - return s_input[-1], length - - def update_max_sequence_length(self, seq_length: int, device): - # Find global max audio length across all nodes - if torch.distributed.is_initialized(): - global_max_len = torch.tensor([seq_length], dtype=torch.float32, device=device) - - # Update across all ranks in the distributed system - torch.distributed.all_reduce(global_max_len, op=torch.distributed.ReduceOp.MAX) - - seq_length = global_max_len.int().item() - - if seq_length > self.max_audio_length: - if seq_length < 5000: - seq_length = seq_length * 2 - elif seq_length < 10000: - seq_length = seq_length * 1.5 - self.max_audio_length = seq_length - - device = next(self.parameters()).device - seq_range = torch.arange(0, self.max_audio_length, device=device) - if hasattr(self, 'seq_range'): - self.seq_range = seq_range - else: - self.register_buffer('seq_range', seq_range, persistent=False) - - # Update all submodules - for name, m in self.named_modules(): - if isinstance(m, MaskedConv1d): - m.update_masked_length(self.max_audio_length, seq_range=self.seq_range) - elif isinstance(m, SqueezeExcite): - m.set_max_len(self.max_audio_length, seq_range=self.seq_range) - - -class ParallelConvASREncoder(NeuralModule, Exportable): - """ - Convolutional encoder for ASR models with parallel blocks. CarneliNet can be implemented with this class. - """ - - def _prepare_for_export(self): - m_count = 0 - for m in self.modules(): - if isinstance(m, MaskedConv1d): - m.use_mask = False - m_count += 1 - logging.warning(f"Turned off {m_count} masked convolutions") - - def input_example(self, max_batch=1, max_dim=256): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - input_example = torch.randn(max_batch, self._feat_in, max_dim).to(next(self.parameters()).device) - return tuple([input_example]) - - @property - def disabled_deployment_input_names(self): - """Implement this method to return a set of input names disabled for export""" - return set(["length"]) - - @property - def disabled_deployment_output_names(self): - """Implement this method to return a set of output names disabled for export""" - return set(["encoded_lengths"]) - - def save_to(self, save_path: str): - pass - - @classmethod - def restore_from(cls, restore_path: str): - pass - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return OrderedDict( - { - "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - } - ) - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return OrderedDict( - { - "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - } - ) - - def __init__( - self, - jasper, - activation: str, - feat_in: int, - normalization_mode: str = "batch", - residual_mode: str = "add", - norm_groups: int = -1, - conv_mask: bool = True, - frame_splicing: int = 1, - init_mode: Optional[str] = 'xavier_uniform', - aggregation_mode: Optional[str] = None, - quantize: bool = False, - ): - super().__init__() - if isinstance(jasper, ListConfig): - jasper = OmegaConf.to_container(jasper) - - activation = jasper_activations[activation]() - feat_in = feat_in * frame_splicing - - self._feat_in = feat_in - - residual_panes = [] - encoder_layers = [] - self.dense_residual = False - for lcfg in jasper: - dense_res = [] - if lcfg.get('residual_dense', False): - residual_panes.append(feat_in) - dense_res = residual_panes - self.dense_residual = True - groups = lcfg.get('groups', 1) - separable = lcfg.get('separable', False) - heads = lcfg.get('heads', -1) - residual_mode = lcfg.get('residual_mode', residual_mode) - se = lcfg.get('se', False) - se_reduction_ratio = lcfg.get('se_reduction_ratio', 8) - se_context_window = lcfg.get('se_context_size', -1) - se_interpolation_mode = lcfg.get('se_interpolation_mode', 'nearest') - kernel_size_factor = lcfg.get('kernel_size_factor', 1.0) - stride_last = lcfg.get('stride_last', False) - aggregation_mode = lcfg.get('aggregation_mode', 'sum') - block_dropout = lcfg.get('block_dropout', 0.0) - parallel_residual_mode = lcfg.get('parallel_residual_mode', 'sum') - - parallel_blocks = [] - for kernel_size in lcfg['kernel']: - parallel_blocks.append( - JasperBlock( - feat_in, - lcfg['filters'], - repeat=lcfg['repeat'], - kernel_size=[kernel_size], - stride=lcfg['stride'], - dilation=lcfg['dilation'], - dropout=lcfg['dropout'], - residual=lcfg['residual'], - groups=groups, - separable=separable, - heads=heads, - residual_mode=residual_mode, - normalization=normalization_mode, - norm_groups=norm_groups, - activation=activation, - residual_panes=dense_res, - conv_mask=conv_mask, - se=se, - se_reduction_ratio=se_reduction_ratio, - se_context_window=se_context_window, - se_interpolation_mode=se_interpolation_mode, - kernel_size_factor=kernel_size_factor, - stride_last=stride_last, - quantize=quantize, - ) - ) - if len(parallel_blocks) == 1: - encoder_layers.append(parallel_blocks[0]) - else: - encoder_layers.append( - ParallelBlock( - parallel_blocks, - aggregation_mode=aggregation_mode, - block_dropout_prob=block_dropout, - residual_mode=parallel_residual_mode, - in_filters=feat_in, - out_filters=lcfg['filters'], - ) - ) - feat_in = lcfg['filters'] - - self._feat_out = feat_in - - self.encoder = torch.nn.Sequential(*encoder_layers) - self.apply(lambda x: init_weights(x, mode=init_mode)) - - @typecheck() - def forward(self, audio_signal, length=None): - s_input, length = self.encoder(([audio_signal], length)) - if length is None: - return s_input[-1] - - return s_input[-1], length - - -class ConvASRDecoder(NeuralModule, Exportable, adapter_mixins.AdapterModuleMixin): - """Simple ASR Decoder for use with CTC-based models such as JasperNet and QuartzNet - - Based on these papers: - https://arxiv.org/pdf/1904.03288.pdf - https://arxiv.org/pdf/1910.10261.pdf - https://arxiv.org/pdf/2005.04290.pdf - """ - - @property - def input_types(self): - return OrderedDict({"encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation())}) - - @property - def output_types(self): - return OrderedDict({"logprobs": NeuralType(('B', 'T', 'D'), LogprobsType())}) - - def __init__(self, feat_in, num_classes, init_mode="xavier_uniform", vocabulary=None): - super().__init__() - - if vocabulary is None and num_classes < 0: - raise ValueError( - f"Neither of the vocabulary and num_classes are set! At least one of them need to be set." - ) - - if num_classes <= 0: - num_classes = len(vocabulary) - logging.info(f"num_classes of ConvASRDecoder is set to the size of the vocabulary: {num_classes}.") - - if vocabulary is not None: - if num_classes != len(vocabulary): - raise ValueError( - f"If vocabulary is specified, it's length should be equal to the num_classes. Instead got: num_classes={num_classes} and len(vocabulary)={len(vocabulary)}" - ) - self.__vocabulary = vocabulary - self._feat_in = feat_in - # Add 1 for blank char - self._num_classes = num_classes + 1 - - self.decoder_layers = torch.nn.Sequential( - torch.nn.Conv1d(self._feat_in, self._num_classes, kernel_size=1, bias=True) - ) - self.apply(lambda x: init_weights(x, mode=init_mode)) - - accepted_adapters = [adapter_utils.LINEAR_ADAPTER_CLASSPATH] - self.set_accepted_adapter_types(accepted_adapters) - - # to change, requires running ``model.temperature = T`` explicitly - self.temperature = 1.0 - - @typecheck() - def forward(self, encoder_output): - # Adapter module forward step - if self.is_adapter_available(): - encoder_output = encoder_output.transpose(1, 2) # [B, T, C] - encoder_output = self.forward_enabled_adapters(encoder_output) - encoder_output = encoder_output.transpose(1, 2) # [B, C, T] - - if self.temperature != 1.0: - return torch.nn.functional.log_softmax( - self.decoder_layers(encoder_output).transpose(1, 2) / self.temperature, dim=-1 - ) - return torch.nn.functional.log_softmax(self.decoder_layers(encoder_output).transpose(1, 2), dim=-1) - - def input_example(self, max_batch=1, max_dim=256): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - input_example = torch.randn(max_batch, self._feat_in, max_dim).to(next(self.parameters()).device) - return tuple([input_example]) - - def _prepare_for_export(self, **kwargs): - m_count = 0 - for m in self.modules(): - if type(m).__name__ == "MaskedConv1d": - m.use_mask = False - m_count += 1 - if m_count > 0: - logging.warning(f"Turned off {m_count} masked convolutions") - Exportable._prepare_for_export(self, **kwargs) - - # Adapter method overrides - def add_adapter(self, name: str, cfg: DictConfig): - # Update the config with correct input dim - cfg = self._update_adapter_cfg_input_dim(cfg) - # Add the adapter - super().add_adapter(name=name, cfg=cfg) - - def _update_adapter_cfg_input_dim(self, cfg: DictConfig): - cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=self._feat_in) - return cfg - - @property - def vocabulary(self): - return self.__vocabulary - - @property - def num_classes_with_blank(self): - return self._num_classes - - -class ConvASRDecoderReconstruction(NeuralModule, Exportable): - """ASR Decoder for reconstructing masked regions of spectrogram - """ - - @property - def input_types(self): - return OrderedDict({"encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation())}) - - @property - def output_types(self): - if self.apply_softmax: - return OrderedDict({"out": NeuralType(('B', 'T', 'D'), LogprobsType())}) - else: - return OrderedDict({"out": NeuralType(('B', 'T', 'D'), AcousticEncodedRepresentation())}) - - def __init__( - self, - feat_in, - feat_out, - feat_hidden, - stride_layers=0, - non_stride_layers=0, - kernel_size=11, - init_mode="xavier_uniform", - activation="relu", - stride_transpose=True, - apply_softmax=False, - ): - super().__init__() - - if ((stride_layers + non_stride_layers) > 0) and (kernel_size < 3 or kernel_size % 2 == 0): - raise ValueError("Kernel size in this decoder needs to be >= 3 and odd when using at least 1 conv layer.") - - activation = jasper_activations[activation]() - - self.feat_in = feat_in - self.feat_out = feat_out - self.feat_hidden = feat_hidden - - self.decoder_layers = [nn.Conv1d(self.feat_in, self.feat_hidden, kernel_size=1, bias=True)] - for i in range(stride_layers): - self.decoder_layers.append(activation) - if stride_transpose: - self.decoder_layers.append( - nn.ConvTranspose1d( - self.feat_hidden, - self.feat_hidden, - kernel_size, - stride=2, - padding=(kernel_size - 3) // 2 + 1, - output_padding=1, - bias=True, - groups=self.feat_hidden, - ) - ) - else: - self.decoder_layers.append( - nn.Conv1d( - self.feat_hidden, - self.feat_hidden, - kernel_size, - stride=2, - padding=(kernel_size - 1) // 2, - bias=True, - groups=self.feat_hidden, - ) - ) - self.decoder_layers.append(nn.Conv1d(self.feat_hidden, self.feat_hidden, kernel_size=1, bias=True)) - self.decoder_layers.append(nn.BatchNorm1d(self.feat_hidden, eps=1e-3, momentum=0.1)) - for i in range(non_stride_layers): - self.decoder_layers.append(activation) - self.decoder_layers.append( - nn.Conv1d( - self.feat_hidden, - self.feat_hidden, - kernel_size, - bias=True, - groups=self.feat_hidden, - padding=kernel_size // 2, - ) - ) - self.decoder_layers.append(nn.Conv1d(self.feat_hidden, self.feat_hidden, kernel_size=1, bias=True)) - self.decoder_layers.append(nn.BatchNorm1d(self.feat_hidden, eps=1e-3, momentum=0.1)) - - self.decoder_layers.append(activation) - self.decoder_layers.append(nn.Conv1d(self.feat_hidden, self.feat_out, kernel_size=1, bias=True)) - - self.decoder_layers = nn.Sequential(*self.decoder_layers) - self.apply_softmax = apply_softmax - - self.apply(lambda x: init_weights(x, mode=init_mode)) - - @typecheck() - def forward(self, encoder_output): - out = self.decoder_layers(encoder_output).transpose(-2, -1) - if self.apply_softmax: - out = torch.nn.functional.log_softmax(out, dim=-1) - return out - - def input_example(self, max_batch=1, max_dim=256): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - input_example = torch.randn(max_batch, self._feat_in, max_dim).to(next(self.parameters()).device) - return tuple([input_example]) - - def _prepare_for_export(self, **kwargs): - m_count = 0 - for m in self.modules(): - if type(m).__name__ == "MaskedConv1d": - m.use_mask = False - m_count += 1 - if m_count > 0: - logging.warning(f"Turned off {m_count} masked convolutions") - Exportable._prepare_for_export(self, **kwargs) - - -class ConvASRDecoderClassification(NeuralModule, Exportable): - """Simple ASR Decoder for use with classification models such as JasperNet and QuartzNet - - Based on these papers: - https://arxiv.org/pdf/2005.04290.pdf - """ - - def input_example(self, max_batch=1, max_dim=256): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - input_example = torch.randn(max_batch, self._feat_in, max_dim).to(next(self.parameters()).device) - return tuple([input_example]) - - @property - def input_types(self): - return OrderedDict({"encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation())}) - - @property - def output_types(self): - return OrderedDict({"logits": NeuralType(('B', 'D'), LogitsType())}) - - def __init__( - self, - feat_in: int, - num_classes: int, - init_mode: Optional[str] = "xavier_uniform", - return_logits: bool = True, - pooling_type='avg', - ): - super().__init__() - - self._feat_in = feat_in - self._return_logits = return_logits - self._num_classes = num_classes - - if pooling_type == 'avg': - self.pooling = torch.nn.AdaptiveAvgPool1d(1) - elif pooling_type == 'max': - self.pooling = torch.nn.AdaptiveMaxPool1d(1) - else: - raise ValueError('Pooling type chosen is not valid. Must be either `avg` or `max`') - - self.decoder_layers = torch.nn.Sequential(torch.nn.Linear(self._feat_in, self._num_classes, bias=True)) - self.apply(lambda x: init_weights(x, mode=init_mode)) - - @typecheck() - def forward(self, encoder_output): - batch, in_channels, timesteps = encoder_output.size() - - encoder_output = self.pooling(encoder_output).view(batch, in_channels) # [B, C] - logits = self.decoder_layers(encoder_output) # [B, num_classes] - - if self._return_logits: - return logits - - return torch.nn.functional.softmax(logits, dim=-1) - - @property - def num_classes(self): - return self._num_classes - - -class ECAPAEncoder(NeuralModule, Exportable): - """ - Modified ECAPA Encoder layer without Res2Net module for faster training and inference which achieves - better numbers on speaker diarization tasks - Reference: ECAPA-TDNN Embeddings for Speaker Diarization (https://arxiv.org/pdf/2104.01466.pdf) - - input: - feat_in: input feature shape (mel spec feature shape) - filters: list of filter shapes for SE_TDNN modules - kernel_sizes: list of kernel shapes for SE_TDNN modules - dilations: list of dilations for group conv se layer - scale: scale value to group wider conv channels (deafult:8) - - output: - outputs : encoded output - output_length: masked output lengths - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return OrderedDict( - { - "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - } - ) - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return OrderedDict( - { - "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - } - ) - - def __init__( - self, - feat_in: int, - filters: list, - kernel_sizes: list, - dilations: list, - scale: int = 8, - init_mode: str = 'xavier_uniform', - ): - super().__init__() - self.layers = nn.ModuleList() - self.layers.append(TDNNModule(feat_in, filters[0], kernel_size=kernel_sizes[0], dilation=dilations[0])) - - for i in range(len(filters) - 2): - self.layers.append( - TDNNSEModule( - filters[i], - filters[i + 1], - group_scale=scale, - se_channels=128, - kernel_size=kernel_sizes[i + 1], - dilation=dilations[i + 1], - ) - ) - self.feature_agg = TDNNModule(filters[-1], filters[-1], kernel_sizes[-1], dilations[-1]) - self.apply(lambda x: init_weights(x, mode=init_mode)) - - def forward(self, audio_signal, length=None): - x = audio_signal - outputs = [] - - for layer in self.layers: - x = layer(x, length=length) - outputs.append(x) - - x = torch.cat(outputs[1:], dim=1) - x = self.feature_agg(x) - return x, length - - -class SpeakerDecoder(NeuralModule, Exportable): - """ - Speaker Decoder creates the final neural layers that maps from the outputs - of Jasper Encoder to the embedding layer followed by speaker based softmax loss. - Args: - feat_in (int): Number of channels being input to this module - num_classes (int): Number of unique speakers in dataset - emb_sizes (list) : shapes of intermediate embedding layers (we consider speaker embbeddings from 1st of this layers) - Defaults to [1024,1024] - pool_mode (str) : Pooling strategy type. options are 'xvector','tap', 'attention' - Defaults to 'xvector (mean and variance)' - tap (temporal average pooling: just mean) - attention (attention based pooling) - - init_mode (str): Describes how neural network parameters are - initialized. Options are ['xavier_uniform', 'xavier_normal', - 'kaiming_uniform','kaiming_normal']. - Defaults to "xavier_uniform". - """ - - def input_example(self, max_batch=1, max_dim=256): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - input_example = torch.randn(max_batch, self.input_feat_in, max_dim).to(next(self.parameters()).device) - return tuple([input_example]) - - @property - def input_types(self): - return OrderedDict( - { - "encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "length": NeuralType(('B',), LengthsType(), optional=True), - } - ) - - @property - def output_types(self): - return OrderedDict( - { - "logits": NeuralType(('B', 'D'), LogitsType()), - "embs": NeuralType(('B', 'D'), AcousticEncodedRepresentation()), - } - ) - - def __init__( - self, - feat_in: int, - num_classes: int, - emb_sizes: Optional[Union[int, list]] = 256, - pool_mode: str = 'xvector', - angular: bool = False, - attention_channels: int = 128, - init_mode: str = "xavier_uniform", - ): - super().__init__() - self.angular = angular - self.emb_id = 2 - bias = False if self.angular else True - emb_sizes = [emb_sizes] if type(emb_sizes) is int else emb_sizes - - self._num_classes = num_classes - self.pool_mode = pool_mode.lower() - if self.pool_mode == 'xvector' or self.pool_mode == 'tap': - self._pooling = StatsPoolLayer(feat_in=feat_in, pool_mode=self.pool_mode) - affine_type = 'linear' - elif self.pool_mode == 'attention': - self._pooling = AttentivePoolLayer(inp_filters=feat_in, attention_channels=attention_channels) - affine_type = 'conv' - - shapes = [self._pooling.feat_in] - for size in emb_sizes: - shapes.append(int(size)) - - emb_layers = [] - for shape_in, shape_out in zip(shapes[:-1], shapes[1:]): - layer = self.affine_layer(shape_in, shape_out, learn_mean=False, affine_type=affine_type) - emb_layers.append(layer) - - self.emb_layers = nn.ModuleList(emb_layers) - - self.final = nn.Linear(shapes[-1], self._num_classes, bias=bias) - - self.apply(lambda x: init_weights(x, mode=init_mode)) - - def affine_layer( - self, inp_shape, out_shape, learn_mean=True, affine_type='conv', - ): - if affine_type == 'conv': - layer = nn.Sequential( - nn.BatchNorm1d(inp_shape, affine=True, track_running_stats=True), - nn.Conv1d(inp_shape, out_shape, kernel_size=1), - ) - - else: - layer = nn.Sequential( - nn.Linear(inp_shape, out_shape), - nn.BatchNorm1d(out_shape, affine=learn_mean, track_running_stats=True), - nn.ReLU(), - ) - - return layer - - @typecheck() - def forward(self, encoder_output, length=None): - pool = self._pooling(encoder_output, length) - embs = [] - - for layer in self.emb_layers: - pool, emb = layer(pool), layer[: self.emb_id](pool) - embs.append(emb) - - pool = pool.squeeze(-1) - if self.angular: - for W in self.final.parameters(): - W = F.normalize(W, p=2, dim=1) - pool = F.normalize(pool, p=2, dim=1) - - out = self.final(pool) - - return out, embs[-1].squeeze(-1) - - -class ConvASREncoderAdapter(ConvASREncoder, adapter_mixins.AdapterModuleMixin): - - # Higher level forwarding - def add_adapter(self, name: str, cfg: dict): - for jasper_block in self.encoder: # type: adapter_mixins.AdapterModuleMixin - cfg = self._update_adapter_cfg_input_dim(jasper_block, cfg) - - jasper_block.set_accepted_adapter_types(self.get_accepted_adapter_types()) - jasper_block.add_adapter(name, cfg) - - def is_adapter_available(self) -> bool: - return any([jasper_block.is_adapter_available() for jasper_block in self.encoder]) - - def set_enabled_adapters(self, name: Optional[str] = None, enabled: bool = True): - for jasper_block in self.encoder: # type: adapter_mixins.AdapterModuleMixin - jasper_block.set_enabled_adapters(name=name, enabled=enabled) - - def get_enabled_adapters(self) -> List[str]: - names = set([]) - for jasper_block in self.encoder: # type: adapter_mixins.AdapterModuleMixin - names.update(jasper_block.get_enabled_adapters()) - - names = sorted(list(names)) - return names - - def _update_adapter_cfg_input_dim(self, block: JasperBlock, cfg): - cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=block.planes) - return cfg - - def get_accepted_adapter_types(self,) -> Set[type]: - types = super().get_accepted_adapter_types() - - if len(types) == 0: - self.set_accepted_adapter_types( - [adapter_utils.LINEAR_ADAPTER_CLASSPATH,] - ) - types = self.get_accepted_adapter_types() - return types - - -@dataclass -class JasperEncoderConfig: - filters: int = MISSING - repeat: int = MISSING - kernel: List[int] = MISSING - stride: List[int] = MISSING - dilation: List[int] = MISSING - dropout: float = MISSING - residual: bool = MISSING - - # Optional arguments - groups: int = 1 - separable: bool = False - heads: int = -1 - residual_mode: str = "add" - residual_dense: bool = False - se: bool = False - se_reduction_ratio: int = 8 - se_context_size: int = -1 - se_interpolation_mode: str = 'nearest' - kernel_size_factor: float = 1.0 - stride_last: bool = False - - -@dataclass -class ConvASREncoderConfig: - _target_: str = 'nemo.collections.asr.modules.ConvASREncoder' - jasper: Optional[List[JasperEncoderConfig]] = field(default_factory=list) - activation: str = MISSING - feat_in: int = MISSING - normalization_mode: str = "batch" - residual_mode: str = "add" - norm_groups: int = -1 - conv_mask: bool = True - frame_splicing: int = 1 - init_mode: Optional[str] = "xavier_uniform" - - -@dataclass -class ConvASRDecoderConfig: - _target_: str = 'nemo.collections.asr.modules.ConvASRDecoder' - feat_in: int = MISSING - num_classes: int = MISSING - init_mode: Optional[str] = "xavier_uniform" - vocabulary: Optional[List[str]] = field(default_factory=list) - - -@dataclass -class ConvASRDecoderClassificationConfig: - _target_: str = 'nemo.collections.asr.modules.ConvASRDecoderClassification' - feat_in: int = MISSING - num_classes: int = MISSING - init_mode: Optional[str] = "xavier_uniform" - return_logits: bool = True - pooling_type: str = 'avg' - - -""" -Register any additional information -""" -if adapter_mixins.get_registered_adapter(ConvASREncoder) is None: - adapter_mixins.register_adapter(base_class=ConvASREncoder, adapter_class=ConvASREncoderAdapter) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/flashlight_decoder.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/flashlight_decoder.py deleted file mode 100644 index 05a111ebb7e76516fb1c3a3ae84d0998c53d5f22..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/flashlight_decoder.py +++ /dev/null @@ -1,290 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -import math -from typing import Iterable, List, Optional, Tuple, Union - -import numpy as np -import torch - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.core.classes import NeuralModule, typecheck -from nemo.core.neural_types import LengthsType, LogprobsType, NeuralType, PredictionsType - - -class _TokensWrapper: - def __init__(self, vocabulary: List[str], tokenizer: TokenizerSpec): - self.vocabulary = vocabulary - self.tokenizer = tokenizer - - if tokenizer is None: - self.reverse_map = {self.vocabulary[i]: i for i in range(len(self.vocabulary))} - - self.vocab_len = len(self.vocabulary) - - if (self.tokenizer is not None) and hasattr(self.tokenizer, 'unk_id') and self.tokenizer.unk_id is not None: - self.unknown_id = self.tokenizer.unk_id - elif ' ' in self.vocabulary: - self.unknown_id = self.token_to_id(' ') - elif '' in self.vocabulary: - self.unknown_id = self.token_to_id('') - else: - self.unknown_id = -1 - - @property - def blank(self): - return self.vocab_len - - @property - def unk_id(self): - return self.unknown_id - - @property - def vocab(self): - return self.vocabulary - - @property - def vocab_size(self): - # the +1 is because we add the blank id - return self.vocab_len + 1 - - def token_to_id(self, token: str): - if token == self.blank: - return -1 - - if self.tokenizer is not None: - return self.tokenizer.token_to_id(token) - else: - return self.reverse_map[token] - - def text_to_tokens(self, text: str): - if self.tokenizer is not None: - return self.tokenizer.text_to_tokens(text) - else: - return list(text) - - -class FlashLightKenLMBeamSearchDecoder(NeuralModule): - ''' - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "log_probs": NeuralType(('B', 'T', 'D'), LogprobsType()), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return {"hypos": NeuralType(('B'), PredictionsType())} - ''' - - def __init__( - self, - lm_path: str, - vocabulary: List[str], - tokenizer: Optional[TokenizerSpec] = None, - lexicon_path: Optional[str] = None, - boost_path: Optional[str] = None, - beam_size: int = 32, - beam_size_token: int = 32, - beam_threshold: float = 25.0, - lm_weight: float = 2.0, - word_score: float = -1.0, - unk_weight: float = -math.inf, - sil_weight: float = 0.0, - ): - - try: - from flashlight.lib.text.decoder import ( - LM, - CriterionType, - KenLM, - LexiconDecoder, - LexiconDecoderOptions, - SmearingMode, - Trie, - ) - from flashlight.lib.text.dictionary import create_word_dict, load_words - except ModuleNotFoundError: - raise ModuleNotFoundError( - "FlashLightKenLMBeamSearchDecoder requires the installation of flashlight python bindings " - "from https://github.com/flashlight/text. Please follow the build instructions there." - ) - - super().__init__() - - self.criterion_type = CriterionType.CTC - self.tokenizer_wrapper = _TokensWrapper(vocabulary, tokenizer) - self.vocab_size = self.tokenizer_wrapper.vocab_size - self.blank = self.tokenizer_wrapper.blank - self.silence = self.tokenizer_wrapper.unk_id - - if lexicon_path is not None: - self.lexicon = load_words(lexicon_path) - self.word_dict = create_word_dict(self.lexicon) - self.unk_word = self.word_dict.get_index("") - - # loads in the boosted words if given via a file - if boost_path is not None: - with open(boost_path, 'r', encoding='utf_8') as fr: - boost_words = [line.strip().split('\t') for line in fr] - boost_words = {w[0]: w[1] for w in boost_words} - else: - boost_words = {} - - # add OOV boosted words to word_dict so it gets picked up in LM obj creation - for word in boost_words.keys(): - if word not in self.lexicon: - self.word_dict.add_entry(word) - - # loads in the kenlm binary and combines in with the dictionary object from the lexicon - # this gives a mapping between each entry in the kenlm binary and its mapping to whatever - # numeraire is used by the AM, which is explicitly mapped via the lexicon - # this information is ued to build a vocabulary trie for decoding - self.lm = KenLM(lm_path, self.word_dict) - self.trie = Trie(self.vocab_size, self.silence) - - start_state = self.lm.start(False) - for i, (word, spellings) in enumerate(self.lexicon.items()): - word_idx = self.word_dict.get_index(word) - _, score = self.lm.score(start_state, word_idx) - for spelling in spellings: - spelling_idxs = [self.tokenizer_wrapper.token_to_id(token) for token in spelling] - if self.tokenizer_wrapper.unk_id in spelling_idxs: - print(f'tokenizer has unknown id for word[ {word} ] {spelling} {spelling_idxs}', flush=True) - continue - self.trie.insert( - spelling_idxs, word_idx, score if word not in boost_words else float(boost_words[word]) - ) - # handle OOV boosted words - for word, boost in boost_words.items(): - if word not in self.lexicon: - word_idx = self.word_dict.get_index(word) - spelling = self.tokenizer_wrapper.text_to_tokens(word) - spelling_idxs = [self.tokenizer_wrapper.token_to_id(token) for token in spelling] - if self.tokenizer_wrapper.unk_id in spelling_idxs: - print(f'tokenizer has unknown id for word[ {word} ] {spelling} {spelling_idxs}', flush=True) - continue - self.trie.insert(spelling_idxs, word_idx, float(boost)) - self.trie.smear(SmearingMode.MAX) - - self.decoder_opts = LexiconDecoderOptions( - beam_size=beam_size, - beam_size_token=int(beam_size_token), - beam_threshold=beam_threshold, - lm_weight=lm_weight, - word_score=word_score, - unk_score=unk_weight, - sil_score=sil_weight, - log_add=False, - criterion_type=self.criterion_type, - ) - - self.decoder = LexiconDecoder( - self.decoder_opts, self.trie, self.lm, self.silence, self.blank, self.unk_word, [], False, - ) - else: - from flashlight.lib.text.decoder import LexiconFreeDecoder, LexiconFreeDecoderOptions - - d = { - w: [[w]] - for w in self.tokenizer_wrapper.vocab + ([] if '' in self.tokenizer_wrapper.vocab else ['']) - } - self.word_dict = create_word_dict(d) - self.lm = KenLM(lm_path, self.word_dict) - self.decoder_opts = LexiconFreeDecoderOptions( - beam_size=beam_size, - beam_size_token=int(beam_size_token), - beam_threshold=beam_threshold, - lm_weight=lm_weight, - sil_score=sil_weight, - log_add=False, - criterion_type=self.criterion_type, - ) - self.decoder = LexiconFreeDecoder(self.decoder_opts, self.lm, self.silence, self.blank, []) - - def _get_tokens(self, idxs: List[int]): - """Normalize tokens by handling CTC blank, ASG replabels, etc.""" - - idxs = (g[0] for g in itertools.groupby(idxs)) - if self.silence < 0: - idxs = filter(lambda x: x != self.blank and x != self.silence, idxs) - else: - idxs = filter(lambda x: x != self.blank, idxs) - idxs = list(idxs) - if idxs[0] == self.silence: - idxs = idxs[1:] - if idxs[-1] == self.silence: - idxs = idxs[:-1] - - return torch.LongTensor(idxs) - - def _get_timesteps(self, token_idxs: List[int]): - """Returns frame numbers corresponding to every non-blank token. - Parameters - ---------- - token_idxs : List[int] - IDs of decoded tokens. - Returns - ------- - List[int] - Frame numbers corresponding to every non-blank token. - """ - - timesteps = [] - for i, token_idx in enumerate(token_idxs): - if token_idx == self.blank: - continue - if i == 0 or token_idx != token_idxs[i - 1]: - timesteps.append(i) - - return timesteps - - # @typecheck(ignore_collections=True) - @torch.no_grad() - def forward(self, log_probs: Union[np.ndarray, torch.Tensor]): - if isinstance(log_probs, np.ndarray): - log_probs = torch.from_numpy(log_probs).float() - if log_probs.dim() == 2: - log_probs = log_probs.unsqueeze(0) - - emissions = log_probs.cpu().contiguous() - - B, T, N = emissions.size() - hypos = [] - # we iterate over the batch dimension of our input tensor log probabilities - for b in range(B): - # the flashlight C++ expects a C style pointer, so the memory address - # which is what we obtain here. Then we pass it to pybinding method which - # is bound to the underlying C++ code - emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0) - results = self.decoder.decode(emissions_ptr, T, N) - - hypos.append( - [ - { - "tokens": self._get_tokens(result.tokens), - "score": result.score, - "timesteps": self._get_timesteps(result.tokens), - "words": [self.word_dict.get_entry(x) for x in result.words if x >= 0], - } - for result in results - ] - ) - - return hypos diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/graph_decoder.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/graph_decoder.py deleted file mode 100644 index d66dbd734a69b8a1668e55d3ebf842c144e0a7d7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/graph_decoder.py +++ /dev/null @@ -1,214 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Optional - -import torch -from omegaconf import DictConfig - -from nemo.core.classes import NeuralModule -from nemo.core.neural_types import LengthsType, LogprobsType, NeuralType, PredictionsType - - -class ViterbiDecoderWithGraph(NeuralModule): - """Viterbi Decoder with WFSA (Weighted Finite State Automaton) graphs. - - Note: - Requires k2 v1.14 or later to be installed to use this module. - - Decoder can be set up via the config, and optionally be passed keyword arguments as follows. - - Examples: - .. code-block:: yaml - - model: # Model config - ... - graph_module_cfg: # Config for graph modules, e.g. ViterbiDecoderWithGraph - split_batch_size: 0 - backend_cfg: - topo_type: "default" # other options: "compact", "shared_blank", "minimal" - topo_with_self_loops: true - token_lm: # must be provided for criterion_type: "map" - - Args: - num_classes: Number of target classes for the decoder network to predict. - (Excluding the blank token). - - backend: Which backend to use for decoding. Currently only `k2` is supported. - - dec_type: Type of decoding graph to use. Choices: `topo` and `token_lm`, - with `topo` standing for the loss topology graph only - and `token_lm` for the topology composed with a token_lm graph. - - return_type: Type of output. Choices: `1best` and `lattice`. - `1best` is represented as a list of 1D tensors. - `lattice` can be of type corresponding to the backend (e.g. k2.Fsa). - - return_ilabels: For return_type=`1best`. - Whether to return input labels of a lattice (otherwise output labels). - - output_aligned: For return_type=`1best`. - Whether the tensors length will correspond to log_probs_length - and the labels will be aligned to the frames of emission - (otherwise there will be only the necessary labels). - - split_batch_size: Local batch size. Used for memory consumption reduction at the cost of speed performance. - Effective if complies 0 < split_batch_size < batch_size. - - graph_module_cfg: Optional Dict of (str, value) pairs that are passed to the backend graph decoder. - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "log_probs": NeuralType(("B", "T", "D") if self._3d_input else ("B", "T", "T", "D"), LogprobsType()), - "input_lengths": NeuralType(tuple("B"), LengthsType()), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return {"predictions": NeuralType(("B", "T"), PredictionsType())} - - def __init__( - self, - num_classes, - backend: str = "k2", - dec_type: str = "topo", - return_type: str = "1best", - return_ilabels: bool = True, - output_aligned: bool = True, - split_batch_size: int = 0, - graph_module_cfg: Optional[DictConfig] = None, - ): - self._blank = num_classes - self.return_ilabels = return_ilabels - self.output_aligned = output_aligned - self.split_batch_size = split_batch_size - self.dec_type = dec_type - - if return_type == "1best": - self.return_lattices = False - elif return_type == "lattice": - self.return_lattices = True - elif return_type == "nbest": - raise NotImplementedError(f"return_type {return_type} is not supported at the moment") - else: - raise ValueError(f"Unsupported return_type: {return_type}") - - # we assume that self._blank + 1 == num_classes - if backend == "k2": - if self.dec_type == "topo": - from nemo.collections.asr.parts.k2.graph_decoders import CtcDecoder as Decoder - elif self.dec_type == "topo_rnnt_ali": - from nemo.collections.asr.parts.k2.graph_decoders import RnntAligner as Decoder - elif self.dec_type == "token_lm": - from nemo.collections.asr.parts.k2.graph_decoders import TokenLMDecoder as Decoder - elif self.dec_type == "loose_ali": - raise NotImplementedError() - elif self.dec_type == "tlg": - raise NotImplementedError(f"dec_type {self.dec_type} is not supported at the moment") - else: - raise ValueError(f"Unsupported dec_type: {self.dec_type}") - - self._decoder = Decoder(num_classes=self._blank + 1, blank=self._blank, cfg=graph_module_cfg) - elif backend == "gtn": - raise NotImplementedError("gtn-backed decoding is not implemented") - - self._3d_input = self.dec_type != "topo_rnnt" - super().__init__() - - def update_graph(self, graph): - """Updates graph of the backend graph decoder. - """ - self._decoder.update_graph(graph) - - def _forward_impl(self, log_probs, log_probs_length, targets=None, target_length=None): - if targets is None and target_length is not None or targets is not None and target_length is None: - raise RuntimeError( - f"Both targets and target_length have to be None or not None: {targets}, {target_length}" - ) - # do not use self.return_lattices for now - if targets is None: - align = False - decode_func = lambda a, b: self._decoder.decode( - a, b, return_lattices=False, return_ilabels=self.return_ilabels, output_aligned=self.output_aligned - ) - else: - align = True - decode_func = lambda a, b, c, d: self._decoder.align( - a, b, c, d, return_lattices=False, return_ilabels=False, output_aligned=True - ) - batch_size = log_probs.shape[0] - if self.split_batch_size > 0 and self.split_batch_size <= batch_size: - predictions = [] - probs = [] - for batch_idx in range(0, batch_size, self.split_batch_size): - begin = batch_idx - end = min(begin + self.split_batch_size, batch_size) - log_probs_length_part = log_probs_length[begin:end] - log_probs_part = log_probs[begin:end, : log_probs_length_part.max()] - if align: - target_length_part = target_length[begin:end] - targets_part = targets[begin:end, : target_length_part.max()] - predictions_part, probs_part = decode_func( - log_probs_part, log_probs_length_part, targets_part, target_length_part - ) - del targets_part, target_length_part - else: - predictions_part, probs_part = decode_func(log_probs_part, log_probs_length_part) - del log_probs_part, log_probs_length_part - predictions += predictions_part - probs += probs_part - else: - predictions, probs = ( - decode_func(log_probs, log_probs_length, targets, target_length) - if align - else decode_func(log_probs, log_probs_length) - ) - assert len(predictions) == len(probs) - return predictions, probs - - @torch.no_grad() - def forward(self, log_probs, log_probs_length): - if self.dec_type == "looseali": - raise RuntimeError(f"Decoder with dec_type=`{self.dec_type}` is not intended for regular decoding.") - predictions, probs = self._forward_impl(log_probs, log_probs_length) - lengths = torch.tensor([len(pred) for pred in predictions], device=predictions[0].device) - predictions_tensor = torch.full((len(predictions), lengths.max()), self._blank).to( - device=predictions[0].device - ) - probs_tensor = torch.full((len(probs), lengths.max()), 1.0).to(device=predictions[0].device) - for i, (pred, prob) in enumerate(zip(predictions, probs)): - predictions_tensor[i, : lengths[i]] = pred - probs_tensor[i, : lengths[i]] = prob - return predictions_tensor, lengths, probs_tensor - - @torch.no_grad() - def align(self, log_probs, log_probs_length, targets, target_length): - len_enough = (log_probs_length >= target_length) & (target_length > 0) - if torch.all(len_enough) or self.dec_type == "looseali": - results = self._forward_impl(log_probs, log_probs_length, targets, target_length) - else: - results = self._forward_impl( - log_probs[len_enough], log_probs_length[len_enough], targets[len_enough], target_length[len_enough] - ) - for i, computed in enumerate(len_enough): - if not computed: - results[0].insert(i, torch.empty(0, dtype=torch.int32)) - results[1].insert(i, torch.empty(0, dtype=torch.float)) - return results diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/hybrid_autoregressive_transducer.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/hybrid_autoregressive_transducer.py deleted file mode 100644 index b806692d44f40dd274edbba1fe05a77f9d4af30b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/hybrid_autoregressive_transducer.py +++ /dev/null @@ -1,245 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Any, Dict, List, Optional, Tuple, Union - -import torch - -from nemo.collections.asr.modules import rnnt -from nemo.collections.asr.parts.utils.rnnt_utils import HATJointOutput - -from nemo.utils import logging - - -class HATJoint(rnnt.RNNTJoint): - """A Hybrid Autoregressive Transducer Joint Network (HAT Joint Network). - A HAT Joint network, comprised of a feedforward model. - - Args: - jointnet: A dict-like object which contains the following key-value pairs. - encoder_hidden: int specifying the hidden dimension of the encoder net. - pred_hidden: int specifying the hidden dimension of the prediction net. - joint_hidden: int specifying the hidden dimension of the joint net - activation: Activation function used in the joint step. Can be one of - ['relu', 'tanh', 'sigmoid']. - - Optionally, it may also contain the following: - dropout: float, set to 0.0 by default. Optional dropout applied at the end of the joint net. - - num_classes: int, specifying the vocabulary size that the joint network must predict, - excluding the HAT blank token. - - vocabulary: Optional list of strings/tokens that comprise the vocabulary of the joint network. - Unused and kept only for easy access for character based encoding HAT models. - - log_softmax: Optional bool, set to None by default. If set as None, will compute the log_softmax() - based on the value provided. - - preserve_memory: Optional bool, set to False by default. If the model crashes due to the memory - intensive joint step, one might try this flag to empty the tensor cache in pytorch. - - Warning: This will make the forward-backward pass much slower than normal. - It also might not fix the OOM if the GPU simply does not have enough memory to compute the joint. - - fuse_loss_wer: Optional bool, set to False by default. - - Fuses the joint forward, loss forward and - wer forward steps. In doing so, it trades of speed for memory conservation by creating sub-batches - of the provided batch of inputs, and performs Joint forward, loss forward and wer forward (optional), - all on sub-batches, then collates results to be exactly equal to results from the entire batch. - - When this flag is set, prior to calling forward, the fields `loss` and `wer` (either one) *must* - be set using the `HATJoint.set_loss()` or `HATJoint.set_wer()` methods. - - Further, when this flag is set, the following argument `fused_batch_size` *must* be provided - as a non negative integer. This value refers to the size of the sub-batch. - - When the flag is set, the input and output signature of `forward()` of this method changes. - Input - in addition to `encoder_outputs` (mandatory argument), the following arguments can be provided. - - decoder_outputs (optional). Required if loss computation is required. - - encoder_lengths (required) - - transcripts (optional). Required for wer calculation. - - transcript_lengths (optional). Required for wer calculation. - - compute_wer (bool, default false). Whether to compute WER or not for the fused batch. - - Output - instead of the usual `joint` log prob tensor, the following results can be returned. - - loss (optional). Returned if decoder_outputs, transcripts and transript_lengths are not None. - - wer_numerator + wer_denominator (optional). Returned if transcripts, transcripts_lengths are provided - and compute_wer is set. - - fused_batch_size: Optional int, required if `fuse_loss_wer` flag is set. Determines the size of the - sub-batches. Should be any value below the actual batch size per GPU. - """ - - def __init__( - self, - jointnet: Dict[str, Any], - num_classes: int, - num_extra_outputs: int = 0, - vocabulary: Optional[List] = None, - log_softmax: Optional[bool] = None, - preserve_memory: bool = False, - fuse_loss_wer: bool = False, - fused_batch_size: Optional[int] = None, - experimental_fuse_loss_wer: Any = None, - ): - super().__init__( - jointnet=jointnet, - num_classes=num_classes, - num_extra_outputs=num_extra_outputs, - vocabulary=vocabulary, - log_softmax=log_softmax, - preserve_memory=preserve_memory, - fuse_loss_wer=fuse_loss_wer, - fused_batch_size=fused_batch_size, - experimental_fuse_loss_wer=experimental_fuse_loss_wer, - ) - - self.pred, self.enc, self.joint_net, self.blank_pred = self._joint_hat_net_modules( - num_classes=self._vocab_size, # non blank symbol - pred_n_hidden=self.pred_hidden, - enc_n_hidden=self.encoder_hidden, - joint_n_hidden=self.joint_hidden, - activation=self.activation, - dropout=jointnet.get('dropout', 0.0), - ) - self._return_hat_ilm = False - - @property - def return_hat_ilm(self): - return self._return_hat_ilm - - @return_hat_ilm.setter - def return_hat_ilm(self, hat_subtract_ilm): - self._return_hat_ilm = hat_subtract_ilm - - def joint(self, f: torch.Tensor, g: torch.Tensor) -> Union[torch.Tensor, HATJointOutput]: - """ - Compute the joint step of the network. - - Here, - B = Batch size - T = Acoustic model timesteps - U = Target sequence length - H1, H2 = Hidden dimensions of the Encoder / Decoder respectively - H = Hidden dimension of the Joint hidden step. - V = Vocabulary size of the Decoder (excluding the HAT blank token). - - NOTE: - The implementation of this model is slightly modified from the original paper. - The original paper proposes the following steps : - (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- *1 - *1 -> Forward through joint final [B, T, U, V + 1]. - - We instead split the joint hidden into joint_hidden_enc and joint_hidden_dec and act as follows: - enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- *1 - dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- *2 - (*1, *2) -> Sum [B, T, U, H] -> Forward through joint final [B, T, U, V + 1]. - - Args: - f: Output of the Encoder model. A torch.Tensor of shape [B, T, H1] - g: Output of the Decoder model. A torch.Tensor of shape [B, U, H2] - - Returns: - Log softmaxed tensor of shape (B, T, U, V + 1). - Internal LM probability (B, 1, U, V) -- in case of return_ilm==True. - """ - # f = [B, T, H1] - f = self.enc(f) - f.unsqueeze_(dim=2) # (B, T, 1, H) - - # g = [B, U, H2] - g = self.pred(g) - g.unsqueeze_(dim=1) # (B, 1, U, H) - - inp = f + g # [B, T, U, H] - - del f - - # Forward adapter modules on joint hidden - if self.is_adapter_available(): - inp = self.forward_enabled_adapters(inp) - - blank_logprob = self.blank_pred(inp) # [B, T, U, 1] - label_logit = self.joint_net(inp) # [B, T, U, V] - - del inp - - label_logprob = label_logit.log_softmax(dim=-1) - scale_prob = torch.clamp(1 - torch.exp(blank_logprob), min=1e-6) - label_logprob_scaled = torch.log(scale_prob) + label_logprob # [B, T, U, V] - - res = torch.cat((label_logprob_scaled, blank_logprob), dim=-1).contiguous() # [B, T, U, V+1] - - if self.return_hat_ilm: - ilm_logprobs = self.joint_net(g).log_softmax(dim=-1) # [B, 1, U, V] - res = HATJointOutput(hat_logprobs=res, ilm_logprobs=ilm_logprobs) - - del g, blank_logprob, label_logprob, label_logit, scale_prob, label_logprob_scaled - - if self.preserve_memory: - torch.cuda.empty_cache() - - return res - - def _joint_hat_net_modules(self, num_classes, pred_n_hidden, enc_n_hidden, joint_n_hidden, activation, dropout): - """ - Prepare the trainable modules of the Joint Network - - Args: - num_classes: Number of output classes (vocab size) excluding the HAT blank token. - pred_n_hidden: Hidden size of the prediction network. - enc_n_hidden: Hidden size of the encoder network. - joint_n_hidden: Hidden size of the joint network. - activation: Activation of the joint. Can be one of [relu, tanh, sigmoid] - dropout: Dropout value to apply to joint. - """ - pred = torch.nn.Linear(pred_n_hidden, joint_n_hidden) - enc = torch.nn.Linear(enc_n_hidden, joint_n_hidden) - blank_pred = torch.nn.Sequential( - torch.nn.Tanh(), torch.nn.Dropout(p=dropout), torch.nn.Linear(joint_n_hidden, 1), torch.nn.LogSigmoid() - ) - - if activation not in ['relu', 'sigmoid', 'tanh']: - raise ValueError("Unsupported activation for joint step - please pass one of " "[relu, sigmoid, tanh]") - - activation = activation.lower() - - if activation == 'relu': - activation = torch.nn.ReLU(inplace=True) - elif activation == 'sigmoid': - activation = torch.nn.Sigmoid() - elif activation == 'tanh': - activation = torch.nn.Tanh() - - layers = ( - [activation] - + ([torch.nn.Dropout(p=dropout)] if dropout else []) - + [torch.nn.Linear(joint_n_hidden, num_classes)] - ) - return pred, enc, torch.nn.Sequential(*layers), blank_pred diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/lstm_decoder.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/lstm_decoder.py deleted file mode 100644 index 9bb60e2fabca99fdb0ec8c1b21e73e10e02f1623..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/lstm_decoder.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections import OrderedDict - -import torch -import torch.nn as nn - -from nemo.core.classes.common import typecheck -from nemo.core.classes.exportable import Exportable -from nemo.core.classes.module import NeuralModule -from nemo.core.neural_types import AcousticEncodedRepresentation, LogprobsType, NeuralType - -__all__ = ['LSTMDecoder'] - - -class LSTMDecoder(NeuralModule, Exportable): - """ - Simple LSTM Decoder for ASR models - Args: - feat_in (int): size of the input features - num_classes (int): the size of the vocabulary - lstm_hidden_size (int): hidden size of the LSTM layers - vocabulary (vocab): The vocabulary - bidirectional (bool): default is False. Whether LSTMs are bidirectional or not - num_layers (int): default is 1. Number of LSTM layers stacked - """ - - @property - def input_types(self): - return OrderedDict({"encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation())}) - - @property - def output_types(self): - return OrderedDict({"logprobs": NeuralType(('B', 'T', 'D'), LogprobsType())}) - - def __init__(self, feat_in, num_classes, lstm_hidden_size, vocabulary=None, bidirectional=False, num_layers=1): - super().__init__() - - if vocabulary is not None: - if num_classes != len(vocabulary): - raise ValueError( - f"If vocabulary is specified, it's length should be equal to the num_classes. " - f"Instead got: num_classes={num_classes} and len(vocabulary)={len(vocabulary)}" - ) - self.__vocabulary = vocabulary - self._feat_in = feat_in - # Add 1 for blank char - self._num_classes = num_classes + 1 - - self.lstm_layer = nn.LSTM( - input_size=feat_in, - hidden_size=lstm_hidden_size, - num_layers=num_layers, - batch_first=True, - bidirectional=bidirectional, - ) - lstm_hidden_size = 2 * lstm_hidden_size if bidirectional else lstm_hidden_size - self.linear_layer = torch.nn.Linear(in_features=lstm_hidden_size, out_features=self._num_classes) - - @typecheck() - def forward(self, encoder_output): - output = encoder_output.transpose(1, 2) - output, _ = self.lstm_layer(output) - output = self.linear_layer(output) - return torch.nn.functional.log_softmax(output, dim=-1) - - def input_example(self, max_batch=1, max_dim=256): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - input_example = torch.randn(max_batch, self._feat_in, max_dim).to(next(self.parameters()).device) - return tuple([input_example]) - - @property - def vocabulary(self): - return self.__vocabulary - - @property - def num_classes_with_blank(self): - return self._num_classes diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/msdd_diarizer.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/msdd_diarizer.py deleted file mode 100644 index 949960ed33ee7a523ea647f55c3f37e95cf92454..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/msdd_diarizer.py +++ /dev/null @@ -1,442 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections import OrderedDict - -import torch -import torch.nn as nn -import torch.nn.functional as F - -from nemo.core.classes.common import typecheck -from nemo.core.classes.exportable import Exportable -from nemo.core.classes.module import NeuralModule -from nemo.core.neural_types import EncodedRepresentation, LengthsType, NeuralType, SpectrogramType -from nemo.core.neural_types.elements import ProbsType - -__all__ = ['MSDD_module'] - - -class ConvLayer(nn.Module): - def __init__(self, in_channels=1, out_channels=1, kernel_size=(3, 1), stride=(1, 1)): - super(ConvLayer, self).__init__() - self.cnn = nn.Sequential( - nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride), - nn.ReLU(), - nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.99), - ) - - def forward(self, feature): - feature = self.cnn(feature) - return feature - - -class MSDD_module(NeuralModule, Exportable): - """ - Multi-scale Diarization Decoder (MSDD) for overlap-aware diarization and improved diarization accuracy from clustering diarizer. - Based on the paper: Taejin Park et. al, "Multi-scale Speaker Diarization with Dynamic Scale Weighting", Interspeech 2022. - Arxiv version: https://arxiv.org/pdf/2203.15974.pdf - - Args: - num_spks (int): - Max number of speakers that are processed by the model. In `MSDD_module`, `num_spks=2` for pairwise inference. - hidden_size (int): - Number of hidden units in sequence models and intermediate layers. - num_lstm_layers (int): - Number of the stacked LSTM layers. - dropout_rate (float): - Dropout rate for linear layers, CNN and LSTM. - cnn_output_ch (int): - Number of channels per each CNN layer. - emb_dim (int): - Dimension of the embedding vectors. - scale_n (int): - Number of scales in multi-scale system. - clamp_max (float): - Maximum value for limiting the scale weight values. - conv_repeat (int): - Number of CNN layers after the first CNN layer. - weighting_scheme (str): - Name of the methods for estimating the scale weights. - context_vector_type (str): - If 'cos_sim', cosine similarity values are used for the input of the sequence models. - If 'elem_prod', element-wise product values are used for the input of the sequence models. - """ - - @property - def output_types(self): - """ - Return definitions of module output ports. - """ - return OrderedDict( - { - "probs": NeuralType(('B', 'T', 'C'), ProbsType()), - "scale_weights": NeuralType(('B', 'T', 'C', 'D'), ProbsType()), - } - ) - - @property - def input_types(self): - """ - Return definitions of module input ports. - """ - return OrderedDict( - { - "ms_emb_seq": NeuralType(('B', 'T', 'C', 'D'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - "ms_avg_embs": NeuralType(('B', 'C', 'D', 'C'), EncodedRepresentation()), - "targets": NeuralType(('B', 'T', 'C'), ProbsType()), - } - ) - - def init_weights(self, m): - if type(m) == nn.Linear: - torch.nn.init.xavier_uniform_(m.weight) - m.bias.data.fill_(0.01) - elif type(m) in [nn.GRU, nn.LSTM, nn.RNN]: - for name, param in m.named_parameters(): - if 'weight_ih' in name: - torch.nn.init.xavier_uniform_(param.data) - elif 'weight_hh' in name: - torch.nn.init.orthogonal_(param.data) - elif 'bias' in name: - param.data.fill_(0.01) - - def __init__( - self, - num_spks: int = 2, - hidden_size: int = 256, - num_lstm_layers: int = 2, - dropout_rate: float = 0.5, - cnn_output_ch: int = 16, - emb_dim: int = 192, - scale_n: int = 5, - clamp_max: float = 1.0, - conv_repeat: int = 1, - weighting_scheme: str = 'conv_scale_weight', - context_vector_type: str = 'cos_sim', - ): - super().__init__() - self._speaker_model = None - self.batch_size: int = 1 - self.length: int = 50 - self.emb_dim: int = emb_dim - self.num_spks: int = num_spks - self.scale_n: int = scale_n - self.cnn_output_ch: int = cnn_output_ch - self.conv_repeat: int = conv_repeat - self.chan: int = 2 - self.eps: float = 1e-6 - self.num_lstm_layers: int = num_lstm_layers - self.weighting_scheme: str = weighting_scheme - self.context_vector_type: bool = context_vector_type - - self.softmax = torch.nn.Softmax(dim=2) - self.cos_dist = torch.nn.CosineSimilarity(dim=3, eps=self.eps) - self.lstm = nn.LSTM( - hidden_size, - hidden_size, - num_layers=self.num_lstm_layers, - batch_first=True, - bidirectional=True, - dropout=dropout_rate, - ) - - if self.weighting_scheme == 'conv_scale_weight': - self.conv = nn.ModuleList( - [ - ConvLayer( - in_channels=1, - out_channels=cnn_output_ch, - kernel_size=(self.scale_n + self.scale_n * num_spks, 1), - stride=(1, 1), - ) - ] - ) - for conv_idx in range(1, conv_repeat + 1): - self.conv.append( - ConvLayer( - in_channels=1, out_channels=cnn_output_ch, kernel_size=(self.cnn_output_ch, 1), stride=(1, 1) - ) - ) - self.conv_bn = nn.ModuleList() - for conv_idx in range(self.conv_repeat + 1): - self.conv_bn.append(nn.BatchNorm2d(self.emb_dim, affine=False)) - self.conv_to_linear = nn.Linear(emb_dim * cnn_output_ch, hidden_size) - self.linear_to_weights = nn.Linear(hidden_size, self.scale_n) - - elif self.weighting_scheme == 'attn_scale_weight': - self.W_a = nn.Linear(emb_dim, emb_dim, bias=False) - nn.init.eye_(self.W_a.weight) - else: - raise ValueError(f"No such weighting scheme as {self.weighting_scheme}") - - self.hidden_to_spks = nn.Linear(2 * hidden_size, self.num_spks) - if self.context_vector_type == "cos_sim": - self.dist_to_emb = nn.Linear(self.scale_n * self.num_spks, hidden_size) - self.dist_to_emb.apply(self.init_weights) - elif self.context_vector_type == "elem_prod": - self.product_to_emb = nn.Linear(self.emb_dim * self.num_spks, hidden_size) - else: - raise ValueError(f"No such context vector type as {self.context_vector_type}") - - self.dropout = nn.Dropout(dropout_rate) - self.hidden_to_spks.apply(self.init_weights) - self.lstm.apply(self.init_weights) - self.clamp_max = clamp_max - - def core_model(self, ms_emb_seq, length, ms_avg_embs, targets): - """ - Core model that accepts multi-scale cosine similarity values and estimates per-speaker binary label. - - Args: - ms_emb_seq (Tensor): - Multiscale input embedding sequence - Shape: (batch_size, length, scale_n, emb_dim) - length (Tensor): - The actual length of embedding sequences without zero padding - Shape: (batch_size,) - ms_avg_embs (Tensor): - Cluster-average speaker embedding vectors. - Shape: (batch_size, scale_n, self.emb_dim, max_spks) - targets (Tensor): - Ground-truth labels for the finest segment. - Shape: (batch_size, feats_len, max_spks) - - Returns: - preds (Tensor): - Predicted binary speaker label for each speaker. - Shape: (batch_size, feats_len, max_spks) - scale_weights (Tensor): - Multiscale weights per each base-scale segment. - Shape: (batch_size, length, scale_n, max_spks) - - """ - self.batch_size = ms_emb_seq.shape[0] - self.length = ms_emb_seq.shape[1] - self.emb_dim = ms_emb_seq.shape[-1] - - _ms_emb_seq = ms_emb_seq.unsqueeze(4).expand(-1, -1, -1, -1, self.num_spks) - ms_emb_seq_single = ms_emb_seq - ms_avg_embs = ms_avg_embs.unsqueeze(1).expand(-1, self.length, -1, -1, -1) - - ms_avg_embs_perm = ms_avg_embs.permute(0, 1, 2, 4, 3).reshape(self.batch_size, self.length, -1, self.emb_dim) - - if self.weighting_scheme == "conv_scale_weight": - scale_weights = self.conv_scale_weights(ms_avg_embs_perm, ms_emb_seq_single) - elif self.weighting_scheme == "attn_scale_weight": - scale_weights = self.attention_scale_weights(ms_avg_embs_perm, ms_emb_seq_single) - else: - raise ValueError(f"No such weighting scheme as {self.weighting_scheme}") - scale_weights = scale_weights.to(ms_emb_seq.device) - - if self.context_vector_type == "cos_sim": - context_emb = self.cosine_similarity(scale_weights, ms_avg_embs, _ms_emb_seq) - elif self.context_vector_type == "elem_prod": - context_emb = self.element_wise_product(scale_weights, ms_avg_embs, _ms_emb_seq) - else: - raise ValueError(f"No such context vector type as {self.context_vector_type}") - - context_emb = self.dropout(F.relu(context_emb)) - lstm_output = self.lstm(context_emb) - lstm_hidden_out = self.dropout(F.relu(lstm_output[0])) - spk_preds = self.hidden_to_spks(lstm_hidden_out) - preds = nn.Sigmoid()(spk_preds) - return preds, scale_weights - - def element_wise_product(self, scale_weights, ms_avg_embs, ms_emb_seq): - """ - Calculate element wise product values among cluster-average embedding vectors and input embedding vector sequences. - This function is selected by assigning `self.context_vector_type = "elem_prod"`. `elem_prod` method usually takes more - time to converge compared to `cos_sim` method. - - Args: - scale_weights (Tensor): - Multiscale weight vector. - Shape: (batch_size, feats_len, scale_n, max_spks) - ms_avg_embs_perm (Tensor): - Tensor containing cluster-average speaker embeddings for each scale. - Shape: (batch_size, length, scale_n, emb_dim) - ms_emb_seq (Tensor): - Tensor containing multi-scale speaker embedding sequences. `ms_emb_seq` is a single channel input from the - given audio stream input. - Shape: (batch_size, length, num_spks, emb_dim) - - Returns: - context_emb (Tensor): - Output of `dist_to_emb` linear layer containing context for speaker label estimation. - """ - scale_weight_flatten = scale_weights.reshape(self.batch_size * self.length, self.num_spks, self.scale_n) - ms_avg_embs_flatten = ms_avg_embs.reshape( - self.batch_size * self.length, self.scale_n, self.emb_dim, self.num_spks - ) - ms_emb_seq_flatten = ms_emb_seq.reshape(-1, self.scale_n, self.emb_dim) - ms_emb_seq_flatten_rep = ms_emb_seq_flatten.unsqueeze(3).reshape(-1, self.scale_n, self.emb_dim, self.num_spks) - elemwise_product = ms_avg_embs_flatten * ms_emb_seq_flatten_rep - context_vectors = torch.bmm( - scale_weight_flatten.reshape(self.batch_size * self.num_spks * self.length, 1, self.scale_n), - elemwise_product.reshape(self.batch_size * self.num_spks * self.length, self.scale_n, self.emb_dim), - ) - context_vectors = context_vectors.reshape(self.batch_size, self.length, self.emb_dim * self.num_spks) - context_emb = self.product_to_emb(context_vectors) - return context_emb - - def cosine_similarity(self, scale_weights, ms_avg_embs, _ms_emb_seq): - """ - Calculate cosine similarity values among cluster-average embedding vectors and input embedding vector sequences. - This function is selected by assigning self.context_vector_type = "cos_sim". - - Args: - scale_weights (Tensor): - Multiscale weight vector. - Shape: (batch_size, feats_len, scale_n, max_spks) - ms_avg_embs_perm (Tensor): - Tensor containing cluster-average speaker embeddings for each scale. - Shape: (batch_size, length, scale_n, emb_dim) - _ms_emb_seq (Tensor): - Tensor containing multi-scale speaker embedding sequences. `ms_emb_seq` is a single channel input from the - given audio stream input. - Shape: (batch_size, length, num_spks, emb_dim) - - Returns: - context_emb (Tensor): - Output of `dist_to_emb` linear layer containing context for speaker label estimation. - """ - cos_dist_seq = self.cos_dist(_ms_emb_seq, ms_avg_embs) - context_vectors = torch.mul(scale_weights, cos_dist_seq) - context_vectors = context_vectors.view(self.batch_size, self.length, -1) - context_emb = self.dist_to_emb(context_vectors) - return context_emb - - def attention_scale_weights(self, ms_avg_embs_perm, ms_emb_seq): - """ - Use weighted inner product for calculating each scale weight. W_a matrix has (emb_dim * emb_dim) learnable parameters - and W_a matrix is initialized with an identity matrix. Compared to "conv_scale_weight" method, this method shows more evenly - distributed scale weights. - - Args: - ms_avg_embs_perm (Tensor): - Tensor containing cluster-average speaker embeddings for each scale. - Shape: (batch_size, length, scale_n, emb_dim) - ms_emb_seq (Tensor): - Tensor containing multi-scale speaker embedding sequences. `ms_emb_seq` is input from the - given audio stream input. - Shape: (batch_size, length, num_spks, emb_dim) - - Returns: - scale_weights (Tensor): - Weight vectors that determine the weight of each scale. - Shape: (batch_size, length, num_spks, emb_dim) - """ - self.W_a(ms_emb_seq.flatten(0, 1)) - mat_a = self.W_a(ms_emb_seq.flatten(0, 1)) - mat_b = ms_avg_embs_perm.flatten(0, 1).permute(0, 2, 1) - - weighted_corr = torch.matmul(mat_a, mat_b).reshape(-1, self.scale_n, self.scale_n, self.num_spks) - scale_weights = torch.sigmoid(torch.diagonal(weighted_corr, dim1=1, dim2=2)) - scale_weights = scale_weights.reshape(self.batch_size, self.length, self.scale_n, self.num_spks) - scale_weights = self.softmax(scale_weights) - return scale_weights - - def conv_scale_weights(self, ms_avg_embs_perm, ms_emb_seq_single): - """ - Use multiple Convnet layers to estimate the scale weights based on the cluster-average embedding and - input embedding sequence. - - Args: - ms_avg_embs_perm (Tensor): - Tensor containing cluster-average speaker embeddings for each scale. - Shape: (batch_size, length, scale_n, emb_dim) - ms_emb_seq_single (Tensor): - Tensor containing multi-scale speaker embedding sequences. ms_emb_seq_single is input from the - given audio stream input. - Shape: (batch_size, length, num_spks, emb_dim) - - Returns: - scale_weights (Tensor): - Weight vectors that determine the weight of each scale. - Shape: (batch_size, length, num_spks, emb_dim) - """ - ms_cnn_input_seq = torch.cat([ms_avg_embs_perm, ms_emb_seq_single], dim=2) - ms_cnn_input_seq = ms_cnn_input_seq.unsqueeze(2).flatten(0, 1) - - conv_out = self.conv_forward( - ms_cnn_input_seq, conv_module=self.conv[0], bn_module=self.conv_bn[0], first_layer=True - ) - for conv_idx in range(1, self.conv_repeat + 1): - conv_out = self.conv_forward( - conv_input=conv_out, - conv_module=self.conv[conv_idx], - bn_module=self.conv_bn[conv_idx], - first_layer=False, - ) - - lin_input_seq = conv_out.view(self.batch_size, self.length, self.cnn_output_ch * self.emb_dim) - hidden_seq = self.conv_to_linear(lin_input_seq) - hidden_seq = self.dropout(F.leaky_relu(hidden_seq)) - scale_weights = self.softmax(self.linear_to_weights(hidden_seq)) - scale_weights = scale_weights.unsqueeze(3).expand(-1, -1, -1, self.num_spks) - return scale_weights - - def conv_forward(self, conv_input, conv_module, bn_module, first_layer=False): - """ - A module for convolutional neural networks with 1-D filters. As a unit layer batch normalization, non-linear layer and dropout - modules are included. - - Note: - If `first_layer=True`, the input shape is set for processing embedding input. - If `first_layer=False`, then the input shape is set for processing the output from another `conv_forward` module. - - Args: - conv_input (Tensor): - Reshaped tensor containing cluster-average embeddings and multi-scale embedding sequences. - Shape: (batch_size*length, 1, scale_n*(num_spks+1), emb_dim) - conv_module (ConvLayer): - ConvLayer instance containing torch.nn.modules.conv modules. - bn_module (torch.nn.modules.batchnorm.BatchNorm2d): - Predefined Batchnorm module. - first_layer (bool): - Boolean for switching between the first layer and the others. - Default: `False` - - Returns: - conv_out (Tensor): - Convnet output that can be fed to another ConvLayer module or linear layer. - Shape: (batch_size*length, 1, cnn_output_ch, emb_dim) - """ - conv_out = conv_module(conv_input) - conv_out = conv_out.permute(0, 2, 1, 3) if not first_layer else conv_out - conv_out = conv_out.reshape(self.batch_size, self.length, self.cnn_output_ch, self.emb_dim) - conv_out = conv_out.unsqueeze(2).flatten(0, 1) - conv_out = bn_module(conv_out.permute(0, 3, 2, 1)).permute(0, 3, 2, 1) - conv_out = self.dropout(F.leaky_relu(conv_out)) - return conv_out - - @typecheck() - def forward(self, ms_emb_seq, length, ms_avg_embs, targets): - preds, scale_weights = self.core_model(ms_emb_seq, length, ms_avg_embs, targets) - return preds, scale_weights - - def input_example(self): - """ - Generate input examples for tracing etc. - - Returns (tuple): - A tuple of input examples. - """ - device = next(self.parameters()).device - lens = torch.full(size=(input_example.shape[0],), fill_value=123, device=device) - input_example = torch.randn(1, lens, self.scale_n, self.emb_dim, device=device) - avg_embs = torch.randn(1, self.scale_n, self.emb_dim, self.num_spks, device=device) - targets = torch.randn(1, lens, self.num_spks).round().float() - return tuple([input_example, lens, avg_embs, targets]) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/rnn_encoder.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/rnn_encoder.py deleted file mode 100644 index 0ebb89f545e27cf8309a78bb218efdf90e116e5c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/rnn_encoder.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections import OrderedDict - -import torch -import torch.distributed -import torch.nn as nn - -from nemo.collections.asr.parts.submodules.subsampling import ConvSubsampling, StackingSubsampling -from nemo.core.classes.common import typecheck -from nemo.core.classes.exportable import Exportable -from nemo.core.classes.module import NeuralModule -from nemo.core.neural_types import AcousticEncodedRepresentation, LengthsType, NeuralType, SpectrogramType - -__all__ = ['RNNEncoder'] - - -class RNNEncoder(NeuralModule, Exportable): - """ - The RNN-based encoder for ASR models. - Followed the architecture suggested in the following paper: - 'STREAMING END-TO-END SPEECH RECOGNITION FOR MOBILE DEVICES' by Yanzhang He et al. - https://arxiv.org/pdf/1811.06621.pdf - - - Args: - feat_in (int): the size of feature channels - n_layers (int): number of layers of RNN - d_model (int): the hidden size of the model - proj_size (int): the size of the output projection after each RNN layer - rnn_type (str): the type of the RNN layers, choices=['lstm, 'gru', 'rnn'] - bidirectional (float): specifies whether RNN layers should be bidirectional or not - Defaults to True. - feat_out (int): the size of the output features - Defaults to -1 (means feat_out is d_model) - subsampling (str): the method of subsampling, choices=['stacking, 'vggnet', 'striding'] - Defaults to stacking. - subsampling_factor (int): the subsampling factor - Defaults to 4. - subsampling_conv_channels (int): the size of the convolutions in the subsampling module for vggnet and striding - Defaults to -1 which would set it to d_model. - dropout (float): the dropout rate used between all layers - Defaults to 0.2. - """ - - def input_example(self): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - input_example = torch.randn(16, self._feat_in, 256).to(next(self.parameters()).device) - input_example_length = torch.randint(0, 256, (16,)).to(next(self.parameters()).device) - return tuple([input_example, input_example_length]) - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return OrderedDict( - { - "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - } - ) - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return OrderedDict( - { - "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - } - ) - - def __init__( - self, - feat_in: int, - n_layers: int, - d_model: int, - proj_size: int = -1, - rnn_type: str = 'lstm', - bidirectional: bool = True, - subsampling: str = 'striding', - subsampling_factor: int = 4, - subsampling_conv_channels: int = -1, - dropout: float = 0.2, - ): - super().__init__() - - self.d_model = d_model - self._feat_in = feat_in - - if subsampling_conv_channels == -1: - subsampling_conv_channels = proj_size - if subsampling and subsampling_factor > 1: - if subsampling in ['stacking', 'stacking_norm']: - self.pre_encode = StackingSubsampling( - subsampling_factor=subsampling_factor, - feat_in=feat_in, - feat_out=proj_size, - norm=True if 'norm' in subsampling else False, - ) - else: - self.pre_encode = ConvSubsampling( - subsampling=subsampling, - subsampling_factor=subsampling_factor, - feat_in=feat_in, - feat_out=proj_size, - conv_channels=subsampling_conv_channels, - activation=nn.ReLU(), - ) - else: - self.pre_encode = nn.Linear(feat_in, proj_size) - - self._feat_out = proj_size - - self.layers = nn.ModuleList() - - SUPPORTED_RNN = {"lstm": nn.LSTM, "gru": nn.GRU, "rnn": nn.RNN} - if rnn_type not in SUPPORTED_RNN: - raise ValueError(f"rnn_type can be one from the following:{SUPPORTED_RNN.keys()}") - else: - rnn_module = SUPPORTED_RNN[rnn_type] - - for i in range(n_layers): - rnn_proj_size = proj_size // 2 if bidirectional else proj_size - if rnn_type == "lstm": - layer = rnn_module( - input_size=self._feat_out, - hidden_size=d_model, - num_layers=1, - batch_first=True, - bidirectional=bidirectional, - proj_size=rnn_proj_size, - ) - self.layers.append(layer) - self.layers.append(nn.LayerNorm(proj_size)) - self.layers.append(nn.Dropout(p=dropout)) - self._feat_out = proj_size - - @typecheck() - def forward(self, audio_signal, length=None): - max_audio_length: int = audio_signal.size(-1) - - if length is None: - length = audio_signal.new_full( - audio_signal.size(0), max_audio_length, dtype=torch.int32, device=self.seq_range.device - ) - - audio_signal = torch.transpose(audio_signal, 1, 2) - - if isinstance(self.pre_encode, nn.Linear): - audio_signal = self.pre_encode(audio_signal) - else: - audio_signal, length = self.pre_encode(audio_signal, length) - - for lth, layer in enumerate(self.layers): - audio_signal = layer(audio_signal) - if isinstance(audio_signal, tuple): - audio_signal, _ = audio_signal - - audio_signal = torch.transpose(audio_signal, 1, 2) - return audio_signal, length diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/rnnt.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/rnnt.py deleted file mode 100644 index 04bdd25ac35139ea9a62b147b70ec07834872ddd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/rnnt.py +++ /dev/null @@ -1,2051 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Any, Dict, List, Optional, Tuple, Union - -import torch -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.asr.modules import rnnt_abstract -from nemo.collections.asr.parts.submodules import stateless_net -from nemo.collections.asr.parts.utils import adapter_utils, rnnt_utils -from nemo.collections.common.parts import rnn -from nemo.core.classes import adapter_mixins, typecheck -from nemo.core.classes.exportable import Exportable -from nemo.core.classes.mixins import AdapterModuleMixin -from nemo.core.neural_types import ( - AcousticEncodedRepresentation, - ElementType, - EmbeddedTextType, - LabelsType, - LengthsType, - LogprobsType, - LossType, - NeuralType, - SpectrogramType, -) -from nemo.utils import logging - - -class StatelessTransducerDecoder(rnnt_abstract.AbstractRNNTDecoder, Exportable): - """A Stateless Neural Network Transducer Decoder / Prediction Network. - An RNN-T Decoder/Prediction stateless network that simply takes concatenation of embeddings of the history tokens as the output. - - Args: - prednet: A dict-like object which contains the following key-value pairs. - pred_hidden: int specifying the hidden dimension of the prediction net. - - dropout: float, set to 0.0 by default. Optional dropout applied at the end of the final LSTM RNN layer. - - vocab_size: int, specifying the vocabulary size of the embedding layer of the Prediction network, - excluding the RNNT blank token. - - context_size: int, specifying the size of the history context used for this decoder. - - normalization_mode: Can be either None, 'layer'. By default, is set to None. - Defines the type of normalization applied to the RNN layer. - - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "targets": NeuralType(('B', 'T'), LabelsType()), - "target_length": NeuralType(tuple('B'), LengthsType()), - "states": [NeuralType(('B', 'T'), LabelsType(), optional=True)], - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return { - "outputs": NeuralType(('B', 'D', 'T'), EmbeddedTextType()), - "prednet_lengths": NeuralType(tuple('B'), LengthsType()), - "states": [NeuralType(('B', 'T'), LabelsType(), optional=True)], - } - - def input_example(self, max_batch=1, max_dim=1): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - length = max_dim - targets = torch.full(fill_value=self.blank_idx, size=(max_batch, length), dtype=torch.int32).to( - next(self.parameters()).device - ) - target_length = torch.randint(0, length, size=(max_batch,), dtype=torch.int32).to( - next(self.parameters()).device - ) - states = tuple(self.initialize_state(targets.float())) - return (targets, target_length, states) - - def _prepare_for_export(self, **kwargs): - self._rnnt_export = True - super()._prepare_for_export(**kwargs) - - def __init__( - self, - prednet: Dict[str, Any], - vocab_size: int, - context_size: int = 1, - normalization_mode: Optional[str] = None, - ): - # Required arguments - self.pred_hidden = prednet['pred_hidden'] - self.blank_idx = vocab_size - self.context_size = context_size - - # Initialize the model (blank token increases vocab size by 1) - super().__init__(vocab_size=vocab_size, blank_idx=self.blank_idx, blank_as_pad=True) - - # Optional arguments - dropout = prednet.get('dropout', 0.0) - - self.prediction = self._predict_modules( - **{ - "context_size": context_size, - "vocab_size": vocab_size, - "emb_dim": self.pred_hidden, - "blank_idx": self.blank_idx, - "normalization_mode": normalization_mode, - "dropout": dropout, - } - ) - self._rnnt_export = False - - @typecheck() - def forward(self, targets, target_length, states=None): - # y: (B, U) - y = rnn.label_collate(targets) - - # state maintenance is unnecessary during training forward call - # to get state, use .predict() method. - if self._rnnt_export: - add_sos = False - else: - add_sos = True - - g, state = self.predict(y, state=states, add_sos=add_sos) # (B, U, D) - g = g.transpose(1, 2) # (B, D, U) - - return g, target_length, state - - def predict( - self, - y: Optional[torch.Tensor] = None, - state: Optional[torch.Tensor] = None, - add_sos: bool = True, - batch_size: Optional[int] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Stateful prediction of scores and state for a tokenset. - - Here: - B - batch size - U - label length - C - context size for stateless decoder - D - total embedding size - - Args: - y: Optional torch tensor of shape [B, U] of dtype long which will be passed to the Embedding. - If None, creates a zero tensor of shape [B, 1, D] which mimics output of pad-token on Embedding. - - state: An optional one-element list of one tensor. The tensor is used to store previous context labels. - The tensor uses type long and is of shape [B, C]. - - add_sos: bool flag, whether a zero vector describing a "start of signal" token should be - prepended to the above "y" tensor. When set, output size is (B, U + 1, D). - - batch_size: An optional int, specifying the batch size of the `y` tensor. - Can be infered if `y` and `state` is None. But if both are None, then batch_size cannot be None. - - Returns: - A tuple (g, state) such that - - - If add_sos is False: - g: (B, U, D) - state: [(B, C)] storing the history context including the new words in y. - - If add_sos is True: - g: (B, U + 1, D) - state: [(B, C)] storing the history context including the new words in y. - - """ - # Get device and dtype of current module - _p = next(self.parameters()) - device = _p.device - dtype = _p.dtype - - # If y is not None, it is of shape [B, U] with dtype long. - if y is not None: - if y.device != device: - y = y.to(device) - - y, state = self.prediction(y, state) - - else: - # Y is not provided, assume zero tensor with shape [B, 1, D] is required - # Emulates output of embedding of pad token. - if batch_size is None: - B = 1 if state is None else state[0].size(1) - else: - B = batch_size - - y = torch.zeros((B, 1, self.pred_hidden), device=device, dtype=dtype) - - # Prepend blank "start of sequence" symbol (zero tensor) - if add_sos: - B, U, D = y.shape - start = torch.zeros((B, 1, D), device=y.device, dtype=y.dtype) - y = torch.cat([start, y], dim=1).contiguous() # (B, U + 1, D) - else: - start = None # makes del call later easier - - del start - return y, state - - def _predict_modules(self, **kwargs): - """ - Prepare the trainable parameters of the Prediction Network. - - Args: - vocab_size: Vocab size (excluding the blank token). - pred_n_hidden: Hidden size of the RNNs. - norm: Type of normalization to perform in RNN. - dropout: Whether to apply dropout to RNN. - """ - - net = stateless_net.StatelessNet(**kwargs) - return net - - def score_hypothesis( - self, hypothesis: rnnt_utils.Hypothesis, cache: Dict[Tuple[int], Any] - ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]: - """ - Similar to the predict() method, instead this method scores a Hypothesis during beam search. - Hypothesis is a dataclass representing one hypothesis in a Beam Search. - - Args: - hypothesis: Refer to rnnt_utils.Hypothesis. - cache: Dict which contains a cache to avoid duplicate computations. - - Returns: - Returns a tuple (y, states, lm_token) such that: - y is a torch.Tensor of shape [1, 1, H] representing the score of the last token in the Hypothesis. - state is a list of RNN states, each of shape [L, 1, H]. - lm_token is the final integer token of the hypothesis. - """ - if hypothesis.dec_state is not None: - device = hypothesis.dec_state[0].device - else: - _p = next(self.parameters()) - device = _p.device - - # parse "blank" tokens in hypothesis - if len(hypothesis.y_sequence) > 0 and hypothesis.y_sequence[-1] == self.blank_idx: - blank_state = True - else: - blank_state = False - - # Convert last token of hypothesis to torch.Tensor - target = torch.full([1, 1], fill_value=hypothesis.y_sequence[-1], device=device, dtype=torch.long) - lm_token = target[:, -1] # [1] - - # Convert current hypothesis into a tuple to preserve in cache - sequence = tuple(hypothesis.y_sequence) - - if sequence in cache: - y, new_state = cache[sequence] - else: - # Obtain score for target token and new states - if blank_state: - y, new_state = self.predict(None, state=None, add_sos=False, batch_size=1) # [1, 1, H] - - else: - y, new_state = self.predict( - target, state=hypothesis.dec_state, add_sos=False, batch_size=1 - ) # [1, 1, H] - - y = y[:, -1:, :] # Extract just last state : [1, 1, H] - cache[sequence] = (y, new_state) - - return y, new_state, lm_token - - def initialize_state(self, y: torch.Tensor) -> List[torch.Tensor]: - batch = y.size(0) - state = [torch.ones([batch, self.context_size], dtype=torch.long, device=y.device) * self.blank_idx] - return state - - def batch_initialize_states(self, batch_states: List[torch.Tensor], decoder_states: List[List[torch.Tensor]]): - """ - Create batch of decoder states. - - Args: - batch_states (list): batch of decoder states - ([(B, H)]) - - decoder_states (list of list): list of decoder states - [B x ([(1, C)]] - - Returns: - batch_states (tuple): batch of decoder states - ([(B, C)]) - """ - new_state = torch.stack([s[0] for s in decoder_states]) - - return [new_state] - - def batch_select_state(self, batch_states: List[torch.Tensor], idx: int) -> List[List[torch.Tensor]]: - """Get decoder state from batch of states, for given id. - - Args: - batch_states (list): batch of decoder states - [(B, C)] - - idx (int): index to extract state from batch of states - - Returns: - (tuple): decoder states for given id - [(C)] - """ - if batch_states is not None: - states = batch_states[0][idx] - states = ( - states.long() - ) # beam search code assumes the batch_states tensor is always of float type, so need conversion - return [states] - else: - return None - - def batch_concat_states(self, batch_states: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """Concatenate a batch of decoder state to a packed state. - - Args: - batch_states (list): batch of decoder states - B x ([(C)] - - Returns: - (tuple): decoder states - [(B x C)] - """ - state_list = [] - batch_list = [] - for sample_id in range(len(batch_states)): - tensor = torch.stack(batch_states[sample_id]) # [1, H] - batch_list.append(tensor) - - state_tensor = torch.cat(batch_list, 0) # [B, H] - state_list.append(state_tensor) - - return state_list - - def batch_copy_states( - self, - old_states: List[torch.Tensor], - new_states: List[torch.Tensor], - ids: List[int], - value: Optional[float] = None, - ) -> List[torch.Tensor]: - """Copy states from new state to old state at certain indices. - - Args: - old_states: packed decoder states - single element list of (B x C) - - new_states: packed decoder states - single element list of (B x C) - - ids (list): List of indices to copy states at. - - value (optional float): If a value should be copied instead of a state slice, a float should be provided - - Returns: - batch of decoder states with partial copy at ids (or a specific value). - (B x C) - """ - - if value is None: - old_states[0][ids, :] = new_states[0][ids, :] - - return old_states - - def batch_score_hypothesis( - self, hypotheses: List[rnnt_utils.Hypothesis], cache: Dict[Tuple[int], Any], batch_states: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]: - """ - Used for batched beam search algorithms. Similar to score_hypothesis method. - - Args: - hypothesis: List of Hypotheses. Refer to rnnt_utils.Hypothesis. - cache: Dict which contains a cache to avoid duplicate computations. - batch_states: List of torch.Tensor which represent the states of the RNN for this batch. - Each state is of shape [L, B, H] - - Returns: - Returns a tuple (b_y, b_states, lm_tokens) such that: - b_y is a torch.Tensor of shape [B, 1, H] representing the scores of the last tokens in the Hypotheses. - b_state is a list of list of RNN states, each of shape [L, B, H]. - Represented as B x List[states]. - lm_token is a list of the final integer tokens of the hypotheses in the batch. - """ - final_batch = len(hypotheses) - if final_batch == 0: - raise ValueError("No hypotheses was provided for the batch!") - - _p = next(self.parameters()) - device = _p.device - dtype = _p.dtype - - tokens = [] - process = [] - done = [None for _ in range(final_batch)] - - # For each hypothesis, cache the last token of the sequence and the current states - for i, hyp in enumerate(hypotheses): - sequence = tuple(hyp.y_sequence) - - if sequence in cache: - done[i] = cache[sequence] - else: - tokens.append(hyp.y_sequence[-1]) - process.append((sequence, hyp.dec_state)) - - if process: - batch = len(process) - - # convert list of tokens to torch.Tensor, then reshape. - tokens = torch.tensor(tokens, device=device, dtype=torch.long).view(batch, -1) - dec_states = self.initialize_state(tokens) # [B, C] - dec_states = self.batch_initialize_states(dec_states, [d_state for seq, d_state in process]) - - y, dec_states = self.predict( - tokens, state=dec_states, add_sos=False, batch_size=batch - ) # [B, 1, H], List([L, 1, H]) - - dec_states = tuple(state.to(dtype=dtype) for state in dec_states) - - # Update done states and cache shared by entire batch. - j = 0 - for i in range(final_batch): - if done[i] is None: - # Select sample's state from the batch state list - new_state = self.batch_select_state(dec_states, j) - - # Cache [1, H] scores of the current y_j, and its corresponding state - done[i] = (y[j], new_state) - cache[process[j][0]] = (y[j], new_state) - - j += 1 - - # Set the incoming batch states with the new states obtained from `done`. - batch_states = self.batch_initialize_states(batch_states, [d_state for y_j, d_state in done]) - - # Create batch of all output scores - # List[1, 1, H] -> [B, 1, H] - batch_y = torch.stack([y_j for y_j, d_state in done]) - - # Extract the last tokens from all hypotheses and convert to a tensor - lm_tokens = torch.tensor([h.y_sequence[-1] for h in hypotheses], device=device, dtype=torch.long).view( - final_batch - ) - - return batch_y, batch_states, lm_tokens - - -class RNNTDecoder(rnnt_abstract.AbstractRNNTDecoder, Exportable, AdapterModuleMixin): - """A Recurrent Neural Network Transducer Decoder / Prediction Network (RNN-T Prediction Network). - An RNN-T Decoder/Prediction network, comprised of a stateful LSTM model. - - Args: - prednet: A dict-like object which contains the following key-value pairs. - pred_hidden: int specifying the hidden dimension of the prediction net. - pred_rnn_layers: int specifying the number of rnn layers. - - Optionally, it may also contain the following: - forget_gate_bias: float, set by default to 1.0, which constructs a forget gate - initialized to 1.0. - Reference: - [An Empirical Exploration of Recurrent Network Architectures](http://proceedings.mlr.press/v37/jozefowicz15.pdf) - t_max: int value, set to None by default. If an int is specified, performs Chrono Initialization - of the LSTM network, based on the maximum number of timesteps `t_max` expected during the course - of training. - Reference: - [Can recurrent neural networks warp time?](https://openreview.net/forum?id=SJcKhk-Ab) - weights_init_scale: Float scale of the weights after initialization. Setting to lower than one - sometimes helps reduce variance between runs. - hidden_hidden_bias_scale: Float scale for the hidden-to-hidden bias scale. Set to 0.0 for - the default behaviour. - dropout: float, set to 0.0 by default. Optional dropout applied at the end of the final LSTM RNN layer. - - vocab_size: int, specifying the vocabulary size of the embedding layer of the Prediction network, - excluding the RNNT blank token. - - normalization_mode: Can be either None, 'batch' or 'layer'. By default, is set to None. - Defines the type of normalization applied to the RNN layer. - - random_state_sampling: bool, set to False by default. When set, provides normal-distribution - sampled state tensors instead of zero tensors during training. - Reference: - [Recognizing long-form speech using streaming end-to-end models](https://arxiv.org/abs/1910.11455) - - blank_as_pad: bool, set to True by default. When set, will add a token to the Embedding layer of this - prediction network, and will treat this token as a pad token. In essence, the RNNT pad token will - be treated as a pad token, and the embedding layer will return a zero tensor for this token. - - It is set by default as it enables various batch optimizations required for batched beam search. - Therefore, it is not recommended to disable this flag. - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "targets": NeuralType(('B', 'T'), LabelsType()), - "target_length": NeuralType(tuple('B'), LengthsType()), - "states": [NeuralType(('D', 'B', 'D'), ElementType(), optional=True)], # must always be last - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return { - "outputs": NeuralType(('B', 'D', 'T'), EmbeddedTextType()), - "prednet_lengths": NeuralType(tuple('B'), LengthsType()), - "states": [NeuralType((('D', 'B', 'D')), ElementType(), optional=True)], # must always be last - } - - def input_example(self, max_batch=1, max_dim=1): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - length = max_dim - targets = torch.full(fill_value=self.blank_idx, size=(max_batch, length), dtype=torch.int32).to( - next(self.parameters()).device - ) - target_length = torch.randint(0, length, size=(max_batch,), dtype=torch.int32).to( - next(self.parameters()).device - ) - states = tuple(self.initialize_state(targets.float())) - return (targets, target_length, states) - - def _prepare_for_export(self, **kwargs): - self._rnnt_export = True - super()._prepare_for_export(**kwargs) - - def __init__( - self, - prednet: Dict[str, Any], - vocab_size: int, - normalization_mode: Optional[str] = None, - random_state_sampling: bool = False, - blank_as_pad: bool = True, - ): - # Required arguments - self.pred_hidden = prednet['pred_hidden'] - self.pred_rnn_layers = prednet["pred_rnn_layers"] - self.blank_idx = vocab_size - - # Initialize the model (blank token increases vocab size by 1) - super().__init__(vocab_size=vocab_size, blank_idx=self.blank_idx, blank_as_pad=blank_as_pad) - - # Optional arguments - forget_gate_bias = prednet.get('forget_gate_bias', 1.0) - t_max = prednet.get('t_max', None) - weights_init_scale = prednet.get('weights_init_scale', 1.0) - hidden_hidden_bias_scale = prednet.get('hidden_hidden_bias_scale', 0.0) - dropout = prednet.get('dropout', 0.0) - self.random_state_sampling = random_state_sampling - - self.prediction = self._predict_modules( - vocab_size=vocab_size, # add 1 for blank symbol - pred_n_hidden=self.pred_hidden, - pred_rnn_layers=self.pred_rnn_layers, - forget_gate_bias=forget_gate_bias, - t_max=t_max, - norm=normalization_mode, - weights_init_scale=weights_init_scale, - hidden_hidden_bias_scale=hidden_hidden_bias_scale, - dropout=dropout, - rnn_hidden_size=prednet.get("rnn_hidden_size", -1), - ) - self._rnnt_export = False - - @typecheck() - def forward(self, targets, target_length, states=None): - # y: (B, U) - y = rnn.label_collate(targets) - - # state maintenance is unnecessary during training forward call - # to get state, use .predict() method. - if self._rnnt_export: - add_sos = False - else: - add_sos = True - - g, states = self.predict(y, state=states, add_sos=add_sos) # (B, U, D) - g = g.transpose(1, 2) # (B, D, U) - - return g, target_length, states - - def predict( - self, - y: Optional[torch.Tensor] = None, - state: Optional[List[torch.Tensor]] = None, - add_sos: bool = True, - batch_size: Optional[int] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Stateful prediction of scores and state for a (possibly null) tokenset. - This method takes various cases into consideration : - - No token, no state - used for priming the RNN - - No token, state provided - used for blank token scoring - - Given token, states - used for scores + new states - - Here: - B - batch size - U - label length - H - Hidden dimension size of RNN - L - Number of RNN layers - - Args: - y: Optional torch tensor of shape [B, U] of dtype long which will be passed to the Embedding. - If None, creates a zero tensor of shape [B, 1, H] which mimics output of pad-token on EmbeddiNg. - - state: An optional list of states for the RNN. Eg: For LSTM, it is the state list length is 2. - Each state must be a tensor of shape [L, B, H]. - If None, and during training mode and `random_state_sampling` is set, will sample a - normal distribution tensor of the above shape. Otherwise, None will be passed to the RNN. - - add_sos: bool flag, whether a zero vector describing a "start of signal" token should be - prepended to the above "y" tensor. When set, output size is (B, U + 1, H). - - batch_size: An optional int, specifying the batch size of the `y` tensor. - Can be infered if `y` and `state` is None. But if both are None, then batch_size cannot be None. - - Returns: - A tuple (g, hid) such that - - - If add_sos is False: - g: (B, U, H) - hid: (h, c) where h is the final sequence hidden state and c is the final cell state: - h (tensor), shape (L, B, H) - c (tensor), shape (L, B, H) - - If add_sos is True: - g: (B, U + 1, H) - hid: (h, c) where h is the final sequence hidden state and c is the final cell state: - h (tensor), shape (L, B, H) - c (tensor), shape (L, B, H) - - """ - # Get device and dtype of current module - _p = next(self.parameters()) - device = _p.device - dtype = _p.dtype - - # If y is not None, it is of shape [B, U] with dtype long. - if y is not None: - if y.device != device: - y = y.to(device) - - # (B, U) -> (B, U, H) - y = self.prediction["embed"](y) - else: - # Y is not provided, assume zero tensor with shape [B, 1, H] is required - # Emulates output of embedding of pad token. - if batch_size is None: - B = 1 if state is None else state[0].size(1) - else: - B = batch_size - - y = torch.zeros((B, 1, self.pred_hidden), device=device, dtype=dtype) - - # Prepend blank "start of sequence" symbol (zero tensor) - if add_sos: - B, U, H = y.shape - start = torch.zeros((B, 1, H), device=y.device, dtype=y.dtype) - y = torch.cat([start, y], dim=1).contiguous() # (B, U + 1, H) - else: - start = None # makes del call later easier - - # If in training mode, and random_state_sampling is set, - # initialize state to random normal distribution tensor. - if state is None: - if self.random_state_sampling and self.training: - state = self.initialize_state(y) - - # Forward step through RNN - y = y.transpose(0, 1) # (U + 1, B, H) - g, hid = self.prediction["dec_rnn"](y, state) - g = g.transpose(0, 1) # (B, U + 1, H) - - del y, start, state - - # Adapter module forward step - if self.is_adapter_available(): - g = self.forward_enabled_adapters(g) - - return g, hid - - def _predict_modules( - self, - vocab_size, - pred_n_hidden, - pred_rnn_layers, - forget_gate_bias, - t_max, - norm, - weights_init_scale, - hidden_hidden_bias_scale, - dropout, - rnn_hidden_size, - ): - """ - Prepare the trainable parameters of the Prediction Network. - - Args: - vocab_size: Vocab size (excluding the blank token). - pred_n_hidden: Hidden size of the RNNs. - pred_rnn_layers: Number of RNN layers. - forget_gate_bias: Whether to perform unit forget gate bias. - t_max: Whether to perform Chrono LSTM init. - norm: Type of normalization to perform in RNN. - weights_init_scale: Float scale of the weights after initialization. Setting to lower than one - sometimes helps reduce variance between runs. - hidden_hidden_bias_scale: Float scale for the hidden-to-hidden bias scale. Set to 0.0 for - the default behaviour. - dropout: Whether to apply dropout to RNN. - rnn_hidden_size: the hidden size of the RNN, if not specified, pred_n_hidden would be used - """ - if self.blank_as_pad: - embed = torch.nn.Embedding(vocab_size + 1, pred_n_hidden, padding_idx=self.blank_idx) - else: - embed = torch.nn.Embedding(vocab_size, pred_n_hidden) - - layers = torch.nn.ModuleDict( - { - "embed": embed, - "dec_rnn": rnn.rnn( - input_size=pred_n_hidden, - hidden_size=rnn_hidden_size if rnn_hidden_size > 0 else pred_n_hidden, - num_layers=pred_rnn_layers, - norm=norm, - forget_gate_bias=forget_gate_bias, - t_max=t_max, - dropout=dropout, - weights_init_scale=weights_init_scale, - hidden_hidden_bias_scale=hidden_hidden_bias_scale, - proj_size=pred_n_hidden if pred_n_hidden < rnn_hidden_size else 0, - ), - } - ) - return layers - - def initialize_state(self, y: torch.Tensor) -> List[torch.Tensor]: - """ - Initialize the state of the RNN layers, with same dtype and device as input `y`. - - Args: - y: A torch.Tensor whose device the generated states will be placed on. - - Returns: - List of torch.Tensor, each of shape [L, B, H], where - L = Number of RNN layers - B = Batch size - H = Hidden size of RNN. - """ - batch = y.size(0) - if self.random_state_sampling and self.training: - state = [ - torch.randn(self.pred_rnn_layers, batch, self.pred_hidden, dtype=y.dtype, device=y.device), - torch.randn(self.pred_rnn_layers, batch, self.pred_hidden, dtype=y.dtype, device=y.device), - ] - - else: - state = [ - torch.zeros(self.pred_rnn_layers, batch, self.pred_hidden, dtype=y.dtype, device=y.device), - torch.zeros(self.pred_rnn_layers, batch, self.pred_hidden, dtype=y.dtype, device=y.device), - ] - return state - - def score_hypothesis( - self, hypothesis: rnnt_utils.Hypothesis, cache: Dict[Tuple[int], Any] - ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]: - """ - Similar to the predict() method, instead this method scores a Hypothesis during beam search. - Hypothesis is a dataclass representing one hypothesis in a Beam Search. - - Args: - hypothesis: Refer to rnnt_utils.Hypothesis. - cache: Dict which contains a cache to avoid duplicate computations. - - Returns: - Returns a tuple (y, states, lm_token) such that: - y is a torch.Tensor of shape [1, 1, H] representing the score of the last token in the Hypothesis. - state is a list of RNN states, each of shape [L, 1, H]. - lm_token is the final integer token of the hypothesis. - """ - if hypothesis.dec_state is not None: - device = hypothesis.dec_state[0].device - else: - _p = next(self.parameters()) - device = _p.device - - # parse "blank" tokens in hypothesis - if len(hypothesis.y_sequence) > 0 and hypothesis.y_sequence[-1] == self.blank_idx: - blank_state = True - else: - blank_state = False - - # Convert last token of hypothesis to torch.Tensor - target = torch.full([1, 1], fill_value=hypothesis.y_sequence[-1], device=device, dtype=torch.long) - lm_token = target[:, -1] # [1] - - # Convert current hypothesis into a tuple to preserve in cache - sequence = tuple(hypothesis.y_sequence) - - if sequence in cache: - y, new_state = cache[sequence] - else: - # Obtain score for target token and new states - if blank_state: - y, new_state = self.predict(None, state=None, add_sos=False, batch_size=1) # [1, 1, H] - - else: - y, new_state = self.predict( - target, state=hypothesis.dec_state, add_sos=False, batch_size=1 - ) # [1, 1, H] - - y = y[:, -1:, :] # Extract just last state : [1, 1, H] - cache[sequence] = (y, new_state) - - return y, new_state, lm_token - - def batch_score_hypothesis( - self, hypotheses: List[rnnt_utils.Hypothesis], cache: Dict[Tuple[int], Any], batch_states: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]: - """ - Used for batched beam search algorithms. Similar to score_hypothesis method. - - Args: - hypothesis: List of Hypotheses. Refer to rnnt_utils.Hypothesis. - cache: Dict which contains a cache to avoid duplicate computations. - batch_states: List of torch.Tensor which represent the states of the RNN for this batch. - Each state is of shape [L, B, H] - - Returns: - Returns a tuple (b_y, b_states, lm_tokens) such that: - b_y is a torch.Tensor of shape [B, 1, H] representing the scores of the last tokens in the Hypotheses. - b_state is a list of list of RNN states, each of shape [L, B, H]. - Represented as B x List[states]. - lm_token is a list of the final integer tokens of the hypotheses in the batch. - """ - final_batch = len(hypotheses) - - if final_batch == 0: - raise ValueError("No hypotheses was provided for the batch!") - - _p = next(self.parameters()) - device = _p.device - dtype = _p.dtype - - tokens = [] - process = [] - done = [None for _ in range(final_batch)] - - # For each hypothesis, cache the last token of the sequence and the current states - for i, hyp in enumerate(hypotheses): - sequence = tuple(hyp.y_sequence) - - if sequence in cache: - done[i] = cache[sequence] - else: - tokens.append(hyp.y_sequence[-1]) - process.append((sequence, hyp.dec_state)) - - if process: - batch = len(process) - - # convert list of tokens to torch.Tensor, then reshape. - tokens = torch.tensor(tokens, device=device, dtype=torch.long).view(batch, -1) - dec_states = self.initialize_state(tokens.to(dtype=dtype)) # [L, B, H] - dec_states = self.batch_initialize_states(dec_states, [d_state for seq, d_state in process]) - - y, dec_states = self.predict( - tokens, state=dec_states, add_sos=False, batch_size=batch - ) # [B, 1, H], List([L, 1, H]) - - dec_states = tuple(state.to(dtype=dtype) for state in dec_states) - - # Update done states and cache shared by entire batch. - j = 0 - for i in range(final_batch): - if done[i] is None: - # Select sample's state from the batch state list - new_state = self.batch_select_state(dec_states, j) - - # Cache [1, H] scores of the current y_j, and its corresponding state - done[i] = (y[j], new_state) - cache[process[j][0]] = (y[j], new_state) - - j += 1 - - # Set the incoming batch states with the new states obtained from `done`. - batch_states = self.batch_initialize_states(batch_states, [d_state for y_j, d_state in done]) - - # Create batch of all output scores - # List[1, 1, H] -> [B, 1, H] - batch_y = torch.stack([y_j for y_j, d_state in done]) - - # Extract the last tokens from all hypotheses and convert to a tensor - lm_tokens = torch.tensor([h.y_sequence[-1] for h in hypotheses], device=device, dtype=torch.long).view( - final_batch - ) - - return batch_y, batch_states, lm_tokens - - def batch_initialize_states(self, batch_states: List[torch.Tensor], decoder_states: List[List[torch.Tensor]]): - """ - Create batch of decoder states. - - Args: - batch_states (list): batch of decoder states - ([L x (B, H)], [L x (B, H)]) - - decoder_states (list of list): list of decoder states - [B x ([L x (1, H)], [L x (1, H)])] - - Returns: - batch_states (tuple): batch of decoder states - ([L x (B, H)], [L x (B, H)]) - """ - # LSTM has 2 states - new_states = [[] for _ in range(len(decoder_states[0]))] - for layer in range(self.pred_rnn_layers): - for state_id in range(len(decoder_states[0])): - # batch_states[state_id][layer] = torch.stack([s[state_id][layer] for s in decoder_states]) - new_state_for_layer = torch.stack([s[state_id][layer] for s in decoder_states]) - new_states[state_id].append(new_state_for_layer) - - for state_id in range(len(decoder_states[0])): - new_states[state_id] = torch.stack([state for state in new_states[state_id]]) - - return new_states - - def batch_select_state(self, batch_states: List[torch.Tensor], idx: int) -> List[List[torch.Tensor]]: - """Get decoder state from batch of states, for given id. - - Args: - batch_states (list): batch of decoder states - ([L x (B, H)], [L x (B, H)]) - - idx (int): index to extract state from batch of states - - Returns: - (tuple): decoder states for given id - ([L x (1, H)], [L x (1, H)]) - """ - if batch_states is not None: - state_list = [] - for state_id in range(len(batch_states)): - states = [batch_states[state_id][layer][idx] for layer in range(self.pred_rnn_layers)] - state_list.append(states) - - return state_list - else: - return None - - def batch_concat_states(self, batch_states: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """Concatenate a batch of decoder state to a packed state. - - Args: - batch_states (list): batch of decoder states - B x ([L x (H)], [L x (H)]) - - Returns: - (tuple): decoder states - (L x B x H, L x B x H) - """ - state_list = [] - - for state_id in range(len(batch_states[0])): - batch_list = [] - for sample_id in range(len(batch_states)): - tensor = torch.stack(batch_states[sample_id][state_id]) # [L, H] - tensor = tensor.unsqueeze(0) # [1, L, H] - batch_list.append(tensor) - - state_tensor = torch.cat(batch_list, 0) # [B, L, H] - state_tensor = state_tensor.transpose(1, 0) # [L, B, H] - state_list.append(state_tensor) - - return state_list - - def batch_copy_states( - self, - old_states: List[torch.Tensor], - new_states: List[torch.Tensor], - ids: List[int], - value: Optional[float] = None, - ) -> List[torch.Tensor]: - """Copy states from new state to old state at certain indices. - - Args: - old_states(list): packed decoder states - (L x B x H, L x B x H) - - new_states: packed decoder states - (L x B x H, L x B x H) - - ids (list): List of indices to copy states at. - - value (optional float): If a value should be copied instead of a state slice, a float should be provided - - Returns: - batch of decoder states with partial copy at ids (or a specific value). - (L x B x H, L x B x H) - """ - for state_id in range(len(old_states)): - if value is None: - old_states[state_id][:, ids, :] = new_states[state_id][:, ids, :] - else: - old_states[state_id][:, ids, :] *= 0.0 - old_states[state_id][:, ids, :] += value - - return old_states - - # Adapter method overrides - def add_adapter(self, name: str, cfg: DictConfig): - # Update the config with correct input dim - cfg = self._update_adapter_cfg_input_dim(cfg) - # Add the adapter - super().add_adapter(name=name, cfg=cfg) - - def _update_adapter_cfg_input_dim(self, cfg: DictConfig): - cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=self.pred_hidden) - return cfg - - -class RNNTJoint(rnnt_abstract.AbstractRNNTJoint, Exportable, AdapterModuleMixin): - """A Recurrent Neural Network Transducer Joint Network (RNN-T Joint Network). - An RNN-T Joint network, comprised of a feedforward model. - - Args: - jointnet: A dict-like object which contains the following key-value pairs. - encoder_hidden: int specifying the hidden dimension of the encoder net. - pred_hidden: int specifying the hidden dimension of the prediction net. - joint_hidden: int specifying the hidden dimension of the joint net - activation: Activation function used in the joint step. Can be one of - ['relu', 'tanh', 'sigmoid']. - - Optionally, it may also contain the following: - dropout: float, set to 0.0 by default. Optional dropout applied at the end of the joint net. - - num_classes: int, specifying the vocabulary size that the joint network must predict, - excluding the RNNT blank token. - - vocabulary: Optional list of strings/tokens that comprise the vocabulary of the joint network. - Unused and kept only for easy access for character based encoding RNNT models. - - log_softmax: Optional bool, set to None by default. If set as None, will compute the log_softmax() - based on the value provided. - - preserve_memory: Optional bool, set to False by default. If the model crashes due to the memory - intensive joint step, one might try this flag to empty the tensor cache in pytorch. - - Warning: This will make the forward-backward pass much slower than normal. - It also might not fix the OOM if the GPU simply does not have enough memory to compute the joint. - - fuse_loss_wer: Optional bool, set to False by default. - - Fuses the joint forward, loss forward and - wer forward steps. In doing so, it trades of speed for memory conservation by creating sub-batches - of the provided batch of inputs, and performs Joint forward, loss forward and wer forward (optional), - all on sub-batches, then collates results to be exactly equal to results from the entire batch. - - When this flag is set, prior to calling forward, the fields `loss` and `wer` (either one) *must* - be set using the `RNNTJoint.set_loss()` or `RNNTJoint.set_wer()` methods. - - Further, when this flag is set, the following argument `fused_batch_size` *must* be provided - as a non negative integer. This value refers to the size of the sub-batch. - - When the flag is set, the input and output signature of `forward()` of this method changes. - Input - in addition to `encoder_outputs` (mandatory argument), the following arguments can be provided. - - decoder_outputs (optional). Required if loss computation is required. - - encoder_lengths (required) - - transcripts (optional). Required for wer calculation. - - transcript_lengths (optional). Required for wer calculation. - - compute_wer (bool, default false). Whether to compute WER or not for the fused batch. - - Output - instead of the usual `joint` log prob tensor, the following results can be returned. - - loss (optional). Returned if decoder_outputs, transcripts and transript_lengths are not None. - - wer_numerator + wer_denominator (optional). Returned if transcripts, transcripts_lengths are provided - and compute_wer is set. - - fused_batch_size: Optional int, required if `fuse_loss_wer` flag is set. Determines the size of the - sub-batches. Should be any value below the actual batch size per GPU. - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "encoder_outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "decoder_outputs": NeuralType(('B', 'D', 'T'), EmbeddedTextType()), - "encoder_lengths": NeuralType(tuple('B'), LengthsType(), optional=True), - "transcripts": NeuralType(('B', 'T'), LabelsType(), optional=True), - "transcript_lengths": NeuralType(tuple('B'), LengthsType(), optional=True), - "compute_wer": NeuralType(optional=True), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - if not self._fuse_loss_wer: - return { - "outputs": NeuralType(('B', 'T', 'T', 'D'), LogprobsType()), - } - - else: - return { - "loss": NeuralType(elements_type=LossType(), optional=True), - "wer": NeuralType(elements_type=ElementType(), optional=True), - "wer_numer": NeuralType(elements_type=ElementType(), optional=True), - "wer_denom": NeuralType(elements_type=ElementType(), optional=True), - } - - def _prepare_for_export(self, **kwargs): - self._fuse_loss_wer = False - self.log_softmax = False - super()._prepare_for_export(**kwargs) - - def input_example(self, max_batch=1, max_dim=8192): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - B, T, U = max_batch, max_dim, max_batch - encoder_outputs = torch.randn(B, self.encoder_hidden, T).to(next(self.parameters()).device) - decoder_outputs = torch.randn(B, self.pred_hidden, U).to(next(self.parameters()).device) - return (encoder_outputs, decoder_outputs) - - @property - def disabled_deployment_input_names(self): - """Implement this method to return a set of input names disabled for export""" - return set(["encoder_lengths", "transcripts", "transcript_lengths", "compute_wer"]) - - def __init__( - self, - jointnet: Dict[str, Any], - num_classes: int, - num_extra_outputs: int = 0, - vocabulary: Optional[List] = None, - log_softmax: Optional[bool] = None, - preserve_memory: bool = False, - fuse_loss_wer: bool = False, - fused_batch_size: Optional[int] = None, - experimental_fuse_loss_wer: Any = None, - ): - super().__init__() - - self.vocabulary = vocabulary - - self._vocab_size = num_classes - self._num_extra_outputs = num_extra_outputs - self._num_classes = num_classes + 1 + num_extra_outputs # 1 is for blank - - if experimental_fuse_loss_wer is not None: - # Override fuse_loss_wer from deprecated argument - fuse_loss_wer = experimental_fuse_loss_wer - - self._fuse_loss_wer = fuse_loss_wer - self._fused_batch_size = fused_batch_size - - if fuse_loss_wer and (fused_batch_size is None): - raise ValueError("If `fuse_loss_wer` is set, then `fused_batch_size` cannot be None!") - - self._loss = None - self._wer = None - - # Log softmax should be applied explicitly only for CPU - self.log_softmax = log_softmax - self.preserve_memory = preserve_memory - - if preserve_memory: - logging.warning( - "`preserve_memory` was set for the Joint Model. Please be aware this will severely impact " - "the forward-backward step time. It also might not solve OOM issues if the GPU simply " - "does not have enough memory to compute the joint." - ) - - # Required arguments - self.encoder_hidden = jointnet['encoder_hidden'] - self.pred_hidden = jointnet['pred_hidden'] - self.joint_hidden = jointnet['joint_hidden'] - self.activation = jointnet['activation'] - - # Optional arguments - dropout = jointnet.get('dropout', 0.0) - - self.pred, self.enc, self.joint_net = self._joint_net_modules( - num_classes=self._num_classes, # add 1 for blank symbol - pred_n_hidden=self.pred_hidden, - enc_n_hidden=self.encoder_hidden, - joint_n_hidden=self.joint_hidden, - activation=self.activation, - dropout=dropout, - ) - - # Flag needed for RNNT export support - self._rnnt_export = False - - # to change, requires running ``model.temperature = T`` explicitly - self.temperature = 1.0 - - @typecheck() - def forward( - self, - encoder_outputs: torch.Tensor, - decoder_outputs: Optional[torch.Tensor], - encoder_lengths: Optional[torch.Tensor] = None, - transcripts: Optional[torch.Tensor] = None, - transcript_lengths: Optional[torch.Tensor] = None, - compute_wer: bool = False, - ) -> Union[torch.Tensor, List[Optional[torch.Tensor]]]: - # encoder = (B, D, T) - # decoder = (B, D, U) if passed, else None - encoder_outputs = encoder_outputs.transpose(1, 2) # (B, T, D) - - if decoder_outputs is not None: - decoder_outputs = decoder_outputs.transpose(1, 2) # (B, U, D) - - if not self._fuse_loss_wer: - if decoder_outputs is None: - raise ValueError( - "decoder_outputs passed is None, and `fuse_loss_wer` is not set. " - "decoder_outputs can only be None for fused step!" - ) - - out = self.joint(encoder_outputs, decoder_outputs) # [B, T, U, V + 1] - return out - - else: - # At least the loss module must be supplied during fused joint - if self._loss is None or self._wer is None: - raise ValueError("`fuse_loss_wer` flag is set, but `loss` and `wer` modules were not provided! ") - - # If fused joint step is required, fused batch size is required as well - if self._fused_batch_size is None: - raise ValueError("If `fuse_loss_wer` is set, then `fused_batch_size` cannot be None!") - - # When using fused joint step, both encoder and transcript lengths must be provided - if (encoder_lengths is None) or (transcript_lengths is None): - raise ValueError( - "`fuse_loss_wer` is set, therefore encoder and target lengths " "must be provided as well!" - ) - - losses = [] - target_lengths = [] - batch_size = int(encoder_outputs.size(0)) # actual batch size - - # Iterate over batch using fused_batch_size steps - for batch_idx in range(0, batch_size, self._fused_batch_size): - begin = batch_idx - end = min(begin + self._fused_batch_size, batch_size) - - # Extract the sub batch inputs - # sub_enc = encoder_outputs[begin:end, ...] - # sub_transcripts = transcripts[begin:end, ...] - sub_enc = encoder_outputs.narrow(dim=0, start=begin, length=int(end - begin)) - sub_transcripts = transcripts.narrow(dim=0, start=begin, length=int(end - begin)) - - sub_enc_lens = encoder_lengths[begin:end] - sub_transcript_lens = transcript_lengths[begin:end] - - # Sub transcripts does not need the full padding of the entire batch - # Therefore reduce the decoder time steps to match - max_sub_enc_length = sub_enc_lens.max() - max_sub_transcript_length = sub_transcript_lens.max() - - if decoder_outputs is not None: - # Reduce encoder length to preserve computation - # Encoder: [sub-batch, T, D] -> [sub-batch, T', D]; T' < T - if sub_enc.shape[1] != max_sub_enc_length: - sub_enc = sub_enc.narrow(dim=1, start=0, length=int(max_sub_enc_length)) - - # sub_dec = decoder_outputs[begin:end, ...] # [sub-batch, U, D] - sub_dec = decoder_outputs.narrow(dim=0, start=begin, length=int(end - begin)) # [sub-batch, U, D] - - # Reduce decoder length to preserve computation - # Decoder: [sub-batch, U, D] -> [sub-batch, U', D]; U' < U - if sub_dec.shape[1] != max_sub_transcript_length + 1: - sub_dec = sub_dec.narrow(dim=1, start=0, length=int(max_sub_transcript_length + 1)) - - # Perform joint => [sub-batch, T', U', V + 1] - sub_joint = self.joint(sub_enc, sub_dec) - - del sub_dec - - # Reduce transcript length to correct alignment - # Transcript: [sub-batch, L] -> [sub-batch, L']; L' <= L - if sub_transcripts.shape[1] != max_sub_transcript_length: - sub_transcripts = sub_transcripts.narrow(dim=1, start=0, length=int(max_sub_transcript_length)) - - # Compute sub batch loss - # preserve loss reduction type - loss_reduction = self.loss.reduction - - # override loss reduction to sum - self.loss.reduction = None - - # compute and preserve loss - loss_batch = self.loss( - log_probs=sub_joint, - targets=sub_transcripts, - input_lengths=sub_enc_lens, - target_lengths=sub_transcript_lens, - ) - losses.append(loss_batch) - target_lengths.append(sub_transcript_lens) - - # reset loss reduction type - self.loss.reduction = loss_reduction - - else: - losses = None - - # Update WER for sub batch - if compute_wer: - sub_enc = sub_enc.transpose(1, 2) # [B, T, D] -> [B, D, T] - sub_enc = sub_enc.detach() - sub_transcripts = sub_transcripts.detach() - - # Update WER on each process without syncing - self.wer.update(sub_enc, sub_enc_lens, sub_transcripts, sub_transcript_lens) - - del sub_enc, sub_transcripts, sub_enc_lens, sub_transcript_lens - - # Reduce over sub batches - if losses is not None: - losses = self.loss.reduce(losses, target_lengths) - - # Collect sub batch wer results - if compute_wer: - # Sync and all_reduce on all processes, compute global WER - wer, wer_num, wer_denom = self.wer.compute() - self.wer.reset() - else: - wer = None - wer_num = None - wer_denom = None - - return losses, wer, wer_num, wer_denom - - def joint(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tensor: - """ - Compute the joint step of the network. - - Here, - B = Batch size - T = Acoustic model timesteps - U = Target sequence length - H1, H2 = Hidden dimensions of the Encoder / Decoder respectively - H = Hidden dimension of the Joint hidden step. - V = Vocabulary size of the Decoder (excluding the RNNT blank token). - - NOTE: - The implementation of this model is slightly modified from the original paper. - The original paper proposes the following steps : - (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- *1 - *1 -> Forward through joint final [B, T, U, V + 1]. - - We instead split the joint hidden into joint_hidden_enc and joint_hidden_dec and act as follows: - enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- *1 - dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- *2 - (*1, *2) -> Sum [B, T, U, H] -> Forward through joint final [B, T, U, V + 1]. - - Args: - f: Output of the Encoder model. A torch.Tensor of shape [B, T, H1] - g: Output of the Decoder model. A torch.Tensor of shape [B, U, H2] - - Returns: - Logits / log softmaxed tensor of shape (B, T, U, V + 1). - """ - # f = [B, T, H1] - f = self.enc(f) - f.unsqueeze_(dim=2) # (B, T, 1, H) - - # g = [B, U, H2] - g = self.pred(g) - g.unsqueeze_(dim=1) # (B, 1, U, H) - - inp = f + g # [B, T, U, H] - - del f, g - - # Forward adapter modules on joint hidden - if self.is_adapter_available(): - inp = self.forward_enabled_adapters(inp) - - res = self.joint_net(inp) # [B, T, U, V + 1] - - del inp - - if self.preserve_memory: - torch.cuda.empty_cache() - - # If log_softmax is automatic - if self.log_softmax is None: - if not res.is_cuda: # Use log softmax only if on CPU - if self.temperature != 1.0: - res = (res / self.temperature).log_softmax(dim=-1) - else: - res = res.log_softmax(dim=-1) - else: - if self.log_softmax: - if self.temperature != 1.0: - res = (res / self.temperature).log_softmax(dim=-1) - else: - res = res.log_softmax(dim=-1) - - return res - - def _joint_net_modules(self, num_classes, pred_n_hidden, enc_n_hidden, joint_n_hidden, activation, dropout): - """ - Prepare the trainable modules of the Joint Network - - Args: - num_classes: Number of output classes (vocab size) excluding the RNNT blank token. - pred_n_hidden: Hidden size of the prediction network. - enc_n_hidden: Hidden size of the encoder network. - joint_n_hidden: Hidden size of the joint network. - activation: Activation of the joint. Can be one of [relu, tanh, sigmoid] - dropout: Dropout value to apply to joint. - """ - pred = torch.nn.Linear(pred_n_hidden, joint_n_hidden) - enc = torch.nn.Linear(enc_n_hidden, joint_n_hidden) - - if activation not in ['relu', 'sigmoid', 'tanh']: - raise ValueError("Unsupported activation for joint step - please pass one of " "[relu, sigmoid, tanh]") - - activation = activation.lower() - - if activation == 'relu': - activation = torch.nn.ReLU(inplace=True) - elif activation == 'sigmoid': - activation = torch.nn.Sigmoid() - elif activation == 'tanh': - activation = torch.nn.Tanh() - - layers = ( - [activation] - + ([torch.nn.Dropout(p=dropout)] if dropout else []) - + [torch.nn.Linear(joint_n_hidden, num_classes)] - ) - return pred, enc, torch.nn.Sequential(*layers) - - # Adapter method overrides - def add_adapter(self, name: str, cfg: DictConfig): - # Update the config with correct input dim - cfg = self._update_adapter_cfg_input_dim(cfg) - # Add the adapter - super().add_adapter(name=name, cfg=cfg) - - def _update_adapter_cfg_input_dim(self, cfg: DictConfig): - cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=self.joint_hidden) - return cfg - - @property - def num_classes_with_blank(self): - return self._num_classes - - @property - def num_extra_outputs(self): - return self._num_extra_outputs - - @property - def loss(self): - return self._loss - - def set_loss(self, loss): - if not self._fuse_loss_wer: - raise ValueError("Attempting to set loss module even though `fuse_loss_wer` is not set!") - - self._loss = loss - - @property - def wer(self): - return self._wer - - def set_wer(self, wer): - if not self._fuse_loss_wer: - raise ValueError("Attempting to set WER module even though `fuse_loss_wer` is not set!") - - self._wer = wer - - @property - def fuse_loss_wer(self): - return self._fuse_loss_wer - - def set_fuse_loss_wer(self, fuse_loss_wer, loss=None, metric=None): - self._fuse_loss_wer = fuse_loss_wer - - self._loss = loss - self._wer = metric - - @property - def fused_batch_size(self): - return self._fuse_loss_wer - - def set_fused_batch_size(self, fused_batch_size): - self._fused_batch_size = fused_batch_size - - -class RNNTDecoderJoint(torch.nn.Module, Exportable): - """ - Utility class to export Decoder+Joint as a single module - """ - - def __init__(self, decoder, joint): - super().__init__() - self.decoder = decoder - self.joint = joint - - @property - def input_types(self): - state_type = NeuralType(('D', 'B', 'D'), ElementType()) - mytypes = { - 'encoder_outputs': NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "targets": NeuralType(('B', 'T'), LabelsType()), - "target_length": NeuralType(tuple('B'), LengthsType()), - 'input_states_1': state_type, - 'input_states_2': state_type, - } - - return mytypes - - def input_example(self, max_batch=1, max_dim=1): - decoder_example = self.decoder.input_example(max_batch=max_batch, max_dim=max_dim) - state1, state2 = decoder_example[-1] - return tuple([self.joint.input_example()[0]]) + decoder_example[:2] + (state1, state2) - - @property - def output_types(self): - return { - "outputs": NeuralType(('B', 'T', 'T', 'D'), LogprobsType()), - "prednet_lengths": NeuralType(tuple('B'), LengthsType()), - "output_states_1": NeuralType((('D', 'B', 'D')), ElementType()), - "output_states_2": NeuralType((('D', 'B', 'D')), ElementType()), - } - - def forward(self, encoder_outputs, targets, target_length, input_states_1, input_states_2): - decoder_outputs = self.decoder(targets, target_length, (input_states_1, input_states_2)) - decoder_output = decoder_outputs[0] - decoder_length = decoder_outputs[1] - input_states_1, input_states_2 = decoder_outputs[2][0], decoder_outputs[2][1] - joint_output = self.joint(encoder_outputs, decoder_output) - return (joint_output, decoder_length, input_states_1, input_states_2) - - -class RNNTDecoderJointSSL(torch.nn.Module): - def __init__(self, decoder, joint): - super().__init__() - self.decoder = decoder - self.joint = joint - - @property - def needs_labels(self): - return True - - @property - def input_types(self): - return { - "encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "targets": NeuralType(('B', 'T'), LabelsType()), - "target_lengths": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - return {"log_probs": NeuralType(('B', 'T', 'D'), SpectrogramType())} - - def forward(self, encoder_output, targets, target_lengths): - - decoder, target_length, states = self.decoder(targets=targets, target_length=target_lengths) - log_probs = self.joint(encoder_outputs=encoder_output, decoder_outputs=decoder) - - return log_probs - - -class SampledRNNTJoint(RNNTJoint): - """A Sampled Recurrent Neural Network Transducer Joint Network (RNN-T Joint Network). - An RNN-T Joint network, comprised of a feedforward model, where the vocab size will be sampled instead - of computing the full vocabulary joint. - - Args: - jointnet: A dict-like object which contains the following key-value pairs. - encoder_hidden: int specifying the hidden dimension of the encoder net. - pred_hidden: int specifying the hidden dimension of the prediction net. - joint_hidden: int specifying the hidden dimension of the joint net - activation: Activation function used in the joint step. Can be one of - ['relu', 'tanh', 'sigmoid']. - - Optionally, it may also contain the following: - dropout: float, set to 0.0 by default. Optional dropout applied at the end of the joint net. - - num_classes: int, specifying the vocabulary size that the joint network must predict, - excluding the RNNT blank token. - - n_samples: int, specifies the number of tokens to sample from the vocabulary space, - excluding the RNNT blank token. If a given value is larger than the entire vocabulary size, - then the full vocabulary will be used. - - vocabulary: Optional list of strings/tokens that comprise the vocabulary of the joint network. - Unused and kept only for easy access for character based encoding RNNT models. - - log_softmax: Optional bool, set to None by default. If set as None, will compute the log_softmax() - based on the value provided. - - preserve_memory: Optional bool, set to False by default. If the model crashes due to the memory - intensive joint step, one might try this flag to empty the tensor cache in pytorch. - - Warning: This will make the forward-backward pass much slower than normal. - It also might not fix the OOM if the GPU simply does not have enough memory to compute the joint. - - fuse_loss_wer: Optional bool, set to False by default. - - Fuses the joint forward, loss forward and - wer forward steps. In doing so, it trades of speed for memory conservation by creating sub-batches - of the provided batch of inputs, and performs Joint forward, loss forward and wer forward (optional), - all on sub-batches, then collates results to be exactly equal to results from the entire batch. - - When this flag is set, prior to calling forward, the fields `loss` and `wer` (either one) *must* - be set using the `RNNTJoint.set_loss()` or `RNNTJoint.set_wer()` methods. - - Further, when this flag is set, the following argument `fused_batch_size` *must* be provided - as a non negative integer. This value refers to the size of the sub-batch. - - When the flag is set, the input and output signature of `forward()` of this method changes. - Input - in addition to `encoder_outputs` (mandatory argument), the following arguments can be provided. - - decoder_outputs (optional). Required if loss computation is required. - - encoder_lengths (required) - - transcripts (optional). Required for wer calculation. - - transcript_lengths (optional). Required for wer calculation. - - compute_wer (bool, default false). Whether to compute WER or not for the fused batch. - - Output - instead of the usual `joint` log prob tensor, the following results can be returned. - - loss (optional). Returned if decoder_outputs, transcripts and transript_lengths are not None. - - wer_numerator + wer_denominator (optional). Returned if transcripts, transcripts_lengths are provided - and compute_wer is set. - - fused_batch_size: Optional int, required if `fuse_loss_wer` flag is set. Determines the size of the - sub-batches. Should be any value below the actual batch size per GPU. - """ - - def __init__( - self, - jointnet: Dict[str, Any], - num_classes: int, - n_samples: int, - vocabulary: Optional[List] = None, - log_softmax: Optional[bool] = None, - preserve_memory: bool = False, - fuse_loss_wer: bool = False, - fused_batch_size: Optional[int] = None, - ): - super().__init__( - jointnet=jointnet, - num_classes=num_classes, - vocabulary=vocabulary, - log_softmax=log_softmax, - preserve_memory=preserve_memory, - fuse_loss_wer=fuse_loss_wer, - fused_batch_size=fused_batch_size, - ) - self.n_samples = n_samples - self.register_buffer('blank_id', torch.tensor([self.num_classes_with_blank - 1]), persistent=False) - - @typecheck() - def forward( - self, - encoder_outputs: torch.Tensor, - decoder_outputs: Optional[torch.Tensor], - encoder_lengths: Optional[torch.Tensor] = None, - transcripts: Optional[torch.Tensor] = None, - transcript_lengths: Optional[torch.Tensor] = None, - compute_wer: bool = False, - ) -> Union[torch.Tensor, List[Optional[torch.Tensor]]]: - # If in inference mode, revert to basic RNNT Joint behaviour. - # Sampled RNNT is only used for training. - if not torch.is_grad_enabled() or torch.is_inference_mode_enabled(): - # Simply call full tensor joint - return super().forward( - encoder_outputs=encoder_outputs, - decoder_outputs=decoder_outputs, - encoder_lengths=encoder_lengths, - transcripts=transcripts, - transcript_lengths=transcript_lengths, - compute_wer=compute_wer, - ) - - if transcripts is None or transcript_lengths is None: - logging.warning( - "Sampled RNNT Joint currently only works with `fuse_loss_wer` set to True, " - "and when `fused_batch_size` is a positive integer." - ) - raise ValueError( - "Sampled RNNT loss only works when the transcripts are provided during training." - "Please ensure that you correctly pass the `transcripts` and `transcript_lengths`." - ) - - # encoder = (B, D, T) - # decoder = (B, D, U) if passed, else None - encoder_outputs = encoder_outputs.transpose(1, 2) # (B, T, D) - - if decoder_outputs is not None: - decoder_outputs = decoder_outputs.transpose(1, 2) # (B, U, D) - - # At least the loss module must be supplied during fused joint - if self._loss is None or self._wer is None: - raise ValueError("`fuse_loss_wer` flag is set, but `loss` and `wer` modules were not provided! ") - - # If fused joint step is required, fused batch size is required as well - if self._fused_batch_size is None: - raise ValueError("If `fuse_loss_wer` is set, then `fused_batch_size` cannot be None!") - - # When using fused joint step, both encoder and transcript lengths must be provided - if (encoder_lengths is None) or (transcript_lengths is None): - raise ValueError( - "`fuse_loss_wer` is set, therefore encoder and target lengths " "must be provided as well!" - ) - - losses = [] - target_lengths = [] - batch_size = int(encoder_outputs.size(0)) # actual batch size - - # Iterate over batch using fused_batch_size steps - for batch_idx in range(0, batch_size, self._fused_batch_size): - begin = batch_idx - end = min(begin + self._fused_batch_size, batch_size) - - # Extract the sub batch inputs - # sub_enc = encoder_outputs[begin:end, ...] - # sub_transcripts = transcripts[begin:end, ...] - sub_enc = encoder_outputs.narrow(dim=0, start=begin, length=int(end - begin)) - sub_transcripts = transcripts.narrow(dim=0, start=begin, length=int(end - begin)) - - sub_enc_lens = encoder_lengths[begin:end] - sub_transcript_lens = transcript_lengths[begin:end] - - # Sub transcripts does not need the full padding of the entire batch - # Therefore reduce the decoder time steps to match - max_sub_enc_length = sub_enc_lens.max() - max_sub_transcript_length = sub_transcript_lens.max() - - if decoder_outputs is not None: - # Reduce encoder length to preserve computation - # Encoder: [sub-batch, T, D] -> [sub-batch, T', D]; T' < T - if sub_enc.shape[1] != max_sub_enc_length: - sub_enc = sub_enc.narrow(dim=1, start=0, length=int(max_sub_enc_length)) - - # sub_dec = decoder_outputs[begin:end, ...] # [sub-batch, U, D] - sub_dec = decoder_outputs.narrow(dim=0, start=begin, length=int(end - begin)) # [sub-batch, U, D] - - # Reduce decoder length to preserve computation - # Decoder: [sub-batch, U, D] -> [sub-batch, U', D]; U' < U - if sub_dec.shape[1] != max_sub_transcript_length + 1: - sub_dec = sub_dec.narrow(dim=1, start=0, length=int(max_sub_transcript_length + 1)) - - # Reduce transcript length to correct alignment - # Transcript: [sub-batch, L] -> [sub-batch, L']; L' <= L - if sub_transcripts.shape[1] != max_sub_transcript_length: - sub_transcripts = sub_transcripts.narrow(dim=1, start=0, length=int(max_sub_transcript_length)) - - # Perform sampled joint => [sub-batch, T', U', {V' < V} + 1}] - sub_joint, sub_transcripts_remapped = self.sampled_joint( - sub_enc, sub_dec, transcript=sub_transcripts, transcript_lengths=sub_transcript_lens - ) - - del sub_dec - - # Compute sub batch loss - # preserve loss reduction type - loss_reduction = self.loss.reduction - - # override loss reduction to sum - self.loss.reduction = None - - # override blank idx in order to map to new vocabulary space - # in the new vocabulary space, we set the mapping of the RNNT Blank from index V+1 to 0 - # So the loss here needs to be updated accordingly. - # TODO: See if we can have some formal API for rnnt loss to update inner blank index. - cached_blank_id = self.loss._loss.blank - self.loss._loss.blank = 0 - - # compute and preserve loss - loss_batch = self.loss( - log_probs=sub_joint, - targets=sub_transcripts_remapped, # Note: We have to use remapped transcripts here ! - input_lengths=sub_enc_lens, - target_lengths=sub_transcript_lens, # Note: Even after remap, the transcript lengths remain intact. - ) - losses.append(loss_batch) - target_lengths.append(sub_transcript_lens) - - # reset loss reduction type and blank id - self.loss.reduction = loss_reduction - self.loss._loss.blank = cached_blank_id - - else: - losses = None - - # Update WER for sub batch - if compute_wer: - sub_enc = sub_enc.transpose(1, 2) # [B, T, D] -> [B, D, T] - sub_enc = sub_enc.detach() - sub_transcripts = sub_transcripts.detach() - - # Update WER on each process without syncing - self.wer.update(sub_enc, sub_enc_lens, sub_transcripts, sub_transcript_lens) - - del sub_enc, sub_transcripts, sub_enc_lens, sub_transcript_lens - - # Reduce over sub batches - if losses is not None: - losses = self.loss.reduce(losses, target_lengths) - - # Collect sub batch wer results - if compute_wer: - # Sync and all_reduce on all processes, compute global WER - wer, wer_num, wer_denom = self.wer.compute() - self.wer.reset() - else: - wer = None - wer_num = None - wer_denom = None - - return losses, wer, wer_num, wer_denom - - def sampled_joint( - self, f: torch.Tensor, g: torch.Tensor, transcript: torch.Tensor, transcript_lengths: torch.Tensor, - ) -> torch.Tensor: - """ - Compute the sampled joint step of the network. - - # Reference - - [Memory-Efficient Training of RNN-Transducer with Sampled Softmax](https://arxiv.org/abs/2203.16868) - - Here, - B = Batch size - T = Acoustic model timesteps - U = Target sequence length - H1, H2 = Hidden dimensions of the Encoder / Decoder respectively - H = Hidden dimension of the Joint hidden step. - V = Vocabulary size of the Decoder (excluding the RNNT blank token). - S = Sample size of vocabulary. - - NOTE: - The implementation of this joint model is slightly modified from the original paper. - The original paper proposes the following steps : - (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- *1 - *1 -> Forward through joint final [B, T, U, V + 1]. - - We instead split the joint hidden into joint_hidden_enc and joint_hidden_dec and act as follows: - enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- *1 - dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- *2 - (*1, *2) -> Sum [B, T, U, H] -> Sample Vocab V_Pos (for target tokens) and V_Neg -> - (V_Neg is sampled not uniformly by as a rand permutation of all vocab tokens, then eliminate - all Intersection(V_Pos, V_Neg) common tokens to avoid duplication of loss) -> - Concat new Vocab V_Sampled = Union(V_Pos, V_Neg) - -> Forward partially through the joint final to create [B, T, U, V_Sampled] - - Args: - f: Output of the Encoder model. A torch.Tensor of shape [B, T, H1] - g: Output of the Decoder model. A torch.Tensor of shape [B, U, H2] - transcript: Batch of transcripts. A torch.Tensor of shape [B, U] - transcript_lengths: Batch of lengths of the transcripts. A torch.Tensor of shape [B] - - Returns: - Logits / log softmaxed tensor of shape (B, T, U, V + 1). - """ - # If under inference mode, ignore sampled joint and compute full joint. - if self.training is False or torch.is_grad_enabled() is False or torch.is_inference_mode_enabled(): - # Simply call full tensor joint - return super().joint(f=f, g=g) - - # Compute sampled softmax - # f = [B, T, H1] - f = self.enc(f) - f.unsqueeze_(dim=2) # (B, T, 1, H) - - # g = [B, U, H2] - g = self.pred(g) - g.unsqueeze_(dim=1) # (B, 1, U, H) - - inp = f + g # [B, T, U, H] - - del f, g - - # Forward adapter modules on joint hidden - if self.is_adapter_available(): - inp = self.forward_enabled_adapters(inp) - - # Do partial forward of joint net (skipping the final linear) - for module in self.joint_net[:-1]: - inp = module(inp) # [B, T, U, H] - - # Begin compute of sampled RNNT joint - with torch.no_grad(): - # gather true labels - transcript_vocab_ids = torch.unique(transcript) - - # augment with blank token id - transcript_vocab_ids = torch.cat([self.blank_id, transcript_vocab_ids]) - - # Remap the transcript label ids to new positions of label ids (in the transcript_vocab_ids) - # This is necessary cause the RNNT loss doesnt care about the value, only the position of the ids - # of the transcript tokens. We can skip this step for noise samples cause those are only used for softmax - # estimation, not for computing actual label. - # From `https://stackoverflow.com/a/68969697` - bucketize algo. - t_ids = torch.arange(transcript_vocab_ids.size(0), device='cpu') - mapping = {k: v for k, v in zip(transcript_vocab_ids.to('cpu'), t_ids)} - - # From `https://stackoverflow.com/questions/13572448`. - palette, key = zip(*mapping.items()) - - t_device = transcript.device - key = torch.tensor(key, device=t_device) - palette = torch.tensor(palette, device=t_device) - - # This step maps old token id to new token id in broadcasted manner. - # For example, if original transcript tokens were [2, 1, 4, 5, 4, 1] - # But after computing the unique token set of above we get - # transcript_vocab_ids = [1, 2, 4, 5] # note: pytorch returns sorted unique values thankfully - # Then we get the index map of the new vocab ids as: - # {0: 1, 1: 2, 2: 4, 3: 5} - # Now we need to map the original transcript tokens to new vocab id space - # So we construct the inverted map as follow : - # {1: 0, 2: 1, 4: 2, 5: 3} - # Then remap the original transcript tokens to new token ids - # new_transcript = [1, 0, 2, 3, 2, 0] - index = torch.bucketize(transcript.ravel(), palette) - transcript = key[index].reshape(transcript.shape) - transcript = transcript.to(t_device) - - # Extract out partial weight tensor and bias tensor of just the V_Pos vocabulary from the full joint. - true_weights = self.joint_net[-1].weight[transcript_vocab_ids, :] - true_bias = self.joint_net[-1].bias[transcript_vocab_ids] - - # Compute the transcript joint scores (only of vocab V_Pos) - transcript_scores = torch.matmul(inp, true_weights.transpose(0, 1)) + true_bias - - # Construct acceptance criteria in vocab space, reject all tokens in Intersection(V_Pos, V_Neg) - with torch.no_grad(): - # Instead of uniform sample, first we create arange V (ignoring blank), then randomly shuffle - # this range of ids, then subset `n_samples` amount of vocab tokens out of the permuted tensor. - # This is good because it guarentees that no token will ever be repeated in V_Neg; - # which dramatically complicates loss calculation. - # Further more, with this strategy, given a `n_samples` > V + 1; we are guarenteed to get the - # V_Samples = V (i.e., full vocabulary will be used in such a case). - # Useful to debug cases where you expect sampled vocab to get exact same training curve as - # full vocab. - sample_ids = torch.randperm(n=self.num_classes_with_blank - 1, device=transcript_scores.device)[ - : self.n_samples - ] - - # We need to compute the intersection(V_Pos, V_Neg), then eliminate the intersection arguments - # from inside V_Neg. - - # First, compute the pairwise commonality to find index inside `sample_ids` which match the token id - # inside transcript_vocab_ids. - # Note: It is important to ignore the hardcoded RNNT Blank token injected at id 0 of the transcript - # vocab ids, otherwise the blank may occur twice, once for RNNT blank and once as negative sample, - # doubling the gradient of the RNNT blank token. - reject_samples = torch.where(transcript_vocab_ids[1:, None] == sample_ids[None, :]) - - # Let accept samples be a set of ids which is a subset of sample_ids - # such that intersection(V_Pos, accept_samples) is a null set. - accept_samples = sample_ids.clone() - - # In order to construct such an accept_samples tensor, first we construct a bool map - # and fill all the indices where there is a match inside of sample_ids. - # reject_samples is a tuple (transcript_vocab_position, sample_position) which gives a - # many to many map between N values of transript and M values of sample_ids. - # We dont care about transcript side matches, only the ids inside of sample_ids that matched. - sample_mask = torch.ones_like(accept_samples, dtype=torch.bool) - sample_mask[reject_samples[1]] = False - - # Finally, compute the subset of tokens by selecting only those sample_ids which had no matches - accept_samples = accept_samples[sample_mask] - - # Extract out partial weight tensor and bias tensor of just the V_Neg vocabulary from the full joint. - sample_weights = self.joint_net[-1].weight[accept_samples, :] - sample_bias = self.joint_net[-1].bias[accept_samples] - - # Compute the noise joint scores (only of vocab V_Neg) to be used for softmax - # The quality of this sample determines the quality of the softmax gradient. - # We use naive algo broadcasted over batch, but it is more efficient than sample level computation. - # One can increase `n_samples` for better estimation of rejection samples and its gradient. - noise_scores = torch.matmul(inp, sample_weights.transpose(0, 1)) + sample_bias - - # Finally, construct the sampled joint as the V_Sampled = Union(V_Pos, V_Neg) - # Here, we simply concatenate the two tensors to construct the joint with V_Sampled vocab - # because before we have properly asserted that Intersection(V_Pos, V_Neg) is a null set. - res = torch.cat([transcript_scores, noise_scores], dim=-1) - - del inp - - if self.preserve_memory: - torch.cuda.empty_cache() - - # If log_softmax is automatic - if self.log_softmax is None: - if not res.is_cuda: # Use log softmax only if on CPU - res = res.log_softmax(dim=-1) - else: - if self.log_softmax: - res = res.log_softmax(dim=-1) - - return res, transcript - - -# Add the adapter compatible modules to the registry -for cls in [RNNTDecoder, RNNTJoint, SampledRNNTJoint]: - if adapter_mixins.get_registered_adapter(cls) is None: - adapter_mixins.register_adapter(cls, cls) # base class is adapter compatible itself diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/rnnt_abstract.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/rnnt_abstract.py deleted file mode 100644 index e473f64e5716cdf4c1f674a83a338d98915f294a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/rnnt_abstract.py +++ /dev/null @@ -1,279 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional, Tuple - -import torch - -from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis -from nemo.core import NeuralModule - - -class AbstractRNNTJoint(NeuralModule, ABC): - """ - An abstract RNNT Joint framework, which can possibly integrate with GreedyRNNTInfer and BeamRNNTInfer classes. - Represents the abstract RNNT Joint network, which accepts the acoustic model and prediction network - embeddings in order to compute the joint of the two prior to decoding the output sequence. - """ - - @abstractmethod - def joint(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tensor: - """ - Compute the joint step of the network. - - Here, - B = Batch size - T = Acoustic model timesteps - U = Target sequence length - H1, H2 = Hidden dimensions of the Encoder / Decoder respectively - H = Hidden dimension of the Joint hidden step. - V = Vocabulary size of the Decoder (excluding the RNNT blank token). - - NOTE: - The implementation of this model is slightly modified from the original paper. - The original paper proposes the following steps : - (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- *1 - *1 -> Forward through joint final [B, T, U, V + 1]. - - We instead split the joint hidden into joint_hidden_enc and joint_hidden_dec and act as follows: - enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- *1 - dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- *2 - (*1, *2) -> Sum [B, T, U, H] -> Forward through joint final [B, T, U, V + 1]. - - Args: - f: Output of the Encoder model. A torch.Tensor of shape [B, T, H1] - g: Output of the Decoder model. A torch.Tensor of shape [B, U, H2] - - Returns: - Logits / log softmaxed tensor of shape (B, T, U, V + 1). - """ - raise NotImplementedError() - - @property - def num_classes_with_blank(self): - raise NotImplementedError() - - @property - def num_extra_outputs(self): - raise NotImplementedError() - - -class AbstractRNNTDecoder(NeuralModule, ABC): - """ - An abstract RNNT Decoder framework, which can possibly integrate with GreedyRNNTInfer and BeamRNNTInfer classes. - Represents the abstract RNNT Prediction/Decoder stateful network, which performs autoregressive decoding - in order to construct the output sequence. - - Args: - vocab_size: Size of the vocabulary, excluding the RNNT blank token. - blank_idx: Index of the blank token. Can be 0 or size(vocabulary). - blank_as_pad: Bool flag, whether to allocate an additional token in the Embedding layer - of this module in order to treat all RNNT `blank` tokens as pad tokens, thereby letting - the Embedding layer batch tokens more efficiently. - - It is mandatory to use this for certain Beam RNNT Infer methods - such as TSD, ALSD. - It is also more efficient to use greedy batch decoding with this flag. - """ - - def __init__(self, vocab_size, blank_idx, blank_as_pad): - super().__init__() - - self.vocab_size = vocab_size - self.blank_idx = blank_idx # first or last index of vocabulary - self.blank_as_pad = blank_as_pad - - if blank_idx not in [0, vocab_size]: - raise ValueError("`blank_idx` must be either 0 or the final token of the vocabulary") - - @abstractmethod - def predict( - self, - y: Optional[torch.Tensor] = None, - state: Optional[torch.Tensor] = None, - add_sos: bool = False, - batch_size: Optional[int] = None, - ) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Stateful prediction of scores and state for a (possibly null) tokenset. - This method takes various cases into consideration : - - No token, no state - used for priming the RNN - - No token, state provided - used for blank token scoring - - Given token, states - used for scores + new states - - Here: - B - batch size - U - label length - H - Hidden dimension size of RNN - L - Number of RNN layers - - Args: - y: Optional torch tensor of shape [B, U] of dtype long which will be passed to the Embedding. - If None, creates a zero tensor of shape [B, 1, H] which mimics output of pad-token on Embedding. - - state: An optional list of states for the RNN. Eg: For LSTM, it is the state list length is 2. - Each state must be a tensor of shape [L, B, H]. - If None, and during training mode and `random_state_sampling` is set, will sample a - normal distribution tensor of the above shape. Otherwise, None will be passed to the RNN. - - add_sos: bool flag, whether a zero vector describing a "start of signal" token should be - prepended to the above "y" tensor. When set, output size is (B, U + 1, H). - - batch_size: An optional int, specifying the batch size of the `y` tensor. - Can be infered if `y` and `state` is None. But if both are None, then batch_size cannot be None. - - Returns: - A tuple (g, hid) such that - - - If add_sos is False: - g: (B, U, H) - hid: (h, c) where h is the final sequence hidden state and c is the final cell state: - h (tensor), shape (L, B, H) - c (tensor), shape (L, B, H) - - If add_sos is True: - g: (B, U + 1, H) - hid: (h, c) where h is the final sequence hidden state and c is the final cell state: - h (tensor), shape (L, B, H) - c (tensor), shape (L, B, H) - - """ - raise NotImplementedError() - - @abstractmethod - def initialize_state(self, y: torch.Tensor) -> List[torch.Tensor]: - """ - Initialize the state of the RNN layers, with same dtype and device as input `y`. - - Args: - y: A torch.Tensor whose device the generated states will be placed on. - - Returns: - List of torch.Tensor, each of shape [L, B, H], where - L = Number of RNN layers - B = Batch size - H = Hidden size of RNN. - """ - raise NotImplementedError() - - @abstractmethod - def score_hypothesis( - self, hypothesis: Hypothesis, cache: Dict[Tuple[int], Any] - ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]: - """ - Similar to the predict() method, instead this method scores a Hypothesis during beam search. - Hypothesis is a dataclass representing one hypothesis in a Beam Search. - - Args: - hypothesis: Refer to rnnt_utils.Hypothesis. - cache: Dict which contains a cache to avoid duplicate computations. - - Returns: - Returns a tuple (y, states, lm_token) such that: - y is a torch.Tensor of shape [1, 1, H] representing the score of the last token in the Hypothesis. - state is a list of RNN states, each of shape [L, 1, H]. - lm_token is the final integer token of the hypothesis. - """ - raise NotImplementedError() - - def batch_score_hypothesis( - self, hypotheses: List[Hypothesis], cache: Dict[Tuple[int], Any], batch_states: List[torch.Tensor] - ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]: - """ - Used for batched beam search algorithms. Similar to score_hypothesis method. - - Args: - hypothesis: List of Hypotheses. Refer to rnnt_utils.Hypothesis. - cache: Dict which contains a cache to avoid duplicate computations. - batch_states: List of torch.Tensor which represent the states of the RNN for this batch. - Each state is of shape [L, B, H] - - Returns: - Returns a tuple (b_y, b_states, lm_tokens) such that: - b_y is a torch.Tensor of shape [B, 1, H] representing the scores of the last tokens in the Hypotheses. - b_state is a list of list of RNN states, each of shape [L, B, H]. - Represented as B x List[states]. - lm_token is a list of the final integer tokens of the hypotheses in the batch. - """ - raise NotImplementedError() - - def batch_initialize_states(self, batch_states: List[torch.Tensor], decoder_states: List[List[torch.Tensor]]): - """ - Create batch of decoder states. - - Args: - batch_states (list): batch of decoder states - ([L x (B, H)], [L x (B, H)]) - - decoder_states (list of list): list of decoder states - [B x ([L x (1, H)], [L x (1, H)])] - - Returns: - batch_states (tuple): batch of decoder states - ([L x (B, H)], [L x (B, H)]) - """ - raise NotImplementedError() - - def batch_select_state(self, batch_states: List[torch.Tensor], idx: int) -> List[List[torch.Tensor]]: - """Get decoder state from batch of states, for given id. - - Args: - batch_states (list): batch of decoder states - ([L x (B, H)], [L x (B, H)]) - - idx (int): index to extract state from batch of states - - Returns: - (tuple): decoder states for given id - ([L x (1, H)], [L x (1, H)]) - """ - raise NotImplementedError() - - def batch_concat_states(self, batch_states: List[List[torch.Tensor]]) -> List[torch.Tensor]: - """Concatenate a batch of decoder state to a packed state. - - Args: - batch_states (list): batch of decoder states - B x ([L x (H)], [L x (H)]) - - Returns: - (tuple): decoder states - (L x B x H, L x B x H) - """ - raise NotImplementedError() - - def batch_copy_states( - self, - old_states: List[torch.Tensor], - new_states: List[torch.Tensor], - ids: List[int], - value: Optional[float] = None, - ) -> List[torch.Tensor]: - """Copy states from new state to old state at certain indices. - - Args: - old_states(list): packed decoder states - (L x B x H, L x B x H) - - new_states: packed decoder states - (L x B x H, L x B x H) - - ids (list): List of indices to copy states at. - - value (optional float): If a value should be copied instead of a state slice, a float should be provided - - Returns: - batch of decoder states with partial copy at ids (or a specific value). - (L x B x H, L x B x H) - """ - raise NotImplementedError() diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/squeezeformer_encoder.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/squeezeformer_encoder.py deleted file mode 100644 index a887abd19ebbd451a76e318db3ce84a3e600a41f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/squeezeformer_encoder.py +++ /dev/null @@ -1,456 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from collections import OrderedDict -from typing import List, Optional, Set - -import torch -import torch.distributed -import torch.nn as nn -from omegaconf import DictConfig - -from nemo.collections.asr.parts.submodules.multi_head_attention import PositionalEncoding, RelPositionalEncoding -from nemo.collections.asr.parts.submodules.squeezeformer_modules import SqueezeformerLayer -from nemo.collections.asr.parts.submodules.subsampling import ConvSubsampling, StackingSubsampling, TimeReductionModule -from nemo.collections.asr.parts.utils import adapter_utils -from nemo.core.classes.common import typecheck -from nemo.core.classes.exportable import Exportable -from nemo.core.classes.mixins import AccessMixin, adapter_mixins -from nemo.core.classes.module import NeuralModule -from nemo.core.neural_types import AcousticEncodedRepresentation, LengthsType, NeuralType, SpectrogramType - -__all__ = ['SqueezeformerEncoder'] - - -class SqueezeformerEncoder(NeuralModule, Exportable, AccessMixin): - """ - The encoder for ASR model of Squeezeformer. - Based on this paper: - 'Squeezeformer: An Efficient Transformer for Automatic Speech Recognition' by Sehoon Kim et al. - https://arxiv.org/abs/2206.00888 - - Args: - feat_in (int): the size of feature channels - n_layers (int): number of layers of ConformerBlock - d_model (int): the hidden size of the model - feat_out (int): the size of the output features - Defaults to -1 (means feat_out is d_model) - subsampling (str): the method of subsampling, choices=['vggnet', 'striding', 'dw_striding'] - Defaults to dw_striding. - subsampling_factor (int): the subsampling factor which should be power of 2 - Defaults to 4. - subsampling_conv_channels (int): the size of the convolutions in the subsampling module - Defaults to -1 which would set it to d_model. - ff_expansion_factor (int): the expansion factor in feed forward layers - Defaults to 4. - self_attention_model (str): type of the attention layer and positional encoding - 'rel_pos': relative positional embedding and Transformer-XL - 'abs_pos': absolute positional embedding and Transformer - default is rel_pos. - pos_emb_max_len (int): the maximum length of positional embeddings - Defaulst to 5000 - n_heads (int): number of heads in multi-headed attention layers - Defaults to 4. - xscaling (bool): enables scaling the inputs to the multi-headed attention layers by sqrt(d_model) - Defaults to True. - untie_biases (bool): whether to not share (untie) the bias weights between layers of Transformer-XL - Defaults to True. - conv_kernel_size (int): the size of the convolutions in the convolutional modules - Defaults to 31. - conv_norm_type (str): the type of the normalization in the convolutional modules - Defaults to 'batch_norm'. - dropout (float): the dropout rate used in all layers except the attention layers - Defaults to 0.1. - dropout_emb (float): the dropout rate used for the positional embeddings - Defaults to 0.1. - dropout_att (float): the dropout rate used for the attention layer - Defaults to 0.0. - adaptive_scale (bool): Whether to scale the inputs to each component by affine `scale` and `bias` layer. - Or use a fixed scale=1 and bias=0. - time_reduce_idx (int): Optional integer index of a layer where a time reduction operation will occur. - All operations beyond this point will only occur at the reduced resolution. - time_recovery_idx (int): Optional integer index of a layer where the time recovery operation will occur. - All operations beyond this point will occur at the original resolution (resolution after - primary downsampling). If no value is provided, assumed to be the last layer. - """ - - def input_example(self, max_batch=1, max_dim=256): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - dev = next(self.parameters()).device - input_example = torch.randn(max_batch, self._feat_in, max_dim).to(dev) - input_example_length = torch.randint(1, max_dim, (max_batch,)).to(dev) - return tuple([input_example, input_example_length]) - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return OrderedDict( - { - "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - } - ) - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return OrderedDict( - { - "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - } - ) - - def __init__( - self, - feat_in: int, - n_layers: int, - d_model: int, - feat_out: int = -1, - subsampling: str = 'dw_striding', - subsampling_factor: int = 4, - subsampling_conv_channels: int = -1, - ff_expansion_factor: int = 4, - self_attention_model: str = 'rel_pos', - n_heads: int = 4, - att_context_size: Optional[List[int]] = None, - xscaling: bool = True, - untie_biases: bool = True, - pos_emb_max_len: int = 5000, - conv_kernel_size: int = 31, - conv_norm_type: str = 'batch_norm', - dropout: float = 0.1, - dropout_emb: float = 0.1, - dropout_att: float = 0.0, - adaptive_scale: bool = True, - time_reduce_idx: Optional[int] = None, - time_recovery_idx: Optional[int] = None, - ): - super().__init__() - - d_ff = d_model * ff_expansion_factor - self.d_model = d_model - self._feat_in = feat_in - if att_context_size: - self.att_context_size = att_context_size - else: - self.att_context_size = [-1, -1] - - if xscaling: - self.xscale = math.sqrt(d_model) - else: - self.xscale = None - self.adaptive_scale = adaptive_scale - - self.time_reduce_idx = time_reduce_idx - if time_reduce_idx is not None: - if time_recovery_idx is None: - self.time_recovery_idx = n_layers - 1 # recover at last layer - else: - self.time_recovery_idx = time_recovery_idx # recover at given layer - - if self.time_reduce_idx is not None: - if self.time_reduce_idx < 0 or self.time_recovery_idx >= n_layers: - raise ValueError(f"Time reduce index must lie between [0, {n_layers})") - if self.time_recovery_idx < 0 or self.time_recovery_idx >= n_layers: - raise ValueError(f"Time recovery index must lie between [0, {n_layers})") - - if subsampling_conv_channels == -1: - subsampling_conv_channels = d_model - if subsampling and subsampling_factor > 1: - if subsampling == 'stacking': - self.pre_encode = StackingSubsampling( - subsampling_factor=subsampling_factor, feat_in=feat_in, feat_out=d_model - ) - else: - self.pre_encode = ConvSubsampling( - subsampling=subsampling, - subsampling_factor=subsampling_factor, - feat_in=feat_in, - feat_out=d_model, - conv_channels=subsampling_conv_channels, - activation=nn.ReLU(), - ) - # For Squeezeformer, initialize the parameters as required. - self.pre_encode.reset_parameters() - else: - self.pre_encode = nn.Linear(feat_in, d_model) - - self._feat_out = d_model - - if not untie_biases and self_attention_model == "rel_pos": - d_head = d_model // n_heads - pos_bias_u = nn.Parameter(torch.Tensor(n_heads, d_head)) - pos_bias_v = nn.Parameter(torch.Tensor(n_heads, d_head)) - nn.init.zeros_(pos_bias_u) - nn.init.zeros_(pos_bias_v) - else: - pos_bias_u = None - pos_bias_v = None - - self.pos_emb_max_len = pos_emb_max_len - if self_attention_model == "rel_pos": - self.pos_enc = RelPositionalEncoding( - d_model=d_model, - dropout_rate=dropout, - max_len=pos_emb_max_len, - xscale=self.xscale, - dropout_rate_emb=dropout_emb, - ) - elif self_attention_model == "abs_pos": - pos_bias_u = None - pos_bias_v = None - self.pos_enc = PositionalEncoding( - d_model=d_model, dropout_rate=dropout, max_len=pos_emb_max_len, xscale=self.xscale - ) - else: - raise ValueError(f"Not valid self_attention_model: '{self_attention_model}'!") - - self.layers = nn.ModuleList() - for i in range(n_layers): - layer = SqueezeformerLayer( - d_model=d_model, - d_ff=d_ff, - self_attention_model=self_attention_model, - n_heads=n_heads, - conv_kernel_size=conv_kernel_size, - conv_norm_type=conv_norm_type, - dropout=dropout, - dropout_att=dropout_att, - pos_bias_u=pos_bias_u, - pos_bias_v=pos_bias_v, - adaptive_scale=adaptive_scale, - ) - self.layers.append(layer) - - # Time Reduction and Recovery layer setup - self.time_reduce_layer = None - self.time_recovery_layer = None - self.time_reduce_pos_enc = None - # Add time reduction layer - if self.time_reduce_idx is not None: - self.time_reduce_layer = TimeReductionModule(d_model, d_model, kernel_size=5, stride=2) - self.time_recovery_layer = nn.Linear(d_model, d_model) - - # Chose same type of positional encoding as the originally determined above - if self_attention_model == "rel_pos": - self.time_reduce_pos_enc = RelPositionalEncoding( - d_model=d_model, dropout_rate=0.0, max_len=pos_emb_max_len, xscale=None, dropout_rate_emb=0.0, - ) - else: - self.time_reduce_pos_enc = PositionalEncoding( - d_model=d_model, dropout_rate=0.0, max_len=pos_emb_max_len, xscale=None, dropout_rate_emb=0.0 - ) - - self.pre_ln = nn.LayerNorm(d_model) - - if feat_out > 0 and feat_out != self._feat_out: - self.out_proj = nn.Linear(self._feat_out, feat_out) - self._feat_out = feat_out - else: - self.out_proj = None - self._feat_out = d_model - self.set_max_audio_length(self.pos_emb_max_len) - self.use_pad_mask = True - - # will be set in self.forward() if defined in AccessMixin config - self.interctc_capture_at_layers = None - - def set_max_audio_length(self, max_audio_length): - """ Sets maximum input length. - Pre-calculates internal seq_range mask. - """ - self.max_audio_length = max_audio_length - device = next(self.parameters()).device - seq_range = torch.arange(0, self.max_audio_length, device=device) - if hasattr(self, 'seq_range'): - self.seq_range = seq_range - else: - self.register_buffer('seq_range', seq_range, persistent=False) - self.pos_enc.extend_pe(max_audio_length, device) - - if self.time_reduce_pos_enc is not None: - self.time_reduce_pos_enc.extend_pe(max_audio_length, device) - - @typecheck() - def forward(self, audio_signal, length=None): - self.update_max_seq_length(seq_length=audio_signal.size(2), device=audio_signal.device) - return self.forward_for_export(audio_signal=audio_signal, length=length) - - @typecheck() - def forward_for_export(self, audio_signal, length): - max_audio_length: int = audio_signal.size(-1) - - if max_audio_length > self.max_audio_length: - self.set_max_audio_length(max_audio_length) - - if length is None: - length = audio_signal.new_full( - audio_signal.size(0), max_audio_length, dtype=torch.int32, device=self.seq_range.device - ) - - audio_signal = torch.transpose(audio_signal, 1, 2) - - if isinstance(self.pre_encode, nn.Linear): - audio_signal = self.pre_encode(audio_signal) - else: - audio_signal, length = self.pre_encode(audio_signal, length) - - audio_signal, pos_emb = self.pos_enc(audio_signal) - # adjust size - max_audio_length = audio_signal.size(1) - # Create the self-attention and padding masks - - pad_mask = self.make_pad_mask(max_audio_length, length) - att_mask = pad_mask.unsqueeze(1).repeat([1, max_audio_length, 1]) - att_mask = torch.logical_and(att_mask, att_mask.transpose(1, 2)) - if self.att_context_size[0] >= 0: - att_mask = att_mask.triu(diagonal=-self.att_context_size[0]) - if self.att_context_size[1] >= 0: - att_mask = att_mask.tril(diagonal=self.att_context_size[1]) - att_mask = ~att_mask - - if self.use_pad_mask: - pad_mask = ~pad_mask - else: - pad_mask = None - - # Create cache of activations for the time reduction step - # Note: NeMo codebase allows only a single time reduction step to occur - recovery_activation_cache = [] - - audio_signal = self.pre_ln(audio_signal) - for lth, layer in enumerate(self.layers): - # Perform time reduction - if self.time_reduce_layer is not None and lth == self.time_reduce_idx: - # Perform time reduction - recovery_activation_cache.append((audio_signal, att_mask, pad_mask, pos_emb)) - audio_signal, att_mask, pad_mask = self.time_reduce_layer( - x=audio_signal, att_mask=att_mask, pad_mask=pad_mask - ) - # Only update PE, not the original audio_signal - _, pos_emb = self.time_reduce_pos_enc(audio_signal) - - # Perform time recovery - if self.time_recovery_layer is not None and lth == self.time_recovery_idx: - recovery_audio_signal, att_mask, pad_mask, pos_emb = recovery_activation_cache.pop(0) - # repeat interleaved values for 2x seq length - audio_signal = torch.repeat_interleave(audio_signal, repeats=2, dim=1) - - B, T, D = recovery_audio_signal.size() - audio_signal = audio_signal[:, :T, :] # Slice off the exact T timesteps as original cache value - audio_signal = self.time_recovery_layer(audio_signal) # learn non linear mapping - audio_signal = recovery_audio_signal + audio_signal # learn just the residual - - audio_signal = layer(x=audio_signal, att_mask=att_mask, pos_emb=pos_emb, pad_mask=pad_mask) - - # saving tensors if required for interctc loss - if self.is_access_enabled(): - if self.interctc_capture_at_layers is None: - self.interctc_capture_at_layers = self.access_cfg.get('interctc', {}).get('capture_layers', []) - if lth in self.interctc_capture_at_layers: - lth_audio_signal = audio_signal - if self.out_proj is not None: - lth_audio_signal = self.out_proj(audio_signal) - # shape is the same as the shape of audio_signal output, i.e. [B, D, T] - self.register_accessible_tensor( - name=f'interctc/layer_output_{lth}', tensor=torch.transpose(lth_audio_signal, 1, 2) - ) - self.register_accessible_tensor(name=f'interctc/layer_length_{lth}', tensor=length) - - if self.out_proj is not None: - audio_signal = self.out_proj(audio_signal) - - audio_signal = torch.transpose(audio_signal, 1, 2) - return audio_signal, length - - def update_max_seq_length(self, seq_length: int, device): - # Find global max audio length across all nodes - if torch.distributed.is_initialized(): - global_max_len = torch.tensor([seq_length], dtype=torch.float32, device=device) - - # Update across all ranks in the distributed system - torch.distributed.all_reduce(global_max_len, op=torch.distributed.ReduceOp.MAX) - - seq_length = global_max_len.int().item() - - if seq_length > self.max_audio_length: - self.set_max_audio_length(seq_length) - - def make_pad_mask(self, max_audio_length, seq_lens): - """Make masking for padding.""" - mask = self.seq_range[:max_audio_length].expand(seq_lens.size(0), -1) < seq_lens.unsqueeze(-1) - return mask - - def enable_pad_mask(self, on=True): - # On inference, user may chose to disable pad mask - mask = self.use_pad_mask - self.use_pad_mask = on - return mask - - -class SqueezeformerEncoderAdapter(SqueezeformerEncoder, adapter_mixins.AdapterModuleMixin): - - # Higher level forwarding - def add_adapter(self, name: str, cfg: dict): - cfg = self._update_adapter_cfg_input_dim(cfg) - for conformer_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin - conformer_layer.add_adapter(name, cfg) - - def is_adapter_available(self) -> bool: - return any([conformer_layer.is_adapter_available() for conformer_layer in self.layers]) - - def set_enabled_adapters(self, name: Optional[str] = None, enabled: bool = True): - for conformer_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin - conformer_layer.set_enabled_adapters(name=name, enabled=enabled) - - def get_enabled_adapters(self) -> List[str]: - names = set([]) - for conformer_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin - names.update(conformer_layer.get_enabled_adapters()) - - names = sorted(list(names)) - return names - - def _update_adapter_cfg_input_dim(self, cfg: DictConfig): - cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=self.d_model) - return cfg - - def get_accepted_adapter_types(self,) -> Set[type]: - types = super().get_accepted_adapter_types() - - if len(types) == 0: - self.set_accepted_adapter_types( - [ - adapter_utils.LINEAR_ADAPTER_CLASSPATH, - adapter_utils.MHA_ADAPTER_CLASSPATH, - adapter_utils.RELMHA_ADAPTER_CLASSPATH, - ] - ) - types = self.get_accepted_adapter_types() - return types - - -""" -Register any additional information -""" -if adapter_mixins.get_registered_adapter(SqueezeformerEncoder) is None: - adapter_mixins.register_adapter(base_class=SqueezeformerEncoder, adapter_class=SqueezeformerEncoderAdapter) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__init__.py deleted file mode 100644 index 4da13981c4b7becf9247d3b213897c5dbe6eec00..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.modules.transformer.bridge_encoders import * -from nemo.collections.asr.modules.transformer.perceiver_encoders import * -from nemo.collections.asr.modules.transformer.transformer_bottleneck import * -from nemo.collections.asr.modules.transformer.transformer_decoders import * -from nemo.collections.asr.modules.transformer.transformer_encoders import * -from nemo.collections.asr.modules.transformer.transformer_generators import * -from nemo.collections.asr.modules.transformer.transformer_modules import * diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 665b04bbd18ee573aec9aa4457f66c4d837bed98..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/bridge_encoders.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/bridge_encoders.cpython-310.pyc deleted file mode 100644 index 03040bef8b309222a1a3121abaa36cb960680681..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/bridge_encoders.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/decoder_module.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/decoder_module.cpython-310.pyc deleted file mode 100644 index eda731c34aac2292fafb98183fb249e14292cbed..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/decoder_module.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/encoder_module.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/encoder_module.cpython-310.pyc deleted file mode 100644 index 24e1417ad45177dd386584b10ec71f48d0e8f9c2..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/encoder_module.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/perceiver_encoders.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/perceiver_encoders.cpython-310.pyc deleted file mode 100644 index 2c1caac59cc7edeb8fc15cfbf9654fd2fb4658df..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/perceiver_encoders.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/reduction_encoders.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/reduction_encoders.cpython-310.pyc deleted file mode 100644 index 47ab66d9f6079658beba1935732c395b1f8490f0..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/reduction_encoders.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer.cpython-310.pyc deleted file mode 100644 index 95e8dbe7ada1e3dab608b70fe4563f7b8bf10637..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer_bottleneck.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer_bottleneck.cpython-310.pyc deleted file mode 100644 index d36479e71056285df3c32c09bbfb097aed4cb5f3..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer_bottleneck.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer_decoders.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer_decoders.cpython-310.pyc deleted file mode 100644 index 7164190712b35cd8b918f031876343e4426a6227..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer_decoders.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer_encoders.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer_encoders.cpython-310.pyc deleted file mode 100644 index 75bd4e70babe027622fc30f3352f746aefc45636..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer_encoders.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer_generators.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer_generators.cpython-310.pyc deleted file mode 100644 index 6ee8bd974de609dfcde89747425c0e4b5d031737..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer_generators.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer_modules.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer_modules.cpython-310.pyc deleted file mode 100644 index 9bbe666d9e1b31ef4bdb94ab6123f1a4d5a3b956..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/__pycache__/transformer_modules.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/bridge_encoders.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/bridge_encoders.py deleted file mode 100644 index 5c72d27b9ebf9a618c93346cf6b6371c58df6b45..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/bridge_encoders.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - -from nemo.collections.asr.modules.transformer.transformer_encoders import TransformerEncoder -from nemo.collections.asr.modules.transformer.transformer_modules import AttentionBridge - -__all__ = ["BridgeEncoder"] - - -class BridgeEncoder(torch.nn.Module): - def __init__( - self, - num_layers: int, - hidden_size: int, - inner_size: int, - mask_future: bool = False, - num_attention_heads: int = 1, - attn_score_dropout: float = 0.0, - attn_layer_dropout: float = 0.0, - ffn_dropout: float = 0.0, - hidden_act: str = "relu", - pre_ln: bool = False, - pre_ln_final_layer_norm: bool = True, - hidden_steps: int = 32, - hidden_init_method: str = "default", - hidden_blocks: int = 0, - ): - super().__init__() - - self._hidden_steps = hidden_steps - self._hidden_init_method = hidden_init_method - self._hidden_blocks = hidden_blocks - - if self._hidden_init_method == "default": - self._hidden_init_method = "enc_shared" - - if self.hidden_init_method not in self.supported_init_methods: - raise ValueError( - "Unknown hidden_init_method = {hidden_init_method}, supported methods are {supported_init_methods}".format( - hidden_init_method=self.hidden_init_method, supported_init_methods=self.supported_init_methods, - ) - ) - - # attention bridge - self.att_bridge = AttentionBridge(hidden_size=hidden_size, k=hidden_steps, bridge_size=inner_size,) - - if self.hidden_init_method == "enc": - self.init_hidden_enc = TransformerEncoder( - num_layers=num_layers, - hidden_size=hidden_size, - inner_size=inner_size, - mask_future=mask_future, - num_attention_heads=num_attention_heads, - attn_score_dropout=attn_score_dropout, - attn_layer_dropout=attn_layer_dropout, - ffn_dropout=ffn_dropout, - hidden_act=hidden_act, - pre_ln=pre_ln, - pre_ln_final_layer_norm=pre_ln_final_layer_norm, - ) - - # self attention - self.hidden_enc = TransformerEncoder( - num_layers=num_layers, - hidden_size=hidden_size, - inner_size=inner_size, - mask_future=mask_future, - num_attention_heads=num_attention_heads, - attn_score_dropout=attn_score_dropout, - attn_layer_dropout=attn_layer_dropout, - ffn_dropout=ffn_dropout, - hidden_act=hidden_act, - pre_ln=pre_ln, - pre_ln_final_layer_norm=pre_ln_final_layer_norm, - ) - - @property - def supported_init_methods(self): - return ["enc_shared", "identity", "enc"] - - @property - def hidden_steps(self): - return self._hidden_steps - - @property - def hidden_blocks(self): - return self._hidden_blocks - - @property - def hidden_init_method(self): - return self._hidden_init_method - - def forward(self, encoder_states, encoder_mask): - """ - Args: - encoder_states: output of the encoder (B x L_enc x H) - encoder_mask: encoder inputs mask (B x L_enc) - """ - # self-attention over input - if self.hidden_init_method == "enc_shared": - residual = encoder_states - hidden_states = self.hidden_enc(encoder_states=encoder_states, encoder_mask=encoder_mask) - # residual connection - hidden_states += residual - elif self.hidden_init_method == "identity": - hidden_states = encoder_states - elif self.hidden_init_method == "enc": - residual = encoder_states - hidden_states = self.init_hidden_enc(encoder_states=encoder_states, encoder_mask=encoder_mask) - # residual connection - hidden_states += residual - - # project encoder states to a fixed steps hidden using k attention heads - hidden_states = self.att_bridge(hidden=hidden_states, hidden_mask=encoder_mask) - - # all hidden values are active - hidden_mask = torch.ones( - encoder_states.shape[0], self._hidden_steps, dtype=encoder_mask.dtype, device=encoder_mask.device - ) - - # apply self-attention over fixed-size hidden_states - for block in range(self._hidden_blocks): - residual = hidden_states - hidden_states = self.hidden_enc(encoder_states=hidden_states, encoder_mask=hidden_mask) - # residual connection - hidden_states += residual - - return hidden_states, hidden_mask diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/decoder_module.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/decoder_module.py deleted file mode 100644 index d1cb8ac9b1f049c93f3b679ae0a56962012f6e06..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/decoder_module.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from abc import ABC -from typing import Any, Dict, Optional - -from nemo.core.classes import NeuralModule -from nemo.core.neural_types import ChannelType, EncodedRepresentation, MaskType, NeuralType - -__all__ = ['DecoderModule'] - - -class DecoderModule(NeuralModule, ABC): - """ Base class for decoder neural module to be used in NLP models. """ - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - return { - "input_ids": NeuralType(('B', 'T'), ChannelType()), - "decoder_mask": NeuralType(('B', 'T'), MaskType(), optional=True), - "encoder_embeddings": NeuralType(('B', 'T', 'D'), ChannelType(), optional=True), - "encoder_mask": NeuralType(('B', 'T'), MaskType(), optional=True), - "decoder_mems": NeuralType(('B', 'D', 'T', 'D'), EncodedRepresentation(), optional=True), - } - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return {"last_hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())} - - @property - def hidden_size(self) -> Optional[int]: - raise NotImplementedError - - @property - def vocab_size(self) -> Optional[int]: - raise NotImplementedError - - @property - def embedding(self) -> Optional[Any]: - raise NotImplementedError - - @property - def decoder(self) -> Optional[Any]: - raise NotImplementedError - - @property - def max_sequence_length(self) -> Optional[int]: - raise NotImplementedError diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/encoder_module.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/encoder_module.py deleted file mode 100644 index bd3912e0e693e1e3f5f32e7702bf894445a77485..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/encoder_module.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from abc import ABC -from typing import Dict, Optional - -from nemo.core.classes import NeuralModule -from nemo.core.neural_types import ChannelType, MaskType, NeuralType - -__all__ = ['EncoderModule'] - - -class EncoderModule(NeuralModule, ABC): - """ Base class for encoder neural module to be used in NLP models. """ - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - return { - "input_ids": NeuralType(('B', 'T'), ChannelType()), - "encoder_mask": NeuralType(('B', 'T'), MaskType()), - } - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return {"last_hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())} - - @property - def hidden_size(self) -> Optional[int]: - raise NotImplementedError diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/perceiver_encoders.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/perceiver_encoders.py deleted file mode 100644 index e836e20be7bc6222b1612ec769069cd7c9e9023a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/perceiver_encoders.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy - -import torch - -from nemo.collections.asr.modules.transformer.transformer_decoders import TransformerDecoder -from nemo.collections.asr.modules.transformer.transformer_encoders import TransformerEncoder -from nemo.collections.asr.modules.transformer.transformer_modules import AttentionBridge - -__all__ = ["PerceiverEncoder"] - - -class PerceiverEncoder(torch.nn.Module): - def __init__( - self, - num_layers: int, - hidden_size: int, - inner_size: int, - mask_future: bool = False, - num_attention_heads: int = 1, - attn_score_dropout: float = 0.0, - attn_layer_dropout: float = 0.0, - ffn_dropout: float = 0.0, - hidden_act: str = "relu", - pre_ln: bool = False, - pre_ln_final_layer_norm: bool = True, - hidden_steps: int = 32, - hidden_init_method: str = "default", - hidden_blocks: int = 2, - ): - super().__init__() - - self._hidden_steps = hidden_steps - self._hidden_init_method = hidden_init_method - self._hidden_blocks = hidden_blocks - - if self._hidden_init_method == "default": - self._hidden_init_method = "params" - - if self.hidden_init_method not in self.supported_init_methods: - raise ValueError( - "Unknown hidden_init_method = {hidden_init_method}, supported methods are {supported_init_methods}".format( - hidden_init_method=self.hidden_init_method, supported_init_methods=self.supported_init_methods, - ) - ) - - diagonal = 0 if mask_future else None - - if self.hidden_init_method == "params": - # learnable initial hidden values - self.init_hidden = torch.nn.Parameter(torch.nn.init.xavier_normal_(torch.empty(hidden_steps, hidden_size))) - self.init_cross_att = TransformerDecoder( - num_layers=1, - hidden_size=hidden_size, - inner_size=inner_size, - num_attention_heads=num_attention_heads, - attn_score_dropout=attn_score_dropout, - attn_layer_dropout=attn_layer_dropout, - ffn_dropout=ffn_dropout, - hidden_act=hidden_act, - pre_ln=pre_ln, - pre_ln_final_layer_norm=pre_ln_final_layer_norm, - ) - self.init_cross_att.diagonal = diagonal - elif self.hidden_init_method == "bridge": - # initialize latent with attention bridge - self.att_bridge = AttentionBridge(hidden_size=hidden_size, k=hidden_steps, bridge_size=inner_size,) - - # cross-attention encoder - layer = TransformerDecoder( - num_layers=1, - hidden_size=hidden_size, - inner_size=inner_size, - num_attention_heads=num_attention_heads, - attn_score_dropout=attn_score_dropout, - attn_layer_dropout=attn_layer_dropout, - ffn_dropout=ffn_dropout, - hidden_act=hidden_act, - pre_ln=pre_ln, - pre_ln_final_layer_norm=pre_ln_final_layer_norm, - ) - layer.diagonal = diagonal - self.cross_att_layers = torch.nn.ModuleList([copy.deepcopy(layer) for _ in range(hidden_blocks)]) - - # self-attention encoder - layer = TransformerEncoder( - num_layers=num_layers, - hidden_size=hidden_size, - inner_size=inner_size, - mask_future=mask_future, - num_attention_heads=num_attention_heads, - attn_score_dropout=attn_score_dropout, - attn_layer_dropout=attn_layer_dropout, - ffn_dropout=ffn_dropout, - hidden_act=hidden_act, - pre_ln=pre_ln, - pre_ln_final_layer_norm=pre_ln_final_layer_norm, - ) - self.self_att_layers = torch.nn.ModuleList([copy.deepcopy(layer) for _ in range(hidden_blocks)]) - - @property - def supported_init_methods(self): - return ["params", "bridge"] - - @property - def hidden_steps(self): - return self._hidden_steps - - @property - def hidden_blocks(self): - return self._hidden_blocks - - @property - def hidden_init_method(self): - return self._hidden_init_method - - def forward(self, encoder_states, encoder_mask): - """ - Args: - encoder_states: output of the encoder (B x L_enc x H) - encoder_mask: encoder inputs mask (B x L_enc) - """ - # all hidden values are active - hidden_mask = torch.ones( - encoder_states.shape[0], self._hidden_steps, dtype=encoder_mask.dtype, device=encoder_mask.device - ) - - # initialize hidden state - if self._hidden_init_method == "params": - # initialize latent with learned parameters - hidden_states = self.init_hidden.unsqueeze(0).expand(encoder_states.shape[0], -1, -1) - hidden_states = self.init_cross_att( - decoder_states=hidden_states, - decoder_mask=hidden_mask, - encoder_states=encoder_states, - encoder_mask=encoder_mask, - ) - elif self._hidden_init_method == "bridge": - # initialize latent with attention bridge - hidden_states = self.att_bridge(hidden=encoder_states, hidden_mask=encoder_mask,) - - # apply block (cross-attention, self-attention) multiple times - # for block in range(self._hidden_blocks): - for self_att, cross_att in zip(self.self_att_layers, self.cross_att_layers): - residual = hidden_states - - # cross attention of hidden over encoder states - hidden_states = cross_att( - decoder_states=hidden_states, - decoder_mask=hidden_mask, - encoder_states=encoder_states, - encoder_mask=encoder_mask, - ) - - # self-attention over hidden - hidden_states = self_att(encoder_states=hidden_states, encoder_mask=hidden_mask,) - - # residual connection - hidden_states += residual - - return hidden_states, hidden_mask diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/reduction_encoders.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/reduction_encoders.py deleted file mode 100644 index 0c3355b0949f54c220814131509a209956245d9c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/reduction_encoders.py +++ /dev/null @@ -1,148 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy - -import torch - -from nemo.collections.asr.modules.transformer.transformer_encoders import TransformerEncoder - -__all__ = ["PoolingEncoder"] - - -class PoolingEncoder(torch.nn.Module): - - _SUPPORTED_ARCH = ["max", "avg"] - - def __init__( - self, - num_layers: int, - hidden_size: int, - inner_size: int, - mask_future: bool = False, - num_attention_heads: int = 1, - attn_score_dropout: float = 0.0, - attn_layer_dropout: float = 0.0, - ffn_dropout: float = 0.0, - hidden_act: str = "relu", - pre_ln: bool = False, - pre_ln_final_layer_norm: bool = True, - hidden_steps: int = 4, - hidden_init_method: str = "default", - hidden_blocks: int = 2, - pooling_type: str = "max", - ): - super().__init__() - - # minimal steps to allow reduction - self._hidden_steps = hidden_steps - self._hidden_init_method = hidden_init_method - self._hidden_blocks = hidden_blocks - self._pooling_type = pooling_type - - if self._hidden_steps < 2: - raise ValueError("Expected hidden_steps >= 2 but received hidden_steps = {self._hidden_steps}") - - if self.hidden_init_method not in self.supported_init_methods: - raise ValueError( - "Unknown hidden_init_method = {hidden_init_method}, supported methods are {supported_init_methods}".format( - hidden_init_method=self.hidden_init_method, supported_init_methods=self.supported_init_methods, - ) - ) - - if self._pooling_type not in self.supported_arch: - raise ValueError(f"Unknown pooling_type = {pooling_type}. Available values = {self.supported_arch}") - - # self-attention encoder - layer = TransformerEncoder( - num_layers=num_layers, - hidden_size=hidden_size, - inner_size=inner_size, - mask_future=mask_future, - num_attention_heads=num_attention_heads, - attn_score_dropout=attn_score_dropout, - attn_layer_dropout=attn_layer_dropout, - ffn_dropout=ffn_dropout, - hidden_act=hidden_act, - pre_ln=pre_ln, - pre_ln_final_layer_norm=pre_ln_final_layer_norm, - ) - self.self_att_layers = torch.nn.ModuleList([copy.deepcopy(layer) for _ in range(hidden_blocks)]) - - self.pooling = self._build_pooling_module() - - def _build_pooling_module(self): - """ - Returns pooling module. - Allows to override for child classes. - """ - if self._pooling_type == "max": - pooling = torch.nn.MaxPool1d(kernel_size=2, stride=2) - elif self._pooling_type == "avg": - pooling = torch.nn.AvgPool1d(kernel_size=2, stride=2) - - return pooling - - @property - def supported_arch(self): - return self._SUPPORTED_ARCH - - @property - def supported_init_methods(self): - return ["default"] - - @property - def hidden_steps(self): - return self._hidden_steps - - @property - def hidden_blocks(self): - return self._hidden_blocks - - @property - def hidden_init_method(self): - return self._hidden_init_method - - def forward(self, encoder_states, encoder_mask): - """ - Args: - encoder_states: output of the encoder (B x L_enc x H) - encoder_mask: encoder inputs mask (B x L_enc) - """ - # initialize hidden state - hidden_mask = encoder_mask - hidden_states = encoder_states - - # apply block (self-attention, max-pool) multiple times - for self_att in self.self_att_layers: - residual = hidden_states - - # self-attention over hidden - hidden_states = self_att(encoder_states=hidden_states, encoder_mask=hidden_mask) - - hidden_states += residual - - # max pool reduction if possible - if hidden_states.shape[1] >= self.hidden_steps: - # max pool hidden states - hidden_states = hidden_states.permute(0, 2, 1) - hidden_states = self.pooling(hidden_states) - hidden_states = hidden_states.permute(0, 2, 1) - - # max pool mask - hidden_mask = ( - self.pooling(hidden_mask.unsqueeze(0).type_as(hidden_states)).squeeze(0).type_as(hidden_mask) - ) - - return hidden_states, hidden_mask diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/text_generation.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/text_generation.py deleted file mode 100644 index a261e925691f291f72c16ba7d5b0d8f5ee0896f1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/text_generation.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -from typing import List, Tuple, Union - -from torch import Tensor - -if sys.version_info >= (3, 8): - from typing import TypedDict -else: - from typing_extensions import TypedDict - - -class LengthParam(TypedDict): - max_length: int # The maximum length of the sequence to be generated. - min_length: int # The minimum length of the sequence to be generated. - - -class SamplingParam(TypedDict): - use_greedy: bool # Whether or not to use sampling ; use greedy decoding otherwise - temperature: float # sampling temperature - top_k: int # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: float # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - repetition_penalty: float # The parameter for repetition penalty. 1.0 means no penalty. - add_BOS: bool # add the bos token at the begining of the prompt - all_probs: bool # whether return the log prob for all the tokens in vocab - compute_logprob: bool # a flag used to compute logprob of all the input text, a very special case of running inference, default False - - -class OutputType(TypedDict): - sentences: List[str] # output sentences - tokens: List[List[str]] # output sentences borken into tokens - logprob: List[List[float]] # log prob of generated tokens - full_logprob: List[List[float]] # log prob of all the tokens in the vocab - token_ids: List[List[int]] # output sentence token ids - offsets: List[List[int]] # list of tokens start positions in text - - -class TextGeneration: - """ - Interface for all text generation models. - """ - - def generate( - self, - inputs: Union[List[str], Tuple[Tensor, Tensor], List[dict]], - length_params: LengthParam, - sampling_params: SamplingParam = None, - ) -> OutputType: - """ - Public method to generate text. - - Args: - inputs (Union[List[str], Tensor, List[dict]]): - Can be one of the 3 types: - 1. List of strings. Each element of the list provides input prompt. The model will apply tokenizer on it. - E.g [‘sentence’, ‘sentence2’ … ] - 2. Tuple of Pytorch Tensors (context_tokens, context_lengths). The `context_tokens` has shape (batch_size, seq_length), it's the batched sequences of tokens used as a prompst for the generation or as model inputs to the encoder. - The generative model will skip the tokenization and padding step. The `context_lengths` has shape (batch_size,), it indicates the length of the context tokens for each of the input sequences. - E.g. ( torch.tensor([[23,5234,23,35,…], [223,323,23,23232,232,...] …]), torch.tensor([20, 30, …])) - 3. List of python dict objects. Used for prompt/p-tuning inputs where a set of key-value pairs are converted into input token embeddings for the model. - E.g. [{"prompt-tag": "sentiment", "sentence": "this is a good movie"}, - {"prompt-tag": "qa", "context": "some context text", "question": "a simple question"} ... ] - where 'prompt-tag' is used to identify the type of NLP task to solve. - length_params (LengthParam): - a dictionary type which controls the sampling length. - max_length: int, The maximum length of the sequence to be generated. - min_length: int, The minimum length of the sequence to be generated. - If None, max_length is set to 30, and min_length is set to None - sampling_params (SamplingParam): - a dictionary type which contains the parameters for text sampling. It has the following keys - use_greedy: bool, Whether or not to use sampling ; use greedy decoding otherwise - top_k: int, The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p: float, If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. - repetition_penalty: float, The parameter for repetition penalty. 1.0 means no penalty. - add_BOS: bool, Whether add the bos token at the begining of the prompt - all_probs: bool # whether return the log prob for all the tokens in vocab - compute_logprob: bool # a flag used to compute logprob of all the input text, a very special case of running inference, default False - Default None, If it is None, use_greedy will be "True". - Returns: - OutputType: It generates the output in a dictionary type. It has the following keys: - sentences: List[str], output sentences - tokens: List[List[str]], output sentences borken into tokens - logprob: List[List[float]], log prob of generated tokens - full_logprob: List[List[float]], log prob of all the tokens in the vocab - token_ids: List[List[int]], output sentence token ids - offsets: List[List[int]] # list of tokens start positions in text - """ - raise NotImplementedError("please implement this method") diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer.py deleted file mode 100644 index 718448aa1c7c6a4848f35ba12208c32c5d749484..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer.py +++ /dev/null @@ -1,276 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass -from typing import Dict, Optional - -import torch -from omegaconf.omegaconf import MISSING - -from nemo.collections.asr.modules.transformer.decoder_module import DecoderModule -from nemo.collections.asr.modules.transformer.encoder_module import EncoderModule -from nemo.collections.asr.modules.transformer.transformer_decoders import TransformerDecoder -from nemo.collections.asr.modules.transformer.transformer_encoders import TransformerEncoder -from nemo.collections.asr.modules.transformer.transformer_modules import TransformerEmbedding -from nemo.core.classes.common import typecheck -from nemo.core.classes.exportable import Exportable -from nemo.core.neural_types import ChannelType, NeuralType - - -@dataclass -class NeMoTransformerConfig: - # must be configured by the user - hidden_size: int = MISSING - num_layers: int = MISSING - inner_size: int = MISSING - num_attention_heads: int = MISSING - - # embedding - max_sequence_length: int = 512 - num_token_types: int = 2 - embedding_dropout: float = 0.0 - learn_positional_encodings: bool = False - - # transformer - ffn_dropout: float = 0.0 - attn_score_dropout: float = 0.0 - attn_layer_dropout: float = 0.0 - hidden_act: str = 'relu' - pre_ln: bool = False - pre_ln_final_layer_norm: bool = True - - # named model arguments - library: str = 'nemo' - model_name: Optional[str] = None - pretrained: bool = False - - -@dataclass -class NeMoTransformerEncoderConfig(NeMoTransformerConfig): - mask_future: bool = False - - -@dataclass -class NeMoTransformerDecoderConfig(NeMoTransformerConfig): - r2l: bool = False - - -class TransformerEncoderNM(EncoderModule, Exportable): - def __init__( - self, - vocab_size: int, - hidden_size: int, - num_layers: int, - inner_size: int, - num_attention_heads: int, - max_sequence_length: int = 512, - num_token_types: int = 2, - embedding_dropout: float = 0.0, - learn_positional_encodings: bool = False, - ffn_dropout: float = 0.0, - attn_score_dropout: float = 0.0, - attn_layer_dropout: float = 0.0, - hidden_act: str = 'relu', - mask_future: bool = False, - pre_ln: bool = False, - pre_ln_final_layer_norm: bool = True, - ): - super().__init__() - - self._vocab_size = vocab_size - self._hidden_size = hidden_size - self._max_sequence_length = max_sequence_length - - self._embedding = TransformerEmbedding( - vocab_size=self._vocab_size, - hidden_size=self._hidden_size, - max_sequence_length=max_sequence_length, - num_token_types=num_token_types, - embedding_dropout=embedding_dropout, - learn_positional_encodings=learn_positional_encodings, - ) - - self._encoder = TransformerEncoder( - hidden_size=self._hidden_size, - num_layers=num_layers, - inner_size=inner_size, - num_attention_heads=num_attention_heads, - ffn_dropout=ffn_dropout, - attn_score_dropout=attn_score_dropout, - attn_layer_dropout=attn_layer_dropout, - hidden_act=hidden_act, - mask_future=mask_future, - pre_ln=pre_ln, - pre_ln_final_layer_norm=pre_ln_final_layer_norm, - ) - - @typecheck() - def forward(self, input_ids, encoder_mask): - embeddings = self._embedding(input_ids=input_ids) - encoder_hidden_states = self._encoder(encoder_states=embeddings, encoder_mask=encoder_mask) - return encoder_hidden_states - - @property - def hidden_size(self): - return self._hidden_size - - @property - def vocab_size(self): - return self._vocab_size - - @property - def max_sequence_length(self): - return self._max_sequence_length - - @property - def embedding(self): - return self._embedding - - @property - def encoder(self): - return self._encoder - - def input_example(self, max_batch=1, max_dim=256): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - sample = next(self.parameters()) - sz = (max_batch, max_dim) - input_ids = torch.randint(low=0, high=2048, size=sz, device=sample.device) - encoder_mask = torch.randint(low=0, high=1, size=sz, device=sample.device) - return tuple([input_ids, encoder_mask]) - - -class TransformerDecoderNM(DecoderModule, Exportable): - def __init__( - self, - vocab_size: int, - hidden_size: int, - num_layers: int, - inner_size: int, - num_attention_heads: int, - max_sequence_length: int = 512, - num_token_types: int = 2, - embedding_dropout: float = 0.0, - learn_positional_encodings: bool = False, - ffn_dropout: float = 0.0, - attn_score_dropout: float = 0.0, - attn_layer_dropout: float = 0.0, - hidden_act: str = 'relu', - pre_ln: bool = False, - pre_ln_final_layer_norm: bool = True, - ): - super().__init__() - - self._vocab_size = vocab_size - self._hidden_size = hidden_size - self._max_sequence_length = max_sequence_length - self.num_states = num_layers + 1 - self.return_mems = False - if pre_ln_final_layer_norm: - self.num_states += 1 - - self._embedding = TransformerEmbedding( - vocab_size=self.vocab_size, - hidden_size=self.hidden_size, - max_sequence_length=max_sequence_length, - num_token_types=num_token_types, - embedding_dropout=embedding_dropout, - learn_positional_encodings=learn_positional_encodings, - ) - - self._decoder = TransformerDecoder( - hidden_size=self.hidden_size, - num_layers=num_layers, - inner_size=inner_size, - num_attention_heads=num_attention_heads, - ffn_dropout=ffn_dropout, - attn_score_dropout=attn_score_dropout, - attn_layer_dropout=attn_layer_dropout, - hidden_act=hidden_act, - pre_ln=pre_ln, - pre_ln_final_layer_norm=pre_ln_final_layer_norm, - ) - - @typecheck() - def forward( - self, input_ids, decoder_mask, encoder_embeddings, encoder_mask, decoder_mems=None, - ): - start_pos = 0 - if decoder_mems is not None: - start_pos = input_ids.shape[1] - 1 - input_ids = input_ids[:, -1:] - decoder_mask = decoder_mask[:, -1:] - decoder_mems = torch.transpose(decoder_mems, 0, 1) - decoder_embeddings = self._embedding(input_ids=input_ids, start_pos=start_pos) - decoder_hidden_states = self._decoder( - decoder_states=decoder_embeddings, - decoder_mask=decoder_mask, - encoder_states=encoder_embeddings, - encoder_mask=encoder_mask, - decoder_mems_list=decoder_mems, - return_mems=self.return_mems, - return_mems_as_list=False, - ) - if self.return_mems: - decoder_hidden_states = torch.transpose(decoder_hidden_states, 0, 1) - return decoder_hidden_states - - @property - def hidden_size(self): - return self._hidden_size - - @property - def vocab_size(self): - return self._vocab_size - - @property - def max_sequence_length(self): - return self._max_sequence_length - - @property - def embedding(self): - return self._embedding - - @property - def decoder(self): - return self._decoder - - def input_example(self, max_batch=1, max_dim=256): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - sample = next(self.parameters()) - sz = (max_batch, max_dim) - input_ids = torch.randint(low=0, high=2048, size=sz, device=sample.device) - encoder_mask = torch.randint(low=0, high=1, size=sz, device=sample.device) - mem_size = [max_batch, self.num_states, max_dim - 1, self._hidden_size] - decoder_mems = torch.rand(mem_size, device=sample.device) - return tuple([input_ids, encoder_mask, self._embedding(input_ids), encoder_mask, decoder_mems]) - - def _prepare_for_export(self, **kwargs): - self._decoder.diagonal = None - self.return_mems = True - super()._prepare_for_export(**kwargs) - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - if self.return_mems: - return {"last_hidden_states": NeuralType(('B', 'D', 'T', 'D'), ChannelType())} - else: - return {"last_hidden_states": NeuralType(('B', 'T', 'D'), ChannelType())} diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_bottleneck.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_bottleneck.py deleted file mode 100644 index c463b4de1c70418c01bd944bb2d22d366f1e2df0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_bottleneck.py +++ /dev/null @@ -1,336 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass -from typing import Dict, Optional - -from nemo.collections.asr.modules.transformer.bridge_encoders import BridgeEncoder -from nemo.collections.asr.modules.transformer.perceiver_encoders import PerceiverEncoder -from nemo.collections.asr.modules.transformer.reduction_encoders import PoolingEncoder -from nemo.collections.asr.modules.transformer.transformer import ( - NeMoTransformerConfig, - TransformerDecoderNM, - TransformerEncoderNM, -) -from nemo.core.classes.common import typecheck -from nemo.core.neural_types import MaskType, NeuralType -from nemo.core.neural_types.elements import BoolType - -__all__ = [ - "NeMoTransformerBottleneckConfig", - "NeMoTransformerBottleneckEncoderConfig", - "NeMoTransformerBottleneckDecoderConfig", - "TransformerBottleneckEncoderNM", -] - - -@dataclass -class NeMoTransformerBottleneckConfig(NeMoTransformerConfig): - # architecture details (default is no bottleneck) - arch: str = '' - hidden_steps: int = -1 - hidden_blocks: int = 1 - hidden_init_method: str = "params" - - -@dataclass -class NeMoTransformerBottleneckEncoderConfig(NeMoTransformerBottleneckConfig): - mask_future: bool = False - # change return_mask to False to return hidden states only (default for non-bottleneck encoder) - return_mask: bool = True - - -@dataclass -class NeMoTransformerBottleneckDecoderConfig(NeMoTransformerBottleneckConfig): - r2l: bool = False - - -class TransformerBottleneckEncoderNM(TransformerEncoderNM): - - _SUPPORTED_ARCH = ["seq2seq", "bridge", "perceiver", "max_pool", "avg_pool"] - - def __init__( - self, - vocab_size: int, - hidden_size: int, - num_layers: int, - inner_size: int, - num_attention_heads: int, - max_sequence_length: int = 512, - num_token_types: int = 2, - embedding_dropout: float = 0.0, - learn_positional_encodings: bool = False, - ffn_dropout: float = 0.0, - attn_score_dropout: float = 0.0, - attn_layer_dropout: float = 0.0, - hidden_act: str = 'relu', - mask_future: bool = False, - pre_ln: bool = False, - pre_ln_final_layer_norm: bool = True, - arch: str = '', - hidden_steps: int = -1, - hidden_blocks: int = 1, - hidden_init_method: str = "default", - # default whether forward() method returns hidden or (hidden, mask) - return_mask=True, - ): - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - num_layers=num_layers, - inner_size=inner_size, - num_attention_heads=num_attention_heads, - max_sequence_length=max_sequence_length, - num_token_types=num_token_types, - embedding_dropout=embedding_dropout, - learn_positional_encodings=learn_positional_encodings, - ffn_dropout=ffn_dropout, - attn_score_dropout=attn_score_dropout, - attn_layer_dropout=attn_layer_dropout, - hidden_act=hidden_act, - mask_future=mask_future, - pre_ln=pre_ln, - pre_ln_final_layer_norm=pre_ln_final_layer_norm, - ) - - self._arch = arch - self._return_mask = return_mask - - # replace encoder - self._encoder = self._build_encoder( - arch=arch, - hidden_steps=hidden_steps, - hidden_blocks=hidden_blocks, - hidden_init_method=hidden_init_method, - hidden_size=hidden_size, - num_layers=num_layers, - inner_size=inner_size, - num_attention_heads=num_attention_heads, - ffn_dropout=ffn_dropout, - attn_score_dropout=attn_score_dropout, - attn_layer_dropout=attn_layer_dropout, - hidden_act=hidden_act, - mask_future=mask_future, - pre_ln=pre_ln, - pre_ln_final_layer_norm=pre_ln_final_layer_norm, - ) - - def _build_encoder(self, arch, **kwargs): - """ - Returns a decoder based on architecture arch and kwargs - """ - # default non-bottleneck transformer encoder - if (not arch) or (arch == "seq2seq"): - encoder = self.encoder - elif arch == "bridge": - encoder = BridgeEncoder( - num_layers=kwargs["num_layers"], - hidden_size=kwargs["hidden_size"], - inner_size=kwargs["inner_size"], - num_attention_heads=kwargs["num_attention_heads"], - attn_score_dropout=kwargs["attn_score_dropout"], - attn_layer_dropout=kwargs["attn_layer_dropout"], - ffn_dropout=kwargs["ffn_dropout"], - hidden_act=kwargs["hidden_act"], - mask_future=kwargs["mask_future"], - pre_ln=kwargs["pre_ln"], - pre_ln_final_layer_norm=kwargs["pre_ln_final_layer_norm"], - hidden_steps=kwargs["hidden_steps"], - hidden_blocks=kwargs["hidden_blocks"], - hidden_init_method=kwargs["hidden_init_method"], - ) - elif arch == "perceiver": - encoder = PerceiverEncoder( - num_layers=kwargs["num_layers"], - hidden_size=kwargs["hidden_size"], - inner_size=kwargs["inner_size"], - num_attention_heads=kwargs["num_attention_heads"], - attn_score_dropout=kwargs["attn_score_dropout"], - attn_layer_dropout=kwargs["attn_layer_dropout"], - ffn_dropout=kwargs["ffn_dropout"], - hidden_act=kwargs["hidden_act"], - mask_future=kwargs["mask_future"], - pre_ln=kwargs["pre_ln"], - pre_ln_final_layer_norm=kwargs["pre_ln_final_layer_norm"], - hidden_steps=kwargs["hidden_steps"], - hidden_blocks=kwargs["hidden_blocks"], - hidden_init_method=kwargs["hidden_init_method"], - ) - elif arch == "max_pool": - encoder = PoolingEncoder( - num_layers=kwargs["num_layers"], - hidden_size=kwargs["hidden_size"], - inner_size=kwargs["inner_size"], - num_attention_heads=kwargs["num_attention_heads"], - attn_score_dropout=kwargs["attn_score_dropout"], - attn_layer_dropout=kwargs["attn_layer_dropout"], - ffn_dropout=kwargs["ffn_dropout"], - hidden_act=kwargs["hidden_act"], - mask_future=kwargs["mask_future"], - pre_ln=kwargs["pre_ln"], - pre_ln_final_layer_norm=kwargs["pre_ln_final_layer_norm"], - hidden_steps=kwargs["hidden_steps"], - hidden_blocks=kwargs["hidden_blocks"], - hidden_init_method=kwargs["hidden_init_method"], - pooling_type="max", - ) - elif arch == "avg_pool": - encoder = PoolingEncoder( - num_layers=kwargs["num_layers"], - hidden_size=kwargs["hidden_size"], - inner_size=kwargs["inner_size"], - num_attention_heads=kwargs["num_attention_heads"], - attn_score_dropout=kwargs["attn_score_dropout"], - attn_layer_dropout=kwargs["attn_layer_dropout"], - ffn_dropout=kwargs["ffn_dropout"], - hidden_act=kwargs["hidden_act"], - mask_future=kwargs["mask_future"], - pre_ln=kwargs["pre_ln"], - pre_ln_final_layer_norm=kwargs["pre_ln_final_layer_norm"], - hidden_steps=kwargs["hidden_steps"], - hidden_blocks=kwargs["hidden_blocks"], - hidden_init_method=kwargs["hidden_init_method"], - pooling_type="avg", - ) - else: - raise ValueError(f"Unknown arch = {self.arch}, supported arch = {self.supported_arch}") - - return encoder - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - input_types = super().input_types - input_types.update( - {"return_mask": NeuralType((), BoolType(), True),} - ) - - return input_types - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - output_types = super().output_types - output_types.update( - {"hidden_mask": NeuralType(('B', 'T'), MaskType(), True),} - ) - return output_types - - @property - def supported_arch(self): - return self._SUPPORTED_ARCH - - @property - def arch(self): - return self._arch - - @typecheck() - def forward(self, input_ids, encoder_mask, return_mask=None): - if return_mask is None: - return_mask = self._return_mask - - embeddings = self._embedding(input_ids=input_ids) - - if (not self.arch) or (self.arch == "seq2seq"): - encoder_hidden_states = self._encoder(encoder_states=embeddings, encoder_mask=encoder_mask) - encoder_hidden_mask = encoder_mask - else: - encoder_hidden_states, encoder_hidden_mask = self._encoder( - encoder_states=embeddings, encoder_mask=encoder_mask, - ) - - if return_mask: - return encoder_hidden_states, encoder_hidden_mask - else: - return encoder_hidden_states - - -class TransformerBottleneckDecoderNM(TransformerDecoderNM): - _SUPPORTED_ARCH = ["seq2seq"] - - def __init__( - self, - vocab_size: int, - hidden_size: int, - num_layers: int, - inner_size: int, - num_attention_heads: int, - max_sequence_length: int = 512, - num_token_types: int = 2, - embedding_dropout: float = 0.0, - learn_positional_encodings: bool = False, - ffn_dropout: float = 0.0, - attn_score_dropout: float = 0.0, - attn_layer_dropout: float = 0.0, - hidden_act: str = 'relu', - pre_ln: bool = False, - pre_ln_final_layer_norm: bool = True, - arch='', - ): - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - num_layers=num_layers, - inner_size=inner_size, - num_attention_heads=num_attention_heads, - max_sequence_length=max_sequence_length, - num_token_types=num_token_types, - embedding_dropout=embedding_dropout, - learn_positional_encodings=learn_positional_encodings, - ffn_dropout=ffn_dropout, - attn_score_dropout=attn_score_dropout, - attn_layer_dropout=attn_layer_dropout, - hidden_act=hidden_act, - pre_ln=pre_ln, - pre_ln_final_layer_norm=pre_ln_final_layer_norm, - ) - - self._arch = arch - - # replace decoder - self._decoder = self._build_decoder( - arch=arch, - hidden_size=hidden_size, - num_layers=num_layers, - inner_size=inner_size, - num_attention_heads=num_attention_heads, - max_sequence_length=max_sequence_length, - num_token_types=num_token_types, - embedding_dropout=embedding_dropout, - learn_positional_encodings=learn_positional_encodings, - ffn_dropout=ffn_dropout, - attn_score_dropout=attn_score_dropout, - attn_layer_dropout=attn_layer_dropout, - hidden_act=hidden_act, - pre_ln=pre_ln, - pre_ln_final_layer_norm=pre_ln_final_layer_norm, - ) - - def _build_decoder(self, arch, **kwargs): - """ - Returns a decoder based on architecture arch and kwargs - """ - # usual non-bottleneck transformer decoder - if (not arch) or (arch == "seq2seq"): - decoder = self.decoder - else: - raise ValueError(f"Unknown arch = {self.arch}, supported arch = {self.supported_arch}") - - return decoder - - @property - def supported_arch(self): - return self._SUPPORTED_ARCH - - @property - def arch(self): - return self._arch diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_decoders.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_decoders.py deleted file mode 100644 index cfe2e9229f92050af90069cb5550b12e18cc1722..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_decoders.py +++ /dev/null @@ -1,218 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy - -import torch -import torch.nn as nn - -from nemo.collections.asr.modules.transformer.transformer_modules import MultiHeadAttention, PositionWiseFF -from nemo.collections.common.parts import form_attention_mask - -__all__ = ["TransformerDecoder"] - - -class TransformerDecoderBlock(nn.Module): - """ - Building block of Transformer decoder. - - Args: - hidden_size: size of the embeddings in the model, also known as d_model - inner_size: number of neurons in the intermediate part of feed-forward - net, usually is (4-8 x hidden_size) in the papers - num_attention_heads: number of heads in multi-head attention - attn_score_dropout: probability of dropout applied to attention scores - attn_layer_dropout: probability of dropout applied to the output of the - attention layers, but before layer normalization - ffn_dropout: probability of dropout applied to FFN output - hidden_act: activation function used between two linear layers in FFN - """ - - def __init__( - self, - hidden_size: int, - inner_size: int, - num_attention_heads: int = 1, - attn_score_dropout: float = 0.0, - attn_layer_dropout: float = 0.0, - ffn_dropout: float = 0.0, - hidden_act: str = "relu", - pre_ln: bool = False, - ): - super().__init__() - self.pre_ln = pre_ln - self.layer_norm_1 = nn.LayerNorm(hidden_size, eps=1e-5) - self.first_sub_layer = MultiHeadAttention( - hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout - ) - self.layer_norm_2 = nn.LayerNorm(hidden_size, eps=1e-5) - self.second_sub_layer = MultiHeadAttention( - hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout - ) - self.layer_norm_3 = nn.LayerNorm(hidden_size, eps=1e-5) - self.third_sub_layer = PositionWiseFF(hidden_size, inner_size, ffn_dropout, hidden_act) - - def forward_preln(self, decoder_query, decoder_mask, decoder_keys, encoder_states, encoder_mask): - """ - Pre-LayerNorm block - Order of operations: LN -> Self-Attn -> Residual -> LN -> Cross-Attn -> Residual -> LN -> FFN - """ - residual = decoder_query - decoder_query = self.layer_norm_1(decoder_query) - decoder_keys = self.layer_norm_1(decoder_keys) - self_attn_output = self.first_sub_layer(decoder_query, decoder_keys, decoder_keys, decoder_mask) - self_attn_output += residual - - residual = self_attn_output - self_attn_output = self.layer_norm_2(self_attn_output) - enc_dec_attn_output = self.second_sub_layer(self_attn_output, encoder_states, encoder_states, encoder_mask) - enc_dec_attn_output += residual - - residual = enc_dec_attn_output - enc_dec_attn_output = self.layer_norm_3(enc_dec_attn_output) - output_states = self.third_sub_layer(enc_dec_attn_output) - output_states += residual - - return output_states - - def forward_postln(self, decoder_query, decoder_mask, decoder_keys, encoder_states, encoder_mask): - """ - Post-LayerNorm block - Order of operations: Self-Attn -> Residual -> LN -> Cross-Attn -> Residual -> LN -> FFN -> Residual -> LN - """ - self_attn_output = self.first_sub_layer(decoder_query, decoder_keys, decoder_keys, decoder_mask) - self_attn_output += decoder_query - self_attn_output = self.layer_norm_1(self_attn_output) - - enc_dec_attn_output = self.second_sub_layer(self_attn_output, encoder_states, encoder_states, encoder_mask) - enc_dec_attn_output += self_attn_output - enc_dec_attn_output = self.layer_norm_2(enc_dec_attn_output) - - output_states = self.third_sub_layer(enc_dec_attn_output) - output_states += enc_dec_attn_output - return self.layer_norm_3(output_states) - - def forward(self, decoder_query, decoder_mask, decoder_keys, encoder_states, encoder_mask): - if self.pre_ln: - return self.forward_preln(decoder_query, decoder_mask, decoder_keys, encoder_states, encoder_mask) - else: - return self.forward_postln(decoder_query, decoder_mask, decoder_keys, encoder_states, encoder_mask) - - -class TransformerDecoder(nn.Module): - def __init__( - self, - num_layers: int, - hidden_size: int, - inner_size: int, - num_attention_heads: int = 1, - attn_score_dropout: float = 0.0, - attn_layer_dropout: float = 0.0, - ffn_dropout: float = 0.0, - hidden_act: str = "relu", - pre_ln: bool = False, - pre_ln_final_layer_norm: bool = True, - ): - super().__init__() - - if pre_ln and pre_ln_final_layer_norm: - self.final_layer_norm = nn.LayerNorm(hidden_size, eps=1e-5) - else: - self.final_layer_norm = None - - layer = TransformerDecoderBlock( - hidden_size, - inner_size, - num_attention_heads, - attn_score_dropout, - attn_layer_dropout, - ffn_dropout, - hidden_act, - pre_ln, - ) - self.layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_layers)]) - self.diagonal = 0 - - def _get_memory_states(self, decoder_states, decoder_mems_list=None, i=0): - if decoder_mems_list is not None: - inp1 = torch.transpose(decoder_mems_list[i], 1, 2) # Putting seq_len to last dim to handle export cases - inp2 = torch.transpose(decoder_states, 1, 2) - memory_states = torch.cat((inp1, inp2), dim=2) - memory_states = torch.transpose(memory_states, 1, 2) # Transposing back - else: - memory_states = decoder_states - return memory_states - - def forward( - self, - decoder_states, - decoder_mask, - encoder_states, - encoder_mask, - decoder_mems_list=None, - return_mems=False, - return_mems_as_list=True, - ): - """ - Args: - decoder_states: output of the embedding layer (B x L_dec x H) - decoder_mask: decoder inputs mask (B x L_dec) - encoder_states: output of the encoder (B x L_enc x H) - encoder_mask: encoder inputs mask (B x L_enc) - decoder_mems_list: list of the cached decoder hidden states - for fast autoregressive generation which will be used instead - of decoder_states as keys and values if not None - return_mems: bool, whether to return outputs of all decoder layers - or the last layer only - return_mems_as_list: bool, when True, mems returned are as a list; otherwise mems are Tensor - """ - decoder_attn_mask = form_attention_mask(decoder_mask, diagonal=self.diagonal) - encoder_attn_mask = form_attention_mask(encoder_mask) - memory_states = self._get_memory_states(decoder_states, decoder_mems_list, 0) - if return_mems_as_list: - cached_mems_list = [memory_states] - else: - cached_mems_list = memory_states.unsqueeze(0) - - for i, layer in enumerate(self.layers): - decoder_states = layer(decoder_states, decoder_attn_mask, memory_states, encoder_states, encoder_attn_mask) - memory_states = self._get_memory_states(decoder_states, decoder_mems_list, i + 1) - if return_mems_as_list: - cached_mems_list.append(memory_states) - else: - cached_mems_list = torch.cat((cached_mems_list, memory_states.unsqueeze(0)), dim=0) - - if self.final_layer_norm is not None: - decoder_states = self.final_layer_norm(decoder_states) - memory_states = self._get_memory_states(decoder_states, decoder_mems_list, i + 2) - if return_mems_as_list: - cached_mems_list.append(memory_states) - else: - cached_mems_list = torch.cat((cached_mems_list, memory_states.unsqueeze(0)), dim=0) - - if return_mems: - return cached_mems_list - else: - return cached_mems_list[-1] - - def input_example(self, max_batch=1, max_dim=256): - """ - Generates input examples for tracing etc. - Returns: - A tuple of input examples. - """ - sample = next(self.parameters()) - input_ids = torch.randint(low=0, high=2048, size=(max_batch, max_dim, 1024), device=sample.device) - encoder_mask = torch.randint(low=0, high=1, size=(max_batch, max_dim), device=sample.device) - return tuple([input_ids, encoder_mask, input_ids, encoder_mask]) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_encoders.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_encoders.py deleted file mode 100644 index 544d561267cffc1d32ff6f838a874f5f896a2f1e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_encoders.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy - -import torch -import torch.nn as nn - -from nemo.collections.asr.modules.transformer.transformer_modules import MultiHeadAttention, PositionWiseFF -from nemo.collections.common.parts import form_attention_mask - -__all__ = ["TransformerEncoder"] - - -class TransformerEncoderBlock(nn.Module): - """ - Building block of Transformer encoder. - - Args: - hidden_size: size of the embeddings in the model, also known as d_model - inner_size: number of neurons in the intermediate part of feed-forward - net, usually is (4-8 x hidden_size) in the papers - num_attention_heads: number of heads in multi-head attention - attn_score_dropout: probability of dropout applied to attention scores - attn_layer_dropout: probability of dropout applied to the output of the - attention layers, but before layer normalization - ffn_dropout: probability of dropout applied to FFN output - hidden_act: activation function used between two linear layers in FFN - """ - - def __init__( - self, - hidden_size: int, - inner_size: int, - num_attention_heads: int = 1, - attn_score_dropout: float = 0.0, - attn_layer_dropout: float = 0.0, - ffn_dropout: float = 0.0, - hidden_act: str = "relu", - pre_ln: bool = False, - ): - super().__init__() - self.pre_ln = pre_ln - self.layer_norm_1 = nn.LayerNorm(hidden_size, eps=1e-5) - self.first_sub_layer = MultiHeadAttention( - hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout - ) - self.layer_norm_2 = nn.LayerNorm(hidden_size, eps=1e-5) - self.second_sub_layer = PositionWiseFF(hidden_size, inner_size, ffn_dropout, hidden_act) - - def forward_preln(self, encoder_query, encoder_mask, encoder_keys): - """ - Pre-LayerNorm block - Order of operations: LN -> Self-Attn -> Residual -> LN -> Cross-Attn -> Residual -> LN -> FFN - """ - residual = encoder_query - encoder_query = self.layer_norm_1(encoder_query) - encoder_keys = self.layer_norm_1(encoder_keys) - self_attn_output = self.first_sub_layer(encoder_query, encoder_keys, encoder_keys, encoder_mask) - self_attn_output += residual - - residual = self_attn_output - self_attn_output = self.layer_norm_2(self_attn_output) - output_states = self.second_sub_layer(self_attn_output) - output_states += residual - - return output_states - - def forward_postln(self, encoder_query, encoder_mask, encoder_keys): - """ - Post-LayerNorm block - Order of operations: Self-Attn -> Residual -> LN -> Cross-Attn -> Residual -> LN -> FFN -> Residual -> LN - """ - self_attn_output = self.first_sub_layer(encoder_query, encoder_keys, encoder_keys, encoder_mask) - self_attn_output += encoder_query - self_attn_output = self.layer_norm_1(self_attn_output) - - output_states = self.second_sub_layer(self_attn_output) - output_states += self_attn_output - output_states = self.layer_norm_2(output_states) - - return output_states - - def forward(self, encoder_query, encoder_mask, encoder_keys): - if self.pre_ln: - return self.forward_preln(encoder_query, encoder_mask, encoder_keys) - else: - return self.forward_postln(encoder_query, encoder_mask, encoder_keys) - - -class TransformerEncoder(nn.Module): - def __init__( - self, - num_layers: int, - hidden_size: int, - inner_size: int, - mask_future: bool = False, - num_attention_heads: int = 1, - attn_score_dropout: float = 0.0, - attn_layer_dropout: float = 0.0, - ffn_dropout: float = 0.0, - hidden_act: str = "relu", - pre_ln: bool = False, - pre_ln_final_layer_norm: bool = True, - ): - super().__init__() - - if pre_ln and pre_ln_final_layer_norm: - self.final_layer_norm = nn.LayerNorm(hidden_size, eps=1e-5) - else: - self.final_layer_norm = None - - layer = TransformerEncoderBlock( - hidden_size, - inner_size, - num_attention_heads, - attn_score_dropout, - attn_layer_dropout, - ffn_dropout, - hidden_act, - pre_ln, - ) - self.layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_layers)]) - self.diag = 0 if mask_future else None - - def _get_memory_states(self, encoder_states, encoder_mems_list=None, i=0): - if encoder_mems_list is not None: - memory_states = torch.cat((encoder_mems_list[i], encoder_states), dim=1) - else: - memory_states = encoder_states - return memory_states - - def forward(self, encoder_states, encoder_mask, encoder_mems_list=None, return_mems=False): - """ - Args: - encoder_states: output of the embedding_layer (B x L_enc x H) - encoder_mask: encoder inputs mask (B x L_enc) - encoder_mems_list: list of the cached encoder hidden states - for fast autoregressive generation which will be used instead - of encoder_states as keys and values if not None - return_mems: bool, whether to return outputs of all encoder layers - or the last layer only - """ - - encoder_attn_mask = form_attention_mask(encoder_mask, self.diag) - - memory_states = self._get_memory_states(encoder_states, encoder_mems_list, 0) - cached_mems_list = [memory_states] - - for i, layer in enumerate(self.layers): - encoder_states = layer(encoder_states, encoder_attn_mask, memory_states) - memory_states = self._get_memory_states(encoder_states, encoder_mems_list, i + 1) - cached_mems_list.append(memory_states) - - if self.final_layer_norm is not None: - encoder_states = self.final_layer_norm(encoder_states) - memory_states = self._get_memory_states(encoder_states, encoder_mems_list, i + 1) - cached_mems_list.append(memory_states) - - if return_mems: - return cached_mems_list - else: - return cached_mems_list[-1] diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_generators.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_generators.py deleted file mode 100644 index 6e17151dcd1b118c6ba8bb1e18cc6f2099cf5a86..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_generators.py +++ /dev/null @@ -1,906 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from contextlib import contextmanager - -import torch - -from nemo.collections.common.parts import NEG_INF, mask_padded_tokens - -__all__ = [ - "GreedySequenceGenerator", - "TopKSequenceGenerator", - "BeamSearchSequenceGenerator", - "BeamSearchSequenceGeneratorWithLanguageModel", - "EnsembleBeamSearchSequenceGenerator", -] - - -class GreedySequenceGenerator: - """ - Greedy sequence generator based on the decoder followed by log_softmax. - - Args: - embedding: nn.Module, transforms input_ids into vector embeddings - decoder: nn.Module, takes embeddings and produces hidden_states - log_softmax: nn.Module, takes hidden_states and produces log_probs - which correspond to probability distribution of tokens (ids) - pad: index of padding token in the vocabulary - bos: index of beginning of sequence token in the vocabulary - eos: index of end of sequence token in the vocabulary - max_sequence_length: maximum allowed length for generated sequences - max_delta_length: in case of encoder-decoder generation (e.g. NMT), - forbids generated sequences to be longer than the length of - source sequences plus max_delta_length - batch_size: size of the batch of generated sequences if neither - source nor target starting sequences are provided - """ - - def __init__( - self, - embedding, - decoder, - log_softmax, - pad=0, - bos=1, - eos=2, - max_sequence_length=512, - max_delta_length=20, - batch_size=1, - ): - super().__init__() - self.embedding = embedding - self.decoder = decoder - self.log_softmax = log_softmax - self.pad, self.bos, self.eos = pad, bos, eos - self.max_seq_length = max_sequence_length - self.max_delta_len = max_delta_length - self.batch_size = batch_size - - def _one_step_forward( - self, - decoder_input_ids=None, - encoder_hidden_states=None, - encoder_input_mask=None, - decoder_mems_list=None, - pos=0, - ): - """ - One step of autoregressive output generation. - - Args: - decoder_input_ids: starting sequence of tokens to generate from; - if None, generation will start from a batch of tokens - encoder_hidden_states: output of the encoder for conditional - sequence generation; if None, generator will use unconditional - mode (e.g., language modeling) - encoder_input_mask: input mask used in the encoder - decoder_mems_list: list of size num_layers with cached activations - of sequence (x[1], ..., x[k-1]) for fast generation of x[k] - pos: starting position in positional encoding - """ - - decoder_hidden_states = self.embedding.forward(decoder_input_ids, start_pos=pos) - decoder_input_mask = mask_padded_tokens(decoder_input_ids, self.pad).float() - - if encoder_hidden_states is not None: - decoder_mems_list = self.decoder.forward( - decoder_hidden_states, - decoder_input_mask, - encoder_hidden_states, - encoder_input_mask, - decoder_mems_list, - return_mems=True, - ) - else: - decoder_mems_list = self.decoder.forward( - decoder_hidden_states, decoder_input_mask, decoder_mems_list, return_mems=True - ) - log_probs = self.log_softmax.forward(hidden_states=decoder_mems_list[-1][:, -1:]) - return log_probs, decoder_mems_list - - def _prepare_for_search(self, decoder_input_ids=None, encoder_hidden_states=None): - """ - Helper function which defines starting sequence to begin generating - with and maximum allowed number of tokens to be generated. - """ - - decoder_parameter = next(self.decoder.parameters()) - batch_size = self.batch_size - - # for encoder-decoder generation, maximum length of generated sequence - # is min(max_sequence_length, src_len + max_delta_length) - if encoder_hidden_states is not None: - batch_size, src_len, _ = encoder_hidden_states.size() - if self.max_delta_len >= 0: - max_seq_length = min(self.max_seq_length, src_len + self.max_delta_len) - else: - max_seq_length = self.max_seq_length - else: - max_seq_length = self.max_seq_length - - # if no input is provided, start with the batch of tokens - if decoder_input_ids is not None: - tgt = decoder_input_ids - batch_size, tgt_len = decoder_input_ids.size() - else: - tgt = torch.zeros(batch_size, 1).long().fill_(self.bos).to(decoder_parameter.device) - tgt_len = 1 - max_generation_length = max_seq_length - tgt_len - - return tgt, batch_size, max_generation_length - - def _forward( - self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None, return_beam_scores=False - ): - assert not return_beam_scores - tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states) - - # pad profile tracks sequences ending with token to replace - # everything after with token - decoder_parameter = next(self.decoder.parameters()) - pad_profile = torch.zeros(batch_size, 1).long().to(decoder_parameter.device) - - decoder_mems_list = None - for i in range(max_generation_length): - - log_probs, decoder_mems_list = self._one_step_forward( - tgt[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i - ) - - next_tokens = torch.argmax(log_probs[:, -1], dim=-1, keepdim=True) - next_tokens = self.pad * pad_profile + next_tokens * (1 - pad_profile) - pad_profile = torch.max(pad_profile, (next_tokens == self.eos).long()) - tgt = torch.cat((tgt, next_tokens), dim=-1) - - # abort generation if all sequences end with - if pad_profile.sum() == batch_size: - break - - return tgt - - def __call__( - self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None, return_beam_scores=False - ): - with self.as_frozen(): - return self._forward( - decoder_input_ids, encoder_hidden_states, encoder_input_mask, return_beam_scores=return_beam_scores - ) - - def freeze(self) -> None: - """Freeze weights of embedding, decoder, and classification layers to prevent memory leak. - """ - for param in self.embedding.parameters(): - param.requires_grad = False - self.embedding.eval() - for param in self.decoder.parameters(): - param.requires_grad = False - self.decoder.eval() - for param in self.log_softmax.parameters(): - param.requires_grad = False - self.log_softmax.eval() - - def unfreeze(self) -> None: - """Unfreeze weights of embedding, decoder, and classification layers. - """ - for param in self.embedding.parameters(): - param.requires_grad = True - self.embedding.train() - for param in self.decoder.parameters(): - param.requires_grad = True - self.decoder.train() - for param in self.log_softmax.parameters(): - param.requires_grad = True - self.log_softmax.train() - - @contextmanager - def as_frozen(self): - """ - Context manager which temporarily freezes embedding, decoder, and log_softmax modules, - yields control and finally unfreezes the modules. - """ - self.freeze() - - try: - yield - finally: - self.unfreeze() - - -class TopKSequenceGenerator(GreedySequenceGenerator): - """ - Top-k sequence generator based on the decoder followed by log_softmax. - - Args: - *all args of GreedySequenceGenerator class - beam_size: size of the beam (parameter k in top-k) - temperature: temperature of top-k sampling, all logits are divided - by temperature before rescaling. High temperature leads to - uniform distribution, low leads to delta-like distribution. - Kwargs: - all remaining parameters of GreedySequenceGenerator class - """ - - def __init__(self, embedding, decoder, log_softmax, beam_size=1, temperature=1.0, **kwargs): - super().__init__(embedding, decoder, log_softmax, **kwargs) - self.beam_size = beam_size - self.temp = temperature - - # @torch.no_grad() - def _one_step_forward( - self, - decoder_input_ids=None, - encoder_hidden_states=None, - encoder_input_mask=None, - decoder_mems_list=None, - pos=0, - ): - log_probs, decoder_mems_list = super()._one_step_forward( - decoder_input_ids, encoder_hidden_states, encoder_input_mask, decoder_mems_list, pos - ) - - batch_size, seq_len, vocab_size = log_probs.size() - scores, indices = torch.topk(log_probs, self.beam_size, dim=-1) - - rescaled_logexp = torch.zeros_like(log_probs).scatter(-1, indices, scores.div(self.temp).exp()) - probs = rescaled_logexp / rescaled_logexp.norm(1, -1, keepdim=True) - - # We randomly sample next tokens from rescaled probability distribution - # over top-k candidates and return a binary tensor which indicates - # candidates that have been selected. We call this object - # `pseudo_log_probs` as genuine log_probs should have -infs instead of - # 0s and 0s instead of 1s. - ids = torch.multinomial(probs.view(-1, vocab_size), 1).view(-1, seq_len, 1) - pseudo_log_probs = torch.zeros_like(log_probs).scatter(-1, ids, 1.0) - - return pseudo_log_probs, decoder_mems_list - - -class BeamSearchSequenceGenerator(GreedySequenceGenerator): - def __init__(self, embedding, decoder, log_softmax, beam_size=1, len_pen=0, **kwargs): - """ - Beam Search sequence generator based on the decoder followed by - log_softmax. - - Args: - *all args of GreedySequenceGenerator class - beam_size: size of the beam - len_pen: length penalty parameter - Kwargs: - all remaining parameters of GreedySequenceGenerator class - """ - - super().__init__(embedding, decoder, log_softmax, **kwargs) - self.beam_size = beam_size - self.len_pen = len_pen - - @staticmethod - def compute_len_penalty(lengths, alpha): - """Returns length penalty according to https://arxiv.org/pdf/1609.08144.pdf""" - return ((5 + lengths) / 6).pow(alpha) - - def _forward( - self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None, return_beam_scores=False - ): - tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states) - - # generate initial buffer of beam_size prefixes-hypotheses - log_probs, decoder_mems_list = self._one_step_forward(tgt, encoder_hidden_states, encoder_input_mask, None, 0) - scores, prefixes = torch.topk(log_probs.permute(0, 2, 1), self.beam_size, dim=1) - scores, prefixes = scores.view(-1, 1), prefixes.view(-1, 1) - - # repeat init target prefixes and cached memory states beam_size times - prefixes = torch.cat((tgt.repeat(1, self.beam_size).view(-1, 1), prefixes), dim=1) - for j in range(len(decoder_mems_list)): - decoder_mems_list[j] = decoder_mems_list[j].repeat(self.beam_size, 1, 1) - - # repeat source sequence beam_size times for beam search - if encoder_hidden_states is not None: - _, src_length, hidden_size = encoder_hidden_states.size() - encoder_input_mask = encoder_input_mask.repeat(1, self.beam_size).view(-1, src_length) - encoder_hidden_states = encoder_hidden_states.repeat(1, self.beam_size, 1).view( - -1, src_length, hidden_size - ) - else: - hidden_size = decoder_mems_list[0].size(2) - - # pad_profile tracks finished hypotheses to generate only tokens - # if or has been generated - pad_profile = torch.zeros_like(scores).long() - - # prefixes_len tracks lengths of generated hypotheses to perform - # length penalty correction - prefixes_len = torch.zeros_like(scores).fill_(prefixes.size(1) + 1) - - for i in range(max_generation_length): - - # mask all finished hypotheses to exclude them from beam - pad_mask = pad_profile.repeat(1, self.beam_size) - - # generate and score candidates for prefixes continuation - log_probs, decoder_mems_list = self._one_step_forward( - prefixes[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i + 1 - ) - scores_i, prefixes_i = torch.topk(log_probs[:, -1, :], self.beam_size, dim=-1) - - # for all prefixes ending with or replace generated - # continuations with - prefixes_i = self.pad * pad_mask + prefixes_i * (1 - pad_mask) - - # force all hypotheses but one generated from already finished - # hypotheses to have extremely low score, so they will not be - # considered during beam re-ranking - pad_mask[:, 1:] = pad_mask[:, 1:] * NEG_INF - scores = scores + scores_i * (1 - pad_mask).to(scores.dtype) - - # choose top-k hypotheses with length penalty applied - len_penalties = self.compute_len_penalty(prefixes_len, self.len_pen) - scores = scores / len_penalties - scores, indices_i = torch.topk(scores.view(-1, self.beam_size ** 2), self.beam_size, dim=1) - scores = scores.view(-1, 1) * len_penalties - - # select prefixes which correspond to the chosen hypotheses - prefixes = prefixes.unsqueeze(1).repeat(1, self.beam_size, 1) - prefixes = torch.cat((prefixes, prefixes_i.unsqueeze(2)), dim=2) - prefixes = prefixes.view(batch_size, self.beam_size ** 2, -1) - p_len = prefixes.size(2) - prefixes_ids = indices_i.unsqueeze(2).repeat(1, 1, p_len) - prefixes = prefixes.gather(1, prefixes_ids).view(-1, p_len) - - # reshuffle cached decoder memory states to restore the order - # of hypotheses broken after top-k selection - mems_ids = indices_i.unsqueeze(2).unsqueeze(3).repeat(1, 1, p_len - 1, hidden_size) // self.beam_size - for j in range(len(decoder_mems_list)): - decoder_mems_list[j] = ( - decoder_mems_list[j] - .view(-1, self.beam_size, p_len - 1, hidden_size) - .gather(1, mems_ids) - .view(-1, p_len - 1, hidden_size) - ) - - # update prefixes_len and pad_profile - not_eos_pad = prefixes.ne(self.eos) & prefixes.ne(self.pad) - prefixes_len = 1 + not_eos_pad.sum(dim=1, keepdim=True).to(scores.dtype) - pad_profile = (~not_eos_pad[:, -1:]).long() - - # if all hypotheses end with or , interrupt search - if pad_profile.sum() == batch_size * self.beam_size: - break - - # select best performing hypotheses in each element of the batch - len_penalties = self.compute_len_penalty(prefixes_len, self.len_pen) - scores = scores / len_penalties - best_guesses = ( - torch.argmax(scores.view(-1, self.beam_size), dim=1, keepdim=True).repeat(1, prefixes.size(1)).unsqueeze(1) - ) - tgt = prefixes.view(batch_size, self.beam_size, -1).gather(1, best_guesses).squeeze(1) - - if return_beam_scores: - return prefixes, scores * len_penalties, tgt - else: - return tgt - - -class EnsembleBeamSearchSequenceGenerator: - def __init__( - self, - encoders, - embeddings, - decoders, - log_softmaxes, - beam_size=1, - len_pen=0, - pad=0, - bos=1, - eos=2, - max_sequence_length=512, - max_delta_length=20, - batch_size=1, - language_model=None, - fusion_coef=None, - ): - """ - Ensemble Beam Search sequence generator based on the decoder followed by - log_softmax. Averages the probabilities of different models. - NOTE: All models must have been trained with the same BPE tokenizers. - - Args: - encoders: A list of encoders - embeddings: A list of decoder embedding layers - decoders: A list of decoders - log_softmaxes: A list of decoder output layers - beam_size: Beam size - len_pen: Length penalty to adjust logprob scores to favor longer sequences - pad: pad id - bos: beginning of sequence id - eos: end of sequence id - max_sequence_length: maximum sequence length - max_delta_length: maximum length difference between input and output - batch_size: batch size if not inferrable from input sequence - """ - self.encoders = encoders - self.embeddings = embeddings - self.decoders = decoders - self.log_softmaxes = log_softmaxes - self.beam_size = beam_size - self.len_pen = len_pen - self.pad, self.bos, self.eos = pad, bos, eos - self.max_seq_length = max_sequence_length - self.max_delta_len = max_delta_length - self.batch_size = batch_size - assert len(embeddings) == len(decoders) == len(log_softmaxes) == len(encoders) - self.num_models = len(encoders) - self.language_model = language_model - self.fusion_coef = fusion_coef - - @staticmethod - def compute_len_penalty(lengths, alpha): - """Returns length penalty according to https://arxiv.org/pdf/1609.08144.pdf""" - return ((5 + lengths) / 6).pow(alpha) - - def _one_step_forward_lm(self, decoder_input_ids=None, lm_mems_list=None, pos=0): - input_mask = mask_padded_tokens(decoder_input_ids, self.pad).float() - lm_hidden_states = self.language_model.encoder.embedding.forward(decoder_input_ids, start_pos=pos) - lm_mems_list = self.language_model.encoder.encoder.forward( - lm_hidden_states, input_mask, lm_mems_list, return_mems=True, - ) - lm_log_probs = self.language_model.log_softmax.forward(hidden_states=lm_mems_list[-1][:, -1:]) - return lm_log_probs, lm_mems_list - - def _one_step_forward( - self, - ensemble_index, - decoder_input_ids=None, - encoder_hidden_states=None, - encoder_input_mask=None, - decoder_mems_list=None, - pos=0, - ): - """ - One step of autoregressive output generation for one particular model. - - Args: - decoder_input_ids: starting sequence of tokens to generate from; - if None, generation will start from a batch of tokens - encoder_hidden_states: output of the encoder for conditional - sequence generation; if None, generator will use unconditional - mode (e.g., language modeling) - encoder_input_mask: input mask used in the encoder - decoder_mems_list: list of size num_layers with cached activations - of sequence (x[1], ..., x[k-1]) for fast generation of x[k] - pos: starting position in positional encoding - """ - - decoder_hidden_states = self.embeddings[ensemble_index].forward(decoder_input_ids, start_pos=pos) - decoder_input_mask = mask_padded_tokens(decoder_input_ids, self.pad).float() - - if encoder_hidden_states is not None: - decoder_mems_list = self.decoders[ensemble_index].forward( - decoder_hidden_states, - decoder_input_mask, - encoder_hidden_states, - encoder_input_mask, - decoder_mems_list, - return_mems=True, - ) - else: - decoder_mems_list = self.decoders[ensemble_index].forward( - decoder_hidden_states, decoder_input_mask, decoder_mems_list, return_mems=True - ) - log_probs = self.log_softmaxes[ensemble_index].forward(hidden_states=decoder_mems_list[-1][:, -1:]) - return log_probs, decoder_mems_list - - def _prepare_for_search(self, decoder_input_ids=None, encoder_hidden_states=None): - """ - Helper function which defines starting sequence to begin generating - with and maximum allowed number of tokens to be generated. - """ - - decoder_parameter = next(self.decoders[0].parameters()) - batch_size = self.batch_size - - # for encoder-decoder generation, maximum length of generated sequence - # is min(max_sequence_length, src_len + max_delta_length) - if encoder_hidden_states is not None: - batch_size, src_len, _ = encoder_hidden_states.size() - if self.max_delta_len >= 0: - max_seq_length = min(self.max_seq_length, src_len + self.max_delta_len) - else: - max_seq_length = self.max_seq_length - else: - max_seq_length = self.max_seq_length - - # if no input is provided, start with the batch of tokens - if decoder_input_ids is not None: - tgt = decoder_input_ids - batch_size, tgt_len = decoder_input_ids.size() - else: - tgt = torch.zeros(batch_size, 1).long().fill_(self.bos).to(decoder_parameter.device) - tgt_len = 1 - max_generation_length = max_seq_length - tgt_len - - return tgt, batch_size, max_generation_length - - def _get_encoder_hidden_states(self, src_ids, encoder_input_mask, ensemble_index): - return self.encoders[ensemble_index](input_ids=src_ids, encoder_mask=encoder_input_mask) - - def _average_probs(self, probs_list): - probs_list = torch.stack(probs_list) - return torch.log(torch.exp(probs_list).mean(0)) - # probs = torch.stack(probs_list) # Ens x B x T x V - # return torch.log(probs.sum(0) / probs.sum(-1).sum(0).unsqueeze(-1)) - - def _forward(self, src_ids, encoder_input_mask, decoder_input_ids=None, return_beam_scores=False): - encoder_hidden_states = [ - self._get_encoder_hidden_states(src_ids, encoder_input_mask, i) for i in range(self.num_models) - ] - tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states[0]) - - # generate initial buffer of beam_size prefixes-hypotheses - outputs = [ - self._one_step_forward(i, tgt, encoder_hidden_states[i], encoder_input_mask, None, 0) - for i in range(self.num_models) - ] - nmt_log_probs = self._average_probs([x[0] for x in outputs]) - decoder_mems_lists = [x[1] for x in outputs] - - if self.language_model is not None: - lm_log_probs, lm_mems_list = self._one_step_forward_lm(tgt, None, 0) - log_probs = nmt_log_probs + self.fusion_coef * lm_log_probs - else: - log_probs = nmt_log_probs - scores, prefixes = torch.topk(log_probs.permute(0, 2, 1), self.beam_size, dim=1) - scores, prefixes = scores.view(-1, 1), prefixes.view(-1, 1) - - # repeat init target prefixes and cached memory states beam_size times - prefixes = torch.cat((tgt.repeat(1, self.beam_size).view(-1, 1), prefixes), dim=1) - for i in range(self.num_models): - for j in range(len(decoder_mems_lists[i])): - decoder_mems_lists[i][j] = decoder_mems_lists[i][j].repeat(self.beam_size, 1, 1) - - if self.language_model is not None: - for j in range(len(lm_mems_list)): - lm_mems_list[j] = lm_mems_list[j].repeat(self.beam_size, 1, 1) - lm_hidden_size = lm_mems_list[0].size(2) - - encoder_input_mask = encoder_input_mask.repeat(1, self.beam_size).view(-1, encoder_input_mask.size(1)) - for i in range(self.num_models): - _, src_length, hidden_size = encoder_hidden_states[i].size() - encoder_hidden_states[i] = ( - encoder_hidden_states[i].repeat(1, self.beam_size, 1).view(-1, src_length, hidden_size) - ) - - # pad_profile tracks finished hypotheses to generate only tokens - # if or has been generated - pad_profile = torch.zeros_like(scores).long() - - # prefixes_len tracks lengths of generated hypotheses to perform - # length penalty correction - prefixes_len = torch.zeros_like(scores).fill_(prefixes.size(1) + 1) - - for i in range(max_generation_length): - - # mask all finished hypotheses to exclude them from beam - pad_mask = pad_profile.repeat(1, self.beam_size) - - # generate and score candidates for prefixes continuation - outputs = [ - self._one_step_forward( - model_num, - prefixes[:, -1:], - encoder_hidden_states[model_num], - encoder_input_mask, - decoder_mems_lists[model_num], - i + 1, - ) - for model_num in range(self.num_models) - ] - nmt_log_probs = self._average_probs([x[0] for x in outputs]) - decoder_mems_lists = [x[1] for x in outputs] - - if self.language_model is not None: - lm_log_probs, lm_mems_list = self._one_step_forward_lm(prefixes[:, -1:], lm_mems_list, i + 1) - log_probs = nmt_log_probs + self.fusion_coef * lm_log_probs - else: - log_probs = nmt_log_probs - scores_i, prefixes_i = torch.topk(log_probs[:, -1, :], self.beam_size, dim=-1) - - # for all prefixes ending with or replace generated - # continuations with - prefixes_i = self.pad * pad_mask + prefixes_i * (1 - pad_mask) - - # force all hypotheses but one generated from already finished - # hypotheses to have extremely low score, so they will not be - # considered during beam re-ranking - pad_mask[:, 1:] = pad_mask[:, 1:] * NEG_INF - scores = scores + scores_i * (1 - pad_mask).to(scores.dtype) - - # choose top-k hypotheses with length penalty applied - len_penalties = self.compute_len_penalty(prefixes_len, self.len_pen) - scores = scores / len_penalties - scores, indices_i = torch.topk(scores.view(-1, self.beam_size ** 2), self.beam_size, dim=1) - scores = scores.view(-1, 1) * len_penalties - - # select prefixes which correspond to the chosen hypotheses - prefixes = prefixes.unsqueeze(1).repeat(1, self.beam_size, 1) - prefixes = torch.cat((prefixes, prefixes_i.unsqueeze(2)), dim=2) - prefixes = prefixes.view(batch_size, self.beam_size ** 2, -1) - p_len = prefixes.size(2) - prefixes_ids = indices_i.unsqueeze(2).repeat(1, 1, p_len) - prefixes = prefixes.gather(1, prefixes_ids).view(-1, p_len) - - # reshuffle cached decoder memory states to restore the order - # of hypotheses broken after top-k selection - for model_num in range(self.num_models): - hidden_size = decoder_mems_lists[model_num][0].size(2) - mems_ids = indices_i.unsqueeze(2).unsqueeze(3).repeat(1, 1, p_len - 1, hidden_size) // self.beam_size - for j in range(len(decoder_mems_lists[model_num])): - decoder_mems_lists[model_num][j] = ( - decoder_mems_lists[model_num][j] - .view(-1, self.beam_size, p_len - 1, hidden_size) - .gather(1, mems_ids) - .view(-1, p_len - 1, hidden_size) - ) - if self.language_model is not None: - lm_mems_ids = ( - indices_i.unsqueeze(2).unsqueeze(3).repeat(1, 1, p_len - 1, lm_hidden_size) // self.beam_size - ) - for j in range(len(lm_mems_list)): - lm_mems_list[j] = ( - lm_mems_list[j] - .view(-1, self.beam_size, p_len - 1, lm_hidden_size) - .gather(1, lm_mems_ids) - .view(-1, p_len - 1, lm_hidden_size) - ) - - # update prefixes_len and pad_profile - not_eos_pad = prefixes.ne(self.eos) & prefixes.ne(self.pad) - prefixes_len = 1 + not_eos_pad.sum(dim=1, keepdim=True).to(scores.dtype) - pad_profile = (~not_eos_pad[:, -1:]).long() - - # if all hypotheses end with or , interrupt search - if pad_profile.sum() == batch_size * self.beam_size: - break - - # select best performing hypotheses in each element of the batch - len_penalties = self.compute_len_penalty(prefixes_len, self.len_pen) - scores = scores / len_penalties - best_guesses = ( - torch.argmax(scores.view(-1, self.beam_size), dim=1, keepdim=True).repeat(1, prefixes.size(1)).unsqueeze(1) - ) - tgt = prefixes.view(batch_size, self.beam_size, -1).gather(1, best_guesses).squeeze(1) - - if return_beam_scores: - return prefixes, scores * len_penalties, tgt - else: - return tgt - - def __call__(self, src_ids, encoder_input_mask, decoder_input_ids=None, return_beam_scores=False): - with self.as_frozen(): - return self._forward(src_ids, encoder_input_mask, decoder_input_ids, return_beam_scores) - - def freeze(self) -> None: - """Freeze weights of embedding, decoder, and classification layers to prevent memory leak. - """ - for model_num in range(self.num_models): - for param in self.embeddings[model_num].parameters(): - param.requires_grad = False - self.embeddings[model_num].eval() - for param in self.decoders[model_num].parameters(): - param.requires_grad = False - self.decoders[model_num].eval() - for param in self.log_softmaxes[model_num].parameters(): - param.requires_grad = False - self.log_softmaxes[model_num].eval() - for param in self.encoders[model_num].parameters(): - param.requires_grad = False - self.encoders[model_num].eval() - - def unfreeze(self) -> None: - """Unfreeze weights of embedding, decoder, and classification layers. - """ - for model_num in range(self.num_models): - for param in self.embeddings[model_num].parameters(): - param.requires_grad = True - self.embeddings[model_num].train() - for param in self.decoders[model_num].parameters(): - param.requires_grad = True - self.decoders[model_num].train() - for param in self.log_softmaxes[model_num].parameters(): - param.requires_grad = True - self.log_softmaxes[model_num].train() - for param in self.encoders[model_num].parameters(): - param.requires_grad = True - self.encoders[model_num].train() - - @contextmanager - def as_frozen(self): - """ - Context manager which temporarily freezes embedding, decoder, and log_softmax modules, - yields control and finally unfreezes the modules. - """ - self.freeze() - - try: - yield - finally: - self.unfreeze() - - -class BeamSearchSequenceGeneratorWithLanguageModel(GreedySequenceGenerator): - def __init__( - self, embedding, decoder, log_softmax, language_model, beam_size=1, len_pen=0, fusion_coef=0.0, **kwargs - ): - """ - Beam Search sequence generator based on the decoder followed by log_softmax - with external language model fusion. - Args: - *all args of BeamSearchSequenceGenerator class - language_model: nemo TransformerLMModel - fusion_coef: coefficient before language model score, the resulting score is - score = log P_NMT(y|x) + fusion_coef * log P_LM(y) - Kwargs: - all remaining parameters of GreedySequenceGenerator class - """ - - super().__init__(embedding, decoder, log_softmax, **kwargs) - self.language_model = language_model - self.beam_size = beam_size - self.len_pen = len_pen - self.fusion_coef = fusion_coef - - def _one_step_forward( - self, - decoder_input_ids=None, - encoder_hidden_states=None, - encoder_input_mask=None, - decoder_mems_list=None, - lm_mems_list=None, - pos=0, - ): - - nmt_log_probs, decoder_mems_list = super()._one_step_forward( - decoder_input_ids, encoder_hidden_states, encoder_input_mask, decoder_mems_list, pos, - ) - input_mask = mask_padded_tokens(decoder_input_ids, self.pad).float() - lm_hidden_states = self.language_model.encoder.embedding.forward(decoder_input_ids, start_pos=pos) - - lm_mems_list = self.language_model.encoder.encoder.forward( - lm_hidden_states, input_mask, lm_mems_list, return_mems=True, - ) - lm_log_probs = self.language_model.log_softmax.forward(hidden_states=lm_mems_list[-1][:, -1:]) - - log_probs = nmt_log_probs + self.fusion_coef * lm_log_probs - - return log_probs, decoder_mems_list, lm_mems_list - - @staticmethod - def compute_len_penalty(lengths, alpha): - """Returns length penalty according to https://arxiv.org/pdf/1609.08144.pdf""" - return ((5 + lengths) / 6).pow(alpha) - - def _forward( - self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None, return_beam_scores=False - ): - - tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states) - - # generate initial buffer of beam_size prefixes-hypotheses - log_probs, decoder_mems_list, lm_mems_list = self._one_step_forward( - tgt, encoder_hidden_states, encoder_input_mask, None, None, 0 - ) - scores, prefixes = torch.topk(log_probs.permute(0, 2, 1), self.beam_size, dim=1) - scores, prefixes = scores.view(-1, 1), prefixes.view(-1, 1) - - # repeat init target prefixes and cached memory states beam_size times - prefixes = torch.cat((tgt.repeat(1, self.beam_size).view(-1, 1), prefixes), dim=1) - for j in range(len(decoder_mems_list)): - decoder_mems_list[j] = decoder_mems_list[j].repeat(self.beam_size, 1, 1) - for j in range(len(lm_mems_list)): - lm_mems_list[j] = lm_mems_list[j].repeat(self.beam_size, 1, 1) - - # repeat source sequence beam_size times for beam search - if encoder_hidden_states is not None: - _, src_length, hidden_size = encoder_hidden_states.size() - encoder_input_mask = encoder_input_mask.repeat(1, self.beam_size).view(-1, src_length) - encoder_hidden_states = encoder_hidden_states.repeat(1, self.beam_size, 1).view( - -1, src_length, hidden_size - ) - else: - hidden_size = decoder_mems_list[0].size(2) - lm_hidden_size = lm_mems_list[0].size(2) - - # pad_profile tracks finished hypotheses to generate only tokens - # if or has been generated - pad_profile = torch.zeros_like(scores).long() - - # prefixes_len tracks lengths of generated hypotheses to perform - # length penalty correction - prefixes_len = torch.zeros_like(scores).fill_(prefixes.size(1) + 1) - - for i in range(max_generation_length): - - # mask all finished hypotheses to exclude them from beam - pad_mask = pad_profile.repeat(1, self.beam_size) - - # generate and score candidates for prefixes continuation - log_probs, decoder_mems_list, lm_mems_list = self._one_step_forward( - prefixes[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, lm_mems_list, i + 1 - ) - scores_i, prefixes_i = torch.topk(log_probs[:, -1, :], self.beam_size, dim=-1) - - # for all prefixes ending with or replace generated - # continuations with - prefixes_i = self.pad * pad_mask + prefixes_i * (1 - pad_mask) - - # force all hypotheses but one generated from already finished - # hypotheses to have extremely low score, so they will not be - # considered during beam re-ranking - pad_mask[:, 1:] = pad_mask[:, 1:] * NEG_INF - scores = scores + scores_i * (1 - pad_mask).to(scores.dtype) - - # choose top-k hypotheses with length penalty applied - len_penalties = self.compute_len_penalty(prefixes_len, self.len_pen) - scores = scores / len_penalties - scores, indices_i = torch.topk(scores.view(-1, self.beam_size ** 2), self.beam_size, dim=1) - scores = scores.view(-1, 1) * len_penalties - - # select prefixes which correspond to the chosen hypotheses - prefixes = prefixes.unsqueeze(1).repeat(1, self.beam_size, 1) - prefixes = torch.cat((prefixes, prefixes_i.unsqueeze(2)), dim=2) - prefixes = prefixes.view(batch_size, self.beam_size ** 2, -1) - p_len = prefixes.size(2) - prefixes_ids = indices_i.unsqueeze(2).repeat(1, 1, p_len) - prefixes = prefixes.gather(1, prefixes_ids).view(-1, p_len) - - # reshuffle cached decoder memory states to restore the order - # of hypotheses broken after top-k selection - mems_ids = indices_i.unsqueeze(2).unsqueeze(3).repeat(1, 1, p_len - 1, hidden_size) // self.beam_size - for j in range(len(decoder_mems_list)): - decoder_mems_list[j] = ( - decoder_mems_list[j] - .view(-1, self.beam_size, p_len - 1, hidden_size) - .gather(1, mems_ids) - .view(-1, p_len - 1, hidden_size) - ) - lm_mems_ids = indices_i.unsqueeze(2).unsqueeze(3).repeat(1, 1, p_len - 1, lm_hidden_size) // self.beam_size - for j in range(len(lm_mems_list)): - lm_mems_list[j] = ( - lm_mems_list[j] - .view(-1, self.beam_size, p_len - 1, lm_hidden_size) - .gather(1, lm_mems_ids) - .view(-1, p_len - 1, lm_hidden_size) - ) - - # update prefixes_len and pad_profile - not_eos_pad = prefixes.ne(self.eos) & prefixes.ne(self.pad) - prefixes_len = 1 + not_eos_pad.sum(dim=1, keepdim=True).to(scores.dtype) - pad_profile = (~not_eos_pad[:, -1:]).long() - - # if all hypotheses end with or , interrupt search - if pad_profile.sum() == batch_size * self.beam_size: - break - - # select best performing hypotheses in each element of the batch - len_penalties = self.compute_len_penalty(prefixes_len, self.len_pen) - scores = scores / len_penalties - best_guesses = ( - torch.argmax(scores.view(-1, self.beam_size), dim=1, keepdim=True).repeat(1, prefixes.size(1)).unsqueeze(1) - ) - tgt = prefixes.view(batch_size, self.beam_size, -1).gather(1, best_guesses).squeeze(1) - - if return_beam_scores: - return prefixes, scores * len_penalties, tgt - else: - return tgt diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_modules.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_modules.py deleted file mode 100644 index 25fb781f0cd44d3e255498a20dca72f34478ad04..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_modules.py +++ /dev/null @@ -1,295 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and -# The HuggingFace Inc. team. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math - -import numpy as np -import torch -from torch import nn -from torch.nn.functional import gelu - -from nemo.collections.common.parts import form_attention_mask -from nemo.utils import logging - -__all__ = ["TransformerEmbedding", "AttentionBridge"] - - -class FixedPositionalEncoding(nn.Module): - """ - Fixed positional encoding (embedding layer) from sine and cosine functions - of different frequencies according to https://arxiv.org/abs/1706.03762 - - Args: - hidden_size: size of the embeddings in the model, also known as d_model - max_sequence_length: maximum allowed length of the input sequence - """ - - def __init__(self, hidden_size, max_sequence_length=512): - super().__init__() - - self._hidden_size = hidden_size - self._max_sequence_length = max_sequence_length - self._build_pos_enc(hidden_size=self._hidden_size, max_sequence_length=self._max_sequence_length) - - def _build_pos_enc(self, hidden_size, max_sequence_length, device=None): - """ - Builds/replaces pre-computed positional encoding. - """ - pos_enc = torch.zeros(max_sequence_length, hidden_size, device=device) - position = torch.arange(0.0, max_sequence_length).unsqueeze(1) - coef = -math.log(10000.0) / hidden_size - div_term = torch.exp(coef * torch.arange(0.0, hidden_size, 2)) - pos_enc[:, 0::2] = torch.sin(position * div_term) - pos_enc[:, 1::2] = torch.cos(position * div_term) - pos_enc.div_(math.sqrt(hidden_size)) - self.register_buffer('pos_enc', pos_enc) - - def forward(self, position_ids): - max_pos_id = position_ids.max() - # update positional encoding if needed - if max_pos_id >= self._max_sequence_length: - logging.warning( - f'Max position id {max_pos_id} is greater than max sequence length {self._max_sequence_length}. Expanding position embeddings just for this batch. This is not expected to work very well. Consider chunking your input into smaller sequences.' - ) - self._build_pos_enc( - hidden_size=self._hidden_size, max_sequence_length=max_pos_id + 1, device=position_ids.device, - ) - - embeddings = torch.embedding(self.pos_enc, position_ids) - - # Revert expansion of position embeddings since this wall checkpoint size mismatches. - if max_pos_id >= self._max_sequence_length: - self._build_pos_enc( - hidden_size=self._hidden_size, - max_sequence_length=self._max_sequence_length, - device=position_ids.device, - ) - return embeddings - - -class TransformerEmbedding(nn.Module): - """ - Embedding from token and position embeddings. - Optionally add token_type embedding (e.g. type of the sentence in BERT). - - Args: - vocab_size: size of the vocabulary - hidden_size: size of the embeddings in the model, also known as d_model - max_sequence_length: maximum allowed length of the input sequence - num_token_types: number of different token types - (e.g. tokens of sentence A and tokens of sentence B in BERT) - embedding_dropout: probability of dropout applied to embeddings - learn_positional_encodings: whether to learn positional encodings or - use fixed (sine-cosine) ones - """ - - def __init__( - self, - vocab_size, - hidden_size, - max_sequence_length=512, - num_token_types=2, - embedding_dropout=0.0, - learn_positional_encodings=False, - ): - super().__init__() - - self.max_sequence_length = max_sequence_length - self.learn_positional_encodings = learn_positional_encodings - self.token_embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0) - if learn_positional_encodings: - self.position_embedding = nn.Embedding(max_sequence_length, hidden_size) - else: - self.position_embedding = FixedPositionalEncoding(hidden_size, max_sequence_length) - if num_token_types > 0: - self.token_type_embedding = nn.Embedding(num_token_types, hidden_size) - self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-5) - self.dropout = nn.Dropout(embedding_dropout) - - def forward(self, input_ids, token_type_ids=None, start_pos=0): - seq_length = input_ids.size(1) - # we fail here only with parametric positional embedding. FixedPositionalEncoding automatically extends. - if self.learn_positional_encodings and (seq_length > self.max_sequence_length): - raise ValueError( - f"Input sequence is longer than maximum allowed sequence length for positional encoding. " - f"Got {seq_length} and {self.max_sequence_length}" - ) - position_ids = torch.arange( - start=start_pos, end=start_pos + seq_length, dtype=torch.long, device=input_ids.device - ) - position_ids = position_ids.unsqueeze(0).repeat(input_ids.size(0), 1) - - token_embeddings = self.token_embedding(input_ids) - position_embeddings = self.position_embedding(position_ids) - embeddings = token_embeddings + position_embeddings - - if token_type_ids is not None: - token_type_embeddings = self.token_type_embedding(token_type_ids) - embeddings = embeddings + token_type_embeddings - - embeddings = self.layer_norm(embeddings) - embeddings = self.dropout(embeddings) - - return embeddings - - -class MultiHeadAttention(nn.Module): - """ - Multi-head scaled dot-product attention layer. - - Args: - hidden_size: size of the embeddings in the model, also known as d_model - num_attention_heads: number of heads in multi-head attention - attn_score_dropout: probability of dropout applied to attention scores - attn_layer_dropout: probability of dropout applied to the output of the - whole layer, but before layer normalization - """ - - def __init__(self, hidden_size, num_attention_heads, attn_score_dropout=0.0, attn_layer_dropout=0.0): - super().__init__() - if hidden_size % num_attention_heads != 0: - raise ValueError( - "The hidden size (%d) is not a multiple of the number " - "of attention heads (%d)" % (hidden_size, num_attention_heads) - ) - self.hidden_size = hidden_size - self.num_attention_heads = num_attention_heads - self.attn_head_size = int(hidden_size / num_attention_heads) - self.attn_scale = math.sqrt(math.sqrt(self.attn_head_size)) - - self.query_net = nn.Linear(hidden_size, hidden_size) - self.key_net = nn.Linear(hidden_size, hidden_size) - self.value_net = nn.Linear(hidden_size, hidden_size) - self.out_projection = nn.Linear(hidden_size, hidden_size) - - self.attn_dropout = nn.Dropout(attn_score_dropout) - self.layer_dropout = nn.Dropout(attn_layer_dropout) - - def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attn_head_size) - x = x.view(*new_x_shape) - return x.permute(0, 2, 1, 3) - - def forward(self, queries, keys, values, attention_mask): - - # attention_mask is needed to hide the tokens which correspond to [PAD] - # in the case of BERT, or to hide the future tokens in the case of - # vanilla language modeling and translation - query = self.query_net(queries) - key = self.key_net(keys) - value = self.value_net(values) - query = self.transpose_for_scores(query) / self.attn_scale - key = self.transpose_for_scores(key) / self.attn_scale - value = self.transpose_for_scores(value) - - # for numerical stability we pre-divide query and key by sqrt(sqrt(d)) - attention_scores = torch.matmul(query, key.transpose(-1, -2)) - if attention_mask is not None: - attention_scores = attention_scores + attention_mask.to(attention_scores.dtype) - attention_probs = torch.softmax(attention_scores, dim=-1) - attention_probs = self.attn_dropout(attention_probs) - - context = torch.matmul(attention_probs, value) - context = context.permute(0, 2, 1, 3).contiguous() - new_context_shape = context.size()[:-2] + (self.hidden_size,) - context = context.view(*new_context_shape) - - # output projection - output_states = self.out_projection(context) - output_states = self.layer_dropout(output_states) - return output_states - - -class PositionWiseFF(nn.Module): - """ - Position-wise feed-forward network of Transformer block. - - Args: - hidden_size: size of the embeddings in the model, also known as d_model - inner_size: number of neurons in the intermediate part of feed-forward - net, usually is (4-8 x hidden_size) in the papers - ffn_dropout: probability of dropout applied to net output - hidden_act: activation function used between two linear layers - """ - - def __init__(self, hidden_size, inner_size, ffn_dropout=0.0, hidden_act="relu"): - super().__init__() - self.dense_in = nn.Linear(hidden_size, inner_size) - self.dense_out = nn.Linear(inner_size, hidden_size) - self.layer_dropout = nn.Dropout(ffn_dropout) - ACT2FN = {"gelu": gelu, "relu": torch.relu} - self.act_fn = ACT2FN[hidden_act] - - def forward(self, hidden_states): - output_states = self.dense_in(hidden_states) - output_states = self.act_fn(output_states) - output_states = self.dense_out(output_states) - output_states = self.layer_dropout(output_states) - return output_states - - -class AttentionBridge(torch.nn.Module): - """ - A multi-head attention bridge to project a variable-size hidden states - to k hidden states (per attention head). - - Code is based on the paper https://arxiv.org/pdf/1703.03130.pdf - """ - - def __init__(self, hidden_size, k, bridge_size): - """ - hidden_size - size of input hidden state - k - number of attention heads - bridge_size - size of internal feed forward weights (i.e., attention head size) - """ - super().__init__() - - self.hidden_size = hidden_size - self.k = k - self.bridge_size = bridge_size - - self.attn_scale = np.sqrt(np.sqrt(self.bridge_size)) - - # build model - - self.W1 = torch.nn.Linear(hidden_size, bridge_size, bias=False) - self.W2 = torch.nn.Linear(bridge_size, k, bias=False) - self.act = torch.nn.ReLU() - - def forward(self, hidden, hidden_mask=None, return_ortho_loss=False): - """ - Project hidden [B x N x H] to fixed-size [B x k x H] - - return_ortho_loss - if True returns loss term to encourage - orthogonal attention vectors - """ - - attention_scores = self.W2(self.act(self.W1(hidden) / self.attn_scale) / self.attn_scale).transpose(-1, -2) - - attention_mask = form_attention_mask(hidden_mask) - if attention_mask is not None: - attention_mask.squeeze_(1) - attention_scores = attention_scores + attention_mask.to(attention_scores.dtype) - - A = torch.softmax(attention_scores, dim=-1) - M = A @ hidden - - if return_ortho_loss: - ortho_loss = ((A @ A.transpose(-1, -2)) - torch.eye(self.k).type_as(A)).pow(2).sum() - - return M, ortho_loss - else: - return M diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_utils.py deleted file mode 100644 index a74a30368568ab096426a777d0be7a4a7d8d5c72..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/transformer/transformer_utils.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Optional, Union - -from omegaconf.dictconfig import DictConfig - -from nemo.collections.asr.modules.transformer.transformer import TransformerDecoderNM, TransformerEncoderNM -from nemo.collections.asr.modules.transformer.transformer_bottleneck import TransformerBottleneckEncoderNM - - -def get_nemo_transformer( - model_name: Optional[str] = None, - pretrained: bool = False, - config_dict: Optional[Union[dict, DictConfig]] = None, - encoder: bool = True, - pre_ln_final_layer_norm: bool = True, -) -> Union[TransformerEncoderNM, TransformerDecoderNM]: - """Returns NeMo transformer. - The following configurations are mandatory: - vocab_size: int - hidden_size: int - num_layers: int - inner_size: int - and must be specified if using config_dict. - - Args: - model_name (Optional[str]): model name to download from NGC - pretrained: (bool): False will instantiate the named model architecture with random weights. - config_dict (Optional[dict], optional): model configuration parameters. Defaults to None. - config_file (Optional[str], optional): path to json file containing model configuration. Defaults to None. - checkpoint_file (Optional[str], optional): load weights from path to local checkpoint. Defaults to None. - encoder (bool, optional): True will use EncoderTransformerNM, False will use DecoderTransformerNM. Defaults to True. - """ - if model_name is not None: - raise ValueError(f'NeMo transformers cannot be loaded from NGC yet. model_name should be None') - - if pretrained: - raise ValueError(f'NeMo transformers cannot be loaded from NGC yet. pretrained should be False') - - cfg = None - - if not pretrained: - assert ( - config_dict.get('vocab_size') is not None - and config_dict.get('hidden_size') is not None - and config_dict.get('num_layers') is not None - and config_dict.get('inner_size') is not None - ), f'Using config_dict: {config_dict}. vocab_size, hidden_size, num_layers, and inner_size must are mandatory arguments' - - cfg = config_dict - - if encoder: - # if arch exists in cfg we return TransformerBottleneckEncoderNM - arch = cfg.get('arch', '') - if not arch: - model = TransformerEncoderNM( - vocab_size=cfg.get('vocab_size'), - hidden_size=cfg.get('hidden_size'), - num_layers=cfg.get('num_layers'), - inner_size=cfg.get('inner_size'), - max_sequence_length=cfg.get('max_sequence_length', 512), - embedding_dropout=cfg.get('embedding_dropout', 0.0), - learn_positional_encodings=cfg.get('learn_positional_encodings', False), - num_attention_heads=cfg.get('num_attention_heads'), - ffn_dropout=cfg.get('ffn_dropout', 0.0), - attn_score_dropout=cfg.get('attn_score_dropout', 0.0), - attn_layer_dropout=cfg.get('attn_layer_dropout', 0.0), - hidden_act=cfg.get('hidden_act', 'relu'), - mask_future=cfg.get('mask_future', True), - pre_ln=cfg.get('pre_ln', False), - pre_ln_final_layer_norm=pre_ln_final_layer_norm, - num_token_types=cfg.get('num_token_types', 2), - ) - elif arch in TransformerBottleneckEncoderNM._SUPPORTED_ARCH: - model = TransformerBottleneckEncoderNM( - vocab_size=cfg.get('vocab_size'), - hidden_size=cfg.get('hidden_size'), - num_layers=cfg.get('num_layers'), - inner_size=cfg.get('inner_size'), - max_sequence_length=cfg.get('max_sequence_length', 512), - embedding_dropout=cfg.get('embedding_dropout', 0.0), - learn_positional_encodings=cfg.get('learn_positional_encodings', False), - num_attention_heads=cfg.get('num_attention_heads'), - ffn_dropout=cfg.get('ffn_dropout', 0.0), - attn_score_dropout=cfg.get('attn_score_dropout', 0.0), - attn_layer_dropout=cfg.get('attn_layer_dropout', 0.0), - hidden_act=cfg.get('hidden_act', 'relu'), - mask_future=cfg.get('mask_future', False), - pre_ln=cfg.get('pre_ln', False), - pre_ln_final_layer_norm=pre_ln_final_layer_norm, - num_token_types=cfg.get('num_token_types', 2), - arch=cfg.get('arch', 'full'), - hidden_steps=cfg.get('hidden_steps', -1), - hidden_blocks=cfg.get('hidden_blocks', 1), - hidden_init_method=cfg.get('hidden_init_method', 'default'), - return_mask=cfg.get('return_mask', True), - ) - else: - raise ValueError(f"Unknown arch = {arch}") - else: - model = TransformerDecoderNM( - vocab_size=cfg.get('vocab_size'), - hidden_size=cfg.get('hidden_size'), - num_layers=cfg.get('num_layers'), - inner_size=cfg.get('inner_size'), - max_sequence_length=cfg.get('max_sequence_length', 512), - embedding_dropout=cfg.get('embedding_dropout', 0.0), - learn_positional_encodings=cfg.get('learn_positional_encodings', False), - num_attention_heads=cfg.get('num_attention_heads'), - ffn_dropout=cfg.get('ffn_dropout', 0.0), - attn_score_dropout=cfg.get('attn_score_dropout', 0.0), - attn_layer_dropout=cfg.get('attn_layer_dropout', 0.0), - hidden_act=cfg.get('hidden_act', 'relu'), - pre_ln=cfg.get('pre_ln', False), - pre_ln_final_layer_norm=pre_ln_final_layer_norm, - num_token_types=cfg.get('num_token_types', 2), - ) - - return model - - -# def get_huggingface_transformer( -# model_name: Optional[str] = None, -# pretrained: bool = False, -# config_dict: Optional[Union[dict, DictConfig]] = None, -# encoder: bool = True, -# ) -> Union[HuggingFaceEncoderModule, HuggingFaceDecoderModule]: - -# if encoder: -# model = HuggingFaceEncoderModule(model_name, pretrained, config_dict) -# else: -# model = HuggingFaceDecoderModule(model_name, pretrained, config_dict) - -# return model - - -def get_megatron_transformer( - model_name: Optional[str] = None, - pretrained: bool = True, - config_dict: Optional[Union[dict, DictConfig]] = None, - encoder: bool = True, - checkpoint_file: str = None, -) -> None: - - raise ValueError( - "megatron-lm bert encoders are deprecated in NeMo 1.5.0. Please use NeMo 1.4.0 until megatron bert support is added again." - ) - - # vocab_file = config_dict.pop('vocab_file', None) - # if encoder: - # model = MegatronEncoderModule( - # model_name=model_name, - # pretrained=pretrained, - # config_dict=config_dict, - # checkpoint_file=checkpoint_file, - # vocab_file=vocab_file, - # ) - # else: - # raise ValueError('Megatron decoders are not currently supported.') - - # return model diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/modules/wav2vec_modules.py b/SoundScribe/SpeakerID/nemo/collections/asr/modules/wav2vec_modules.py deleted file mode 100644 index e82d5a665a921ad69734b590e99a4c543ced21f3..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/modules/wav2vec_modules.py +++ /dev/null @@ -1,361 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import math -import random -from typing import Dict, List, Tuple - -import torch -from omegaconf import DictConfig -from omegaconf.dictconfig import DictConfig -from torch import nn -from torch.nn import functional as F - -from nemo.collections.common.parts import form_attention_mask, transformer_weights_init -from nemo.collections.nlp.modules.common.transformer import TransformerEncoder -from nemo.core.classes.module import NeuralModule -from nemo.core.neural_types import AcousticEncodedRepresentation, AudioSignal, LengthsType, NeuralType, SpectrogramType - -DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - -class TransposeLast(torch.nn.Module): - """ - Transposes last dimension. Useful for adding to a sequential block. - """ - - def forward(self, x): - return x.transpose(-2, -1) - - -class SamePad(torch.nn.Module): - def __init__(self, kernel_size): - super().__init__() - self.remove = kernel_size % 2 == 0 - - def forward(self, x): - if self.remove: - x = x[:, :, :-1] - return x - - -class ConvFeatureEncoder(NeuralModule): - """ - Encoder used to isolate features in raw audio for Wav2Vec style training. - Treated as preprocessor module in NeMo ASR training. Defaults values are - for base model found in Baeski et al (https://arxiv.org/abs/2006.11477), - save for use of layer normalization as default schema. (Chosen for stability.) - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - input_signal: - 0: AxisType(BatchTag) - 1: AxisType(TimeTag) - input_signal_length: - 0: AxisType(BatchTag) - Note: length is in number of samples, not seconds - """ - return { - "input_signal": NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)), - "length": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - For compatibility, processed features are treated as Spectrogram types - processed_signal: - 0: AxisType(BatchTag) - 1: AxisType(ChannelTag) - 2: AxisType(ProcessedTimeTag) - processed_signal_length: - 0: AxisType(BatchTag) - """ - return { - "processed_signal": NeuralType(('B', 'C', 'T'), SpectrogramType()), - "processed_signal_length": NeuralType(tuple('B'), LengthsType()), - } - - def __init__( - self, - conv_layers: List[Dict[str, int]], - extractor_mode: str = "layer_norm", - conv_bias: bool = False, - feature_grad_mult=1.0, - normalize_audio=True, - embedding_dim=768, - ): - super().__init__() - - self.grad_mult = feature_grad_mult - self.normalize_input = normalize_audio - - def block( - n_in, n_out, k, stride, is_layer_norm=False, is_group_norm=False, conv_bias=False, - ): - def make_conv(): - conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias) - nn.init.kaiming_normal_(conv.weight) - return conv - - assert (is_layer_norm and is_group_norm) is False, "layer norm and group norm are exclusive" - - if is_layer_norm: - return nn.Sequential( - make_conv(), - nn.Sequential(TransposeLast(), nn.LayerNorm(dim, elementwise_affine=True), TransposeLast()), - nn.GELU(), - ) - elif is_group_norm: - return nn.Sequential(make_conv(), nn.GroupNorm(dim, dim, affine=True), nn.GELU(),) - else: - return nn.Sequential(make_conv(), nn.GELU()) - - in_d = 1 - self.layer_cfg = conv_layers - self.conv_layers = nn.ModuleList() - self.mode = extractor_mode - for i, cl in enumerate(conv_layers): - assert len(cl) == 3, "invalid conv definition: " + str(cl) - dim, k, stride = cl["emb_dim"], cl["kernel_size"], cl["stride"] - - self.conv_layers.append( - block( - in_d, - dim, - k, - stride, - is_layer_norm=self.mode == "layer_norm", - is_group_norm=self.mode == "group_norm" and i == 0, # applied to first layer only - conv_bias=conv_bias, - ) - ) - in_d = dim - - # Model Layers - final_conv_dim = self.layer_cfg[-1]["emb_dim"] # Select last conv output layer dimension - self.post_extract_proj = ( # To project feature encodings to transformer - nn.Linear(final_conv_dim, embedding_dim) if final_conv_dim != embedding_dim else None - ) - self.layer_norm = nn.LayerNorm(embedding_dim) - - def apply_layers(self, x): - for conv in self.conv_layers: - x = conv(x) - return x - - def normalize(self, source, lengths): - with torch.no_grad(): # Normalizes audio source - for i in range(lengths.size(0)): - orig = source[i, : lengths[i]] - norm = F.layer_norm(orig, orig.shape) - source[i, : lengths[i]] = norm - return source - - def forward(self, input_signal, length): - if self.normalize_input: - input_signal = self.normalize(input_signal, length) - - # BxT -> BxCxT - processed_signal = input_signal.unsqueeze(1) - - # Applies grad mult scaling - if self.grad_mult > 0: - processed_signal = self.apply_layers(processed_signal) - if self.grad_mult != 1.0: - processed_signal = GradMultiply.apply(processed_signal, self.grad_mult) - else: - with torch.no_grad(): # 0 indicates frozen feature encoder - processed_signal = self.apply_layers(processed_signal) - - processed_signal = processed_signal.transpose(1, 2) # B,T,C - # Project to embedding - if self.post_extract_proj is not None: - processed_signal = self.post_extract_proj(processed_signal) - - # Adding normalization for output - if self.mode == "layer_norm": - processed_signal = self.layer_norm(processed_signal) - - processed_signal = processed_signal.transpose(1, 2) # B,C,T - - # Feature lengths will have been changed through convolutions - processed_signal_length = self.get_lengths(audio_lengths=length) - - return processed_signal, processed_signal_length - - def get_lengths(self, audio_lengths): - # converts audio lengths to timestep lengths - for conv in self.layer_cfg: - kernel = conv["kernel_size"] - stride = conv["stride"] - audio_lengths = ( - torch.div(audio_lengths - kernel, stride, rounding_mode='floor') + 1 - ) # from pytorch documentation - return audio_lengths - - -class Wav2VecTransformerEncoder(TransformerEncoder): - """ - Encoder module following Transformer encoder paradigm - as described in Vaswani et al. (https://arxiv.org/abs/1706.03762). Used for Wav2Vec - style encoding of context vectors as described by in Baeski et al (https://arxiv.org/abs/2006.11477). - Takes convolutional encodings of all time steps and adds to features before applying series - of self-attention layers. - - Example configs may be found at: https://github.com/NVIDIA/NeMo/tree/main/examples/asr/conf/wav2vec - - Args: - layer_drop: Floating point value specifying proportion of module for layer dropout (See Fan et al. https://arxiv.org/pdf/1909.11556.pdf). - If non-zero, each layer will draw from uniform probability to determine if applied in current forward call. - Occurs only during training step - pos_embed: Config specifying parameters for contextual embedding convolutions. Module configures convolutional padding - to maintain number of time steps - Must contain following: - embedding_dim: Depth/number of channels of each time step from feature encoding - conv_pos: Kernel size for convolution - conv_pos_groups: Number of groups for convolution - transformer: Config for transformer encoder. Uses self-attention layers found in: nemo.collections.nlp.modules.common.transformer - Must contain followign: - num_layers: Number of attention layers - hidden_size: Expected input depth (embedding size between model layers) - inner_size: Depth of embeddings within feed-forward sections of encoder layers - num_attention_heads: Number of attention heads - attn_score_dropout: Probability of dropout applied to attention scores - attn_layer_dropout: Probability of dropout applied to the output of the attention layers (prior to normalization) - ffn_dropout: Probability of dropout applied to feed-forward modules - hidden_act: Activation function for hidden layers - """ - - def __init__(self, pos_embed: DictConfig, transformer: DictConfig, layer_drop: float = 0.0): - super().__init__(**transformer) # see nlp.collections - - # positional convolutional embeddings - emb_dim = pos_embed.embedding_dim - self.pos_conv = nn.Conv1d( - emb_dim, - emb_dim, - kernel_size=pos_embed.conv_pos, - padding=pos_embed.conv_pos // 2, # Padding size preserves time step length - groups=pos_embed.conv_pos_groups, - ) - - self.layer_drop = layer_drop - - self.dropout = transformer.attn_layer_dropout # He initialization - std = math.sqrt((4 * (1.0 - self.dropout)) / (pos_embed.conv_pos * pos_embed.embedding_dim)) - nn.init.normal_(self.pos_conv.weight, mean=0, std=std) - nn.init.constant_(self.pos_conv.bias, 0) - - self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) - self.pos_conv = nn.Sequential(self.pos_conv, SamePad(pos_embed.conv_pos), nn.GELU()) - - self.layer_norm = nn.LayerNorm(emb_dim) - self.apply(lambda x: transformer_weights_init(x, xavier=False)) - - @property - def input_types(self): - """Returns definitions of module output ports. - We treat features as SpectrogramType for Nemo compatibility - audio_signal: - 0: AxisType(BatchTag) - 1: AxisType(ChannelTag) - 2: AxisType(ProcessedTimeTag) - length: - 0: AxisType(BatchTag) - """ - return { - "audio_signal": NeuralType(('B', 'C', 'T'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - We're using SpectrogramType for now to keep things Nemo safe - processed_signal: - 0: AxisType(BatchTag) - 1: AxisType(ChannelTag) - 2: AxisType(ProcessedTimeTag) - processed_length: - 0: AxisType(BatchTag) - """ - return { - "processed_signal": NeuralType(('B', 'C', 'T'), AcousticEncodedRepresentation()), - "processed_length": NeuralType(tuple('B'), LengthsType()), - } - - def forward(self, audio_signal, length): - - # Padding mask needed for transformer - padding_mask = self.create_padding_mask(length) - - # Applying padding before convolution - for idx, len in enumerate(length): - audio_signal[idx, :, len:] = 0.0 - - signal_conv = self.pos_conv(audio_signal) # B, C, T - audio_signal = audio_signal + signal_conv - - audio_signal = audio_signal.transpose(1, 2) # B, C, T -> B, T, C - audio_signal = self.layer_norm(audio_signal) - - context_emb = self.apply_transformer(audio_signal, padding_mask=padding_mask) - - context_emb = context_emb.transpose(1, 2) # B, T, C -> B, C, T - - return context_emb, length # Returning length for NeMo compatibility - - def apply_transformer(self, x, padding_mask=None): - encoder_attn_mask = form_attention_mask(padding_mask) - if ( - self.layer_drop and self.training - ): # Stochastic layer drop as in: Huang et al. https://arxiv.org/pdf/1603.09382.pdf - for _, layer in enumerate(self.layers): - p = random.random() - if p > self.layer_drop: - x = layer(x, encoder_attn_mask, x) - else: - for _, layer in enumerate(self.layers): - x = layer(x, encoder_attn_mask, x) - return x - - def create_padding_mask(self, length): - # Broadcast to vectorize creating the padding mask - max_len = max(length) - padding_mask = torch.arange(max_len, device=DEVICE) - - # Switch to binary for transformer, 1 for valid tokens, 0 for padding - padding_mask = (padding_mask.expand(len(length), max_len) < length.unsqueeze(1)).type(torch.uint8) - - return padding_mask - - -class GradMultiply(torch.autograd.Function): - @staticmethod - def forward(ctx, x, scale): - ctx.scale = scale - res = x.new(x) - return res - - @staticmethod - def backward(ctx, grad): - return grad * ctx.scale, None diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/__init__.py deleted file mode 100644 index 9e3250071955216f6abc505e6181fb59931baa8d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 45e7d09d692077eef08f436b9b4a64a3e69af788..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/features.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/features.py deleted file mode 100644 index 06665c413a45676e67b7950287cc5cd8636ed19d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/features.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright (c) 2018 Ryan Leary -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# This file contains code artifacts adapted from https://github.com/ryanleary/patter - -""" -ALIAS FILE for backward compatibility -""" -from nemo.collections.asr.parts.preprocessing.features import * diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/__init__.py deleted file mode 100644 index 2db92b2574167fb3436c079fd7941450ae0c316b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 5ec440293680dc318ee8fc9812c77a98fc0f20f4..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/__pycache__/classes.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/__pycache__/classes.cpython-310.pyc deleted file mode 100644 index e67794c7453f8dc077f7af4b68408fac58d6ec36..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/__pycache__/classes.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/__pycache__/graph_transducer.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/__pycache__/graph_transducer.cpython-310.pyc deleted file mode 100644 index 85d038817cedfb3892838381b3e22fa4f062c730..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/__pycache__/graph_transducer.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/classes.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/classes.py deleted file mode 100644 index d4c498f32a2dd88888636574b37982b372d55180..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/classes.py +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from abc import ABC -from dataclasses import dataclass, field -from typing import Any, Optional, Tuple - -import torch -from omegaconf import DictConfig - -from nemo.utils import logging - - -@dataclass -class GraphIntersectDenseConfig: - """Graph dense intersection config. - """ - - search_beam: float = 20.0 - output_beam: float = 10.0 - min_active_states: int = 30 - max_active_states: int = 10000 - - -@dataclass -class GraphModuleConfig: - """Config for graph modules. - Typically used with graph losses and decoders. - """ - - topo_type: str = "default" - topo_with_self_loops: bool = True - token_lm: Optional[Any] = None - intersect_pruned: bool = False - intersect_conf: GraphIntersectDenseConfig = field(default_factory=lambda: GraphIntersectDenseConfig()) - boost_coeff: float = 0.0 - predictor_window_size: int = 0 - predictor_step_size: int = 1 - - -class ASRK2Mixin(ABC): - """k2 Mixin class that simplifies the construction of various models with k2-based losses. - - It does the following: - - Sets up the graph loss and decoder (methods _init_k2 and update_k2_modules). - - Registers external graphs, if needed. - - Augments forward(...) with optional graph decoding to get accurate predictions. - """ - - def _init_k2(self): - """ - k2-related initialization implementation. - - This method is expected to run after the __init__ which sets self._cfg - self._cfg is expected to have the attribute graph_module_cfg - """ - if not hasattr(self, "_cfg"): - raise ValueError("self._cfg must be set before calling _init_k2().") - if not hasattr(self._cfg, "graph_module_cfg") or self._cfg.graph_module_cfg is None: - raise ValueError("self._cfg.graph_module_cfg must be set and cannot be None.") - self.graph_module_cfg = self._cfg.graph_module_cfg - - # register token_lm for MAPLoss - criterion_type = self.graph_module_cfg.get("criterion_type", "ml") - self.use_graph_lm = criterion_type == "map" - if self.use_graph_lm: - token_lm_path = self.graph_module_cfg.backend_cfg.get("token_lm", None) - if token_lm_path is None: - raise ValueError( - f"graph_module_cfg.backend_cfg.token_lm is empty. It must be set for criterion_type == `{criterion_type}`" - ) - token_lm_path = self.register_artifact('graph_module_cfg.backend_cfg.token_lm', token_lm_path) - self.graph_module_cfg.backend_cfg["token_lm"] = token_lm_path - - self.update_k2_modules(self.graph_module_cfg) - - def update_k2_modules(self, input_cfg: DictConfig): - """ - Helper function to initialize or update k2 loss and transcribe_decoder. - - Args: - input_cfg: DictConfig to take new parameters from. Schema is expected as in - nemo.collections.asr.models.configs.k2_sequence_models_config.GraphModuleConfig - """ - del self.loss - if hasattr(self, "transcribe_decoder"): - del self.transcribe_decoder - - if hasattr(self, "joint"): - # RNNT - num_classes = self.joint.num_classes_with_blank - 1 - else: - # CTC, MMI, ... - num_classes = self.decoder.num_classes_with_blank - 1 - remove_consecutive = input_cfg.backend_cfg.get("topo_with_self_loops", True) and input_cfg.backend_cfg.get( - "topo_type", "default" - ) not in ["forced_blank", "identity",] - self._wer.remove_consecutive = remove_consecutive - - from nemo.collections.asr.losses.lattice_losses import LatticeLoss - - self.loss = LatticeLoss( - num_classes=num_classes, - reduction=self._cfg.get("ctc_reduction", "mean_batch"), - backend="k2", - criterion_type=input_cfg.get("criterion_type", "ml"), - loss_type=input_cfg.get("loss_type", "ctc"), - split_batch_size=input_cfg.get("split_batch_size", 0), - graph_module_cfg=input_cfg.backend_cfg, - ) - - criterion_type = self.loss.criterion_type - self.use_graph_lm = criterion_type == "map" - transcribe_training = input_cfg.get("transcribe_training", False) - if transcribe_training and criterion_type == "ml": - logging.warning( - f"""You do not need to use transcribe_training=`{transcribe_training}` - with criterion_type=`{criterion_type}`. transcribe_training will be set to False.""" - ) - transcribe_training = False - self.transcribe_training = transcribe_training - if self.use_graph_lm: - from nemo.collections.asr.modules.graph_decoder import ViterbiDecoderWithGraph - - self.transcribe_decoder = ViterbiDecoderWithGraph( - num_classes=num_classes, - backend="k2", - dec_type="token_lm", - return_type="1best", - return_ilabels=True, - output_aligned=True, - split_batch_size=input_cfg.get("split_batch_size", 0), - graph_module_cfg=input_cfg.backend_cfg, - ) - - def _forward_k2_post_processing( - self, log_probs: torch.Tensor, encoded_length: torch.Tensor, greedy_predictions: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ - k2-related post-processing parf of .forward() - - Args: - log_probs: The log probabilities tensor of shape [B, T, D]. - encoded_length: The lengths of the acoustic sequence after propagation through the encoder, of shape [B]. - greedy_predictions: The greedy token predictions of the model of shape [B, T] - - Returns: - A tuple of 3 elements - - 1) The log probabilities tensor of shape [B, T, D]. - 2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B]. - 3) The greedy token predictions of the model of shape [B, T] (via argmax) - """ - # greedy_predictions from .forward() are incorrect for criterion_type=`map` - # getting correct greedy_predictions, if needed - if self.use_graph_lm and (not self.training or self.transcribe_training): - greedy_predictions, encoded_length, _ = self.transcribe_decoder.forward( - log_probs=log_probs, log_probs_length=encoded_length - ) - return log_probs, encoded_length, greedy_predictions diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/grad_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/grad_utils.py deleted file mode 100644 index 6278fb9c86caf28f7ae26e82250d632935eef0dd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/grad_utils.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - -from nemo.collections.asr.parts.k2.utils import make_non_pad_mask - - -class GradExpNormalize(torch.autograd.Function): - """Function for fast gradient normalization. - Typical use case is normalization for mle loss. - """ - - @staticmethod - def forward( - ctx, log_probs: torch.Tensor, input_lengths: torch.Tensor, reduction: str = "mean", - ): - mask = make_non_pad_mask(input_lengths, log_probs.shape[1]) - probs = log_probs.exp() - norm_probs = torch.zeros_like(log_probs) - norm_probs[mask] += probs[mask] - if reduction == "mean": - norm_probs /= norm_probs.shape[0] - ctx.save_for_backward(norm_probs) - return log_probs - - @staticmethod - def backward(ctx, grad_output: torch.Tensor): - return grad_output - grad_output.sum(-1).unsqueeze(-1) * ctx.saved_tensors[0], None, None - - -class GradInsert(torch.autograd.Function): - """Function to attach a pre-computed gradient to a tensor. - Typical use case is gradient computation before calling loss.backward(). - """ - - @staticmethod - def forward( - ctx, input_tensor: torch.Tensor, output_tensor: torch.Tensor, grad: torch.Tensor, mask: torch.Tensor, - ): - assert input_tensor.requires_grad - assert not output_tensor.requires_grad and not grad.requires_grad - - ctx.save_for_backward(grad, mask) - return output_tensor - - @staticmethod - def backward(ctx, grad_output: torch.Tensor): - saved_grad, mask = ctx.saved_tensors - # TODO (alaptev): make it work for grad_output with arbitrary shape - padded_grad_output = torch.zeros(saved_grad.shape[0], dtype=grad_output.dtype, device=grad_output.device) - padded_grad_output[mask] = grad_output - return (padded_grad_output * saved_grad.T).T, None, None, None - - -class PartialGrad(torch.nn.Module): - """Module for partial gradient computation. - Useful when computing loss on batch splits to save memory. - """ - - def __init__(self, func: torch.nn.Module): - super().__init__() - self.func = func - - def forward( - self, - input_tensor: torch.Tensor, - targets: torch.Tensor, - input_lengths: torch.Tensor, - target_lengths: torch.Tensor, - ): - # break the gradient chain - loc_tensor = input_tensor.detach() - loc_tensor.requires_grad_(True) - - new_tensor, mask = self.func(loc_tensor, targets, input_lengths, target_lengths) - loc_new_tensor = new_tensor.detach() - - new_tensor.sum().backward() - grad = loc_tensor.grad - - return GradInsert.apply(input_tensor, loc_new_tensor, grad, mask), mask diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/graph_compilers.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/graph_compilers.py deleted file mode 100644 index 8b82dcf5cc0f46d17d2c21c7591b4b4f75aed019..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/graph_compilers.py +++ /dev/null @@ -1,191 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2020, Xiaomi CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Optional, Tuple - -import torch - -from nemo.collections.asr.parts.k2.utils import add_self_loops, compose_with_self_loops, intersect_with_self_loops - -from nemo.core.utils.k2_guard import k2 # import k2 from guard module - - -class CtcTopologyCompiler(object): - """Default graph compiler. - It applies its topology to the input token sequence to compile the supervision graph. - - Based on https://github.com/k2-fsa/snowfall/blob/master/snowfall/training/ctc_graph.py - """ - - def __init__( - self, - num_classes: int, - blank: int, - topo_type: str = "default", - topo_with_self_loops: bool = True, - device: torch.device = torch.device("cpu"), - ): - self.topo_type = topo_type - self.device = device - from nemo.collections.asr.parts.k2.topologies import build_topo - - self.base_graph = k2.arc_sort(build_topo(topo_type, list(range(num_classes)), blank, topo_with_self_loops)).to( - self.device - ) - self.ctc_topo_inv = k2.arc_sort(self.base_graph.invert()) - - def to(self, device: torch.device): - self.ctc_topo_inv = self.ctc_topo_inv.to(device) - if self.base_graph is not None: - self.base_graph = self.base_graph.to(device) - self.device = device - - def compile(self, targets: torch.Tensor, target_lengths: torch.Tensor) -> 'k2.Fsa': - token_ids_list = [t[:l].tolist() for t, l in zip(targets, target_lengths)] - label_graph = k2.linear_fsa(token_ids_list).to(self.device) - label_graph.aux_labels = label_graph.labels.clone() - supervision_graphs = compose_with_self_loops(self.base_graph, label_graph) - supervision_graphs = k2.arc_sort(supervision_graphs).to(self.device) - - # make sure the gradient is not accumulated - supervision_graphs.requires_grad_(False) - return supervision_graphs - - -class CtcNumGraphCompiler(CtcTopologyCompiler): - """Graph compiler with auxiliary graph to compose with the topology. - The supervision graph contains the auxiliary graph information. - """ - - def __init__( - self, - num_classes: int, - blank: int, - topo_type: str = "default", - topo_with_self_loops: bool = True, - device: torch.device = torch.device("cpu"), - aux_graph: Optional['k2.Fsa'] = None, - ): - super().__init__(num_classes, blank, topo_type, topo_with_self_loops, device) - if aux_graph is None: - self.decoding_graph = k2.create_fsa_vec([self.ctc_topo_inv.invert()]).to(self.device) - else: - self.base_graph = intersect_with_self_loops(self.ctc_topo_inv, aux_graph).invert_() - self.base_graph = k2.arc_sort(self.base_graph).to(self.device) - - def compile( - self, targets: torch.Tensor, target_lengths: torch.Tensor, aux_graph: Optional['k2.Fsa'] = None, - ) -> 'k2.Fsa': - if aux_graph is None and self.base_graph is None: - raise ValueError( - f"At least one of aux_graph and self.base_graph must be set: {aux_graph}, {self.base_graph}" - ) - elif aux_graph is not None: - self.base_graph = intersect_with_self_loops(self.ctc_topo_inv, aux_graph).invert() - self.base_graph = k2.arc_sort(self.base_graph).to(self.device) - return super().compile(targets, target_lengths) - - -class MmiGraphCompiler(CtcNumGraphCompiler): - """Graph compiler for MMI loss. - The decoding graph is a composition of the auxiliary graph and the topology. - It is returned along with the supervision graph on every compile() call. - """ - - def __init__( - self, - num_classes: int, - blank: int, - topo_type: str = "default", - topo_with_self_loops: bool = True, - device: torch.device = torch.device("cpu"), - aux_graph: Optional['k2.Fsa'] = None, - ): - super().__init__(num_classes, blank, topo_type, topo_with_self_loops, device, aux_graph) - if aux_graph is None: - self.decoding_graph = k2.create_fsa_vec([self.ctc_topo_inv.invert()]).to(self.device) - else: - self.decoding_graph = k2.create_fsa_vec([self.base_graph.detach()]).to(self.device) - - def to(self, device: torch.device): - if self.decoding_graph is not None: - self.decoding_graph = self.decoding_graph.to(device) - super().to(device) - - def compile( - self, targets: torch.Tensor, target_lengths: torch.Tensor, aux_graph: Optional['k2.Fsa'] = None, - ) -> Tuple['k2.Fsa', 'k2.Fsa']: - supervision_graphs = super().compile(targets, target_lengths, aux_graph) - if aux_graph is None and self.decoding_graph is None: - raise ValueError( - f"At least one of aux_graph and self.decoding_graph must be set: {aux_graph}, {self.decoding_graph}" - ) - elif aux_graph is not None: - self.decoding_graph = k2.create_fsa_vec([self.base_graph.detach()]).to(self.device) - return supervision_graphs, self.decoding_graph - - -class RnntTopologyCompiler(CtcTopologyCompiler): - """Default graph compiler for RNNT loss. - Each supervision graph is composed with the corresponding RNNT emission adapter. - - If max_adapter_length is provided, the maximum adapter length is limited. - - Note: - The actual number of classes is `num_classes` + 1 with as the class 0. - - Warning: - It is currently not recommended to use topologies other than "minimal". - """ - - def __init__( - self, - num_classes: int, - blank: int, - topo_type: str = "minimal", - topo_with_self_loops: bool = True, - device: torch.device = torch.device("cpu"), - max_adapter_length: int = 0, - ): - if topo_type == "compact": - raise NotImplementedError(f"This compiler does not support topo_type==`compact`.") - super().__init__(num_classes, blank, topo_type, topo_with_self_loops, device) - from nemo.collections.asr.parts.k2.topologies import RnntEmissionAdapterBuilder - - self.max_adapter_length = max_adapter_length - self._builder = RnntEmissionAdapterBuilder(list(range(num_classes)), blank, num_classes) - - def compile(self, targets: torch.Tensor, target_lengths: torch.Tensor) -> 'k2.Fsa': - supervision_graphs = add_self_loops(super().compile(targets, target_lengths), self._builder.eps_num, "input") - - adapters = self._builder( - torch.where(target_lengths > self.max_adapter_length, self.max_adapter_length, target_lengths) - if self.max_adapter_length > 0 and self.max_adapter_length < target_lengths.max() - else target_lengths - ).to(device=self.device) - return k2.intersect(adapters, supervision_graphs, treat_epsilons_specially=False) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/graph_decoders.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/graph_decoders.py deleted file mode 100644 index 33218588b79f0900965439cdc62037119299eee9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/graph_decoders.py +++ /dev/null @@ -1,338 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from abc import abstractmethod -from typing import List, Optional, Tuple, Union - -import torch -from omegaconf import DictConfig - -from nemo.collections.asr.parts.k2.classes import GraphIntersectDenseConfig -from nemo.collections.asr.parts.k2.loss_mixins import CtcK2Mixin, RnntK2Mixin -from nemo.collections.asr.parts.k2.utils import invert_permutation, load_graph -from nemo.utils import logging - - -class BaseDecoder(object): - """Base graph decoder with topology for decoding graph. - Typically uses the same parameters as for the corresponding loss function. - - Can do decoding and forced alignment. - - cfg takes precedence over all optional parameters - We keep explicit parameter setting to be able to create an instance without the need of a config. - """ - - @abstractmethod - def __init__( - self, - num_classes: int, - blank: int, - cfg: Optional[DictConfig] = None, - intersect_pruned: bool = False, - intersect_conf: GraphIntersectDenseConfig = GraphIntersectDenseConfig(), - topo_type: str = "default", - topo_with_self_loops: bool = True, - device: torch.device = torch.device("cpu"), - ): - - if cfg is not None: - intersect_pruned = cfg.get("intersect_pruned", intersect_pruned) - intersect_conf = cfg.get("intersect_conf", intersect_conf) - topo_type = cfg.get("topo_type", topo_type) - topo_with_self_loops = cfg.get("topo_with_self_loops", topo_with_self_loops) - - self.num_classes = num_classes - self.blank = blank - self.intersect_pruned = intersect_pruned - self.device = device - self.topo_type = topo_type - self.topo_with_self_loops = topo_with_self_loops - self.pad_fsavec = self.topo_type == "ctc_compact" - self.intersect_conf = intersect_conf - self.graph_compiler = None # expected to be initialized in child classes - self.base_graph = None # expected to be initialized in child classes - self.decoding_graph = None - - def to(self, device: torch.device): - if self.graph_compiler.device != device: - self.graph_compiler.to(device) - if self.base_graph.device != device: - self.base_graph = self.base_graph.to(device) - if self.decoding_graph is not None and self.decoding_graph.device != device: - self.decoding_graph = self.decoding_graph.to(device) - self.device = device - - def update_graph(self, graph: 'k2.Fsa'): - raise NotImplementedError - - def _decode_impl( - self, - log_probs: torch.Tensor, - supervisions: torch.Tensor, - return_lattices: bool = False, - return_ilabels: bool = False, - output_aligned: bool = True, - ) -> Union['k2.Fsa', Tuple[List[torch.Tensor], List[torch.Tensor]]]: - if self.decoding_graph is None: - self.decoding_graph = self.base_graph - - if log_probs.device != self.device: - self.to(log_probs.device) - emissions_graphs = self._prepare_emissions_graphs(log_probs, supervisions) - - if self.intersect_pruned: - lats = k2.intersect_dense_pruned( - a_fsas=self.decoding_graph, - b_fsas=emissions_graphs, - search_beam=self.intersect_conf.search_beam, - output_beam=self.intersect_conf.output_beam, - min_active_states=self.intersect_conf.min_active_states, - max_active_states=self.intersect_conf.max_active_states, - ) - else: - indices = torch.zeros(emissions_graphs.dim0(), dtype=torch.int32, device=self.device) - dec_graphs = ( - k2.index_fsa(self.decoding_graph, indices) - if self.decoding_graph.shape[0] == 1 - else self.decoding_graph - ) - lats = k2.intersect_dense(dec_graphs, emissions_graphs, self.intersect_conf.output_beam) - self.decoding_graph = None - - order = supervisions[:, 0] - if return_lattices: - lats = k2.index_fsa(lats, invert_permutation(order).to(device=log_probs.device)) - if self.blank != 0: - # change only ilabels - # suppose self.blank == self.num_classes - 1 - lats.labels = torch.where(lats.labels == 0, self.blank, lats.labels - 1) - return lats - else: - shortest_path_fsas = k2.index_fsa( - k2.shortest_path(lats, True), invert_permutation(order).to(device=log_probs.device), - ) - return self._extract_labels_and_probabilities(shortest_path_fsas, return_ilabels, output_aligned) - - def decode( - self, - log_probs: torch.Tensor, - log_probs_length: torch.Tensor, - return_lattices: bool = False, - return_ilabels: bool = False, - output_aligned: bool = True, - ) -> Union['k2.Fsa', Tuple[List[torch.Tensor], List[torch.Tensor]]]: - log_probs, supervisions, _, _ = self._prepare_log_probs_and_targets(log_probs, log_probs_length, None, None) - return self._decode_impl( - log_probs, - supervisions, - return_lattices=return_lattices, - return_ilabels=return_ilabels, - output_aligned=output_aligned, - ) - - def align( - self, - log_probs: torch.Tensor, - log_probs_length: torch.Tensor, - targets: torch.Tensor, - target_lengths: torch.Tensor, - return_lattices: bool = False, - return_ilabels: bool = False, - output_aligned: bool = True, - ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: - log_probs, supervisions, targets, target_lengths = self._prepare_log_probs_and_targets( - log_probs, log_probs_length, targets, target_lengths - ) - order = supervisions[:, 0].to(dtype=torch.long) - self.decoding_graph = self.graph_compiler.compile(targets[order], target_lengths[order]) - return self._decode_impl( - log_probs, - supervisions, - return_lattices=return_lattices, - return_ilabels=return_ilabels, - output_aligned=output_aligned, - ) - - -class CtcDecoder(BaseDecoder, CtcK2Mixin): - """Regular CTC graph decoder with custom topologies. - Available topologies: - - `default`, with or without self-loops - - `compact`, with or without self-loops - - `shared_blank`, with or without self-loops - - `minimal`, without self-loops - - Can do decoding and forced alignment. - """ - - def __init__( - self, - num_classes: int, - blank: int, - cfg: Optional[DictConfig] = None, - intersect_pruned: bool = False, - intersect_conf: GraphIntersectDenseConfig = GraphIntersectDenseConfig(), - topo_type: str = "default", - topo_with_self_loops: bool = True, - device: torch.device = torch.device("cpu"), - ): - super().__init__( - num_classes, blank, cfg, intersect_pruned, intersect_conf, topo_type, topo_with_self_loops, device - ) - from nemo.collections.asr.parts.k2.graph_compilers import CtcTopologyCompiler - - self.graph_compiler = CtcTopologyCompiler( - self.num_classes, self.blank, self.topo_type, self.topo_with_self_loops, self.device - ) - self.base_graph = k2.create_fsa_vec([self.graph_compiler.ctc_topo_inv.invert()]).to(self.device) - - -class RnntAligner(BaseDecoder, RnntK2Mixin): - """RNNT graph decoder with the `minimal` topology. - If predictor_window_size is not provided, this decoder works as a Viterbi over regular RNNT lattice. - With predictor_window_size provided, it applies uniform pruning when compiling Emission FSAs - to reduce memory and compute consumption. - - Can only do forced alignment. - """ - - def __init__( - self, - num_classes: int, - blank: int, - cfg: Optional[DictConfig] = None, - intersect_pruned: bool = False, - intersect_conf: GraphIntersectDenseConfig = GraphIntersectDenseConfig(), - topo_type: str = "default", - topo_with_self_loops: bool = True, - predictor_window_size: int = 0, - predictor_step_size: int = 1, - device: torch.device = torch.device("cpu"), - ): - if cfg is not None: - topo_type = cfg.get("topo_type", topo_type) - predictor_window_size = cfg.get("predictor_window_size", predictor_window_size) - predictor_step_size = cfg.get("predictor_step_size", predictor_step_size) - if topo_type != "minimal": - raise NotImplementedError(f"Only topo_type=`minimal` is supported at the moment.") - super().__init__( - num_classes, blank, cfg, intersect_pruned, intersect_conf, topo_type, topo_with_self_loops, device - ) - self.predictor_window_size = predictor_window_size - self.predictor_step_size = predictor_step_size - from nemo.collections.asr.parts.k2.graph_compilers import RnntTopologyCompiler - - self.graph_compiler = RnntTopologyCompiler( - self.num_classes, - self.blank, - self.topo_type, - self.topo_with_self_loops, - self.device, - max_adapter_length=self.predictor_window_size, - ) - self.base_graph = self.graph_compiler.base_graph - - def decode( - self, - log_probs: torch.Tensor, - log_probs_length: torch.Tensor, - return_lattices: bool = False, - return_ilabels: bool = False, - output_aligned: bool = True, - ) -> Union['k2.Fsa', Tuple[List[torch.Tensor], List[torch.Tensor]]]: - raise NotImplementedError("RNNT decoding is not implemented. Only .align(...) method is supported.") - - def align( - self, - log_probs: torch.Tensor, - log_probs_length: torch.Tensor, - targets: torch.Tensor, - target_lengths: torch.Tensor, - return_lattices: bool = False, - return_ilabels: bool = False, - output_aligned: bool = True, - ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: - assert self.predictor_window_size == 0 or log_probs.size(2) <= self.predictor_window_size + 1 - - return super().align( - log_probs, - log_probs_length, - targets, - target_lengths, - return_lattices=return_lattices, - return_ilabels=return_ilabels, - output_aligned=output_aligned, - ) - - -class TokenLMDecoder(BaseDecoder): - """Graph decoder with token_lm-based decoding graph. - Available topologies: - - `default`, with or without self-loops - - `compact`, with or without self-loops - - `shared_blank`, with or without self-loops - - `minimal`, without self-loops - - Can do decoding and forced alignment. - - cfg takes precedence over all optional parameters - We keep explicit parameter setting to be able to create an instance without the need of a config. - """ - - def __init__( - self, - num_classes: int, - blank: int, - cfg: Optional[DictConfig] = None, - token_lm: Optional[Union['k2.Fsa', str]] = None, - intersect_pruned: bool = False, - intersect_conf: GraphIntersectDenseConfig = GraphIntersectDenseConfig(), - topo_type: str = "default", - topo_with_self_loops: bool = True, - device: torch.device = torch.device("cpu"), - ): - super().__init__( - num_classes, blank, cfg, intersect_pruned, intersect_conf, topo_type, topo_with_self_loops, device - ) - if cfg is not None: - token_lm = cfg.get("token_lm", token_lm) - if token_lm is not None: - self.token_lm = load_graph(token_lm) if isinstance(token_lm, str) else token_lm - if self.token_lm is not None: - self.update_graph(self.token_lm) - else: - logging.warning( - f"""token_lm was set to None. Use this for debug - purposes only or call .update_graph(token_lm) before using.""" - ) - else: - logging.warning( - f"""token_lm was set to None. Use this for debug - purposes only or call .update_graph(token_lm) before using.""" - ) - self.token_lm = None - - def update_graph(self, graph: 'k2.Fsa'): - self.token_lm = graph - token_lm = self.token_lm.clone() - if hasattr(token_lm, "aux_labels"): - delattr(token_lm, "aux_labels") - labels = token_lm.labels - if labels.max() != self.num_classes - 1: - raise ValueError(f"token_lm is not compatible with the num_classes: {labels.unique()}, {self.num_classes}") - self.graph_compiler = CtcNumGraphCompiler( - self.num_classes, self.blank, self.topo_type, self.topo_with_self_loops, self.device, token_lm - ) - self.base_graph = k2.create_fsa_vec([self.graph_compiler.base_graph]).to(self.device) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/graph_transducer.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/graph_transducer.py deleted file mode 100644 index 5de8064224a168c7bfd7c1dd4c5ca1b8c5c9d4a8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/graph_transducer.py +++ /dev/null @@ -1,483 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import abc -from contextlib import nullcontext -from typing import ContextManager -import torch -import torch.nn.functional as F - -from nemo.core.classes.loss import Loss -from nemo.core.utils.k2_guard import k2 - - -def force_float32_context() -> ContextManager: - """Get context manager to force float32 precision in autocast mode.""" - if torch.is_autocast_enabled(): - return torch.cuda.amp.autocast(dtype=torch.float32) - return nullcontext() - - -class GraphTransducerLossBase(Loss): - """ - Base class for graph transducer losses. - Implementation of the approach described in "Powerful and Extensible WFST Framework for RNN-Transducer Losses" - https://ieeexplore.ieee.org/document/10096679 - - Compose-Transducer: compose the unit (target text) and temporal schemas (graphs) into lattice. - Subclass should implement `get_unit_schema` and `get_temporal_schema` methods. - Grid-Transducer: construct the RNN-T lattice (grid) directly in code. - Subclass should implement `get_grid` method. - """ - - def __init__( - self, use_grid_implementation: bool, connect_composed=False, double_scores=False, cast_to_float32=False - ): - """ - - Args: - use_grid_implementation: Whether to use the grid implementation (Grid-Transducer). - connect_composed: Connect graph after composing unit and temporal schemas (only for Compose-Transducer). - `connect` operation is slow, it is useful for visualization, but not necessary for loss computation. - double_scores: Use calculation of loss in double precision (float64) in the lattice. - Does not significantly affect memory usage since the lattice is ~V/2 times smaller - than the joint tensor. - cast_to_float32: Force cast joint tensor to float32 before log-softmax calculation. - """ - super().__init__() - self.use_grid_implementation = use_grid_implementation - self.connect_composed = connect_composed - self.double_scores = double_scores - self.cast_to_float32 = cast_to_float32 - - @abc.abstractmethod - def get_unit_schema(self, units_tensor: torch.Tensor, vocab_size: int) -> "k2.Fsa": - """ - Get unit schema (target text) graph for Compose-Transducer. - - Args: - units_tensor: tensor with target text - vocab_size: number of labels (including blank). Needed to construct additional eps-arcs (in some cases). - - Returns: - unit schema graph (k2.Fsa). - Labels: :: (k2.Fsa: labels, aux_labels, unit_positions) - """ - pass - - @abc.abstractmethod - def get_temporal_schema(self, num_frames: int, vocab_size: int, device: torch.device) -> "k2.Fsa": - """ - Get temporal schema graph for Compose-Transducer. - - Args: - num_frames: length of the sequence (in frames) - vocab_size: number of labels (including blank) - device: device for tensor to construct - - Returns: - temporal schema graph (k2.Fsa). - Labels: :. is a unit from vocab + special units (e.g., additional eps). - """ - pass - - @abc.abstractmethod - def get_grid(self, units_tensor: torch.Tensor, num_frames: int, vocab_size: int) -> "k2.Fsa": - """ - Construct the transducer lattice (grid) directly for Grid-Transducer. - - Args: - units_tensor: tensor with target text - num_frames: length of the sequence (in frames) - vocab_size: number of labels (including blank) - - Returns: - transducer lattice (k2.Fsa). - Labels: :: (k2.Fsa: labels, aux_labels, unit_positions) - """ - pass - - def get_composed_lattice(self, units_tensor: torch.Tensor, num_frames: int, vocab_size: int) -> "k2.Fsa": - """ - Get composed lattice (unit and temporal schemas) for Compose-Transducer. Useful for visualization. - Should be equivalent to the lattice from `get_grid` method. - - Args: - units_tensor: tensor with target text - num_frames: length of the sequence (in frames) - vocab_size: vocab size (including blank) - - Returns: - composed lattice (k2.Fsa) from unit and temporal schemas - """ - fsa_text = self.get_unit_schema(units_tensor, vocab_size) - fsa_temporal = self.get_temporal_schema(num_frames, vocab_size, units_tensor.device) - composed = k2.compose(fsa_text, fsa_temporal, treat_epsilons_specially=False) - if self.connect_composed: - composed = k2.connect(composed) - return composed - - def get_graphs_batched( - self, logits_lengths: torch.Tensor, targets: torch.Tensor, target_lengths: torch.Tensor, vocab_size: int - ) -> "k2.Fsa": - """ - Get batched lattice (grid or composed) for the batch of sequences. - - Args: - logits_lengths: tensor with lengths of logits - targets: tensor with target units - target_lengths: tensor with lengths of targets - vocab_size: vocab size (including blank) - - Returns: - batched lattice - FsaVec (k2.Fsa) - """ - batch_size = logits_lengths.shape[0] - with torch.no_grad(): - if self.use_grid_implementation: - return k2.create_fsa_vec( - [ - self.get_grid( - units_tensor=targets[i, : target_lengths[i].item()], - num_frames=logits_lengths[i].item(), - vocab_size=vocab_size, - ) - for i in range(batch_size) - ] - ) - - # composed version - text_fsas = [ - self.get_unit_schema(units_tensor=targets[i, : target_lengths[i].item()], vocab_size=vocab_size,) - for i in range(batch_size) - ] - temporal_fsas = [ - self.get_temporal_schema( - num_frames=logits_lengths[i].item(), vocab_size=vocab_size, device=targets.device - ) - for i in range(batch_size) - ] - target_fsas_vec = k2.compose( - k2.create_fsa_vec(text_fsas), k2.create_fsa_vec(temporal_fsas), treat_epsilons_specially=False - ) - if self.connect_composed: - k2.connect(target_fsas_vec) - return target_fsas_vec - - def get_logits_indices(self, target_fsas_vec: k2.Fsa, logits_shape: torch.Size) -> torch.Tensor: - """ - Get indices of flatten logits for each arc in the lattices. - - Args: - target_fsas_vec: batch of target FSAs with lattices - logits_shape: shape of the logits tensor - - Returns: - 1d tensor with indices - """ - # logits_shape: B x Time x Text+1 x Labels - batch_size = logits_shape[0] - device = target_fsas_vec.device - scores_to_batch_i = torch.repeat_interleave( - torch.arange(batch_size, device=device, dtype=torch.int64), - torch.tensor( - [target_fsas_vec.arcs.index(0, i)[0].values().shape[0] for i in range(batch_size)], device=device, - ), - ) - indices = ( - scores_to_batch_i * logits_shape[1] * logits_shape[2] * logits_shape[3] # Batch - + target_fsas_vec.aux_labels.to(torch.int64) * logits_shape[2] * logits_shape[3] # Time indices - + target_fsas_vec.unit_positions.to(torch.int64) * logits_shape[3] # Units (text) indices - + target_fsas_vec.labels.to(torch.int64) # Labels - ) - return indices - - -class GraphRnntLoss(GraphTransducerLossBase): - """ - RNN-T loss implementation based on WFST according - to "Powerful and Extensible WFST Framework for RNN-Transducer Losses" - https://ieeexplore.ieee.org/document/10096679 - """ - - def __init__( - self, - blank: int, - use_grid_implementation=True, - connect_composed=False, - double_scores=False, - cast_to_float32=False, - ): - """ - Init method - - Args: - blank: blank label index - use_grid_implementation: Whether to use the grid implementation (Grid-Transducer). - connect_composed: Connect graph after composing unit and temporal schemas (only for Compose-Transducer). - `connect` operation is slow, it is useful for visualization, but not necessary for loss computation. - double_scores: Use calculation of loss in double precision (float64) in the lattice. - Does not significantly affect memory usage since the lattice is ~V/2 times smaller than the joint tensor. - cast_to_float32: Force cast joint tensor to float32 before log-softmax calculation. - """ - super().__init__( - use_grid_implementation=use_grid_implementation, - connect_composed=connect_composed, - double_scores=double_scores, - cast_to_float32=cast_to_float32, - ) - self.blank = blank - - def get_unit_schema(self, units_tensor: torch.Tensor, vocab_size: int) -> "k2.Fsa": - """ - Get unit schema (target text) graph for RNN-T loss (Compose-Transducer). - Forward arcs represent text labels. - - Example graph: text [1, 2], blank=0. - - graph:: - - 0:0:0 0:0:1 0:0:2 - +-------+ +-------+ +-------+ - v | v | v | - +-----------+ 1:1:0 +-----------+ 2:2:1 +-----------+ -1:-1:-1 #===# - | 0 | -------> | 1 | -------> | 2 | ---------> H 3 H - +-----------+ +-----------+ +-----------+ #===# - - Args: - units_tensor: 1d tensor with text units - vocab_size: number of total labels (vocab size including blank) - - Returns: - unit schema graph (k2.Fsa). - Labels: :: (k2.Fsa: labels, aux_labels, unit_positions) - """ - - blank_id = self.blank - device = units_tensor.device - text_len = units_tensor.shape[0] - - # arcs - # text_len + 1 states, in every state - self-loops (blank) and forward (text label / last forward -1) - arcs = torch.zeros(((text_len + 1) * 2, 4), dtype=torch.int32, device=device) - text_indices = torch.arange(0, text_len + 1, dtype=torch.int32, device=device) - # blank labels - arcs[::2, 0] = text_indices # from state - arcs[::2, 1] = text_indices # to state - arcs[::2, 2] = blank_id - - # text labels - arcs[1::2, 0] = text_indices # from state - arcs[1::2, 1] = text_indices + 1 # to state - arcs[1:-1:2, 2] = units_tensor # labels: text - - arcs[-1, 2] = -1 # last transition to final state, ilabel=-1 (special for k2) - olabels = arcs[:, 2].detach().clone() # same as ilabels - - fsa_text = k2.Fsa(arcs, olabels) - fsa_text.unit_positions = text_indices.expand(2, -1).transpose(0, 1).flatten() - fsa_text.unit_positions[-1] = -1 # last transition to final state - return fsa_text - - def get_temporal_schema(self, num_frames: int, vocab_size: int, device: torch.device) -> "k2.Fsa": - """ - Get temporal schema graph for RNN-T loss (Compose-Transducer). - Forward arc - blank, self-loops - all labels excluding blank - - Example graph: blank=0, num_frames=3, vocab_size=3. - Labels: :. is a unit from vocab. - - graph:: - - 1:0 1:1 1:2 - +-----+ +-----+ +-----+ - v | v | v | - +---------+ 0:0 +---------+ 0:1 +---------+ 0:2 +---+ -1:-1 #===# - | 0 | -----> | 1 | -----> | 2 | -----> | 3 | -------> H 4 H - +---------+ +---------+ +---------+ +---+ #===# - ^ 2:0 | ^ 2:1 | ^ 2:2 | - +-----+ +-----+ +-----+ - - Args: - num_frames: length of the sequence (in frames) - vocab_size: number of labels (including blank) - device: device for tensor to construct - - Returns: - temporal schema graph (k2.Fsa). - Labels: :. is a unit from vocab. - """ - blank_id = self.blank - - fsa_temporal_arcs = torch.zeros((num_frames * vocab_size + 1, 4), dtype=torch.int32, device=device) - sequence_states = torch.arange(0, num_frames, dtype=torch.int32, device=device) - # for every state - vocab_size arcs, [0, 1, ..., vocab_size-1, 0, 1, ..., vocab_size-1, ...] - start_states = sequence_states.expand(vocab_size, num_frames).transpose(0, 1).flatten() - # first: make all arcs - self-loops - fsa_temporal_arcs[:-1, 0] = start_states # from - fsa_temporal_arcs[:-1, 1] = start_states # to - fsa_temporal_arcs[:-1, 2] = ( - torch.arange(0, vocab_size, dtype=torch.int32, device=device).expand(num_frames, vocab_size).flatten() - ) - - # blank-arcs: forward - fsa_temporal_arcs[blank_id:-1:vocab_size, 1] = sequence_states + 1 # blanks - - # transition to last final state - fsa_temporal_arcs[-1, :3] = torch.tensor((num_frames, num_frames + 1, -1), dtype=torch.int32, device=device) - - # output symbols: position in the sequence, same as start states for arcs - olabels = fsa_temporal_arcs[:, 0].detach().clone() - olabels[-1] = -1 # last arc to final state - - fsa_temporal = k2.Fsa(fsa_temporal_arcs, olabels) - fsa_temporal = k2.arc_sort(fsa_temporal) # need for compose - return fsa_temporal - - @staticmethod - def relabel_states(states: torch.Tensor, n: int, m: int) -> torch.Tensor: - """ - Relabel states to be in topological order: by diagonals - - Args: - states: tensor with states - n: number of rows - m: number of columns - - Returns: - tensor with relabeled states (same shape as `states`) - """ - i = states % n - j = torch.div(states, n, rounding_mode='floor') # states // n, torch.div to avoid pytorch warnings - min_mn = min(m, n) - max_mn = max(m, n) - diag = i + j - anti_diag = m + n - 1 - diag - max_idx = n * m - 1 - cur_diag_idx = i if m > n else m - j - 1 - states = ( - diag.lt(min_mn) * ((diag * (diag + 1) >> 1) + i) - + torch.logical_and(diag.ge(min_mn), diag.lt(max_mn)) - * ((min_mn * (min_mn + 1) >> 1) + (diag - min_mn) * min_mn + cur_diag_idx) - + diag.ge(max_mn) * (max_idx - (anti_diag * (anti_diag + 1) >> 1) + m - j) - ) - return states - - def get_grid(self, units_tensor: torch.Tensor, num_frames: int, vocab_size: int) -> "k2.Fsa": - """ - Construct the RNN-T lattice directly (Grid-Transducer). - - Args: - units_tensor: 1d tensor with text units - num_frames: length of the sequence (number of frames) - vocab_size: number of total labels (vocab size including blank) - - Returns: - transducer lattice (k2.Fsa). - Labels: :: (k2.Fsa: labels, aux_labels, unit_positions) - """ - blank_id = self.blank - text_length = units_tensor.shape[0] - device = units_tensor.device - num_grid_states = num_frames * (text_length + 1) - num_forward_arcs = (num_frames - 1) * (text_length + 1) - num_text_arcs = text_length * num_frames - arcs = torch.zeros((num_forward_arcs + num_text_arcs + 2, 4), dtype=torch.int32, device=device) - # blank transitions - # i, i+, 0 , i / , i % - from_states = torch.arange(num_forward_arcs, device=device) - to_states = from_states + (text_length + 1) - arcs[:num_forward_arcs, 0] = from_states - arcs[:num_forward_arcs, 1] = to_states - arcs[:num_forward_arcs, 2] = blank_id - - # text arcs - from_states = ( - torch.arange(num_grid_states, dtype=torch.int32, device=device) - .reshape(num_frames, text_length + 1)[:, :-1] - .flatten() - ) - to_states = from_states + 1 - ilabels = units_tensor.expand(num_frames, -1).flatten() - arcs[num_forward_arcs:-2, 0] = from_states - arcs[num_forward_arcs:-2, 1] = to_states - arcs[num_forward_arcs:-2, 2] = ilabels - - # last 2 states - arcs[-2, :3] = torch.tensor((num_grid_states - 1, num_grid_states, blank_id), dtype=torch.int32, device=device) - arcs[-1, :3] = torch.tensor((num_grid_states, num_grid_states + 1, -1), dtype=torch.int32, device=device) - - # sequence indices, time indices - olabels = torch.div(arcs[:, 0], (text_length + 1), rounding_mode="floor") # arcs[:, 0] // (text_length + 1) - unit_positions = arcs[:, 0] % (text_length + 1) - # last state: final - olabels[-1] = -1 - unit_positions[-1] = -1 - - # relabel - # instead of using top sort (extremely expensive) k2.top_sort(rnnt_graph) - arcs[:-2, 0] = self.relabel_states(arcs[:-2, 0], text_length + 1, num_frames) - arcs[:-3, 1] = self.relabel_states(arcs[:-3, 1], text_length + 1, num_frames) - - # sort by start state - required in k2 - # TODO: maybe it is more optimal to avoid sort, construct arcs in ascending order - _, indices = torch.sort(arcs[:, 0], dim=0) - sorted_arcs = arcs[indices] - olabels = olabels[indices] - unit_positions = unit_positions[indices] - - rnnt_graph = k2.Fsa(sorted_arcs, olabels) - rnnt_graph.unit_positions = unit_positions - return rnnt_graph - - def forward( - self, acts: torch.Tensor, labels: torch.Tensor, act_lens: torch.Tensor, label_lens: torch.Tensor, - ) -> torch.Tensor: - """ - Compute forward method for RNN-T. - - Args: - acts: activations (joint tensor). NB: raw logits, not after log-softmax - labels: target labels - act_lens: lengths of activations - label_lens: length of labels sequences - - Returns: - batch of RNN-T scores (loss) - """ - # argument names are consistent with NeMo, see RNNTLoss.forward: - # self._loss(acts=log_probs, labels=targets, act_lens=input_lengths, label_lens=target_lengths) - logits, targets, logits_lengths, target_lengths = acts, labels, act_lens, label_lens - - # logits: B x Time x Text+1 x C - vocab_size = logits.shape[-1] - target_fsas_vec = self.get_graphs_batched(logits_lengths, targets, target_lengths, vocab_size) - - cast_context = force_float32_context() if self.cast_to_float32 else nullcontext() - with cast_context: - log_probs = F.log_softmax(logits, dim=-1) - with torch.no_grad(): - indices = self.get_logits_indices(target_fsas_vec, logits.shape) - # transition to the last state - # use 0 index (for valid index_select) and manually assign score after index_select for this case - indices[target_fsas_vec.labels == -1] = 0 - - # NB: do not assign scores -> modify, k2 will not update all scores correctly (modify -> assign) - scores = log_probs.flatten().index_select(-1, indices) - # fix weights for the arcs to the last state - scores[target_fsas_vec.labels == -1] = 0 - - target_fsas_vec.scores = scores - scores = -1 * target_fsas_vec.get_tot_scores(use_double_scores=self.double_scores, log_semiring=True) - return scores diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/loss_mixins.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/loss_mixins.py deleted file mode 100644 index ad8286e43e236c7bee8caa8e4d7a4a287d1e4d90..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/loss_mixins.py +++ /dev/null @@ -1,233 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from abc import ABC -from typing import List, Optional, Tuple - -import torch - -from nemo.collections.asr.parts.k2.grad_utils import GradExpNormalize -from nemo.collections.asr.parts.k2.utils import ( - create_supervision, - get_arc_weights, - get_uniform_rnnt_prune_ranges, - make_non_pad_mask, - make_non_pad_mask_3d, - prep_padded_densefsavec, -) -from nemo.core.utils.k2_guard import k2 # import k2 from guard module - - -class CtcK2Mixin(ABC): - """k2 Mixin class that simplifies the construction of various k2-based CTC-like losses. - - It does the following: - - Prepares and adapts the input tensors (method _prepare_log_probs_and_targets). - - Creates Emissions graphs (method _prepare_emissions_graphs). - - Extracts the labels and probabilities of the best lattice path (method _extract_labels_and_probabilities). - """ - - def _prepare_log_probs_and_targets( - self, - log_probs: torch.Tensor, - input_lengths: torch.Tensor, - targets: Optional[torch.Tensor] = None, - target_lengths: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: - """Creates k2-style supervisions and shifts targets by one if the number is not zero. - """ - assert log_probs.size(-1) == self.num_classes - supervisions = create_supervision(input_lengths) - # shift targets to make output epsilon ID zero - return ( - log_probs, - supervisions, - torch.where(targets < self.blank, targets + 1, targets) if targets is not None else None, - target_lengths, - ) - - def _prepare_emissions_graphs(self, log_probs: torch.Tensor, supervisions: torch.Tensor) -> 'k2.DenseFsaVec': - """Creates DenseFsaVec, padding it with frames if the topology is `compact`. - In particular, every second frame of the DenseFsaVec is the frame. - - frame is a frame with log-probability zero and every other log-probability is -inf. - """ - return ( - prep_padded_densefsavec(log_probs, supervisions) - if self.pad_fsavec - else k2.DenseFsaVec(log_probs, supervisions) - ) - - def _maybe_normalize_gradients(self, log_probs: torch.Tensor, input_lengths: torch.Tensor) -> torch.Tensor: - """PyTorch is doing the log-softmax normalization as part of the CTC computation. - More: https://github.com/k2-fsa/k2/issues/575 - """ - return GradExpNormalize.apply(log_probs, input_lengths, "mean" if self.reduction != "sum" else "none") - - def _extract_labels_and_probabilities( - self, shortest_path_fsas: 'k2.Fsa', return_ilabels: bool = False, output_aligned: bool = True - ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: - """Extracts the labels and probabilities of the best lattice path, - dropping arcs and restoring the targets shift, if needed. - """ - shortest_paths = [] - probs = [] - # direct iterating does not work as expected - for i in range(shortest_path_fsas.shape[0]): - shortest_path_fsa = shortest_path_fsas[i] - # suppose that artificial input epsilon numbers >= self.num_classes - non_eps_mask = (shortest_path_fsa.labels != -1) & (shortest_path_fsa.labels < self.num_classes) - if return_ilabels: - labels = shortest_path_fsa.labels[non_eps_mask] - else: - labels = shortest_path_fsa.aux_labels[non_eps_mask] - if self.blank != 0: - # suppose output epsilon number == 0 - # since the input epsilons were removed, we treat all remaining epsilons as blanks - labels[labels == 0] = self.blank - labels[(labels > 0) & (labels < self.blank)] -= 1 - labels = labels.to(dtype=torch.long) - if not return_ilabels and not output_aligned: - labels = labels[labels != self.blank] - shortest_paths.append(labels) - probs.append(get_arc_weights(shortest_path_fsa)[non_eps_mask].exp().to(device=shortest_path_fsas.device)) - return shortest_paths, probs - - -class RnntK2Mixin(CtcK2Mixin): - """k2 Mixin class that simplifies the construction of various k2-based RNNT-like losses. Inherits CtcK2Mixin. - - It does the following: - - Prepares and adapts the input tensors. - - Creates Emissions graphs. - - Extracts the labels and probabilities of the best lattice path (method _extract_labels_and_probabilities). - """ - - def _prepare_log_probs_and_targets( - self, - log_probs: torch.Tensor, - input_lengths: torch.Tensor, - targets: torch.Tensor, - target_lengths: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """Before calling super()._prepare_log_probs_and_targets, this method reshapes the log_probs tensor - from (B, T, U+1, D) to (B, T', D) where T' = T*(U+1), shifts paddings along T and U towards the end of T', - and recomputes input_lengths. - - It also calculates indices on which steps should be applied to the log_probs tensor to emulate - arcs shift of the Emissions graph for the pruned RNNT variant. - """ - assert len(log_probs.size()) == 4 # B T U D - B, T, U, D = log_probs.size() - TU = T * U - - # save step indices if, as we assume, decoder output pruning has been applied - if self.predictor_window_size > 0 and self.predictor_window_size < target_lengths.max(): - window_size_with_blank = self.predictor_window_size + 1 - ranges_begin = get_uniform_rnnt_prune_ranges( - input_lengths, target_lengths, window_size_with_blank, self.predictor_step_size, T, True - ) - step_sizes = ranges_begin[:, 1:] - ranges_begin[:, :-1] - raw_step_indices = torch.where(step_sizes > 0) - if self.predictor_step_size > 1: - raw_step_indices = torch.repeat_interleave( - torch.stack(raw_step_indices).T, step_sizes[raw_step_indices], dim=0 - ).T - raw_step_indices = (raw_step_indices[0], raw_step_indices[1]) - unique, count = torch.unique(raw_step_indices[0], return_counts=True) - shift_mask = raw_step_indices[0].unsqueeze(0).repeat(len(unique), 1) == unique.unsqueeze(-1) - step_indices = ( - raw_step_indices[0], - ( - torch.arange(ranges_begin.size(1)).unsqueeze(0).repeat(ranges_begin.size(0), 1) - * window_size_with_blank - )[(raw_step_indices[0], raw_step_indices[1] + 1)] - + torch.cumsum(shift_mask, 1)[shift_mask] - - 1, - ) - max_count = count.max() - max_count_vec = torch.full((B,), max_count) - max_count_vec[unique] -= count - pad_indices_row = torch.repeat_interleave(torch.arange(B), max_count_vec) - pad_unique = torch.unique(pad_indices_row) - pad_shift_mask = pad_indices_row.unsqueeze(0).repeat(len(pad_unique), 1) == pad_unique.unsqueeze(-1) - pad_indices = ( - pad_indices_row, - T * window_size_with_blank + max_count - torch.cumsum(pad_shift_mask, 1)[pad_shift_mask], - ) - self.__step_indices = ( - torch.cat((step_indices[0], pad_indices[0])), - torch.cat((step_indices[1], pad_indices[1])), - ) - self.__supervisions_add = max_count - max_count_vec - else: - self.__step_indices = None - self.__supervisions_add = None - - # reshape 4D log_probs to 3D with respect to target_lengths - non_pad_mask_true = make_non_pad_mask_3d(input_lengths, target_lengths + 1, T, U).flatten(1) - input_lengths = non_pad_mask_true.sum(1) - non_pad_mask_fake = make_non_pad_mask(input_lengths, TU).flatten() - non_pad_mask_true = non_pad_mask_true.flatten() - rearranged_indices = torch.arange(TU * B, device=log_probs.device) - rearranged_indices_buffer = rearranged_indices.clone() - rearranged_indices[non_pad_mask_fake] = rearranged_indices_buffer[non_pad_mask_true] - rearranged_indices[~non_pad_mask_fake] = rearranged_indices_buffer[~non_pad_mask_true] - log_probs = log_probs.reshape(-1, D)[rearranged_indices].view(B, -1, D) - - return super()._prepare_log_probs_and_targets(log_probs, input_lengths, targets, target_lengths) - - def _prepare_emissions_graphs(self, log_probs: torch.Tensor, supervisions: torch.Tensor) -> 'k2.DenseFsaVec': - """Overrides super()._prepare_emissions_graphs. - Creates DenseFsaVec, adding outputs to the end of the D dimension. - - If pruning is used, this method also pads the DenseFsaVec with frames - according to the steps, calculated before. - - frame is a frame with log-probability zero and every other log-probability is -inf. - """ - if self.__step_indices is None or self.__supervisions_add is None: - log_probs_eps = torch.cat( - (log_probs, torch.zeros((log_probs.size(0), log_probs.size(1), 1), device=log_probs.device)), dim=2 - ) - else: - mask = torch.zeros( - (log_probs.size(0), log_probs.size(1) + int(len(self.__step_indices[0]) / log_probs.size(0))), - dtype=torch.bool, - ) - mask[self.__step_indices] = True - log_probs_eps = torch.zeros((mask.size(0), mask.size(1), log_probs.size(2) + 1), device=log_probs.device) - log_probs_eps[mask] = torch.tensor( - [torch.finfo(torch.float32).min] * log_probs.size(2) + [0], device=log_probs.device - ) - log_probs_eps[~mask] = torch.cat( - (log_probs, torch.zeros((log_probs.size(0), log_probs.size(1), 1), device=log_probs.device)), dim=2 - ).view(-1, log_probs.size(-1) + 1) - input_lengths = supervisions[:, -1] + self.__supervisions_add[supervisions[:, 0].to(dtype=torch.long)] - if not torch.all(input_lengths[:-1] - input_lengths[1:] >= 0): - # have to reorder supervisions inplace - order = torch.argsort(input_lengths, descending=True) - # the second column is assumed to be zero - supervisions[:, 0] = supervisions[order, 0] - supervisions[:, -1] = input_lengths[order] - else: - supervisions[:, -1] = input_lengths - self.__step_indices = None - self.__supervisions_add = None - return k2.DenseFsaVec(log_probs_eps, supervisions) - - def _maybe_normalize_gradients(self, log_probs: torch.Tensor, input_lengths: torch.Tensor) -> torch.Tensor: - """Not required for RNNT. - """ - return log_probs diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/map_loss.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/map_loss.py deleted file mode 100644 index c261a4f2ef6b12af8802a0db6fc08591b1c2450e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/map_loss.py +++ /dev/null @@ -1,320 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2020, Xiaomi CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from abc import abstractmethod -from typing import Any, Optional, Tuple, Union - -import torch -from omegaconf import DictConfig - -from nemo.collections.asr.parts.k2.classes import GraphIntersectDenseConfig -from nemo.collections.asr.parts.k2.loss_mixins import CtcK2Mixin -from nemo.collections.asr.parts.k2.ml_loss import MLLoss -from nemo.collections.asr.parts.k2.utils import ( - create_sparse_wrapped, - get_tot_objf_and_finite_mask, - invert_permutation, - load_graph, -) -from nemo.core.utils.k2_guard import k2 # import k2 from guard module -from nemo.utils import logging - - -class MAPLoss(MLLoss): - """ - Maximum a Posteriori Probability criterion. - It implements Lattice-Free Maximum Mutual Information (LF-MMI) and LF-boosted-MMI (LF-bMMI) losses. - - Based on https://github.com/k2-fsa/snowfall/blob/master/snowfall/objectives/mmi.py - - cfg takes precedence over all optional parameters - We keep explicit parameter setting to be able to create an instance without the need of a config. - """ - - @abstractmethod - def __init__( - self, - num_classes: int, - blank: int, - reduction: str, - cfg: Optional[DictConfig] = None, - topo_type: str = "default", - topo_with_self_loops: bool = True, - token_lm: Optional[Union['k2.Fsa', str]] = None, - intersect_pruned: bool = False, - intersect_conf: GraphIntersectDenseConfig = GraphIntersectDenseConfig(), - boost_coeff: float = 0.0, - ): - super().__init__( - num_classes=num_classes, - blank=blank, - reduction=reduction, - cfg=cfg, - topo_type=topo_type, - topo_with_self_loops=topo_with_self_loops, - ) - if cfg is not None: - token_lm = cfg.get("token_lm", token_lm) - intersect_pruned = cfg.get("intersect_pruned", intersect_pruned) - intersect_conf = cfg.get("intersect_conf", intersect_conf) - boost_coeff = cfg.get("boost_coeff", boost_coeff) - self.boost_coeff = boost_coeff - self._intersect_calc_scores_impl = ( - self._intersect_calc_scores_impl_pruned if intersect_pruned else self._intersect_calc_scores_impl_exact_opt - ) - self.intersect_conf = intersect_conf - self.graph_compiler = None # expected to be initialized in .update_graph(...) - if token_lm is None: - logging.warning( - f"""token_lm is empty. - Trainable token_lm is not supported yet. - Please call .update_graph(token_lm) before using.""" - ) - else: - self.lm_graph = load_graph(token_lm) if isinstance(token_lm, str) else token_lm - if self.lm_graph is None: - raise ValueError(f"""lm_graph is empty.""") - else: - self.update_graph(self.lm_graph) - - @abstractmethod - def update_graph(self, graph: 'k2.Fsa'): - # expected to be set in child classes - raise NotImplementedError - - def _intersect_calc_scores_impl_exact_opt( - self, dense_fsa_vec: 'k2.DenseFsaVec', num_graphs: 'k2.Fsa', den_graph: 'k2.Fsa', return_lats: bool = True, - ) -> Tuple[torch.Tensor, torch.Tensor, Optional['k2.Fsa'], Optional['k2.Fsa']]: - """Inner intersection method. - Does joint (simultaneous) exact intersection of dense_fsa_vec against num_graphs and den_graph. - - Optiolally returns the numerator and the denominator lattices. - """ - device = dense_fsa_vec.device - assert device == num_graphs.device and device == den_graph.device - - num_fsas = num_graphs.shape[0] - assert dense_fsa_vec.dim0() == num_fsas - - den_graph = den_graph.clone() - num_graphs = num_graphs.clone() - - num_den_graphs = k2.cat([num_graphs, den_graph]) - - # NOTE: The a_to_b_map in k2.intersect_dense must be sorted - # so the following reorders num_den_graphs. - - # [0, 1, 2, ... ] - num_graphs_indexes = torch.arange(num_fsas, dtype=torch.int32) - - # [num_fsas, num_fsas, num_fsas, ... ] - den_graph_indexes = torch.tensor([num_fsas] * num_fsas, dtype=torch.int32) - - # [0, num_fsas, 1, num_fsas, 2, num_fsas, ... ] - num_den_graphs_indexes = torch.stack([num_graphs_indexes, den_graph_indexes]).t().reshape(-1).to(device) - - num_den_reordered_graphs = k2.index_fsa(num_den_graphs, num_den_graphs_indexes) - - # [[0, 1, 2, ...]] - a_to_b_map = torch.arange(num_fsas, dtype=torch.int32).reshape(1, -1) - - # [[0, 1, 2, ...]] -> [0, 0, 1, 1, 2, 2, ... ] - a_to_b_map = a_to_b_map.repeat(2, 1).t().reshape(-1).to(device) - - num_den_lats = k2.intersect_dense( - a_fsas=num_den_reordered_graphs, - b_fsas=dense_fsa_vec, - output_beam=self.intersect_conf.output_beam, - a_to_b_map=a_to_b_map, - seqframe_idx_name="seqframe_idx" if return_lats else None, - ) - - num_den_tot_scores = num_den_lats.get_tot_scores(log_semiring=True, use_double_scores=False) - num_tot_scores = num_den_tot_scores[::2] - den_tot_scores = num_den_tot_scores[1::2] - - if return_lats: - lat_slice = torch.arange(num_fsas, dtype=torch.int32).to(device) * 2 - return ( - num_tot_scores, - den_tot_scores, - k2.index_fsa(num_den_lats, lat_slice), - k2.index_fsa(num_den_lats, lat_slice + 1), - ) - else: - return num_tot_scores, den_tot_scores, None, None - - def _intersect_calc_scores_impl_pruned( - self, dense_fsa_vec: 'k2.DenseFsaVec', num_graphs: 'k2.Fsa', den_graph: 'k2.Fsa', return_lats: bool = True, - ) -> Tuple[torch.Tensor, torch.Tensor, Optional['k2.Fsa'], Optional['k2.Fsa']]: - """Inner intersection method. - Does exact intersection of dense_fsa_vec against num_graphs and pruned intersection against den_graph. - - Optiolally returns the numerator and the denominator lattices. - """ - device = dense_fsa_vec.device - assert device == num_graphs.device and device == den_graph.device - - num_fsas = num_graphs.shape[0] - assert dense_fsa_vec.dim0() == num_fsas - - num_lats = k2.intersect_dense( - a_fsas=num_graphs, - b_fsas=dense_fsa_vec, - output_beam=self.intersect_conf.output_beam, - seqframe_idx_name="seqframe_idx" if return_lats else None, - ) - den_lats = k2.intersect_dense_pruned( - a_fsas=den_graph, - b_fsas=dense_fsa_vec, - search_beam=self.intersect_conf.search_beam, - output_beam=self.intersect_conf.output_beam, - min_active_states=self.intersect_conf.min_active_states, - max_active_states=self.intersect_conf.max_active_states, - seqframe_idx_name="seqframe_idx" if return_lats else None, - ) - - num_tot_scores = num_lats.get_tot_scores(log_semiring=True, use_double_scores=False) - den_tot_scores = den_lats.get_tot_scores(log_semiring=True, use_double_scores=False) - - if return_lats: - return num_tot_scores, den_tot_scores, num_lats, den_lats - else: - return num_tot_scores, den_tot_scores, None, None - - def _intersect_calc_scores( - self, emissions_graphs: 'k2.DenseFsaVec', supervision_graphs: Any, supervisions: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Intersects emissions_graphs with supervision_graphs and calculates lattice scores. - This version implicitly assumes supervision_graphs to be a pair of the numerator and the denominator FSAs. - - It can also calculate accuracy between the numerator and the denominator lattices to use it as additional loss. - - Can be overridden. - """ - boosted = self.boost_coeff != 0.0 - num_tot_scores, den_tot_scores, num_lats, den_lats = self._intersect_calc_scores_impl( - emissions_graphs, supervision_graphs[0], supervision_graphs[1], boosted - ) - - inverted_batch_order = invert_permutation(supervisions[:, 0].to(dtype=torch.long)) - self.__batch_order = None - tot_scores = (num_tot_scores - den_tot_scores)[inverted_batch_order] - mmi_tot_scores, mmi_valid_mask = get_tot_objf_and_finite_mask(tot_scores, self.reduction) - - if boosted: - assert num_lats is not None and den_lats is not None - - size = ( - emissions_graphs.dim0(), - emissions_graphs.scores.shape[0], - emissions_graphs.scores.shape[1] - 1, - ) - row_ids = emissions_graphs.emissions_graphs.shape().row_ids(1) - num_sparse = create_sparse_wrapped( - indices=[k2.index_select(row_ids, num_lats.seqframe_idx), num_lats.seqframe_idx, num_lats.phones,], - values=num_lats.get_arc_post(False, True).exp(), - size=size, - min_col_index=0, - ) - del num_lats - den_sparse = create_sparse_wrapped( - indices=[k2.index_select(row_ids, den_lats.seqframe_idx), den_lats.seqframe_idx, den_lats.phones,], - values=den_lats.get_arc_post(False, True).exp(), - size=size, - min_col_index=0, - ) - del den_lats - - acc_loss = torch.sparse.sum((num_sparse - den_sparse).coalesce().abs(), (1, 2)).to_dense() - del num_sparse, den_sparse - - acc_tot_scores, acc_valid_mask = get_tot_objf_and_finite_mask(acc_loss, self.reduction) - valid_mask = mmi_valid_mask & acc_valid_mask - total_loss = ( - (self.boost_coeff * acc_tot_scores[inverted_batch_order][valid_mask] - mmi_tot_scores[valid_mask]) - if self.reduction == "none" - else self.boost_coeff * acc_tot_scores - mmi_tot_scores - ) - else: - valid_mask = mmi_valid_mask - total_loss = -mmi_tot_scores[valid_mask] if self.reduction == "none" else -mmi_tot_scores - return total_loss, valid_mask - - -class CtcMmiLoss(MAPLoss, CtcK2Mixin): - """MMI loss with custom CTC topologies. - Available topologies: - - `default`, with or without self-loops - - `compact`, with or without self-loops - - `shared_blank`, with or without self-loops - - `minimal`, without self-loops - - cfg takes precedence over all optional parameters - We keep explicit parameter setting to be able to create an instance without the need of a config. - """ - - def __init__( - self, - num_classes: int, - blank: int, - reduction: str, - cfg: Optional[DictConfig] = None, - topo_type: str = "default", - topo_with_self_loops: bool = True, - token_lm: Optional[Union['k2.Fsa', str]] = None, - intersect_pruned: bool = False, - intersect_conf: GraphIntersectDenseConfig = GraphIntersectDenseConfig(), - boost_coeff: float = 0.0, - ): - super().__init__( - num_classes=num_classes, - blank=blank, - reduction=reduction, - cfg=cfg, - topo_type=topo_type, - topo_with_self_loops=topo_with_self_loops, - token_lm=token_lm, - intersect_pruned=intersect_pruned, - intersect_conf=intersect_conf, - boost_coeff=boost_coeff, - ) - - def update_graph(self, graph: 'k2.Fsa'): - self.lm_graph = graph - lm_graph = self.lm_graph.clone() - if hasattr(lm_graph, "aux_labels"): - delattr(lm_graph, "aux_labels") - labels = lm_graph.labels - if labels.max() != self.num_classes - 1: - raise ValueError(f"lm_graph is not compatible with the num_classes: {labels.unique()}, {self.num_classes}") - from nemo.collections.asr.parts.k2.graph_compilers import MmiGraphCompiler as compiler - - self.graph_compiler = compiler( - self.num_classes, self.blank, self.topo_type, self.topo_with_self_loops, aux_graph=lm_graph - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/ml_loss.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/ml_loss.py deleted file mode 100644 index ef916ee2f69d9c5d33906a915764a96b48a70ee1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/ml_loss.py +++ /dev/null @@ -1,220 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2020, Xiaomi CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from abc import abstractmethod -from typing import Any, Optional, Tuple - -import torch -from omegaconf import DictConfig - -from nemo.collections.asr.parts.k2.graph_compilers import CtcTopologyCompiler, RnntTopologyCompiler -from nemo.collections.asr.parts.k2.loss_mixins import CtcK2Mixin, RnntK2Mixin -from nemo.collections.asr.parts.k2.utils import get_tot_objf_and_finite_mask, invert_permutation -from nemo.core.utils.k2_guard import k2 # import k2 from guard module - - -class MLLoss(torch.nn.Module): - """ - Maximum Likelihood criterion. - It implements Connectionist Temporal Classification (CTC) loss, - but can be extended to support other loss functions (ASG, HMM, RNNT, ...). - - Based on https://github.com/k2-fsa/snowfall/blob/master/snowfall/objectives/ctc.py - - cfg takes precedence over all optional parameters - We keep explicit parameter setting to be able to create an instance without the need of a config. - """ - - @abstractmethod - def __init__( - self, - num_classes: int, - blank: int, - reduction: str, - cfg: Optional[DictConfig] = None, - topo_type: str = "default", - topo_with_self_loops: bool = True, - ): - super().__init__() - if cfg is not None: - topo_type = cfg.get("topo_type", topo_type) - topo_with_self_loops = cfg.get("topo_with_self_loops", topo_with_self_loops) - self.blank = blank - self.num_classes = num_classes - self.reduction = reduction - self.topo_type = topo_type - self.topo_with_self_loops = topo_with_self_loops - self.pad_fsavec = topo_type == "compact" - self.graph_compiler = None # expected to be initialized in child classes - - def _prepare_graphs_for_intersection( - self, - log_probs: torch.Tensor, - targets: torch.Tensor, - input_lengths: torch.Tensor, - target_lengths: torch.Tensor, - ) -> Tuple['k2.DenseFsaVec', Any, torch.Tensor]: - """Converts input tensors to FST graphs: - log_probs to supervision_graphs (DenseFsaVec) - targets to supervision_graphs - Can be overridden. - """ - log_probs, supervisions, targets, target_lengths = self._prepare_log_probs_and_targets( - log_probs, input_lengths, targets, target_lengths - ) - log_probs = self._maybe_normalize_gradients(log_probs, supervisions[:, -1].to(dtype=torch.long)) - emissions_graphs = self._prepare_emissions_graphs(log_probs, supervisions) - del log_probs - - if emissions_graphs.device != self.graph_compiler.device: - self.graph_compiler.to(emissions_graphs.device) - order = supervisions[:, 0].to(dtype=torch.long) - supervision_graphs = self.graph_compiler.compile(targets[order], target_lengths[order]) - - return emissions_graphs, supervision_graphs, supervisions - - def _intersect_calc_scores( - self, emissions_graphs: 'k2.DenseFsaVec', supervision_graphs: Any, supervisions: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Intersects emissions_graphs with supervision_graphs and calculates lattice scores. - Can be overridden. - """ - lats = k2.intersect_dense(supervision_graphs, emissions_graphs, torch.finfo(torch.float32).max / 10) - del emissions_graphs - - num_tot_scores = lats.get_tot_scores(log_semiring=True, use_double_scores=False) - del lats - tot_scores = num_tot_scores[invert_permutation(supervisions[:, 0].to(dtype=torch.long))] - tot_scores, valid_mask = get_tot_objf_and_finite_mask(tot_scores, self.reduction) - return -tot_scores[valid_mask] if self.reduction == "none" else -tot_scores, valid_mask - - def forward( - self, - log_probs: torch.Tensor, - targets: torch.Tensor, - input_lengths: torch.Tensor, - target_lengths: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - assert self.graph_compiler is not None - - emissions_graphs, supervision_graphs, supervisions = self._prepare_graphs_for_intersection( - log_probs, targets, input_lengths, target_lengths - ) - scores, mask = self._intersect_calc_scores(emissions_graphs, supervision_graphs, supervisions) - return scores, mask - - -class CtcLoss(MLLoss, CtcK2Mixin): - """Regular CTC loss with custom topologies. - Available topologies: - - `default`, with or without self-loops - - `compact`, with or without self-loops - - `shared_blank`, with or without self-loops - - `minimal`, without self-loops - cfg takes precedence over all optional parameters - We keep explicit parameter setting to be able to create an instance without the need of a config. - """ - - def __init__( - self, - num_classes: int, - blank: int, - reduction: str, - cfg: Optional[DictConfig] = None, - topo_type: str = "default", - topo_with_self_loops: bool = True, - ): - super().__init__( - num_classes=num_classes, - blank=blank, - reduction=reduction, - cfg=cfg, - topo_type=topo_type, - topo_with_self_loops=topo_with_self_loops, - ) - self.graph_compiler = CtcTopologyCompiler( - self.num_classes, self.blank, self.topo_type, self.topo_with_self_loops - ) - - -class RnntLoss(MLLoss, RnntK2Mixin): - """RNNT loss with the `minimal` topology. - If predictor_window_size is not provided, this loss works as regular RNNT. - With predictor_window_size provided, it applies uniform pruning when compiling Emission FSAs - to reduce memory and compute consumption. - cfg takes precedence over all optional parameters - We keep explicit parameter setting to be able to create an instance without the need of a config. - """ - - def __init__( - self, - num_classes: int, - blank: int, - reduction: str, - cfg: Optional[DictConfig] = None, - topo_type: str = "minimal", - topo_with_self_loops: bool = True, - predictor_window_size: int = 0, - predictor_step_size: int = 1, - ): - super().__init__( - num_classes=num_classes, - blank=blank, - reduction=reduction, - cfg=cfg, - topo_type=topo_type, - topo_with_self_loops=topo_with_self_loops, - ) - if cfg is not None: - topo_type = cfg.get("topo_type", topo_type) - predictor_window_size = cfg.get("predictor_window_size", predictor_window_size) - predictor_step_size = cfg.get("predictor_step_size", predictor_step_size) - if topo_type != "minimal": - raise NotImplementedError(f"Only topo_type=`minimal` is supported at the moment.") - self.predictor_window_size = predictor_window_size - self.predictor_step_size = predictor_step_size - self.graph_compiler = RnntTopologyCompiler( - self.num_classes, - self.blank, - self.topo_type, - self.topo_with_self_loops, - max_adapter_length=self.predictor_window_size, - ) - - def forward( - self, - log_probs: torch.Tensor, - targets: torch.Tensor, - input_lengths: torch.Tensor, - target_lengths: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - assert self.predictor_window_size == 0 or log_probs.size(2) <= self.predictor_window_size + 1 - - return super().forward( - log_probs=log_probs, targets=targets, input_lengths=input_lengths, target_lengths=target_lengths - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/topologies.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/topologies.py deleted file mode 100644 index a3b6fcf0fef71279cf8f41bd8369446215e2bbc2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/topologies.py +++ /dev/null @@ -1,211 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from functools import lru_cache -from typing import List, Optional, Union - -import torch - -from nemo.core.utils.k2_guard import k2 # import k2 from guard module - - -def build_topo(name: str, tokens: List[int], blank_num: int, with_self_loops: bool = True) -> 'k2.Fsa': - """Helper function to build a topology. - It allows to build topologies with a non-zero blank ID. - Args: - name: - The topology name. Choices: default, compact, shared_blank, minimal - tokens: - A list of tokens, e.g., phones, characters, etc. - blank_num: - Blank number. Must be in tokens - with_self_loops: - Whether to add token-to-epsilon self-loops to a topology - Returns: - Returns a topology FST. - """ - if name == "default": - ans = build_default_topo(tokens, with_self_loops) - elif name == "compact": - ans = build_compact_topo(tokens, with_self_loops) - elif name == "shared_blank": - ans = build_shared_blank_topo(tokens, with_self_loops) - elif name == "minimal": - ans = build_minimal_topo(tokens) - else: - raise ValueError(f"Unknown topo name: {name}") - if blank_num != 0: - labels = ans.labels - blank_mask = labels == 0 - labels[(labels != -1) & (labels <= blank_num)] -= 1 - labels[blank_mask] = blank_num - ans.labels = labels # force update ans.labels property to notify FSA about modifications, required by k2 - ans = k2.arc_sort(ans) - return ans - - -def build_default_topo(tokens: List[int], with_self_loops: bool = True) -> 'k2.Fsa': - """Build the default CTC topology. - Zero is assumed to be the ID of the blank symbol. - """ - assert -1 not in tokens, "We assume -1 is ID of the final transition" - assert 0 in tokens, "We assume 0 is the ID of the blank symbol" - - num_states = len(tokens) - final_state = num_states - arcs = "" if with_self_loops else f"0 0 0 0 0.0\n" - for i in range(num_states): - for j in range(num_states): - if i == j: - if with_self_loops: - arcs += f"{i} {i} {tokens[i]} 0 0.0\n" - else: - arcs += f"{i} {j} {tokens[j]} {tokens[j]} 0.0\n" - arcs += f"{i} {final_state} -1 -1 0.0\n" - arcs += f"{final_state}" - ans = k2.Fsa.from_str(arcs, num_aux_labels=1) - ans = k2.arc_sort(ans) - return ans - - -def build_compact_topo(tokens: List[int], with_self_loops: bool = True) -> 'k2.Fsa': - """Build the compact CTC topology. - Zero is assumed to be the ID of the blank symbol. - See https://arxiv.org/abs/2110.03098 - """ - assert -1 not in tokens, "We assume -1 is ID of the final transition" - assert 0 in tokens, "We assume 0 is the ID of the blank symbol" - - eps_num = tokens[-1] + 1 - selfloops_shift = int(with_self_loops) - num_states = len(tokens) + selfloops_shift - final_state = num_states - arcs = "" - for i in range(selfloops_shift, num_states): - arcs += f"0 {i} {tokens[i - selfloops_shift]} {tokens[i - selfloops_shift]} 0.0\n" - arcs += f"0 {final_state} -1 -1 0.0\n" - for i in range(1, num_states): - arcs += f"{i} 0 {eps_num} 0 0.0\n" - if with_self_loops: - arcs += f"{i} {i} {tokens[i - selfloops_shift]} 0 0.0\n" - arcs += f"{final_state}" - ans = k2.Fsa.from_str(arcs, num_aux_labels=1) - ans = k2.arc_sort(ans) - return ans - - -def build_shared_blank_topo(tokens: List[int], with_self_loops: bool = True) -> 'k2.Fsa': - """Build the shared blank CTC topology. - Zero is assumed to be the ID of the blank symbol. - See https://github.com/k2-fsa/k2/issues/746#issuecomment-856421616 - """ - assert -1 not in tokens, "We assume -1 is ID of the final transition" - assert 0 in tokens, "We assume 0 is the ID of the blank symbol" - - tokens = tokens.copy() - tokens.remove(0) - num_tokens = len(tokens) - start = 0 - final = num_tokens + 1 - arcs = [] - arcs.append([start, start, 0, 0, 0]) - arcs.append([start, final, -1, -1, 0]) - arcs.append([final]) - for i, p in enumerate(tokens): - i += 1 - arcs.append([start, start, p, p, 0]) - arcs.append([start, i, p, p, 0]) - arcs.append([i, start, p, 0, 0]) - if with_self_loops: - arcs.append([i, i, p, 0, 0]) - arcs = sorted(arcs, key=lambda arc: arc[0]) - arcs = [[str(i) for i in arc] for arc in arcs] - arcs = [" ".join(arc) for arc in arcs] - arcs = "\n".join(arcs) - ans = k2.Fsa.from_str(arcs, num_aux_labels=1) - ans = k2.arc_sort(ans) - return ans - - -def build_minimal_topo(tokens: List[int]) -> 'k2.Fsa': - """Build the minimal topology. - Zero is assumed to be the ID of the blank symbol. - See https://arxiv.org/abs/2110.03098 - """ - assert -1 not in tokens, "We assume -1 is ID of the final transition" - assert 0 in tokens, "We assume 0 is the ID of the blank symbol" - - num_tokens = len(tokens) - final_state = 1 - arcs = "" - for i in range(num_tokens): - arcs += f"0 0 {tokens[i]} {tokens[i]} 0.0\n" - arcs += f"0 {final_state} -1 -1 0.0\n" - arcs += f"{final_state}" - ans = k2.Fsa.from_str(arcs, num_aux_labels=1) - ans = k2.arc_sort(ans) - return ans - - -class RnntEmissionAdapterBuilder(object): - """Builder class for RNNT Emission Adapters. - - An Emission Adapter is an FSA used to emulate desired temporal Emissions FSA properties of a trivial Emissions FSA. - Temporal properties are emulated by -arcs with zero log-weight. - These additional arcs do not contribute to the lattice scores and can be easily removed from the best path. - - k2 does not have Emissions FSAs. Instead, it has DenseFsaVec, which is not a real FSA. - Thus, Emission Adapters should be composed with Supervision FSAs. - IMPOTRANT: -outputs are expected to be present in the DenseFsaVec. - - These RNNT adapters do only the re-routing (emulate hopping over U dimension). - Redundant non- are not removed by these adapters. - - At initialization, the builder expects a list of tokens, number and number. - When called, the builder returns adapters according to the provided text lengths. - """ - - def __init__(self, tokens: List[int], blank_num: int, eps_num: Optional[int] = None): - assert -1 not in tokens, "We assume -1 is ID of the final transition" - assert blank_num in tokens, "The blank ID must be in tokens" - assert eps_num is None or eps_num not in tokens, "The epsion ID must not be in tokens" - - self.tokens = tokens - self.blank_num = blank_num - self.eps_num = self.tokens[-1] + 1 if eps_num is None else eps_num - - def __call__(self, adapter_lengths: Union[torch.Tensor, List[int]]) -> 'k2.Fsa': - # if you don't make adapter_lengths a list beforehand, - # "i" will be implicitly converted to int, and this will always be considered a cache miss - return k2.create_fsa_vec([self._build_single_adapter(i) for i in adapter_lengths.tolist()]) - - @lru_cache(maxsize=1024) - def _build_single_adapter(self, adapter_length: int) -> 'k2.Fsa': - assert adapter_length >= 1, "`adapter_length` cannot be less than one" - - first_eps_state = adapter_length + 1 - final_state = adapter_length * 2 + 1 - arcs = "" - for i in range(adapter_length): - for j in range(len(self.tokens)): - if j != self.blank_num: - arcs += f"{i} {i + 1} {self.tokens[j]} 0.0\n" - arcs += f"{i} {first_eps_state} {self.blank_num} 0.0\n" - arcs += f"{adapter_length} {first_eps_state} {self.blank_num} 0.0\n" - for i in range(first_eps_state, final_state): - arcs += f"{i} {i + 1 if i < final_state - 1 else 0} {self.eps_num} 0.0\n" - arcs += f"{i} {final_state} -1 0.0\n" - arcs += f"{final_state}" - - return k2.arc_sort(k2.Fsa.from_str(arcs, acceptor=True)) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/utils.py deleted file mode 100644 index f55620a813567d9a826dad8f40a2eaac16d05a27..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/utils.py +++ /dev/null @@ -1,326 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2020, Xiaomi CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import struct -from pickle import UnpicklingError -from typing import List, Optional, Tuple, Union - -import torch - -from nemo.core.utils.k2_guard import k2 # import k2 from guard module -from nemo.utils import logging - - -def create_supervision(input_lengths: torch.Tensor) -> torch.Tensor: - """Creates a special supervisions tensor from input lengths. - These supervisions are required for some k2 methods. - """ - supervisions = torch.stack( - (torch.tensor(range(input_lengths.shape[0])), torch.zeros(input_lengths.shape[0]), input_lengths.cpu(),), 1, - ).to(dtype=torch.int32) - # the duration column has to be sorted in decreasing order - return supervisions[torch.argsort(supervisions[:, -1], descending=True)] - - -def invert_permutation(indices: torch.Tensor) -> torch.Tensor: - """Produces a tensor of reverse permutation for a given indices. - - Based on https://github.com/k2-fsa/snowfall/blob/master/snowfall/common.py - """ - ans = torch.zeros(indices.shape, device=indices.device, dtype=indices.dtype) - ans[indices.to(dtype=torch.long)] = torch.arange(0, indices.shape[0], device=indices.device, dtype=indices.dtype) - return ans - - -def make_non_pad_mask(input_lengths: torch.Tensor, seq_len: int): - """Converts input_lengths to a non-padding mask. The mask is 2D. - """ - batch_size = input_lengths.shape[0] - seq_range = torch.arange(0, seq_len, device=input_lengths.device) - seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, seq_len) - seq_length_expand = input_lengths.clone().detach().to(seq_range_expand.device).unsqueeze(-1) - mask = seq_range_expand < seq_length_expand - return mask - - -def make_non_pad_mask_3d( - lengths_x: torch.Tensor, lengths_y: torch.Tensor, max_length_x: int, max_length_y: int -) -> torch.Tensor: - """Converts two orthogonal input_lengths to a non-padding mask. The mask is 3D. - """ - assert lengths_x.size() == lengths_y.size() - return make_non_pad_mask(lengths_x, max_length_x).unsqueeze(2) & make_non_pad_mask( - lengths_y, max_length_y - ).unsqueeze(1) - - -def ragged_to_tensor_2axes_simple(rt: k2.RaggedTensor) -> Optional[torch.Tensor]: - """Converts k2.RaggedTensor to torch.Tensor if the RaggedTensor is shallow (has two axes). - """ - rt_list = rt.tolist() - result_list = [] - for e in rt_list: - if len(e) == 0: - result_list.append(0) - elif len(e) == 1: - result_list.append(e[0]) - else: - return None - return torch.tensor(result_list, dtype=torch.int32) - - -def load_graph(graph_path: str) -> 'k2.Fsa': - """Fsa graph loading helper function. Loads graphs stored in different formats. - """ - if os.path.exists(graph_path): - errors = [] - try: - graph_dict = torch.load(graph_path, map_location="cpu") - graph = k2.Fsa.from_dict(graph_dict) - return graph - except UnpicklingError as e: - errors.append(e) - with open(graph_path, "rt", encoding="utf-8") as f: - graph_txt = f.read() - # order from the most frequent case to the least - for func, acceptor in [(k2.Fsa.from_openfst, False), (k2.Fsa.from_str, True), (k2.Fsa.from_str, False)]: - try: - graph = func(graph_txt, acceptor=acceptor) - return graph - except (TypeError, ValueError, RuntimeError) as e: - errors.append(e) - raise Exception(errors) - else: - logging.warning(f"""No such file: '{graph_path}'""") - return None - - -def intersect_with_self_loops(base_graph: 'k2.Fsa', aux_graph: 'k2.Fsa') -> 'k2.Fsa': - """Intersection helper function. - """ - assert hasattr(base_graph, "aux_labels") - assert not hasattr(aux_graph, "aux_labels") - aux_graph_with_self_loops = k2.arc_sort(k2.add_epsilon_self_loops(aux_graph)).to(base_graph.device) - result = k2.intersect(k2.arc_sort(base_graph), aux_graph_with_self_loops, treat_epsilons_specially=False) - setattr(result, "phones", result.labels) - return result - - -def compose_with_self_loops(base_graph: 'k2.Fsa', aux_graph: 'k2.Fsa') -> 'k2.Fsa': - """Composition helper function. - """ - aux_graph_with_self_loops = k2.arc_sort(k2.add_epsilon_self_loops(aux_graph)).to(base_graph.device) - return k2.compose(base_graph, aux_graph_with_self_loops, treat_epsilons_specially=False, inner_labels="phones") - - -def create_sparse_wrapped( - indices: List[torch.Tensor], - values: torch.Tensor, - size: Optional[Union[Tuple[int, int], Tuple[int, int, int]]] = None, - min_col_index: Optional[int] = None, -) -> torch.Tensor: - """Wraps up k2.create_sparse to create 2- or 3-dimensional sparse tensors. - """ - assert size is None or len(indices) == len(size) - - if len(indices) == 2: - return k2.create_sparse( - rows=indices[0], cols=indices[1], values=values, size=size, min_col_index=min_col_index, - ) - elif len(indices) == 3: - assert indices[0].ndim == indices[1].ndim == indices[2].ndim == 1 - assert indices[0].numel() == indices[1].numel() == indices[2].numel() == values.numel() - - if min_col_index is not None: - assert isinstance(min_col_index, int) - kept_indices = indices[-1] >= min_col_index - indices = [i[kept_indices] for i in indices] - values = values[kept_indices] - if size is not None: - return torch.sparse_coo_tensor( - torch.stack(indices), values, size=size, device=values.device, requires_grad=values.requires_grad, - ) - else: - return torch.sparse_coo_tensor( - torch.stack(indices), values, device=values.device, requires_grad=values.requires_grad, - ) - else: - raise ValueError(f"len(indices) = {len(indices)}") - - -def prep_padded_densefsavec(log_softmax: torch.Tensor, supervisions: torch.Tensor) -> 'k2.DenseFsaVec': - """Performs special epsilon-padding required for composition with some of the topologies. - """ - log_softmax_eps = torch.cat( - [ - log_softmax, - torch.full((log_softmax.shape[0], log_softmax.shape[1], 1), -float("inf"), device=log_softmax.device,), - ], - axis=-1, - ) - log_softmax_padded = torch.zeros( - (log_softmax_eps.shape[0], log_softmax_eps.shape[1] * 2, log_softmax_eps.shape[2],), device=log_softmax.device, - ) - log_softmax_padded[:, ::2] = log_softmax_eps - supervisions_padded = supervisions.clone() - supervisions_padded[:, 2] *= 2 - dense_log_softmax_padded = k2.DenseFsaVec(log_softmax_padded, supervisions_padded) - return dense_log_softmax_padded - - -def shift_labels_inpl(lattices: List['k2.Fsa'], shift: int): - """Shifts lattice labels and aux_labels by a given number. - This is an in-place operation, if the lattice is on GPU. - """ - for lattice in lattices: - mask = lattice.labels > 0 - lattice.labels[mask] += shift - if hasattr(lattice, "aux_labels"): - mask = lattice.aux_labels > 0 - lattice.aux_labels[mask] += shift - return reset_properties_fsa(lattices) - - -def reset_properties_fsa(graph: 'k2.Fsa'): - """Resets properties of a graph. - In-place (does not create a new graph) if the graph is on GPU. - Use this every time you alter a graph in-place. - See https://github.com/k2-fsa/k2/issues/978 for more information.""" - graph.__dict__["_properties"] = None - # CPU graphs need to be sorted e.g. for intersection - if graph.device == torch.device("cpu"): - graph = k2.arc_sort(graph) - return graph - - -def add_self_loops(graph: 'k2.Fsa', label: int = 0, mode: str = "auto"): - """Adds self-loops with given label to a graph. - Supported modes are ``input``, ``output``, and ``auto``, - Where ``input`` leaves aux_labels zeroes, if present, ``output`` leaves labels zeroes""" - assert mode in ("input", "output", "auto"), "Supported modes are ``input``, ``output``, and ``auto``: {mode}" - assert mode != "output" or hasattr(graph, "aux_labels"), "Graph must have aux_labels for mode ``output``" - new_graph, arc_map = k2.add_epsilon_self_loops(graph, ret_arc_map=True) - - if mode != "output": - new_graph.labels[arc_map == -1] = label - if mode != "input" and hasattr(graph, "aux_labels"): - new_graph.aux_labels[arc_map == -1] = label - return reset_properties_fsa(new_graph) - - -def get_arc_weights(graph: 'k2.Fsa') -> torch.Tensor: - """Returns 1d torch.Tensor with arc weights of a given graph. - """ - if len(graph.shape) > 2: - raise NotImplementedError("FsaVec is not supported at the moment.") - weights_int = graph.arcs.values()[:, -1].tolist() - weights_float = struct.unpack('%sf' % len(weights_int), struct.pack('%si' % len(weights_int), *weights_int)) - return torch.Tensor(weights_float) - - -def get_tot_objf_and_finite_mask(tot_scores: torch.Tensor, reduction: str) -> Tuple[torch.Tensor, torch.Tensor]: - """Figures out the total score(log-prob) over all successful supervision segments - (i.e. those for which the total score wasn't -infinity). - Args: - tot_scores: a Torch tensor of shape (num_segments,) containing total scores - from forward-backward - reduction: a reduction type ('mean', 'sum' or 'none') - Returns: - Returns a tuple of 2 scalar tensors: (tot_score, finite_mask) - where finite_mask is a tensor containing successful segment mask. - - Based on get_tot_objf_and_num_frames - from https://github.com/k2-fsa/snowfall/blob/master/snowfall/objectives/common.py - """ - finite_mask = ~torch.isnan(tot_scores) & torch.ne(tot_scores, -float("inf")) - if reduction == "mean": - tot_scores = tot_scores[finite_mask].mean() - elif reduction == "sum": - tot_scores = tot_scores[finite_mask].sum() - return tot_scores, finite_mask - - -def get_uniform_rnnt_prune_ranges( - encoded_lengths: torch.Tensor, - target_lengths: torch.Tensor, - window_size_with_blank: int, - step: int = 1, - max_seq_len: Optional[int] = None, - begin_only: bool = False, -) -> torch.Tensor: - """Creates the pruning ranges for the Encoder and Predictor of RNNT. - The ranges are similar to https://k2-fsa.github.io/k2/python_api/api.html#k2.get_rnnt_prune_ranges - but they are constructed under the assumption of the uniform distribution token activations across time frames - and without any posterior knowledge. - """ - assert window_size_with_blank > 1 - assert step >= 1 - assert window_size_with_blank > step - assert len(encoded_lengths) == len(target_lengths) - ranges_begin = torch.zeros( - ( - len(encoded_lengths), - encoded_lengths.max() if max_seq_len is None else max(max_seq_len, encoded_lengths.max()), - ), - dtype=torch.long, - ) - for i in (target_lengths >= window_size_with_blank).nonzero(as_tuple=True)[0]: - encoded_len = encoded_lengths[i] - ranges_begin_raw = torch.arange(int((target_lengths[i] - window_size_with_blank) / step + 2)) * step - ranges_begin_raw[-1] = target_lengths[i] - window_size_with_blank + 1 - ranges_begin[i, :encoded_len] = torch.nn.functional.interpolate( - ranges_begin_raw.reshape(1, 1, -1).to(dtype=torch.float), encoded_len, mode="nearest-exact" - ).to(dtype=torch.long) - ranges_begin[i, encoded_len:] = ranges_begin[i, encoded_len - 1] - return ( - ranges_begin - if begin_only - else ranges_begin.unsqueeze(-1).repeat(1, 1, window_size_with_blank) + torch.arange(window_size_with_blank) - ) - - -def apply_rnnt_prune_ranges( - encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor, ranges: torch.Tensor -) -> Tuple[torch.Tensor, torch.Tensor]: - """Prepares pruned encoder and decoder outputs according to the prune ranges. - Based on k2.do_rnnt_pruning(...) - """ - B, T, window_size_with_blank = ranges.size() - D1 = encoder_outputs.size(-1) - _, U, D2 = decoder_outputs.size() - assert B == encoder_outputs.size(0) - assert T == encoder_outputs.size(1) - assert B == decoder_outputs.size(0) - encoder_outputs_pruned = encoder_outputs.unsqueeze(2).expand((B, T, window_size_with_blank, D1)) - decoder_outputs_pruned = torch.gather( - decoder_outputs.unsqueeze(1).expand((B, T, U, D2)), - dim=2, - index=ranges.reshape((B, T, window_size_with_blank, 1)).expand((B, T, window_size_with_blank, D2)), - ) - return encoder_outputs_pruned, decoder_outputs_pruned diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/w_transducer.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/w_transducer.py deleted file mode 100644 index b38a6c560fcdbec7000c5f628a0f839c9643b38e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/k2/w_transducer.py +++ /dev/null @@ -1,340 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from contextlib import nullcontext -from typing import Union - -import torch -import torch.nn.functional as F - -from nemo.collections.asr.parts.k2.graph_transducer import GraphRnntLoss, force_float32_context -from nemo.core.utils.k2_guard import k2 -from nemo.utils.enum import PrettyStrEnum - - -class GraphWTransducerLoss(GraphRnntLoss): - """ - W-Transducer loss: RNN-T loss modification for training RNN-T model for the case - when some text at the beginning/end of the utterance is missing. - The resulting model behaves like the RNN-T model (no modification for decoding is required). - For details see "Powerful and Extensible WFST Framework for RNN-Transducer Losses" paper - https://ieeexplore.ieee.org/document/10096679 - """ - - class LastBlankMode(PrettyStrEnum): - ALLOW_IGNORE = "allow_ignore" - FORCE_FINAL = "force_final" - - def __init__( - self, - blank: int, - eps_weight: float = 0.0, - last_blank_mode: Union[LastBlankMode, str] = LastBlankMode.FORCE_FINAL, - use_grid_implementation=True, - connect_composed=False, - double_scores=False, - cast_to_float32=False, - ): - """ - Init method - - Args: - blank: blank label index - eps_weight: weight of epsilon transitions, 0 means no penalty (default) - last_blank_mode: allow to skip last blank in the prediction (default) or force it - use_grid_implementation: Whether to use the grid implementation (Grid-Transducer). - connect_composed: Connect graph after composing unit and temporal schemas - (only for Compose-Transducer). `connect` operation is slow, it is useful for visualization, - but not necessary for loss computation. - double_scores: Use calculation of loss in double precision (float64) in the lattice. - Does not significantly affect memory usage since the lattice is ~V/2 times smaller than the joint tensor. - cast_to_float32: Force cast joint tensor to float32 before log-softmax calculation. - """ - super().__init__( - blank=blank, - use_grid_implementation=use_grid_implementation, - connect_composed=connect_composed, - double_scores=double_scores, - cast_to_float32=cast_to_float32, - ) - self.eps_weight = eps_weight - self.last_blank_mode = self.LastBlankMode(last_blank_mode) - - def get_unit_schema(self, units_tensor: torch.Tensor, vocab_size: int) -> "k2.Fsa": - """ - Get unit schema (target text) graph for W-Transducer loss (Compose-Transducer). - Forward arcs represent text labels. - - Example graph: text [1, 2], blank=0. Eps ids: 3, 4. - - graph:: - - 3:3:0 0:0:1 0:0:2 - +-------+ +-------+ +-------+ - v | v | v | - +-----------+ 1:1:0 +-----------+ 2:2:1 +-----------+ -1:-1:-1 #===# - | 0 | -------> | 1 | -------> | 2 | ---------> H 3 H - +-----------+ +-----------+ +-----------+ #===# - ^ 0:0:0 | ^ 4:4:2 | - +-------+ +-------+ - - Args: - units_tensor: 1d tensor with text units - vocab_size: number of total labels (vocab size including blank) - - Returns: - unit schema graph (k2.Fsa). - Labels: :: (k2.Fsa: labels, aux_labels, unit_positions) - """ - - blank_id = self.blank - start_eps_id = vocab_size - end_eps_id = vocab_size + 1 - device = units_tensor.device - text_len = units_tensor.shape[0] - - # arcs: scr, dest, label, score - arcs = torch.zeros(((text_len + 1) * 2 + 2, 4), dtype=torch.int32, device=device) - text_indices = torch.arange(0, text_len + 1, dtype=torch.int32, device=device) - # eps - arcs[0, 2] = start_eps_id - # blank labels - arcs[1:-1:2, 0] = text_indices # from state - arcs[1:-1:2, 1] = text_indices # to state - arcs[1:-1:2, 2] = blank_id - - # text labels - arcs[2:-1:2, 0] = text_indices # from state - arcs[2:-1:2, 1] = text_indices + 1 # to state - arcs[2:-2:2, 2] = units_tensor # labels: text - - arcs[-1] = arcs[-2] - arcs[-2, 1] = text_len - arcs[-2, 2] = end_eps_id - arcs[-1, 2] = -1 # last transition to final state, ilabel=-1 (special for k2) - olabels = arcs[:, 2].detach().clone() # same as ilabels - - fsa_text = k2.Fsa(arcs, olabels) - fsa_text.unit_positions = torch.zeros_like(olabels) - fsa_text.unit_positions[1:-1] = text_indices.expand(2, -1).transpose(0, 1).flatten() - fsa_text.unit_positions[-1] = -1 - return fsa_text - - def get_temporal_schema(self, num_frames: int, vocab_size: int, device: torch.device) -> "k2.Fsa": - """ - Get temporal schema graph for W-Transducer loss (Compose-Transducer). - - Example graph: blank=0, num_frames=3, vocab_size=3, last_blank_mode="force_final". - Labels: :. is a unit from vocab + special eps ids `vocab_size`, `vocab_size+1`. - - graph for force_final:: - - 4:0 - +--------------------------------------------+ - | 4:1 | - | +--------------------+ | - 1:0 | 1:1 | 1:2 | | - +-----+ | +-----+ | +-----+ | | - v | | v | | v | v v - +--------------+ 0:0 +------------+ 0:1 +------------+ 0:2 +---+ -1:-1 #===# - | 0 | ----> | 1 | -----> | 2 | -----> | 3 | -------> H 4 H - +--------------+ +------------+ +------------+ +---+ #===# - ^ 2:0 | | | ^ 2:1 | ^ ^ 2:2 | ^ - +-----+ | | +-----+ | +-----+ | - | | 3:0 | | - | +------------------+ 3:0 | - +-------------------------------------------+ - - - Args: - num_frames: length of the sequence (in frames) - vocab_size: number of labels (including blank) - device: device for tensor to construct - - Returns: - temporal schema graph (k2.Fsa). - Labels: :. is a unit from vocab + special units (e.g., additional eps). - """ - blank_id = self.blank - start_eps_id = vocab_size - end_eps_id = vocab_size + 1 - num_eps = 2 - - num_sequence_arcs = num_frames * vocab_size + (num_frames - 1) * num_eps + 1 - fsa_temporal_arcs = torch.zeros((num_sequence_arcs, 4), dtype=torch.int32, device=device) - sequence_states = torch.arange(0, num_frames, dtype=torch.int32, device=device) - sequence_states_next = sequence_states + 1 - # for every state - vocab_size+1 arcs, [0, 1, ..., vocab_size-1, eps, 0, 1, ..., vocab_size-1, eps, ...] - start_states = sequence_states.expand(vocab_size + num_eps, num_frames).transpose(0, 1).flatten() - - # self-loops - all, make forward arcs later - fsa_temporal_arcs[:num_sequence_arcs, 0] = start_states[:-1] # from - fsa_temporal_arcs[:num_sequence_arcs, 1] = start_states[:-1] # to - fsa_temporal_arcs[:num_sequence_arcs, 2] = ( - torch.arange(0, vocab_size + num_eps, dtype=torch.int32, device=device) - .expand(num_frames, vocab_size + num_eps) - .flatten()[:-1] - ) - # forward arcs - fsa_temporal_arcs[blank_id : num_sequence_arcs : vocab_size + num_eps, 1] = sequence_states_next # blanks - # eps arcs - fsa_temporal_arcs[start_eps_id : num_sequence_arcs : vocab_size + num_eps, 0] = 0 - fsa_temporal_arcs[start_eps_id : num_sequence_arcs : vocab_size + num_eps, 1] = sequence_states + 1 - fsa_temporal_arcs[end_eps_id : num_sequence_arcs : vocab_size + num_eps, 0] = sequence_states[:-1] - fsa_temporal_arcs[end_eps_id : num_sequence_arcs : vocab_size + num_eps, 1] = ( - num_frames - 1 if self.last_blank_mode == self.LastBlankMode.FORCE_FINAL else num_frames - ) - - # transition to last final state - fsa_temporal_arcs[-1, :3] = torch.tensor((num_frames, num_frames + 1, -1), dtype=torch.int32, device=device) - - # need to sort arcs - _, indices = torch.sort(fsa_temporal_arcs[:, 0], dim=0) - fsa_temporal_arcs = fsa_temporal_arcs[indices] - - # output symbols: position in the sequence, same as start states for arcs - olabels = fsa_temporal_arcs[:, 0].detach().clone() - olabels[-1] = -1 # transition to the last final state - - fsa_temporal = k2.Fsa(fsa_temporal_arcs, olabels) - fsa_temporal = k2.arc_sort(fsa_temporal) # need for compose - return fsa_temporal - - def get_grid(self, units_tensor: torch.Tensor, num_frames: int, vocab_size: int) -> "k2.Fsa": - """ - Construct W-Transducer lattice directly (Grid-Transducer). - - Args: - units_tensor: 1d tensor with text units - num_frames: length of the sequence (number of frames) - vocab_size: number of total labels (vocab size including blank) - - Returns: - transducer lattice (k2.Fsa). - Labels: :: (k2.Fsa: labels, aux_labels, unit_positions) - """ - blank_id = self.blank - eps_id = vocab_size # beyond vocabulary - text_length = units_tensor.shape[0] - device = units_tensor.device - num_grid_states = num_frames * (text_length + 1) - num_forward_arcs_base = (num_frames - 1) * (text_length + 1) - num_forward_arcs_additional = (num_frames - 1) * 2 - num_forward_arcs = num_forward_arcs_base + num_forward_arcs_additional - num_text_arcs = text_length * num_frames - arcs = torch.zeros((num_forward_arcs + num_text_arcs + 2, 4), dtype=torch.int32, device=device) - # blank transitions - # i, i+, 0 , i / , i % - from_states = torch.arange(num_forward_arcs_base, device=device) - to_states = from_states + (text_length + 1) - arcs[:num_forward_arcs_base, 0] = from_states - arcs[:num_forward_arcs_base, 1] = to_states - arcs[:num_forward_arcs_base, 2] = blank_id - - from_states = torch.cat( - [ - torch.arange(num_frames - 1, device=device) * (text_length + 1), - text_length + torch.arange(num_frames - 1, device=device) * (text_length + 1), - ] - ) - to_states = from_states + (text_length + 1) - arcs[num_forward_arcs_base : num_forward_arcs_base + (num_frames - 1) * 2, 0] = from_states - arcs[num_forward_arcs_base : num_forward_arcs_base + (num_frames - 1) * 2, 1] = to_states - arcs[num_forward_arcs_base : num_forward_arcs_base + (num_frames - 1), 2] = eps_id - arcs[num_forward_arcs_base + (num_frames - 1) : num_forward_arcs_base + (num_frames - 1) * 2, 2] = eps_id + 1 - - arcs[num_forward_arcs_base : num_forward_arcs_base + (num_frames - 1), 0] = 0 - arcs[num_forward_arcs_base + (num_frames - 1) : num_forward_arcs_base + (num_frames - 1) * 2, 1] = ( - num_grid_states - 1 - ) # if other mode - fix later - # last eps ark - after relabel - - # text arcs - from_states = ( - torch.arange(num_grid_states, dtype=torch.int32, device=device) - .reshape(num_frames, text_length + 1)[:, :-1] - .flatten() - ) - to_states = from_states + 1 - ilabels = units_tensor.expand(num_frames, -1).flatten() - arcs[num_forward_arcs:-2, 0] = from_states - arcs[num_forward_arcs:-2, 1] = to_states - arcs[num_forward_arcs:-2, 2] = ilabels - - # last 2 states - arcs[-2, :3] = torch.tensor((num_grid_states - 1, num_grid_states, blank_id), dtype=torch.int32, device=device) - arcs[-1, :3] = torch.tensor((num_grid_states, num_grid_states + 1, -1), dtype=torch.int32, device=device) - - # sequence indices, time indices - olabels = torch.div(arcs[:, 0], (text_length + 1), rounding_mode="floor") # arcs[:, 0] // (text_length + 1) - unit_positions = arcs[:, 0] % (text_length + 1) - # last state: final - olabels[-1] = -1 - unit_positions[-1] = -1 - - # relabel - # instead of using top sort (extremely expensive) k2.top_sort(rnnt_graph) - arcs[:-2, 0] = self.relabel_states(arcs[:-2, 0], text_length + 1, num_frames) - arcs[:-3, 1] = self.relabel_states(arcs[:-3, 1], text_length + 1, num_frames) - - if self.last_blank_mode == self.LastBlankMode.ALLOW_IGNORE: - arcs[ - num_forward_arcs_base + (num_frames - 1) : num_forward_arcs_base + (num_frames - 1) * 2, 1 - ] = num_grid_states - - # sort by start state - required in k2 - # TODO: maybe it is more optimal to avoid sort, construct arcs in ascending order - _, indices = torch.sort(arcs[:, 0], dim=0) - arcs = arcs[indices] - olabels = olabels[indices] - unit_positions = unit_positions[indices] - - rnnt_graph = k2.Fsa(arcs, olabels) - rnnt_graph.unit_positions = unit_positions - return rnnt_graph - - def forward( - self, acts: torch.Tensor, labels: torch.Tensor, act_lens: torch.Tensor, label_lens: torch.Tensor, - ): - """ - Forward method is similar to RNN-T Graph-Transducer forward method, - but we need to assign eps weight to eps-transitions. - """ - # argument names are consistent with NeMo, see RNNTLoss.forward: - # self._loss(acts=log_probs, labels=targets, act_lens=input_lengths, label_lens=target_lengths) - logits, targets, logits_lengths, target_lengths = acts, labels, act_lens, label_lens - - # logits: B x Time x Text+1 x C - vocab_size = logits.shape[-1] - target_fsas_vec = self.get_graphs_batched(logits_lengths, targets, target_lengths, vocab_size) - - cast_context = force_float32_context() if self.cast_to_float32 else nullcontext() - with cast_context: - log_probs = F.log_softmax(logits, dim=-1) - with torch.no_grad(): - indices = self.get_logits_indices(target_fsas_vec, logits.shape) - # transition to the last state + eps-transitions - # use 0 index (for valid index_select) and manually assign score after index_select for this case - indices[target_fsas_vec.labels == -1] = 0 - indices[target_fsas_vec.labels >= vocab_size] = 0 # eps - - # NB: do not assign scores -> modify, k2 will not update all scores correctly (modify -> assign) - scores = log_probs.flatten().index_select(-1, indices) - # fix weights for the arcs to the last state + eps-transitions - scores[target_fsas_vec.labels == -1] = 0 - scores[target_fsas_vec.labels >= vocab_size] = self.eps_weight # eps - - target_fsas_vec.scores = scores - scores = -1 * target_fsas_vec.get_tot_scores(use_double_scores=self.double_scores, log_semiring=True) - return scores diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__init__.py deleted file mode 100644 index 2a952085dceb7a111063db550d26985aaab2984c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.parts.mixins.asr_adapter_mixins import ASRAdapterModelMixin -from nemo.collections.asr.parts.mixins.interctc_mixin import InterCTCMixin -from nemo.collections.asr.parts.mixins.mixins import ( - ASRAdapterModelMixin, - ASRBPEMixin, - ASRModuleMixin, - DiarizationMixin, -) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index cae1f578d6f700a419e221060e5457ba40b4b956..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__pycache__/asr_adapter_mixins.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__pycache__/asr_adapter_mixins.cpython-310.pyc deleted file mode 100644 index 7c11a5e810b502b706f4f100a5944279b352cf28..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__pycache__/asr_adapter_mixins.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__pycache__/interctc_mixin.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__pycache__/interctc_mixin.cpython-310.pyc deleted file mode 100644 index 461b04caa3120148186b5097fc8da31fbd07697e..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__pycache__/interctc_mixin.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__pycache__/mixins.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__pycache__/mixins.cpython-310.pyc deleted file mode 100644 index ba3ba23e5b7a04af2ccbe26909b1bc85dd3a8aad..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__pycache__/mixins.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__pycache__/streaming.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__pycache__/streaming.cpython-310.pyc deleted file mode 100644 index d279ba63c7c34e837a8bd6393722b9aab6a2c5b4..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/__pycache__/streaming.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/asr_adapter_mixins.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/asr_adapter_mixins.py deleted file mode 100644 index f452acd19847e69f97c2d004e26f8e9aeb7a8dc1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/asr_adapter_mixins.py +++ /dev/null @@ -1,295 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List, Optional, Tuple - -from omegaconf import DictConfig, open_dict - -from nemo.core.classes.mixins.adapter_mixins import AdapterModelPTMixin, AdapterModuleMixin -from nemo.utils import logging, logging_mode - - -class ASRAdapterModelMixin(AdapterModelPTMixin): - """ ASR Adapter Mixin that can augment any Encoder module with Adapter module support. - - This mixin class should be used only with a top level ModelPT subclass, that includes an `encoder` submodule. - This mixin class adds several utility methods which are propagated to the `encoder`. - - An Adapter module is any Pytorch nn.Module that possess a few properties : - - - It's input and output dimension are the same, while the hidden dimension need not be the same. - - The final layer of the Adapter module is zero-initialized, so that the residual connection to the adapter - yields the original output. - - This mixin adds the following instance variables to the class this inherits it: - - - `adapter_layer`: A torch.nn.ModuleDict(), whose keys are the names of the adapter (globally unique), - and values are the Adapter nn.Module(). - - `adapter_cfg`: A OmegaConf DictConfig object that holds the config of the adapters that are initialized. - - `adapter_global_cfg_key`: A str representing a key in the model config that can be provided by the user. - The value resolves to `global_cfg`, and can be overridden via `model.cfg.adapters.global_cfg.*`. - - **Note**: This module **is** responsible for maintaining its config. At the ModelPT level, it will access and - write Adapter config information to `self.cfg.adapters`. - """ - - def setup_adapters(self): - """ - Utility method that is called in the ASR ModelPT-implementation constructor, so as to restore any - adapters that were previously added. - - This method should be called just once at constructor time. - """ - supports_adapters = False - - # At least the encoder must extend AdapterModuleMixin - if hasattr(self, 'encoder') and isinstance(self.encoder, AdapterModuleMixin): - supports_adapters |= True - - if hasattr(self, 'decoder') and isinstance(self.decoder, AdapterModuleMixin): - supports_adapters |= True - - if hasattr(self, 'joint') and isinstance(self.joint, AdapterModuleMixin): - supports_adapters |= True - - # If adapters are supported, setup the adapter config + any modules (pre-existing adapter modules) - if supports_adapters: - super().setup_adapters() - - def add_adapter(self, name: str, cfg: DictConfig): - """ - Add an Adapter module to this model. - - Args: - name: A globally unique name for the adapter. Will be used to access, enable and disable adapters. - cfg: A DictConfig that contains at the bare minimum `__target__` to instantiate a new Adapter module. - """ - # setup the config for adapters - super().add_adapter(name=name, cfg=cfg) - - # Resolve module name and adapter name - module_name, _ = self.resolve_adapter_module_name_(name) - - # Use + as a splitter, in order to share one name across multiple modules - if '+' in module_name: - module_names = module_name.split('+') - else: - module_names = [module_name] - - # Update the model.cfg with information about the new adapter from cfg - with open_dict(self.cfg): - for module_name in module_names: - # Check if encoder adapters should be added - if module_name in ('', 'encoder'): - # Dispatch the call to the encoder. - self.encoder.add_adapter(name=name, cfg=cfg) - - # Check if decoder adapters should be added - if module_name == 'decoder': - # Dispatch call to the decoder. - self.decoder.add_adapter(name=name, cfg=cfg) - - # Check if joint adapters should be added; - # Note: We need additional check if joint even exists in model (for CTC models) - if hasattr(self, 'joint') and module_name == 'joint': - # Dispatch call to the joint. - self.joint.add_adapter(name=name, cfg=cfg) - - def is_adapter_available(self) -> bool: - """ - Checks if any Adapter module has been instantiated. - - Returns: - bool, determining if any Adapter module has been instantiated. Returns true even if the adapters are - enabled or disabled, false only if no adapters exist. - """ - config_contains_adapter = super().is_adapter_available() - - # Forward the method call to the individual modules - if hasattr(self, 'encoder') and isinstance(self.encoder, AdapterModuleMixin): - config_contains_adapter |= self.encoder.is_adapter_available() - - if hasattr(self, 'decoder') and isinstance(self.decoder, AdapterModuleMixin): - config_contains_adapter |= self.decoder.is_adapter_available() - - if hasattr(self, 'joint') and isinstance(self.joint, AdapterModuleMixin): - config_contains_adapter |= self.joint.is_adapter_available() - - return config_contains_adapter - - def set_enabled_adapters(self, name: Optional[str] = None, enabled: bool = True): - """ - Updated the internal adapter config, determining if an adapter (or all adapters) are either - enabled or disabled. - - A common user pattern would be to disable all adapters (either after adding them, or restoring a model - with pre-existing adapters) and then simply enable one of the adapters. - - .. code:: - - model.set_enabled_adapters(enabled=False) - model.set_enabled_adapters(name=, enabled=True) - - Args: - name: Optional str. If a str name is given, the config will be updated to the value of `enabled`. - If no name is given, then all adapters will be enabled/disabled. - enabled: Bool, determines if the adapter(s) will be enabled/disabled. - """ - super().set_enabled_adapters(name=name, enabled=enabled) - - # Resolve the module name and adapter name - if name is not None: - module_name, _ = self.resolve_adapter_module_name_(name) - else: - module_name = None - - # Use + as a splitter, in order to share one name across multiple modules - if module_name is not None and '+' in module_name: - module_names = module_name.split('+') - else: - module_names = [module_name] - - for module_name in module_names: - # Check if encoder adapters should be used - # Dispatch the call to the encoder. - if name is None or module_name in ('', 'encoder'): - if self.encoder.is_adapter_available(): - self.encoder.set_enabled_adapters(name=name, enabled=enabled) - - # Dispatch the call to the decoder. - if name is None or module_name == 'decoder': - if self.decoder.is_adapter_available(): - self.decoder.set_enabled_adapters(name=name, enabled=enabled) - - # Dispatch the call to the joint. - # Note: We need additional check for joint, since it may not exist (CTC models). - if name is None or module_name == 'joint': - if hasattr(self, 'joint') and self.joint.is_adapter_available(): - self.joint.set_enabled_adapters(name=name, enabled=enabled) - - def get_enabled_adapters(self) -> List[str]: - """ - Returns a list of all enabled adapters. - - Returns: - A list of str names of each enabled adapter(s). - """ - enabled_adapters = super().get_enabled_adapters() - - # Check if encoder adapters should be used or are enabled - if hasattr(self, 'encoder') and isinstance(self.encoder, AdapterModuleMixin): - enabled_adapters.extend(self.encoder.get_enabled_adapters()) - - if hasattr(self, 'decoder') and isinstance(self.decoder, AdapterModuleMixin): - enabled_adapters.extend(self.decoder.get_enabled_adapters()) - - if hasattr(self, 'joint') and isinstance(self.joint, AdapterModuleMixin): - enabled_adapters.extend(self.joint.get_enabled_adapters()) - - enabled_adapters = list(sorted(list(set(enabled_adapters)))) - - return enabled_adapters - - def check_valid_model_with_adapter_support_(self): - """ - Utility method to test if the subclass of this mixin is an appropriate subclass of ModelPT itself. - """ - # Obtain the global adapter config if possible, otherwise use sensible defaults. - global_cfg = self._get_global_cfg() - - # Test whether the encoder supports adapters - use_encoder_adapter = global_cfg.get('check_encoder_adapter', True) - if use_encoder_adapter: - if not hasattr(self, 'encoder'): - logging.warning( - "Cannot add adapter to this object as it does not have an `encoder` sub-module!", - mode=logging_mode.ONCE, - ) - - if hasattr(self, 'encoder') and not isinstance(self.encoder, AdapterModuleMixin): - logging.warning( - f'{self.encoder.__class__.__name__} does not implement `AdapterModuleMixin`', - mode=logging_mode.ONCE, - ) - - # Test whether the decoder supports adapters - use_decoder_adapter = global_cfg.get('check_decoder_adapter', True) - if use_decoder_adapter: - if not hasattr(self, 'decoder'): - logging.warning( - "Cannot add adapter to this object as it does not have an `decoder` sub-module!", - mode=logging_mode.ONCE, - ) - - if hasattr(self, 'decoder') and not isinstance(self.decoder, AdapterModuleMixin): - logging.warning( - f'{self.decoder.__class__.__name__} does not implement `AdapterModuleMixin`', - mode=logging_mode.ONCE, - ) - - # Test whether the joint supports adapters - use_joint_adapter = global_cfg.get('check_joint_adapter', True) - if use_joint_adapter: - # Joint is only for RNNT models, skip assertion that it must always exist. - if hasattr(self, 'joint') and not isinstance(self.joint, AdapterModuleMixin): - logging.warning( - f'{self.joint.__class__.__name__} does not implement `AdapterModuleMixin`', mode=logging_mode.ONCE - ) - - def resolve_adapter_module_name_(self, name: str) -> Tuple[str, str]: - """ - Utility method to resolve a given global/module adapter name to its components. - Always returns a tuple representing (module_name, adapter_name). ":" is used as the - delimiter for denoting the module name vs the adapter name. - - Will attempt to also resolve a given adapter_name alone back to (module_name, adapter_name) - if the metadata config exists for access. - - Args: - name: A global adapter, or a module adapter name (with structure module_name:adapter_name). - - Returns: - A tuple representing (module_name, adapter_name). If a global adapter is provided, - module_name is set to ''. - """ - module_name, adapter_name = super().resolve_adapter_module_name_(name) - - # Use + as a splitter, in order to share one name across multiple modules - if '+' in module_name: - module_names = module_name.split('+') - else: - module_names = [module_name] - - # resolve name and module only for valid modules - valid_module_names = self.adapter_module_names - - for mod_name in module_names: - if mod_name not in valid_module_names: - raise ValueError(f"Provided module name `{mod_name}` is not in valid list : {valid_module_names}") - - return (module_name, adapter_name) - - def _get_global_cfg(self): - """ - Utility method, to either extract or construct the global config inside adapters config. - """ - global_config = DictConfig({}) - if 'adapters' in self.cfg and self.adapter_global_cfg_key in self.cfg.adapters: - global_config = self.adapter_cfg[self.adapter_global_cfg_key] - return global_config - - @property - def adapter_module_names(self) -> List[str]: - valid_module_names = ['', 'encoder', 'decoder', 'joint'] - return valid_module_names diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/interctc_mixin.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/interctc_mixin.py deleted file mode 100644 index 8b2bf64aa79607c97bfe2641e744c0bfdf2ca298..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/interctc_mixin.py +++ /dev/null @@ -1,285 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import Dict, List, Optional, Tuple - -import torch - -from nemo.core.classes.mixins import AccessMixin - - -class InterCTCMixin: - """Adds utilities for computing interCTC loss from https://arxiv.org/abs/2102.03216. - - To use, make sure encoder accesses ``interctc['capture_layers']`` - property in the AccessMixin and registers ``interctc/layer_output_X`` and - ``interctc/layer_length_X`` for all layers that we want to get loss from. - Additionally, specify the following config parameters to set up loss:: - - interctc: - # can use different values - loss_weights: [0.3] - apply_at_layers: [8] - - Then call - - * ``self.setup_interctc(ctc_decoder_name, ctc_loss_name, ctc_wer_name)`` - in the init method - * ``self.add_interctc_losses`` after computing regular loss. - * ``self.finalize_interctc_metrics(metrics, outputs, prefix="val_")`` - in the `multi_validation_epoch_end` method. - * ``self.finalize_interctc_metrics(metrics, outputs, prefix="test_")`` - in the `multi_test_epoch_end` method. - """ - - def _process_config_values(self, loss_weights: List[float], apply_at_layers: List[int]): - self.set_interctc_param('intermediate_loss_weights', loss_weights) - self.set_interctc_param('apply_at_layers', apply_at_layers) - self.set_interctc_param('main_loss_weight', 1.0 - sum(loss_weights)) - if self.get_interctc_param('main_loss_weight') <= 0.0: - raise ValueError( - "Make sure that sum of intermediate loss weights is < 1.0. " - "Note that we don't do any normalization and assign " - "remaining weight to the regular model loss. " - "E.g., if interctc.loss_weights = [0.1, 0.3], regular " - "loss will have weight of 0.6" - ) - self.set_interctc_param('enabled', len(loss_weights) > 0) - - if len(apply_at_layers) != len(loss_weights): - raise ValueError('Length of interctc.apply_at_layers has to match interctc.loss_weights') - - # setting up config for AccessMixin that will be checked in encoders to - # log the layers we need - AccessMixin.update_access_cfg({'interctc': {'capture_layers': apply_at_layers}}) - - def setup_interctc(self, decoder_name, loss_name, wer_name): - """Sets up all interctc-specific parameters and checks config consistency. - - Caller has to specify names of attributes to perform CTC-specific WER, - decoder and loss computation. They will be looked up in the class - state with ``getattr``. - - The reason we get the names and look up object later is because those - objects might change without re-calling the setup of this class. So - we always want to look up the most up-to-date object instead of - "caching" it here. - """ - # registering all parameters in a dictionary to avoid conflicts with - # main class's names - self._interctc_params = {} - interctc_config = self.cfg.get("interctc") - if interctc_config is not None: - # if interctc is in the config, we want to check that it indeed defines - # the required keys and nothing else - that's automatically done by - # matching with keyword arguments in self._process_config_values - self._process_config_values(**interctc_config) - self._interctc_params['decoder_name'] = decoder_name - self._interctc_params['loss_name'] = loss_name - self._interctc_params['wer_name'] = wer_name - else: - self.set_interctc_param('enabled', False) - - def get_interctc_param(self, param_name): - """Either directly get parameter from ``self._interctc_params`` or - call getattr with the corresponding name. - """ - if param_name in ['decoder', 'loss', 'wer']: - return getattr(self, self._interctc_params[param_name + "_name"]) - return self._interctc_params[param_name] - - def set_interctc_param(self, param_name, param_value): - """Setting the parameter to the ``self._interctc_params`` dictionary. - - Raises an error if trying to set decoder, loss or wer as those should - always come from the main class. - """ - if param_name in ['decoder', 'loss', 'wer']: - raise ValueError( - 'Cannot set "decoder", "loss" or "wer" as parameters. ' - 'They are always looked up in the main class state.' - ) - self._interctc_params[param_name] = param_value - - def _verify_setup_was_called(self): - """Can be used to verify if setup_interctc was called.""" - if not hasattr(self, '_interctc_params'): - raise RuntimeError( - 'self.setup_interctc(ctc_decoder_name, ctc_loss_name, ctc_wer_name) has to be ' - 'called before InterCTC loss can be used!' - ) - - def is_interctc_enabled(self) -> bool: - """Returns whether interCTC loss is enabled.""" - self._verify_setup_was_called() - return self.get_interctc_param('enabled') - - def set_interctc_enabled(self, enabled: bool): - """Can be used to enable/disable InterCTC manually.""" - self._verify_setup_was_called() - if enabled: # checking if proper config parameters were specified - if len(self.get_interctc_param('intermediate_loss_weights')) == 0: - raise RuntimeError( - 'InterCTC cannot be enabled since interctc.loss_weights was not specified in the config.' - ) - if len(self.get_interctc_param('apply_at_layers')) != len( - self.get_interctc_param('intermediate_loss_weights') - ): - raise RuntimeError( - 'InterCTC cannot be enabled, since length of "loss_weights" does not match "apply_at_layers".' - ) - self.set_interctc_param('enabled', enabled) - - def finalize_interctc_metrics(self, metrics: Dict, outputs: List[Dict], prefix: str): - """Finalizes InterCTC WER and loss metrics for logging purposes. - - Should be called inside ``multi_validation_epoch_end`` (with ``prefix="val_"``) or - ``multi_test_epoch_end`` (with ``prefix="test_"``). - - Note that ``metrics`` dictionary is going to be updated in-place. - """ - if self.is_interctc_enabled(): - for layer_idx in self.get_interctc_param('apply_at_layers'): - # assuming that if the first batch logged the metrics, then all batches did - if f"{prefix}inter_ctc_loss_l{layer_idx}" in outputs[0]: - loss = torch.stack([x[f"{prefix}inter_ctc_loss_l{layer_idx}"] for x in outputs]).mean() - metrics["log"][f"{prefix}inter_ctc_loss_l{layer_idx}"] = loss - - if f"{prefix}inter_wer_num_l{layer_idx}" in outputs[0]: - wer_num = torch.stack([x[f"{prefix}inter_wer_num_l{layer_idx}"] for x in outputs]).sum() - wer_denom = torch.stack([x[f"{prefix}inter_wer_denom_l{layer_idx}"] for x in outputs]).sum() - metrics["log"][f"{prefix}inter_wer_l{layer_idx}"] = wer_num / wer_denom - - if f"{prefix}final_loss" in outputs[0]: - metrics["log"][f"{prefix}final_loss"] = torch.stack([x[f"{prefix}final_loss"] for x in outputs]).mean() - - def get_captured_interctc_tensors(self) -> List[Tuple[torch.Tensor, torch.Tensor]]: - """Returns a list of captured tensors from encoder: tuples of (output, length). - - Will additionally apply ``ctc_decoder`` to the outputs. - """ - if not self.is_interctc_enabled(): - return [] - - # note that we have a loop here, because tensors can be defined from - # submodules of encoder (e.g., that's the case in Jasper) - total_registry = {} - for module_registry in AccessMixin.get_module_registry(self.encoder).values(): - for key in module_registry: - if key.startswith("interctc/") and key in total_registry: - raise RuntimeError(f"layer {key} has been logged multiple times!") - total_registry.update(module_registry) - # if intermediate_loss_weights was set, the encoder has to register - # interctc/layer_output_X and interctc/layer_length_X tensors. - # We need to apply decoder to each of them and compute CTC loss. - captured_tensors = [] - for layer_idx in self.get_interctc_param('apply_at_layers'): - try: - layer_outputs = total_registry[f"interctc/layer_output_{layer_idx}"] - layer_lengths = total_registry[f"interctc/layer_length_{layer_idx}"] - except KeyError: - raise RuntimeError( - f"Intermediate layer {layer_idx} was not captured! " - "Check if length of model.encoder.captured_layer_outputs matches " - "length of model.intermediate_loss_weights properties." - ) - if len(layer_outputs) > 1 or len(layer_lengths) > 1: - raise RuntimeError( - "Make sure encoder.forward is called exactly one time before interCTC loss is computed." - ) - captured_tensors.append( - (self.get_interctc_param('decoder')(encoder_output=layer_outputs[0]), layer_lengths[0]) - ) - return captured_tensors - - def add_interctc_losses( - self, - loss_value: torch.Tensor, - transcript: torch.Tensor, - transcript_len: torch.Tensor, - compute_wer: bool, - compute_loss: bool = True, - log_wer_num_denom: bool = False, - log_prefix: str = "", - ) -> Tuple[Optional[torch.Tensor], Dict]: - """Adding interCTC losses if required. - - Will also register loss/wer metrics in the returned dictionary. - - Args: - loss_value (torch.Tensor): regular loss tensor (will add interCTC loss to it). - transcript (torch.Tensor): current utterance transcript. - transcript_len (torch.Tensor): current utterance transcript length. - compute_wer (bool): whether to compute WER for the current utterance. - Should typically be True for validation/test and only True for - training if current batch WER should be logged. - compute_loss (bool): whether to compute loss for the current utterance. - Should always be True in training and almost always True in - validation, unless all other losses are disabled as well. - Defaults to True. - log_wer_num_denom (bool): if True, will additionally log WER num/denom - in the returned metrics dictionary. Should always be True for - validation/test to allow correct metrics aggregation. Should - always be False for training. Defaults to False. - log_prefix (str): prefix added to all log values. Should be ``""`` for - training and ``"val_"`` for validation. Defaults to "". - - Returns: - tuple[Optional[torch.Tensor], Dict]: tuple of new loss tensor and dictionary with logged metrics. - """ - if not self.is_interctc_enabled() or not AccessMixin.is_access_enabled(): - return loss_value, {} - metrics = {} - if compute_loss: - metrics[f"{log_prefix}final_loss"] = loss_value - else: - loss_value = None - captured_tensors = self.get_captured_interctc_tensors() - - if compute_loss: - loss_value *= self.get_interctc_param('main_loss_weight') - - for layer_idx, intermediate_result, loss_weight in zip( - self.get_interctc_param('apply_at_layers'), - captured_tensors, - self.get_interctc_param('intermediate_loss_weights'), - ): - if compute_loss: - inter_loss_value = self.get_interctc_param('loss')( - log_probs=intermediate_result[0], - targets=transcript, - target_lengths=transcript_len, - input_lengths=intermediate_result[1], - ) - metrics[f"{log_prefix}inter_ctc_loss_l{layer_idx}"] = inter_loss_value.detach() - loss_value += inter_loss_value * loss_weight - if compute_wer: - self.get_interctc_param('wer').update( - predictions=intermediate_result[0], - targets=transcript, - target_lengths=transcript_len, - predictions_lengths=intermediate_result[1], - ) - wer, wer_num, wer_denom = self.get_interctc_param('wer').compute() - self.get_interctc_param('wer').reset() - metrics.update({f'{log_prefix}inter_wer_l{layer_idx}': wer}) - if log_wer_num_denom: - metrics.update( - { - f'{log_prefix}inter_wer_num_l{layer_idx}': wer_num, - f'{log_prefix}inter_wer_denom_l{layer_idx}': wer_denom, - } - ) - - # return total loss and dictionary of metrics - return loss_value, metrics diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/mixins.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/mixins.py deleted file mode 100644 index 4c43960ac9d2d24fba602d8efe6b0bfdc7b7df44..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/mixins.py +++ /dev/null @@ -1,754 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from abc import ABC, abstractmethod -from typing import List - -import torch -from omegaconf import DictConfig, OmegaConf, open_dict - -import nemo.collections.asr.models as asr_models -from nemo.collections.asr.parts.mixins.asr_adapter_mixins import ASRAdapterModelMixin -from nemo.collections.asr.parts.mixins.streaming import StreamingEncoder -from nemo.collections.asr.parts.utils import asr_module_utils -from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis -from nemo.collections.common import tokenizers -from nemo.utils import logging - - -class ASRBPEMixin(ABC): - """ ASR BPE Mixin class that sets up a Tokenizer via a config - - This mixin class adds the method `_setup_tokenizer(...)`, which can be used by ASR models - which depend on subword tokenization. - - The setup_tokenizer method adds the following parameters to the class - - - tokenizer_cfg: The resolved config supplied to the tokenizer (with `dir` and `type` arguments). - - tokenizer_dir: The directory path to the tokenizer vocabulary + additional metadata. - - tokenizer_type: The type of the tokenizer. Currently supports `bpe` and `wpe`, as well as `agg`. - - vocab_path: Resolved path to the vocabulary text file. - - In addition to these variables, the method will also instantiate and preserve a tokenizer - (subclass of TokenizerSpec) if successful, and assign it to self.tokenizer. - - The mixin also supports aggregate tokenizers, which consist of ordinary, monolingual tokenizers. - If a conversion between a monolongual and an aggregate tokenizer (or vice versa) is detected, - all registered artifacts will be cleaned up. - """ - - # this will be used in configs and nemo artifacts - AGGREGATE_TOKENIZERS_DICT_PREFIX = 'langs' - - def _setup_tokenizer(self, tokenizer_cfg: DictConfig): - tokenizer_type = tokenizer_cfg.get('type') - if tokenizer_type is None: - raise ValueError("`tokenizer.type` cannot be None") - elif tokenizer_type.lower() == 'agg': - self._setup_aggregate_tokenizer(tokenizer_cfg) - else: - self._setup_monolingual_tokenizer(tokenizer_cfg) - - def _setup_monolingual_tokenizer(self, tokenizer_cfg: DictConfig): - # Prevent tokenizer parallelism (unless user has explicitly set it) - if 'TOKENIZERS_PARALLELISM' not in os.environ: - os.environ['TOKENIZERS_PARALLELISM'] = 'false' - - self.tokenizer_cfg = OmegaConf.to_container(tokenizer_cfg, resolve=True) # type: dict - self.tokenizer_dir = self.tokenizer_cfg.pop('dir') # Remove tokenizer directory - self.tokenizer_type = self.tokenizer_cfg.pop('type').lower() # Remove tokenizer_type - - self.hf_tokenizer_kwargs = self.tokenizer_cfg.pop("hf_kwargs", {}) # Remove HF tokenizer kwargs - - # just in case the previous tokenizer was an aggregate - self._cleanup_aggregate_config_and_artifacts_if_needed() - - # Preserve config - if hasattr(self, 'cfg') and 'tokenizer' in self.cfg: - self.cfg.tokenizer.dir = self.tokenizer_dir - self.cfg.tokenizer.type = self.tokenizer_type - - if 'hf_kwargs' in tokenizer_cfg: - with open_dict(self.cfg.tokenizer): - self.cfg.tokenizer.hf_kwargs = tokenizer_cfg.get('hf_kwargs') - - if self.tokenizer_type not in ['bpe', 'wpe']: - raise ValueError( - "`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or " - "`wpe` for BERT based tokenizer" - ) - - if self.tokenizer_type == 'bpe': - # This is a BPE Tokenizer - if 'model_path' in self.tokenizer_cfg: - model_path = self.tokenizer_cfg.get('model_path') - else: - model_path = os.path.join(self.tokenizer_dir, 'tokenizer.model') - model_path = self.register_artifact('tokenizer.model_path', model_path) - self.model_path = model_path - - if 'special_tokens' in self.tokenizer_cfg: - special_tokens = self.tokenizer_cfg['special_tokens'] - - if special_tokens is not None: - raise ValueError("`special_tokens` are no longer supported for SentencePiece based tokenizers.") - - # Update special tokens - self.tokenizer = tokenizers.SentencePieceTokenizer(model_path=model_path) - - if 'vocab_path' in self.tokenizer_cfg: - vocab_path = self.tokenizer_cfg.get('vocab_path') - else: - vocab_path = os.path.join(self.tokenizer_dir, 'vocab.txt') - vocab_path = self.register_artifact('tokenizer.vocab_path', vocab_path) - self.vocab_path = vocab_path - - try: - if 'spe_tokenizer_vocab' in self.tokenizer_cfg: - spe_vocab_path = self.tokenizer_cfg.get('spe_tokenizer_vocab') - else: - spe_vocab_path = os.path.join(self.tokenizer_dir, 'tokenizer.vocab') - spe_vocab_path = self.register_artifact('tokenizer.spe_tokenizer_vocab', spe_vocab_path) - self.spe_vocab_path = spe_vocab_path - except FileNotFoundError: - # fallback case for older checkpoints that did not preserve the tokenizer.vocab - self.spe_vocab_path = None - - vocabulary = {} - for i in range(self.tokenizer.vocab_size): - piece = self.tokenizer.ids_to_tokens([i]) - piece = piece[0] - vocabulary[piece] = i + 1 - - # wrapper method to get vocabulary conveniently - def get_vocab(): - return vocabulary - - # attach utility values to the tokenizer wrapper - self.tokenizer.tokenizer.vocab_size = len(vocabulary) - self.tokenizer.tokenizer.get_vocab = get_vocab - self.tokenizer.tokenizer.all_special_tokens = self.tokenizer.special_token_to_id - - else: - # This is a WPE Tokenizer - # If path from previous registration exists, remove it - if 'vocab_path' in self.tokenizer_cfg: - vocab_path = self.tokenizer_cfg.get('vocab_path') - else: - vocab_path = os.path.join(self.tokenizer_dir, 'vocab.txt') - vocab_path = self.register_artifact('tokenizer.vocab_path', vocab_path) - self.vocab_path = vocab_path - - # If path from previous registration exists, remove it - if 'vocab_path' in self.tokenizer_cfg: - self.tokenizer_cfg.pop('vocab_path') - - self.tokenizer = tokenizers.AutoTokenizer( - pretrained_model_name='bert-base-cased', - vocab_file=self.vocab_path, - mask_token=self.hf_tokenizer_kwargs.get('mask_token', None), - bos_token=self.hf_tokenizer_kwargs.get('bos_token', None), - eos_token=self.hf_tokenizer_kwargs.get('eos_token', None), - pad_token=self.hf_tokenizer_kwargs.get('pad_token', None), - sep_token=self.hf_tokenizer_kwargs.get('sep_token', None), - cls_token=self.hf_tokenizer_kwargs.get('cls_token', None), - unk_token=self.hf_tokenizer_kwargs.get('unk_token', None), - use_fast=self.hf_tokenizer_kwargs.get('use_fast', False), - ) - - logging.info( - "Tokenizer {} initialized with {} tokens".format( - self.tokenizer.__class__.__name__, self.tokenizer.vocab_size - ) - ) - - def _setup_aggregate_tokenizer(self, tokenizer_cfg: DictConfig): - # Prevent tokenizer parallelism (unless user has explicitly set it) - if 'TOKENIZERS_PARALLELISM' not in os.environ: - os.environ['TOKENIZERS_PARALLELISM'] = 'false' - - self.tokenizer_cfg = OmegaConf.to_container(tokenizer_cfg, resolve=True) # type: dict - - # the aggregate tokenizer does not have one tokenizer_dir but multiple ones - self.tokenizer_dir = None - - self.tokenizer_cfg.pop('dir', None) # Remove tokenizer directory, if any - # Remove tokenizer_type -- obviously if we are here, the type is 'agg' - self.tokenizer_type = self.tokenizer_cfg.pop('type').lower() - - # the aggregate tokenizer should not have these - self.hf_tokenizer_kwargs = {} - self.tokenizer_cfg.pop("hf_kwargs", {}) # Remove HF tokenizer kwargs, if any - - logging.info('_setup_tokenizer: detected an aggregate tokenizer') - # need to de-register any monolingual config items if they exist - self._cleanup_monolingual_and_aggregate_config_and_artifacts_if_needed() - - # overwrite tokenizer type - if hasattr(self, 'cfg') and 'tokenizer' in self.cfg: - self.cfg.tokenizer.type = self.tokenizer_type - - tokenizers_dict = {} - # init each of the monolingual tokenizers found in the config and assemble into AggregateTokenizer - for lang, tokenizer_config in self.tokenizer_cfg[self.AGGREGATE_TOKENIZERS_DICT_PREFIX].items(): - (tokenizer, model_path, vocab_path, spe_vocab_path,) = self._make_tokenizer(tokenizer_config, lang) - - tokenizers_dict[lang] = tokenizer - if hasattr(self, 'cfg'): - with open_dict(self.cfg.tokenizer): - self.cfg.tokenizer[self.AGGREGATE_TOKENIZERS_DICT_PREFIX][lang]['dir'] = self.tokenizer_cfg[ - self.AGGREGATE_TOKENIZERS_DICT_PREFIX - ][lang]['dir'] - self.cfg.tokenizer[self.AGGREGATE_TOKENIZERS_DICT_PREFIX][lang]['type'] = self.tokenizer_cfg[ - self.AGGREGATE_TOKENIZERS_DICT_PREFIX - ][lang]['type'] - - self.tokenizer = tokenizers.AggregateTokenizer(tokenizers_dict) - - def _make_tokenizer(self, tokenizer_cfg: DictConfig, lang=None): - - tokenizer_type = tokenizer_cfg.get('type').lower() - tokenizer_dir = tokenizer_cfg.get('dir') - - if tokenizer_type not in ['bpe', 'wpe']: - raise ValueError( - '`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or' '`wpe` for BERT based tokenizer' - ) - - # defaults - model_path = None - vocab_path = None - spe_vocab_path = None - - if tokenizer_type == 'bpe': - # This is a BPE Tokenizer - if 'model_path' in tokenizer_cfg: - model_path = tokenizer_cfg.get('model_path') - else: - model_path = os.path.join(tokenizer_dir, 'tokenizer.model') - - model_path = self.register_artifact( - 'tokenizer.' + self.AGGREGATE_TOKENIZERS_DICT_PREFIX + '.' + lang + '.model_path', model_path - ) - - if 'special_tokens' in tokenizer_cfg: - special_tokens = tokenizer_cfg['special_tokens'] - if special_tokens is not None: - raise ValueError('`special_tokens` are no longer supported for SentencePiece based tokenizers.') - - # Update special tokens - tokenizer = tokenizers.SentencePieceTokenizer(model_path=model_path) - - if 'vocab_path' in tokenizer_cfg: - vocab_path = tokenizer_cfg.get('vocab_path') - else: - vocab_path = os.path.join(tokenizer_dir, 'vocab.txt') - - vocab_path = self.register_artifact( - 'tokenizer.' + self.AGGREGATE_TOKENIZERS_DICT_PREFIX + '.' + lang + '.vocab_path', vocab_path - ) - - try: - if 'spe_tokenizer_vocab' in tokenizer_cfg: - spe_vocab_path = tokenizer_cfg.get('spe_tokenizer_vocab') - else: - spe_vocab_path = os.path.join(tokenizer_dir, 'tokenizer.vocab') - - spe_vocab_path = self.register_artifact( - 'tokenizer.' + self.AGGREGATE_TOKENIZERS_DICT_PREFIX + '.' + lang + '.spe_tokenizer_vocab', - spe_vocab_path, - ) - - except FileNotFoundError: - # fallback case for older checkpoints that did not preserve the tokenizer.vocab - spe_vocab_path = None - - vocabulary = {} - for i in range(tokenizer.vocab_size): - piece = tokenizer.ids_to_tokens([i]) - piece = piece[0] - vocabulary[piece] = i + 1 - - # wrapper method to get vocabulary conveniently - def get_vocab(): - return vocabulary - - # attach utility values to the tokenizer wrapper - tokenizer.tokenizer.vocab_size = len(vocabulary) - tokenizer.tokenizer.get_vocab = get_vocab - tokenizer.tokenizer.all_special_tokens = tokenizer.special_token_to_id - - else: - # This is a WPE Tokenizer - # If path from previous registration exists, remove it - if 'vocab_path' in tokenizer_cfg: - vocab_path = tokenizer_cfg.get('vocab_path') - else: - vocab_path = os.path.join(tokenizer_dir, 'vocab.txt') - - vocab_path = self.register_artifact( - 'tokenizer.' + self.AGGREGATE_TOKENIZERS_DICT_PREFIX + '.' + lang + '.vocab_path', vocab_path - ) - - # If path from previous registration exists, remove it - if 'vocab_path' in tokenizer_cfg: - tokenizer_cfg.pop('vocab_path') - - hf_tokenizer_kwargs = tokenizer_cfg.get('hf_kwargs', {}) - tokenizer = tokenizers.AutoTokenizer( - pretrained_model_name='bert-base-cased', - vocab_file=vocab_path, - mask_token=hf_tokenizer_kwargs.get('mask_token', None), - bos_token=hf_tokenizer_kwargs.get('bos_token', None), - eos_token=hf_tokenizer_kwargs.get('eos_token', None), - pad_token=hf_tokenizer_kwargs.get('pad_token', None), - sep_token=hf_tokenizer_kwargs.get('sep_token', None), - cls_token=hf_tokenizer_kwargs.get('cls_token', None), - unk_token=hf_tokenizer_kwargs.get('unk_token', None), - use_fast=hf_tokenizer_kwargs.get('use_fast', False), - ) - - logging.info( - 'Tokenizer {} initialized with {} tokens'.format(tokenizer.__class__.__name__, tokenizer.vocab_size) - ) - - return tokenizer, model_path, vocab_path, spe_vocab_path - - def _cleanup_monolingual_and_aggregate_config_and_artifacts_if_needed(self): - """ - Clean ups any monolingual and some aggregate config items and artifacts. - We need to do this when we switch from a monolingual tokenizer to an aggregate one - or go between aggregate tokenizers which could have a different number of languages - """ - if hasattr(self, 'cfg'): - with open_dict(self.cfg.tokenizer): - self.cfg.tokenizer.pop('dir', None) - self.cfg.tokenizer.pop('model_path', None) - self.cfg.tokenizer.pop('vocab_path', None) - self.cfg.tokenizer.pop('spe_tokenizer_vocab', None) - self.cfg.tokenizer.pop('hf_kwargs', None) - - # need to de-register any monolingual artifacts if they exist - if hasattr(self, 'artifacts'): - self.artifacts.pop('tokenizer.model_path', None) - self.artifacts.pop('tokenizer.vocab_path', None) - self.artifacts.pop('tokenizer.spe_tokenizer_vocab', None) - - # just in case we are replacing one aggregate tokenizer with another one, we better - # clean up the old aggregate artifacts as well - for akey in list(self.artifacts.keys()): - if akey.startswith('tokenizer.' + self.AGGREGATE_TOKENIZERS_DICT_PREFIX + '.'): - self.artifacts.pop(akey) - - def _cleanup_aggregate_config_and_artifacts_if_needed(self): - """ - Clean ups any aggregate config items and artifacts. - We need to do this when we switch from an aggregate tokenizer to a monolingual one - """ - if hasattr(self, 'cfg'): - with open_dict(self.cfg.tokenizer): - self.cfg.tokenizer.pop(self.AGGREGATE_TOKENIZERS_DICT_PREFIX, None) - - # clean up the old aggregate artifacts as well - if hasattr(self, 'artifacts'): - for akey in list(self.artifacts.keys()): - if akey.startswith('tokenizer.' + self.AGGREGATE_TOKENIZERS_DICT_PREFIX + '.'): - self.artifacts.pop(akey) - - -class ASRModuleMixin(ASRAdapterModelMixin): - """ - ASRModuleMixin is a mixin class added to ASR models in order to add methods that are specific - to a particular instantiation of a module inside of an ASRModel. - - Each method should first check that the module is present within the subclass, and support additional - functionality if the corresponding module is present. - """ - - def change_conv_asr_se_context_window(self, context_window: int, update_config: bool = True): - """ - Update the context window of the SqueezeExcitation module if the provided model contains an - `encoder` which is an instance of `ConvASREncoder`. - - Args: - context_window: An integer representing the number of input timeframes that will be used - to compute the context. Each timeframe corresponds to a single window stride of the - STFT features. - - Say the window_stride = 0.01s, then a context window of 128 represents 128 * 0.01 s - of context to compute the Squeeze step. - update_config: Whether to update the config or not with the new context window. - """ - asr_module_utils.change_conv_asr_se_context_window( - self, context_window=context_window, update_config=update_config - ) - - def change_attention_model( - self, self_attention_model: str = None, att_context_size: List[int] = None, update_config: bool = True - ): - """ - Update the self_attention_model if function is available in encoder. - - Args: - self_attention_model (str): type of the attention layer and positional encoding - 'rel_pos': relative positional embedding and Transformer-XL - 'rel_pos_local_attn': relative positional embedding and Transformer-XL with local attention using - overlapping windows. Attention context is determined by att_context_size parameter. - 'abs_pos': absolute positional embedding and Transformer - If None is provided, the self_attention_model isn't changed. Defauts to None. - att_context_size (List[int]): List of 2 ints corresponding to left and right attention context sizes, - or None to keep as it is. Defauts to None. - update_config (bool): Whether to update the config or not with the new attention model. - Defaults to True. - """ - if self_attention_model is None and att_context_size is None: - return - - if not hasattr(self, 'encoder'): - logging.info( - "Could not change the self_attention_model in encoder " - "since the model provided does not contain an `encoder` module in its config." - ) - return - - if not hasattr(self.encoder, "change_attention_model"): - logging.info("Model encoder doesn't have a change_attention_model method ") - return - - self.encoder.change_attention_model(self_attention_model, att_context_size, update_config, self.device) - if update_config: - with open_dict(self.cfg): - self.cfg.encoder.self_attention_model = self_attention_model - self.cfg.encoder.att_context_size = att_context_size - - def change_subsampling_conv_chunking_factor( - self, subsampling_conv_chunking_factor: int, update_config: bool = True - ): - """ - Update the conv_chunking_factor (int) if function is available in encoder. - Default is 1 (auto) - Set it to -1 (disabled) or to a specific value (power of 2) if you OOM in the conv subsampling layers - - Args: - conv_chunking_factor (int) - """ - - if not hasattr(self, 'encoder'): - logging.info( - "Could not call the change_subsampling_conv_chunking_factor method in encoder " - "since the model provided does not contain an `encoder` module in its config." - ) - return - - if not hasattr(self.encoder, "change_subsampling_conv_chunking_factor"): - logging.info("Model encoder doesn't have a change_subsampling_conv_chunking_factor method ") - return - - self.encoder.change_subsampling_conv_chunking_factor(subsampling_conv_chunking_factor) - if update_config: - with open_dict(self.cfg): - self.cfg.encoder.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor - - def conformer_stream_step( - self, - processed_signal: torch.Tensor, - processed_signal_length: torch.Tensor = None, - cache_last_channel: torch.Tensor = None, - cache_last_time: torch.Tensor = None, - cache_last_channel_len: torch.Tensor = None, - keep_all_outputs: bool = True, - previous_hypotheses: List[Hypothesis] = None, - previous_pred_out: torch.Tensor = None, - drop_extra_pre_encoded: int = None, - return_transcription: bool = True, - return_log_probs: bool = False, - ): - """ - It simulates a forward step with caching for streaming purposes. - It supports the ASR models where their encoder supports streaming like Conformer. - Args: - processed_signal: the input audio signals - processed_signal_length: the length of the audios - cache_last_channel: the cache tensor for last channel layers like MHA - cache_last_channel_len: engths for cache_last_channel - cache_last_time: the cache tensor for last time layers like convolutions - keep_all_outputs: if set to True, would not drop the extra outputs specified by encoder.streaming_cfg.valid_out_len - previous_hypotheses: the hypotheses from the previous step for RNNT models - previous_pred_out: the predicted outputs from the previous step for CTC models - drop_extra_pre_encoded: number of steps to drop from the beginning of the outputs after the downsampling module. This can be used if extra paddings are added on the left side of the input. - return_transcription: whether to decode and return the transcriptions. It can not get disabled for Transducer models. - return_log_probs: whether to return the log probs, only valid for ctc model - - Returns: - greedy_predictions: the greedy predictions from the decoder - all_hyp_or_transcribed_texts: the decoder hypotheses for Transducer models and the transcriptions for CTC models - cache_last_channel_next: the updated tensor cache for last channel layers to be used for next streaming step - cache_last_time_next: the updated tensor cache for last time layers to be used for next streaming step - cache_last_channel_next_len: the updated lengths for cache_last_channel - best_hyp: the best hypotheses for the Transducer models - - log_probs: the logits tensor of current streaming chunk, only returned when return_log_probs=True - encoded_len: the length of the output log_probs + history chunk log_probs, only returned when return_log_probs=True - """ - if not isinstance(self, asr_models.EncDecRNNTModel) and not isinstance(self, asr_models.EncDecCTCModel): - raise NotImplementedError(f"stream_step does not support {type(self)}!") - - if not isinstance(self.encoder, StreamingEncoder): - raise NotImplementedError(f"Encoder of this model does not support streaming!") - - if isinstance(self, asr_models.EncDecRNNTModel) and return_transcription is False: - logging.info( - "return_transcription can not be False for Transducer models as decoder returns the transcriptions too." - ) - - if not isinstance(self, asr_models.EncDecCTCModel) and return_log_probs is True: - logging.info("return_log_probs can only be True for CTC models.") - - ( - encoded, - encoded_len, - cache_last_channel_next, - cache_last_time_next, - cache_last_channel_next_len, - ) = self.encoder.cache_aware_stream_step( - processed_signal=processed_signal, - processed_signal_length=processed_signal_length, - cache_last_channel=cache_last_channel, - cache_last_time=cache_last_time, - cache_last_channel_len=cache_last_channel_len, - keep_all_outputs=keep_all_outputs, - drop_extra_pre_encoded=drop_extra_pre_encoded, - ) - - if isinstance(self, asr_models.EncDecCTCModel) or ( - isinstance(self, asr_models.EncDecHybridRNNTCTCModel) and self.cur_decoder == "ctc" - ): - if hasattr(self, "ctc_decoder"): - decoding = self.ctc_decoding - decoder = self.ctc_decoder - else: - decoding = self.decoding - decoder = self.decoder - - log_probs = decoder(encoder_output=encoded) - predictions_tensor = log_probs.argmax(dim=-1, keepdim=False) - - # Concatenate the previous predictions with the current one to have the full predictions. - # We drop the extra predictions for each sample by using the lengths returned by the encoder (encoded_len) - # Then create a list of the predictions for the batch. The predictions can have different lengths because of the paddings. - greedy_predictions = [] - if return_transcription: - all_hyp_or_transcribed_texts = [] - else: - all_hyp_or_transcribed_texts = None - for preds_idx, preds in enumerate(predictions_tensor): - if encoded_len is None: - preds_cur = predictions_tensor[preds_idx] - else: - preds_cur = predictions_tensor[preds_idx, : encoded_len[preds_idx]] - if previous_pred_out is not None: - greedy_predictions_concat = torch.cat((previous_pred_out[preds_idx], preds_cur), dim=-1) - encoded_len[preds_idx] += len(previous_pred_out[preds_idx]) - else: - greedy_predictions_concat = preds_cur - greedy_predictions.append(greedy_predictions_concat) - - # TODO: make decoding more efficient by avoiding the decoding process from the beginning - if return_transcription: - decoded_out = decoding.ctc_decoder_predictions_tensor( - decoder_outputs=greedy_predictions_concat.unsqueeze(0), - decoder_lengths=encoded_len[preds_idx : preds_idx + 1], - return_hypotheses=False, - ) - all_hyp_or_transcribed_texts.append(decoded_out[0][0]) - best_hyp = None - else: - best_hyp, all_hyp_or_transcribed_texts = self.decoding.rnnt_decoder_predictions_tensor( - encoder_output=encoded, - encoded_lengths=encoded_len, - return_hypotheses=True, - partial_hypotheses=previous_hypotheses, - ) - greedy_predictions = [hyp.y_sequence for hyp in best_hyp] - - if all_hyp_or_transcribed_texts is None: - all_hyp_or_transcribed_texts = best_hyp - - result = [ - greedy_predictions, - all_hyp_or_transcribed_texts, - cache_last_channel_next, - cache_last_time_next, - cache_last_channel_next_len, - best_hyp, - ] - if return_log_probs: - result.append(log_probs) - result.append(encoded_len) - - return tuple(result) - - @torch.no_grad() - def transcribe_simulate_cache_aware_streaming( - self, - paths2audio_files: List[str], - batch_size: int = 4, - logprobs: bool = False, - return_hypotheses: bool = False, - online_normalization: bool = False, - ): - """ - Args: - paths2audio_files: (a list) of paths to audio files. - batch_size: (int) batch size to use during inference. - Bigger will result in better throughput performance but would use more memory. - logprobs: (bool) pass True to get log probabilities instead of transcripts. - return_hypotheses: (bool) Either return hypotheses or text - With hypotheses can do some postprocessing like getting timestamp or rescoring - online_normalization: (bool) Perform normalization on the run per chunk. - Returns: - A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files - """ - if paths2audio_files is None or len(paths2audio_files) == 0: - return {} - - if return_hypotheses and logprobs: - raise ValueError( - "Either `return_hypotheses` or `logprobs` can be True at any given time." - "Returned hypotheses will contain the logprobs." - ) - - if not isinstance(self, asr_models.EncDecCTCModel): - raise NotImplementedError(f"simulate streaming does not support {type(self)}!") - - if not isinstance(self.encoder, StreamingEncoder): - raise NotImplementedError(f"Encoder of this model does not support streaming!") - - data_loader = self._setup_streaming_transcribe_dataloader(paths2audio_files, batch_size, online_normalization) - - total_log_probs = [] - total_texts = [] - - for streaming_buffer in data_loader: - streaming_buffer_iter = iter(streaming_buffer) - batch_size = len(streaming_buffer.streams_length) - cache_last_channel, cache_last_time, cache_last_channel_len = self.encoder.get_initial_cache_state( - batch_size=batch_size - ) - previous_hypotheses = None - pred_out_stream = None - encoded_len = None - transcribed_texts = None - batch_log_probs = [] - - for step_num, (chunk_audio, chunk_lengths) in enumerate(streaming_buffer_iter): - drop_extra_pre_encoded = self.encoder.streaming_cfg.drop_extra_pre_encoded if step_num != 0 else 0 - with torch.inference_mode(): - result = self.conformer_stream_step( - processed_signal=chunk_audio, - processed_signal_length=chunk_lengths, - cache_last_channel=cache_last_channel, - cache_last_time=cache_last_time, - cache_last_channel_len=cache_last_channel_len, - keep_all_outputs=streaming_buffer.is_buffer_empty(), - previous_hypotheses=previous_hypotheses, - previous_pred_out=pred_out_stream, - drop_extra_pre_encoded=drop_extra_pre_encoded, - return_transcription=True, - return_log_probs=logprobs or return_hypotheses, - ) - if logprobs or return_hypotheses: - ( - pred_out_stream, - transcribed_texts, - cache_last_channel, - cache_last_time, - cache_last_channel_len, - previous_hypotheses, - cur_chunk_log_probs, - encoded_len, - ) = result - batch_log_probs.append(cur_chunk_log_probs.cpu()) - else: - ( - pred_out_stream, - transcribed_texts, - cache_last_channel, - cache_last_time, - cache_last_channel_len, - previous_hypotheses, - ) = result - - if logprobs or return_hypotheses: - # concatenate chunk log probs on T dim - batch_log_probs = torch.cat(batch_log_probs, axis=1) - for log_probs, log_prob_len in zip(batch_log_probs, encoded_len): - total_log_probs.append(log_probs[0:log_prob_len]) - - if transcribed_texts is None: - total_texts += [''] * batch_size - else: - total_texts += transcribed_texts - - if logprobs: - return total_log_probs - - if not return_hypotheses: - return total_texts - - hyps = [] - for log_probs, text in zip(total_log_probs, total_texts): - hyps.append(Hypothesis(y_sequence=log_probs, text=text, score=0.0, dec_state=None)) - return hyps - - def _setup_streaming_transcribe_dataloader( - self, paths2audio_files: List[str], batch_size: int, online_normalization=False - ): - """ - Setup function for a temporary data loader which wraps the provided audio file. - - Args: - paths2audio_files: (a list) of paths to audio files. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - online_normalization: whether to do online normalization - Returns: - a new batch streaming buffer - """ - from nemo.collections.asr.parts.utils.streaming_utils import CacheAwareStreamingAudioBuffer - - streaming_buffer = CacheAwareStreamingAudioBuffer(model=self, online_normalization=online_normalization) - for sample_idx, sample in enumerate(paths2audio_files): - processed_signal, processed_signal_length, stream_id = streaming_buffer.append_audio_file( - sample, stream_id=-1 - ) - logging.info(f'Added this sample to the buffer: {sample}') - if (sample_idx + 1) % batch_size == 0 or sample_idx == len(paths2audio_files) - 1: - logging.info(f"Starting to stream samples {sample_idx - len(streaming_buffer) + 1} to {sample_idx}...") - yield streaming_buffer - streaming_buffer.reset_buffer() - - -class DiarizationMixin(ABC): - @abstractmethod - def diarize(self, paths2audio_files: List[str], batch_size: int = 1) -> List[str]: - """ - Takes paths to audio files and returns speaker labels - Args: - paths2audio_files: paths to audio fragment to be transcribed - - Returns: - Speaker labels - """ - pass diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/streaming.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/streaming.py deleted file mode 100644 index d6fd0b9b354ba3a551c4061cdbf3946d167e0ff8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/mixins/streaming.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from abc import ABC, abstractmethod - -import torch - - -class StreamingEncoder(ABC): - @abstractmethod - def setup_streaming_params( - self, max_look_ahead: int = 10000, - ): - """ - This function sets the needed values and parameters to perform streaming. The configuration (CacheAwareStreamingConfig) need to be stored in self.streaming_cfg. - The streaming configuration is needed to simulate streaming inference. It would set the following - """ - pass - - @abstractmethod - def get_initial_cache_state(self, batch_size, dtype, device, max_dim): - pass - - @staticmethod - def to_numpy(tensor): - if tensor is None: - return None - return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() - - def cache_aware_stream_step( - self, - processed_signal, - processed_signal_length=None, - cache_last_channel=None, - cache_last_time=None, - cache_last_channel_len=None, - keep_all_outputs=True, - drop_extra_pre_encoded=None, - ): - if self.streaming_cfg is None: - self.setup_streaming_params() - if drop_extra_pre_encoded is not None: - prev_drop_extra_pre_encoded = self.streaming_cfg.drop_extra_pre_encoded - self.streaming_cfg.drop_extra_pre_encoded = drop_extra_pre_encoded - else: - prev_drop_extra_pre_encoded = None - - if processed_signal_length is None: - processed_signal_length = processed_signal.new_full(processed_signal.size(0), processed_signal.size(-1)) - - encoder_output = self( - audio_signal=processed_signal, - length=processed_signal_length, - cache_last_channel=cache_last_channel, - cache_last_time=cache_last_time, - cache_last_channel_len=cache_last_channel_len, - ) - - encoder_output = self.streaming_post_process(encoder_output, keep_all_outputs=keep_all_outputs) - - if prev_drop_extra_pre_encoded is not None: - self.streaming_cfg.drop_extra_pre_encoded = prev_drop_extra_pre_encoded - - return encoder_output diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/__init__.py deleted file mode 100644 index 77a23cf78c022fac041754632f0dfee3b7defa82..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.parts.numba.rnnt_loss.rnnt_pytorch import RNNTLossNumba diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index de88ecf51ae9f391d5ec70bb1f6e99f1aed22325..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/__init__.py deleted file mode 100644 index 055d7aeb5fd9caf2960b4a70b76be664671e1bad..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.parts.numba.rnnt_loss.rnnt import rnnt_loss_cpu, rnnt_loss_gpu -from nemo.collections.asr.parts.numba.rnnt_loss.rnnt_pytorch import ( - MultiblankRNNTLossNumba, - RNNTLossNumba, - TDTLossNumba, -) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 5f605ba41d10bbd956e8ae27477fbdd5a035d3ab..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/__pycache__/rnnt.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/__pycache__/rnnt.cpython-310.pyc deleted file mode 100644 index 48ce29c6fa2d9ee7c50f7da15e5599e8817e3bbb..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/__pycache__/rnnt.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/__pycache__/rnnt_pytorch.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/__pycache__/rnnt_pytorch.cpython-310.pyc deleted file mode 100644 index e097a4aeb36d20ad895a6d6d24064e9d1bbd5e91..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/__pycache__/rnnt_pytorch.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/rnnt.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/rnnt.py deleted file mode 100644 index 046aea425e2050d30726a87f8f7369469ba04dfc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/rnnt.py +++ /dev/null @@ -1,483 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright 2018-2019, Mingkun Huang -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import multiprocessing - -import torch -from numba import cuda - -from nemo.collections.asr.parts.numba.rnnt_loss.utils import global_constants, rnnt_helper -from nemo.collections.asr.parts.numba.rnnt_loss.utils.cpu_utils import cpu_rnnt -from nemo.collections.asr.parts.numba.rnnt_loss.utils.cuda_utils import gpu_rnnt - - -def rnnt_loss_cpu( - acts: torch.Tensor, - labels: torch.Tensor, - input_lengths: torch.Tensor, - label_lengths: torch.Tensor, - costs: torch.Tensor, - grads: torch.Tensor, - blank_label: int, - fastemit_lambda: float, - clamp: float, - num_threads: int, -): - """ - Wrapper method for accessing CPU RNNT loss. - - CPU implementation ported from [HawkAaron/warp-transducer](https://github.com/HawkAaron/warp-transducer). - - Args: - acts: Activation tensor of shape [B, T, U, V+1]. - labels: Ground truth labels of shape [B, U]. - input_lengths: Lengths of the acoustic sequence as a vector of ints [B]. - label_lengths: Lengths of the target sequence as a vector of ints [B]. - costs: Zero vector of length [B] in which costs will be set. - grads: Zero tensor of shape [B, T, U, V+1] where the gradient will be set. - blank_label: Index of the blank token in the vocabulary. - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. - num_threads: Number of threads for OpenMP. - """ - # aliases - log_probs = acts - flat_labels = labels - - minibatch_size = log_probs.shape[0] - maxT = log_probs.shape[1] - maxU = log_probs.shape[2] - alphabet_size = log_probs.shape[3] - - if num_threads < 0: - num_threads = multiprocessing.cpu_count() - - num_threads = max(1, num_threads) # have to use at least 1 thread - - gpu_size, status = rnnt_helper.get_workspace_size(maxT, maxU, minibatch_size, gpu=False) - if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: - raise RuntimeError("Invalid parameter passed when calculating working space memory") - - cpu_workspace = torch.zeros(gpu_size, device=log_probs.device, dtype=log_probs.dtype, requires_grad=False) - - ### VIEW TENSORS AS VECTORS FOR POINTER INDEXING ### - log_probs, acts_shape = rnnt_helper.flatten_tensor(log_probs) - flat_labels, labels_shape = rnnt_helper.flatten_tensor(flat_labels) - - wrapper = cpu_rnnt.CPURNNT( - minibatch=minibatch_size, - maxT=maxT, - maxU=maxU, - alphabet_size=alphabet_size, - workspace=cpu_workspace, - blank=blank_label, - fastemit_lambda=fastemit_lambda, - clamp=clamp, - num_threads=num_threads, - batch_first=True, - ) - - if grads is None: - status = wrapper.score_forward( - log_probs=log_probs.data, - costs=costs, - flat_labels=flat_labels.data, - label_lengths=label_lengths.data, - input_lengths=input_lengths.data, - ) - - if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: - raise RuntimeError("Could not calculate forward scores") - - else: - ### FLATTEN GRAD TENSOR ### - grads, grads_shape = rnnt_helper.flatten_tensor(grads) - - status = wrapper.cost_and_grad( - log_probs=log_probs.data, - grads=grads.data, - costs=costs, - flat_labels=flat_labels.data, - label_lengths=label_lengths.data, - input_lengths=input_lengths.data, - ) - - if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: - raise RuntimeError("Could not calculate forward scores") - - del cpu_workspace, wrapper - return True - - -def rnnt_loss_gpu( - acts: torch.Tensor, - labels: torch.Tensor, - input_lengths: torch.Tensor, - label_lengths: torch.Tensor, - costs: torch.Tensor, - grads: torch.Tensor, - blank_label: int, - fastemit_lambda: float, - clamp: float, - num_threads: int, -): - """ - Wrapper method for accessing GPU RNNT loss. - - CUDA implementation ported from [HawkAaron/warp-transducer](https://github.com/HawkAaron/warp-transducer). - - Args: - acts: Activation tensor of shape [B, T, U, V+1]. - labels: Ground truth labels of shape [B, U]. - input_lengths: Lengths of the acoustic sequence as a vector of ints [B]. - label_lengths: Lengths of the target sequence as a vector of ints [B]. - costs: Zero vector of length [B] in which costs will be set. - grads: Zero tensor of shape [B, T, U, V+1] where the gradient will be set. - blank_label: Index of the blank token in the vocabulary. - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. - num_threads: Number of threads for OpenMP. - """ - minibatch_size = acts.shape[0] - maxT = acts.shape[1] - maxU = acts.shape[2] - alphabet_size = acts.shape[3] - - if hasattr(cuda, 'external_stream'): - stream = cuda.external_stream(torch.cuda.current_stream(acts.device).cuda_stream) - else: - stream = cuda.default_stream() - - if num_threads < 0: - num_threads = multiprocessing.cpu_count() - - num_threads = max(1, num_threads) # have to use at least 1 thread - - gpu_size, status = rnnt_helper.get_workspace_size(maxT, maxU, minibatch_size, gpu=True) - if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: - raise RuntimeError("Invalid parameter passed when calculating working space memory") - - # Select GPU index - cuda.select_device(acts.device.index) - gpu_workspace = torch.zeros(gpu_size, device=acts.device, dtype=torch.float32, requires_grad=False) - - ### VIEW TENSORS AS VECTORS FOR POINTER INDEXING ### - acts, acts_shape = rnnt_helper.flatten_tensor(acts) - - wrapper = gpu_rnnt.GPURNNT( - minibatch=minibatch_size, - maxT=maxT, - maxU=maxU, - alphabet_size=alphabet_size, - workspace=gpu_workspace, - blank=blank_label, - fastemit_lambda=fastemit_lambda, - clamp=clamp, - num_threads=num_threads, - stream=stream, - ) - - if grads is None: - status = wrapper.score_forward( - acts=acts.data, - costs=costs.data, - pad_labels=labels.data, - label_lengths=label_lengths.data, - input_lengths=input_lengths.data, - ) - - if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: - raise RuntimeError("Could not calculate forward scores") - - else: - ### FLATTEN GRAD TENSOR ### - grads, grads_shape = rnnt_helper.flatten_tensor(grads) - - status = wrapper.cost_and_grad( - acts=acts.data, - grads=grads.data, - costs=costs.data, - pad_labels=labels.data, - label_lengths=label_lengths.data, - input_lengths=input_lengths.data, - ) - - if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: - raise RuntimeError("Could not calculate forward scores") - - del gpu_workspace, wrapper - return True - - -def tdt_loss_gpu( - label_acts: torch.Tensor, - duration_acts: torch.Tensor, - labels: torch.Tensor, - input_lengths: torch.Tensor, - label_lengths: torch.Tensor, - costs: torch.Tensor, - label_grads: torch.Tensor, - duration_grads: torch.Tensor, - blank_label: int, - durations: list, - fastemit_lambda: float, - clamp: float, - num_threads: int, - sigma: float, - omega: float, -): - """ - Wrapper method for accessing GPU TDT loss (https://arxiv.org/abs/2304.06795). - - CUDA implementation ported from [HawkAaron/warp-transducer](https://github.com/HawkAaron/warp-transducer). - - Args: - label_acts: Activation tensor of shape [B, T, U, V], where V includes the blank symbol. - duration_acts: Activation tensor of shape [B, T, U, D], where D is the number of durations. - labels: Ground truth labels of shape [B, U]. - input_lengths: Lengths of the acoustic sequence as a vector of ints [B]. - label_lengths: Lengths of the target sequence as a vector of ints [B]. - costs: Zero vector of length [B] in which costs will be set. - label_grads: Zero tensor of shape [B, T, U, V] where the gradient to label_acts will be set. - duration_grads: Zero tensor of shape [B, T, U, D] where the gradient to duration_acts will be set. - blank_label: Index of the standard blank token in the vocabulary. - durations: A list of supported durations for TDT. Must include 0 and 1. - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. - num_threads: Number of threads for OpenMP. - sigma: logit-undernormalization weight used in the multi-blank model. Refer to - the multi-blank paper https://arxiv.org/abs/2304.06795 for detailed explanations. - omega: weight for regular RNN-T loss - """ - minibatch_size = label_acts.shape[0] - maxT = label_acts.shape[1] - maxU = label_acts.shape[2] - alphabet_size = label_acts.shape[3] - - if hasattr(cuda, 'external_stream'): - stream = cuda.external_stream(torch.cuda.current_stream(label_acts.device).cuda_stream) - else: - stream = cuda.default_stream() - - if num_threads < 0: - num_threads = multiprocessing.cpu_count() - - num_threads = max(1, num_threads) # have to use at least 1 thread - - gpu_size, status = rnnt_helper.get_workspace_size(maxT, maxU, minibatch_size, gpu=True) - - if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: - raise RuntimeError("Invalid parameter passed when calculating working space memory") - - # Select GPU index - cuda.select_device(label_acts.device.index) - gpu_workspace = torch.zeros(gpu_size, device=label_acts.device, dtype=label_acts.dtype, requires_grad=False) - - tdt_workspace = torch.zeros(len(durations), device=label_acts.device, dtype=torch.long, requires_grad=False) - - for i in range(0, len(durations)): - tdt_workspace[i] = durations[i] - - ### VIEW TENSORS AS VECTORS FOR POINTER INDEXING ### - label_acts, label_acts_shape = rnnt_helper.flatten_tensor(label_acts) - duration_acts, duration_acts_shape = rnnt_helper.flatten_tensor(duration_acts) - - wrapper = gpu_rnnt.GPUTDT( - minibatch=minibatch_size, - maxT=maxT, - maxU=maxU, - alphabet_size=alphabet_size, - workspace=gpu_workspace, - tdt_workspace=tdt_workspace, - num_durations=len(durations), - blank=blank_label, - fastemit_lambda=fastemit_lambda, - clamp=clamp, - num_threads=num_threads, - stream=stream, - sigma=sigma, - omega=omega, - ) - - if label_grads is None: - status = wrapper.score_forward( - label_acts=label_acts.data, - duration_acts=duration_acts.data, - costs=costs.data, - pad_labels=labels.data, - label_lengths=label_lengths.data, - input_lengths=input_lengths.data, - ) - - if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: - raise RuntimeError("Could not calculate forward scores") - - else: - ### FLATTEN GRAD TENSOR ### - label_grads, label_grads_shape = rnnt_helper.flatten_tensor(label_grads) - duration_grads, duration_grads_shape = rnnt_helper.flatten_tensor(duration_grads) - - status = wrapper.cost_and_grad( - label_acts=label_acts.data, - duration_acts=duration_acts.data, - label_grads=label_grads.data, - duration_grads=duration_grads.data, - costs=costs.data, - pad_labels=labels.data, - label_lengths=label_lengths.data, - input_lengths=input_lengths.data, - ) - - if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: - raise RuntimeError("Could not calculate forward scores") - - del gpu_workspace, tdt_workspace, wrapper - return True - - -def multiblank_rnnt_loss_gpu( - acts: torch.Tensor, - labels: torch.Tensor, - input_lengths: torch.Tensor, - label_lengths: torch.Tensor, - costs: torch.Tensor, - grads: torch.Tensor, - blank_label: int, - big_blank_durations: list, - fastemit_lambda: float, - clamp: float, - num_threads: int, - sigma: float, -): - """ - Wrapper method for accessing GPU Multi-blank RNNT loss (https://arxiv.org/pdf/2211.03541.pdf). - - CUDA implementation ported from [HawkAaron/warp-transducer](https://github.com/HawkAaron/warp-transducer). - - Args: - acts: Activation tensor of shape [B, T, U, V + num_big_blanks + 1]. - labels: Ground truth labels of shape [B, U]. - input_lengths: Lengths of the acoustic sequence as a vector of ints [B]. - label_lengths: Lengths of the target sequence as a vector of ints [B]. - costs: Zero vector of length [B] in which costs will be set. - grads: Zero tensor of shape [B, T, U, V + num_big_blanks + 1] where the gradient will be set. - blank_label: Index of the standard blank token in the vocabulary. - big_blank_durations: A list of supported durations for big blank symbols - in the model, e.g. [2, 4, 8]. Note we only include durations for ``big - blanks'' here and it should not include 1 for the standard blank. - Those big blanks have vocabulary indices after the standard blank index. - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. - num_threads: Number of threads for OpenMP. - sigma: logit-undernormalization weight used in the multi-blank model. Refer to - the multi-blank paper https://arxiv.org/pdf/2211.03541 for detailed explanations. - """ - minibatch_size = acts.shape[0] - maxT = acts.shape[1] - maxU = acts.shape[2] - alphabet_size = acts.shape[3] - - if hasattr(cuda, 'external_stream'): - stream = cuda.external_stream(torch.cuda.current_stream(acts.device).cuda_stream) - else: - stream = cuda.default_stream() - - if num_threads < 0: - num_threads = multiprocessing.cpu_count() - - num_threads = max(1, num_threads) # have to use at least 1 thread - - gpu_size, status = rnnt_helper.get_workspace_size(maxT, maxU, minibatch_size, gpu=True) - - if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: - raise RuntimeError("Invalid parameter passed when calculating working space memory") - - # Select GPU index - cuda.select_device(acts.device.index) - gpu_workspace = torch.zeros(gpu_size, device=acts.device, dtype=acts.dtype, requires_grad=False) - - big_blank_workspace = torch.zeros( - len(big_blank_durations), device=acts.device, dtype=torch.long, requires_grad=False - ) - - for i in range(0, len(big_blank_durations)): - big_blank_workspace[i] = big_blank_durations[i] - - ### VIEW TENSORS AS VECTORS FOR POINTER INDEXING ### - acts, acts_shape = rnnt_helper.flatten_tensor(acts) - - wrapper = gpu_rnnt.MultiblankGPURNNT( - minibatch=minibatch_size, - maxT=maxT, - maxU=maxU, - alphabet_size=alphabet_size, - workspace=gpu_workspace, - big_blank_workspace=big_blank_workspace, - num_big_blanks=len(big_blank_durations), - blank=blank_label, - fastemit_lambda=fastemit_lambda, - clamp=clamp, - num_threads=num_threads, - stream=stream, - sigma=sigma, - ) - - if grads is None: - status = wrapper.score_forward( - acts=acts.data, - costs=costs.data, - pad_labels=labels.data, - label_lengths=label_lengths.data, - input_lengths=input_lengths.data, - ) - - if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: - raise RuntimeError("Could not calculate forward scores") - - else: - ### FLATTEN GRAD TENSOR ### - grads, grads_shape = rnnt_helper.flatten_tensor(grads) - - status = wrapper.cost_and_grad( - acts=acts.data, - grads=grads.data, - costs=costs.data, - pad_labels=labels.data, - label_lengths=label_lengths.data, - input_lengths=input_lengths.data, - ) - - if status != global_constants.RNNTStatus.RNNT_STATUS_SUCCESS: - raise RuntimeError("Could not calculate forward scores") - - del gpu_workspace, big_blank_workspace, wrapper - return True diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_numpy.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_numpy.py deleted file mode 100644 index 58508970aa830b53211157354d6d66606fa857ac..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_numpy.py +++ /dev/null @@ -1,369 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright 2018-2019, Mingkun Huang -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import numpy as np -import torch -from torch.autograd import Function, Variable -from torch.nn import Module - - -def check_type(var, t, name): - if var.dtype is not t: - raise TypeError("{} must be {}".format(name, t)) - - -def check_contiguous(var, name): - if not var.is_contiguous(): - raise ValueError("{} must be contiguous".format(name)) - - -def check_dim(var, dim, name): - if len(var.shape) != dim: - raise ValueError("{} must be {}D".format(name, dim)) - - -def certify_inputs(log_probs, labels, lengths, label_lengths): - # check_type(log_probs, torch.float32, "log_probs") - check_type(labels, torch.int64, "labels") - check_type(label_lengths, torch.int64, "label_lengths") - check_type(lengths, torch.int64, "lengths") - check_contiguous(log_probs, "log_probs") - check_contiguous(labels, "labels") - check_contiguous(label_lengths, "label_lengths") - check_contiguous(lengths, "lengths") - - if lengths.shape[0] != log_probs.shape[0]: - raise ValueError( - f"Must have a length per example. " - f"Given lengths dim: {lengths.shape[0]}, " - f"Log probs dim : {log_probs.shape[0]}" - ) - if label_lengths.shape[0] != log_probs.shape[0]: - raise ValueError( - "Must have a label length per example. " - f"Given label lengths dim : {label_lengths.shape[0]}, " - f"Log probs dim : {log_probs.shape[0]}" - ) - - check_dim(log_probs, 4, "log_probs") - check_dim(labels, 2, "labels") - check_dim(lengths, 1, "lenghts") - check_dim(label_lengths, 1, "label_lenghts") - max_T = torch.max(lengths) - max_U = torch.max(label_lengths) - T, U = log_probs.shape[1:3] - if T != max_T: - raise ValueError(f"Input length mismatch! Given T: {T}, Expected max T from input lengths: {max_T}") - if U != max_U + 1: - raise ValueError(f"Output length mismatch! Given U: {U}, Expected max U from target lengths: {max_U} + 1") - - -def _assert_no_grad(tensor): - assert not tensor.requires_grad, ( - "gradients only computed for log_probs - please " "mark other tensors as not requiring gradients" - ) - - -class LogSoftmaxGradModification(Function): - @staticmethod - def forward(ctx, acts, clamp): - if clamp < 0: - raise ValueError("`clamp` must be 0.0 or positive float.") - - res = acts.new(acts) - ctx.clamp = clamp - return res - - @staticmethod - def backward(ctx, grad_output): - grad_output = torch.clamp(grad_output, -ctx.clamp, ctx.clamp) - return ( - grad_output, - None, - ) - - -def forward_pass(log_probs, labels, blank): - """ - Computes probability of the forward variable alpha. - - Args: - log_probs: Tensor of shape [T, U, V+1] - labels: Labels of shape [B, U] - blank: Index of the blank token. - - Returns: - A tuple of the forward variable probabilities - alpha of shape [T, U] - and the log likelihood of this forward step. - """ - T, U, _ = log_probs.shape - alphas = np.zeros((T, U), dtype='f') - - for t in range(1, T): - alphas[t, 0] = alphas[t - 1, 0] + log_probs[t - 1, 0, blank] - - for u in range(1, U): - alphas[0, u] = alphas[0, u - 1] + log_probs[0, u - 1, labels[u - 1]] - for t in range(1, T): - for u in range(1, U): - no_emit = alphas[t - 1, u] + log_probs[t - 1, u, blank] - emit = alphas[t, u - 1] + log_probs[t, u - 1, labels[u - 1]] - alphas[t, u] = np.logaddexp(emit, no_emit) - - loglike = alphas[T - 1, U - 1] + log_probs[T - 1, U - 1, blank] - return alphas, loglike - - -def backward_pass(log_probs, labels, blank): - """ - Computes probability of the backward variable beta. - - Args: - log_probs: Tensor of shape [T, U, V+1] - labels: Labels of shape [B, U] - blank: Index of the blank token. - - Returns: - A tuple of the backward variable probabilities - beta of shape [T, U] - and the log likelihood of this backward step. - """ - T, U, _ = log_probs.shape - betas = np.zeros((T, U), dtype='f') - betas[T - 1, U - 1] = log_probs[T - 1, U - 1, blank] - - for t in reversed(range(T - 1)): - betas[t, U - 1] = betas[t + 1, U - 1] + log_probs[t, U - 1, blank] - - for u in reversed(range(U - 1)): - betas[T - 1, u] = betas[T - 1, u + 1] + log_probs[T - 1, u, labels[u]] - - for t in reversed(range(T - 1)): - for u in reversed(range(U - 1)): - no_emit = betas[t + 1, u] + log_probs[t, u, blank] - emit = betas[t, u + 1] + log_probs[t, u, labels[u]] - betas[t, u] = np.logaddexp(emit, no_emit) - - return betas, betas[0, 0] - - -def compute_gradient(log_probs, alphas, betas, labels, blank, fastemit_lambda): - """ - Computes the gradients of the log_probs with respect to the log probability of this step occuring. - - Args: - Args: - log_probs: Tensor of shape [T, U, V+1] - alphas: Tensor of shape [T, U] which represents the forward variable. - betas: Tensor of shape [T, U] which represents the backward variable. - labels: Labels of shape [B, U] - blank: Index of the blank token. - - Returns: - Gradients of shape [T, U, V+1] with respect to the forward log probability - """ - T, U, _ = log_probs.shape - grads = np.full(log_probs.shape, -float("inf")) - log_like = betas[0, 0] # == alphas[T - 1, U - 1] + betas[T - 1, U - 1] - - # // grad to last blank transition - grads[T - 1, U - 1, blank] = alphas[T - 1, U - 1] - grads[: T - 1, :, blank] = alphas[: T - 1, :] + betas[1:, :] - - # // grad to label transition - for u, l in enumerate(labels): - grads[:, u, l] = alphas[:, u] + betas[:, u + 1] - - grads = -np.exp(grads + log_probs - log_like) - - if fastemit_lambda > 0.0: - for u, l in enumerate(labels): - grads[:, u, l] = (1.0 + fastemit_lambda) * grads[:, u, l] - - return grads - - -def fastemit_regularization(log_probs, labels, alphas, betas, blank, fastemit_lambda): - """ - Describes the computation of FastEmit regularization from the paper - - [FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization](https://arxiv.org/abs/2010.11148) - - Args: - log_probs: Tensor of shape [T, U, V+1] - labels: Unused. Labels of shape [B, U] - alphas: Tensor of shape [T, U] which represents the forward variable. - betas: Unused. Tensor of shape [T, U] which represents the backward variable. - blank: Index of the blank token. - fastemit_lambda: Float scaling factor for FastEmit regularization. - - Returns: - The regularized negative log likelihood - lambda * P˜(At, u|x) - """ - # General calculation of the fastemit regularization alignments - T, U, _ = log_probs.shape - # alignment = np.zeros((T, U), dtype='float32') - # - # for t in range(0, T): - # alignment[t, U - 1] = alphas[t, U - 1] + betas[t, U - 1] - # - # for t in range(0, T): - # for u in range(0, U - 1): - # emit = alphas[t, u] + log_probs[t, u, labels[u]] + betas[t, u + 1] - # alignment[t, u] = emit - # reg = fastemit_lambda * (alignment[T - 1, U - 1]) - - # The above is equivalent to below, without need of computing above - # reg = fastemit_lambda * (alphas[T - 1, U - 1] + betas[T - 1, U - 1]) - - # The above is also equivalent to below, without need of computing the betas alignment matrix - reg = fastemit_lambda * (alphas[T - 1, U - 1] + log_probs[T - 1, U - 1, blank]) - return -reg - - -def transduce(log_probs, labels, blank=0, fastemit_lambda=0.0): - """ - Args: - log_probs: 3D array with shape - [input len, output len + 1, vocab size] - labels: 1D array with shape [output time steps] - blank: Index of the blank token. - fastemit_lambda: Float scaling factor for FastEmit regularization. - - Returns: - float: The negative log-likelihood - 3D array: Gradients with respect to the - unnormalized input actications - 2d arrays: Alphas matrix (TxU) - 2d array: Betas matrix (TxU) - """ - alphas, ll_forward = forward_pass(log_probs, labels, blank) - betas, ll_backward = backward_pass(log_probs, labels, blank) - grads = compute_gradient(log_probs, alphas, betas, labels, blank, fastemit_lambda) - return -ll_forward, grads, alphas, betas - - -def transduce_batch(log_probs, labels, flen, glen, blank=0, fastemit_lambda=0.0): - """ - Compute the transducer loss of the batch. - - Args: - log_probs: [B, T, U, V+1]. Activation matrix normalized with log-softmax. - labels: [B, U+1] - ground truth labels with padded as blank token in the beginning. - flen: Length vector of the acoustic sequence. - glen: Length vector of the target sequence. - blank: Id of the blank token. - fastemit_lambda: Float scaling factor for FastEmit regularization. - - Returns: - Batch of transducer forward log probabilities (loss) and the gradients of the activation matrix. - """ - grads = np.zeros_like(log_probs) - costs = [] - for b in range(log_probs.shape[0]): - t = int(flen[b]) - u = int(glen[b]) + 1 - - ll, g, alphas, betas = transduce(log_probs[b, :t, :u, :], labels[b, : u - 1], blank, fastemit_lambda) - grads[b, :t, :u, :] = g - - reg = fastemit_regularization( - log_probs[b, :t, :u, :], labels[b, : u - 1], alphas, betas, blank, fastemit_lambda - ) - ll += reg - costs.append(ll) - return costs, grads - - -class _RNNT(Function): - @staticmethod - def forward(ctx, acts, labels, act_lens, label_lens, blank, fastemit_lambda): - costs, grads = transduce_batch( - acts.detach().cpu().numpy(), - labels.cpu().numpy(), - act_lens.cpu().numpy(), - label_lens.cpu().numpy(), - blank, - fastemit_lambda, - ) - - costs = torch.FloatTensor([sum(costs)]) - grads = torch.Tensor(grads).to(acts) - - ctx.grads = grads - return costs - - @staticmethod - def backward(ctx, grad_output): - grad_output = grad_output.view(-1, 1, 1, 1).to(ctx.grads) - return ctx.grads.mul(grad_output), None, None, None, None, None - - -class RNNTLoss(Module): - """ - Parameters: - `blank_label` (int): default 0 - label index of blank token - fastemit_lambda: Float scaling factor for FastEmit regularization. - """ - - def __init__(self, blank: int = 0, fastemit_lambda: float = 0.0, clamp: float = -1.0): - super(RNNTLoss, self).__init__() - self.blank = blank - self.fastemit_lambda = fastemit_lambda - self.clamp = float(clamp) if clamp > 0 else 0.0 - self.rnnt = _RNNT.apply - - def forward(self, acts, labels, act_lens, label_lens): - assert len(labels.size()) == 2 - _assert_no_grad(labels) - _assert_no_grad(act_lens) - _assert_no_grad(label_lens) - certify_inputs(acts, labels, act_lens, label_lens) - - # CPU Patch for fp16 - force cast to fp32 - if not acts.is_cuda and acts.dtype == torch.float16: - acts = acts.float() - - if self.clamp > 0.0: - acts = LogSoftmaxGradModification.apply(acts, self.clamp) - - acts = torch.nn.functional.log_softmax(acts, -1) - - return self.rnnt(acts, labels, act_lens, label_lens, self.blank, self.fastemit_lambda) - - -if __name__ == '__main__': - loss = RNNTLoss(fastemit_lambda=0.01) - - torch.manual_seed(0) - - acts = torch.randn(1, 2, 5, 3) - labels = torch.tensor([[0, 2, 1, 2]], dtype=torch.int64) - act_lens = torch.tensor([2], dtype=torch.int64) - label_lens = torch.tensor([len(labels[0])], dtype=torch.int64) - - loss_val = loss(acts, labels, act_lens, label_lens) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_pytorch.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_pytorch.py deleted file mode 100644 index 5960d5ab6b1892eba894465220d4464c448986e3..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/rnnt_pytorch.py +++ /dev/null @@ -1,632 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright 2018-2019, Mingkun Huang -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch -from torch.autograd import Function -from torch.nn import Module - -from nemo.collections.asr.parts.numba.rnnt_loss import rnnt -from nemo.collections.asr.parts.numba.rnnt_loss.utils.cpu_utils import cpu_rnnt - -__all__ = ['rnnt_loss', 'RNNTLossNumba', 'MultiblankRNNTLossNumba', 'TDTLossNumba'] - - -class _RNNTNumba(Function): - @staticmethod - def forward(ctx, acts, labels, act_lens, label_lens, blank, reduction, fastemit_lambda, clamp): - """ - log_probs: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network - labels: 2 dimensional Tensor containing all the targets of the batch with zero padded - act_lens: Tensor of size (batch) containing size of each output sequence from the network - label_lens: Tensor of (batch) containing label length of each example - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - """ - is_cuda = acts.is_cuda - - certify_inputs(acts, labels, act_lens, label_lens) - if clamp < 0: - raise ValueError("`clamp` must be 0.0 or positive float value.") - - loss_func = rnnt.rnnt_loss_gpu if is_cuda else rnnt.rnnt_loss_cpu - grads = torch.zeros_like(acts) if acts.requires_grad else None - minibatch_size = acts.size(0) - costs = torch.zeros(minibatch_size, device=acts.device, dtype=torch.float32) - - loss_func( - acts, - labels=labels, - input_lengths=act_lens, - label_lengths=label_lens, - costs=costs, - grads=grads, - blank_label=blank, - fastemit_lambda=fastemit_lambda, - clamp=clamp, - num_threads=0, - ) - - if reduction in ['sum', 'mean']: - costs = costs.sum().unsqueeze_(-1) - if reduction == 'mean': - costs /= minibatch_size - - if grads is not None: - grads /= minibatch_size - - ctx.grads = grads - - return costs - - @staticmethod - def backward(ctx, grad_output): - if grad_output is not None and ctx.grads is not None: - grad_output = grad_output.view(-1, 1, 1, 1).to(ctx.grads) - return ctx.grads.mul_(grad_output), None, None, None, None, None, None, None - - -class _TDTNumba(Function): - """ - Numba class for Token-and-Duration Transducer (TDT) loss (https://arxiv.org/abs/2304.06795) - """ - - @staticmethod - def forward( - ctx, - label_acts, - duration_acts, - labels, - act_lens, - label_lens, - blank, - durations, - reduction, - fastemit_lambda, - clamp, - sigma, - omega, - ): - """ - log_probs: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network - labels: 2 dimensional Tensor containing all the targets of the batch with zero padded - act_lens: Tensor of size (batch) containing size of each output sequence from the network - label_lens: Tensor of (batch) containing label length of each example - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - durations: list of durations for TDT model, must include 0 and 1, e.g. - [0, 1, 2, 3, 4]. - sigma: hyper-parameter for logit under-normalization method for training - TDT models. Recommended value 0.05. - omega: probability for sampling the standard RNN-T loss. - Refer to https://arxiv.org/abs/2304.06795 for detailed explanations for - the above parameters; - """ - is_cuda = label_acts.is_cuda - - certify_inputs(label_acts, labels, act_lens, label_lens) - if clamp < 0: - raise ValueError("`clamp` must be 0.0 or positive float value.") - - if is_cuda: - loss_func = rnnt.tdt_loss_gpu - else: - raise ValueError("TDT is not yet implemented for non CUDA computation.") - - label_grads = torch.zeros_like(label_acts) if label_acts.requires_grad else None - duration_grads = torch.zeros_like(duration_acts) if duration_acts.requires_grad else None - minibatch_size = label_acts.size(0) - costs = torch.zeros(minibatch_size, device=label_acts.device, dtype=label_acts.dtype) - - loss_func( - label_acts, - duration_acts, - labels=labels, - input_lengths=act_lens, - label_lengths=label_lens, - costs=costs, - label_grads=label_grads, - duration_grads=duration_grads, - blank_label=blank, - durations=durations, - fastemit_lambda=fastemit_lambda, - clamp=clamp, - sigma=sigma, - omega=omega, - num_threads=0, - ) - - if reduction in ['sum', 'mean']: - costs = costs.sum().unsqueeze_(-1) - if reduction == 'mean': - costs /= minibatch_size - - if label_grads is not None: - label_grads /= minibatch_size - duration_grads /= minibatch_size - - ctx.label_grads = label_grads - ctx.duration_grads = duration_grads - - return costs - - @staticmethod - def backward(ctx, grad_output): - if grad_output is not None and ctx.label_grads is not None: - grad_output = grad_output.view(-1, 1, 1, 1).to(ctx.label_grads) - return ( - ctx.label_grads.mul_(grad_output), - ctx.duration_grads.mul_(grad_output), - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - ) - - -class _MultiblankRNNTNumba(Function): - """ - Numba class for multi-blank transducer loss (https://arxiv.org/pdf/2211.03541.pdf) - """ - - @staticmethod - def forward( - ctx, acts, labels, act_lens, label_lens, blank, big_blank_durations, reduction, fastemit_lambda, clamp, sigma - ): - """ - big_blank_durations: list of durations for multi-blank transducer, e.g. - [2, 4, 8]. - sigma: hyper-parameter for logit under-normalization method for training - multi-blank transducers. Recommended value 0.05. - Refer to https://arxiv.org/pdf/2211.03541 for detailed explanations for - the above parameters; - For other parameters for this class, refer to comment for class _RNNTNumba - """ - is_cuda = acts.is_cuda - - certify_inputs(acts, labels, act_lens, label_lens) - if clamp < 0: - raise ValueError("`clamp` must be 0.0 or positive float value.") - - if is_cuda: - loss_func = rnnt.multiblank_rnnt_loss_gpu - else: - raise NotImplementedError() - - grads = torch.zeros_like(acts) if acts.requires_grad else None - minibatch_size = acts.size(0) - costs = torch.zeros(minibatch_size, device=acts.device, dtype=acts.dtype) - - loss_func( - acts, - labels=labels, - input_lengths=act_lens, - label_lengths=label_lens, - costs=costs, - grads=grads, - blank_label=blank, - big_blank_durations=big_blank_durations, - fastemit_lambda=fastemit_lambda, - clamp=clamp, - sigma=sigma, - num_threads=0, - ) - - if reduction in ['sum', 'mean']: - costs = costs.sum().unsqueeze_(-1) - if reduction == 'mean': - costs /= minibatch_size - - if grads is not None: - grads /= minibatch_size - - ctx.grads = grads - - return costs - - @staticmethod - def backward(ctx, grad_output): - if grad_output is not None and ctx.grads is not None: - grad_output = grad_output.view(-1, 1, 1, 1).to(ctx.grads) - return ctx.grads.mul_(grad_output), None, None, None, None, None, None, None, None, None, None - - -def rnnt_loss( - acts, labels, act_lens, label_lens, blank=0, reduction='mean', fastemit_lambda: float = 0.0, clamp: float = 0.0 -): - """RNN Transducer Loss (functional form) - Args: - acts: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network - labels: 2 dimensional Tensor containing all the targets of the batch with zero padded - act_lens: Tensor of size (batch) containing size of each output sequence from the network - label_lens: Tensor of (batch) containing label length of each example - blank (int, optional): blank label. Default: 0. - reduction (string, optional): Specifies the reduction to apply to the output: - 'none' | 'mean' | 'sum'. 'none': no reduction will be applied, - 'mean': the output losses will be divided by the target lengths and - then the mean over the batch is taken. Default: 'mean' - """ - if not acts.is_cuda: - # Since CPU requires log_softmax to be computed explicitly, we need to perform grad clipping - # *after* we have obtained the gradients of loss(logsoftmax()). - # This is highly wasteful since it requires a copy of the entire joint tensor which is expensive. - # CUDA version is much more efficient since it performs an inplace logsoftmax, and therefore - # can inplace clamp the gradient. - if clamp > 0.0: - acts = cpu_rnnt.LogSoftmaxGradModification.apply(acts, clamp) - - # NOTE manually done log_softmax for CPU version, - # log_softmax is computed within GPU version. - acts = torch.nn.functional.log_softmax(acts, -1) - - return _RNNTNumba.apply(acts, labels, act_lens, label_lens, blank, reduction, fastemit_lambda, clamp) - - -def multiblank_rnnt_loss( - acts, - labels, - act_lens, - label_lens, - blank, - big_blank_durations=[], - reduction='mean', - fastemit_lambda: float = 0.0, - clamp: float = 0.0, -): - """ - Multi-blank RNN Transducer (https://arxiv.org/pdf/2211.03541.pdf) Loss (functional form) - Args: - acts: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network - labels: 2 dimensional Tensor containing all the targets of the batch with zero padded - act_lens: Tensor of size (batch) containing size of each output sequence from the network - label_lens: Tensor of (batch) containing label length of each example - blank (int): standard blank label. - big_blank_durations: list of durations for multi-blank transducer, e.g. - [2, 4, 8]. - sigma: hyper-parameter for logit under-normalization method for training - multi-blank transducers. Recommended value 0.05. - Refer to https://arxiv.org/pdf/2211.03541 for detailed explanations for - the last two params. - reduction (string, optional): Specifies the reduction to apply to the output: - 'none' | 'mean' | 'sum'. 'none': no reduction will be applied, - 'mean': the output losses will be divided by the target lengths and - then the mean over the batch is taken. Default: 'mean' - """ - if not acts.is_cuda: - # Since CPU requires log_softmax to be computed explicitly, we need to perform grad clipping - # *after* we have obtained the gradients of loss(logsoftmax()). - # This is highly wasteful since it requires a copy of the entire joint tensor which is expensive. - # CUDA version is much more efficient since it performs an inplace logsoftmax, and therefore - # can inplace clamp the gradient. - if clamp > 0.0: - acts = cpu_rnnt.LogSoftmaxGradModification.apply(acts, clamp) - - # NOTE manually done log_softmax for CPU version, - # log_softmax is computed within GPU version. - acts = torch.nn.functional.log_softmax(acts, -1) - - return _MultiblankRNNTNumba.apply( - acts, labels, act_lens, label_lens, blank, big_blank_durations, reduction, fastemit_lambda, clamp - ) - - -def tdt_loss( - acts, - labels, - act_lens, - label_lens, - blank, - durations=[], - reduction='mean', - fastemit_lambda: float = 0.0, - clamp: float = 0.0, -): - """ - TDT RNN Transducer (https://arxiv.org/abs/2304.06795) Loss (functional form) - Args: - acts: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network - labels: 2 dimensional Tensor containing all the targets of the batch with zero padded - act_lens: Tensor of size (batch) containing size of each output sequence from the network - label_lens: Tensor of (batch) containing label length of each example - blank (int): standard blank label. - durations: list of durations for TDT model, e.g. - [0,1,2,3,4]. - sigma: hyper-parameter for logit under-normalization method for training - multi-blank transducers. Recommended value 0.05. - Refer to https://arxiv.org/abs/2304.06795 for detailed explanations for - the last two params. - reduction (string, optional): Specifies the reduction to apply to the output: - 'none' | 'mean' | 'sum'. 'none': no reduction will be applied, - 'mean': the output losses will be divided by the target lengths and - then the mean over the batch is taken. Default: 'mean' - """ - if not acts.is_cuda: - # Since CPU requires log_softmax to be computed explicitly, we need to perform grad clipping - # *after* we have obtained the gradients of loss(logsoftmax()). - # This is highly wasteful since it requires a copy of the entire joint tensor which is expensive. - # CUDA version is much more efficient since it performs an inplace logsoftmax, and therefore - # can inplace clamp the gradient. - if clamp > 0.0: - acts = cpu_rnnt.LogSoftmaxGradModification.apply(acts, clamp) - - # NOTE manually done log_softmax for CPU version, - # log_softmax is computed within GPU version. - acts = torch.nn.functional.log_softmax(acts, -1) - - return _TDTNumba.apply(acts, labels, act_lens, label_lens, blank, durations, reduction, fastemit_lambda, clamp) - - -class RNNTLossNumba(Module): - """ - Parameters: - blank (int, optional): blank label. Default: 0. - reduction (string, optional): Specifies the reduction to apply to the output: - 'none' | 'mean' | 'sum'. 'none': no reduction will be applied, - 'mean': the output losses will be divided by the target lengths and - then the mean over the batch is taken. Default: 'mean' - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. - """ - - def __init__(self, blank=0, reduction='mean', fastemit_lambda: float = 0.0, clamp: float = -1): - super(RNNTLossNumba, self).__init__() - self.blank = blank - self.fastemit_lambda = fastemit_lambda - self.clamp = float(clamp) if clamp > 0 else 0.0 - self.reduction = reduction - self.loss = _RNNTNumba.apply - - def forward(self, acts, labels, act_lens, label_lens): - """ - log_probs: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network - labels: 2 dimensional Tensor containing all the targets of the batch with zero padded - act_lens: Tensor of size (batch) containing size of each output sequence from the network - label_lens: Tensor of (batch) containing label length of each example - """ - if not acts.is_cuda: - # Force FP32 until log_softmax() is implemented for fp16 on CPU - if acts.dtype == torch.float16: - acts = acts.float() - - # Since CPU requires log_softmax to be computed explicitly, we need to perform grad clipping - # *after* we have obtained the gradients of loss(logsoftmax()). - # This is highly wasteful since it requires a copy of the entire joint tensor which is expensive. - # CUDA version is much more efficient since it performs an inplace logsoftmax, and therefore - # can inplace clamp the gradient. - if self.clamp > 0.0: - acts = cpu_rnnt.LogSoftmaxGradModification.apply(acts, self.clamp) - - # NOTE manually done log_softmax for CPU version, - # log_softmax is computed within GPU version. - acts = torch.nn.functional.log_softmax(acts, -1) - - return self.loss( - acts, labels, act_lens, label_lens, self.blank, self.reduction, self.fastemit_lambda, self.clamp - ) - - -class MultiblankRNNTLossNumba(Module): - """ - Parameters: - blank (int): standard blank label. - big_blank_durations: list of durations for multi-blank transducer, e.g. - [2, 4, 8]. - sigma: hyper-parameter for logit under-normalization method for training - multi-blank transducers. Recommended value 0.05. - Refer to https://arxiv.org/pdf/2211.03541 for detailed explanations for - the above parameters; - reduction (string, optional): Specifies the reduction to apply to the output: - 'none' | 'mean' | 'sum'. 'none': no reduction will be applied, - 'mean': the output losses will be divided by the target lengths and - then the mean over the batch is taken. Default: 'mean' - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. - """ - - def __init__( - self, - blank, - big_blank_durations, - reduction='mean', - fastemit_lambda: float = 0.0, - clamp: float = -1, - sigma: float = 0.0, - ): - super(MultiblankRNNTLossNumba, self).__init__() - self.blank = blank - self.big_blank_durations = big_blank_durations - self.fastemit_lambda = fastemit_lambda - self.clamp = float(clamp) if clamp > 0 else 0.0 - self.reduction = reduction - self.loss = _MultiblankRNNTNumba.apply - self.sigma = sigma - - def forward(self, acts, labels, act_lens, label_lens): - """ - log_probs: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network - labels: 2 dimensional Tensor containing all the targets of the batch with zero padded - act_lens: Tensor of size (batch) containing size of each output sequence from the network - label_lens: Tensor of (batch) containing label length of each example - """ - if not acts.is_cuda: - # Since CPU requires log_softmax to be computed explicitly, we need to perform grad clipping - # *after* we have obtained the gradients of loss(logsoftmax()). - # This is highly wasteful since it requires a copy of the entire joint tensor which is expensive. - # CUDA version is much more efficient since it performs an inplace logsoftmax, and therefore - # can inplace clamp the gradient. - if self.clamp > 0.0: - acts = cpu_rnnt.LogSoftmaxGradModification.apply(acts, self.clamp) - - # NOTE manually done log_softmax for CPU version, - # log_softmax is computed within GPU version. - acts = torch.nn.functional.log_softmax(acts, -1) - - return self.loss( - acts, - labels, - act_lens, - label_lens, - self.blank, - self.big_blank_durations, - self.reduction, - self.fastemit_lambda, - self.clamp, - self.sigma, - ) - - -class TDTLossNumba(Module): - """ - Parameters: - blank (int): standard blank label. - durations: list of durations for TDT model, e.g. - [0, 1, 2, 3, 4]. - sigma: hyper-parameter for logit under-normalization method for training - TDT. Recommended value 0.05. - omega: hyper-parameter for RNN-T loss for loss combination. - Refer to https://arxiv.org/abs/2304.06795 for detailed explanations for - the above parameters; - - reduction (string, optional): Specifies the reduction to apply to the output: - 'none' | 'mean' | 'sum'. 'none': no reduction will be applied, - 'mean': the output losses will be divided by the target lengths and - then the mean over the batch is taken. Default: 'mean' - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. - """ - - def __init__( - self, - blank, - durations=None, - reduction='mean', - fastemit_lambda: float = 0.0, - clamp: float = -1, - sigma: float = 0.0, - omega: float = 0.0, - ): - super(TDTLossNumba, self).__init__() - self.blank = blank - self.durations = durations if durations is not None else [] - self.fastemit_lambda = fastemit_lambda - self.clamp = float(clamp) if clamp > 0 else 0.0 - self.reduction = reduction - self.loss = _TDTNumba.apply - self.sigma = sigma - self.omega = omega - - def forward(self, acts, labels, act_lens, label_lens): - """ - log_probs: Tensor of (batch x seqLength x labelLength x outputDim) containing output from network - labels: 2 dimensional Tensor containing all the targets of the batch with zero padded - act_lens: Tensor of size (batch) containing size of each output sequence from the network - label_lens: Tensor of (batch) containing label length of each example - """ - - # TODO(hainan): in the future, we could further optimize this so that we don't need to - # make contiguous copies of the acts tensor. - label_acts, duration_acts = torch.split( - acts, [acts.shape[-1] - len(self.durations), len(self.durations)], dim=-1 - ) - label_acts = label_acts.contiguous() - duration_acts = torch.nn.functional.log_softmax(duration_acts, dim=-1).contiguous() - - return self.loss( - label_acts, - duration_acts, - labels, - act_lens, - label_lens, - self.blank, - self.durations, - self.reduction, - self.fastemit_lambda, - self.clamp, - self.sigma, - self.omega, - ) - - -def check_type(var, t, name): - if var.dtype is not t: - raise TypeError("{} must be {}".format(name, t)) - - -def check_contiguous(var, name): - if not var.is_contiguous(): - raise ValueError("{} must be contiguous".format(name)) - - -def check_dim(var, dim, name): - if len(var.shape) != dim: - raise ValueError("{} must be {}D".format(name, dim)) - - -def certify_inputs(log_probs, labels, lengths, label_lengths): - # check_type(log_probs, torch.float32, "log_probs") - check_type(labels, torch.int64, "labels") - check_type(label_lengths, torch.int64, "label_lengths") - check_type(lengths, torch.int64, "lengths") - check_contiguous(log_probs, "log_probs") - check_contiguous(labels, "labels") - check_contiguous(label_lengths, "label_lengths") - check_contiguous(lengths, "lengths") - - if lengths.shape[0] != log_probs.shape[0]: - raise ValueError( - f"Must have a length per example. " - f"Given lengths dim: {lengths.shape[0]}, " - f"Log probs dim : {log_probs.shape[0]}" - ) - if label_lengths.shape[0] != log_probs.shape[0]: - raise ValueError( - "Must have a label length per example. " - f"Given label lengths dim : {label_lengths.shape[0]}, " - f"Log probs dim : {log_probs.shape[0]}" - ) - - check_dim(log_probs, 4, "log_probs") - check_dim(labels, 2, "labels") - check_dim(lengths, 1, "lenghts") - check_dim(label_lengths, 1, "label_lenghts") - max_T = torch.max(lengths) - max_U = torch.max(label_lengths) - T, U = log_probs.shape[1:3] - if T != max_T: - raise ValueError(f"Input length mismatch! Given T: {T}, Expected max T from input lengths: {max_T}") - if U != max_U + 1: - raise ValueError(f"Output length mismatch! Given U: {U}, Expected max U from target lengths: {max_U} + 1") diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/__init__.py deleted file mode 100644 index bc443be41c4c3fcf0764eb9fd3e1828d63f8438c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 51f3f903abed992ae128a93019eaf672c4e41e67..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/__pycache__/global_constants.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/__pycache__/global_constants.cpython-310.pyc deleted file mode 100644 index 98ef1654862ab0b5330bbecc4a3920f384d72592..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/__pycache__/global_constants.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/__pycache__/rnnt_helper.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/__pycache__/rnnt_helper.cpython-310.pyc deleted file mode 100644 index a6694cd3d9fd0e4b7f7fc13e5bc5824ad68eadc0..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/__pycache__/rnnt_helper.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/__init__.py deleted file mode 100644 index 1b4bbd40dff2c64244d18a0f4f0051cafe59b48d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright 2018-2019, Mingkun Huang -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index d71a63590a9ff00f2a615010f89f43e35f856cb7..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/__pycache__/cpu_rnnt.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/__pycache__/cpu_rnnt.cpython-310.pyc deleted file mode 100644 index 4cb8831501c1c3b8594c2fae7ef89e240193678c..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/__pycache__/cpu_rnnt.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/cpu_rnnt.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/cpu_rnnt.py deleted file mode 100644 index 3feb7b513a50b964ee2f746106ba08b4df02739a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cpu_utils/cpu_rnnt.py +++ /dev/null @@ -1,419 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright 2018-2019, Mingkun Huang -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import multiprocessing -from typing import Optional - -import numba -import torch -from torch.autograd import Function - -from nemo.collections.asr.parts.numba.rnnt_loss.utils import global_constants - - -def log_sum_exp(a: torch.Tensor, b: torch.Tensor): - """ - Logsumexp with safety checks for infs. - """ - if torch.isinf(a): - return b - - if torch.isinf(b): - return a - - if a > b: - return math.log1p(math.exp(b - a)) + a - else: - return math.log1p(math.exp(a - b)) + b - - -class CpuRNNT_index: - def __init__(self, U: int, maxU: int, minibatch: int, alphabet_size: int, batch_first: bool): - """ - A placeholder Index computation class that emits the resolved index in a flattened tensor, - mimicing pointer indexing in CUDA kernels on the CPU. - - Args: - U: Length of the current target sample (without padding). - maxU: Max Length of the padded target samples. - minibatch: Minibatch index - alphabet_size: Size of the vocabulary including RNNT blank - V+1. - batch_first: Bool flag determining if batch index is first or third. - """ - super(CpuRNNT_index, self).__init__() - self.U = U - self.maxU = maxU - self.minibatch = minibatch - self.alphabet_size = alphabet_size - self.batch_first = batch_first - - def __call__(self, t: int, u: int, v: Optional[int] = None): - # if indexing all the values of the vocabulary, then only t, u are provided - if v is None: - return t * self.U + u - else: - # otherwise, t, u, v are provided to index particular value in the vocabulary. - if self.batch_first: - return (t * self.maxU + u) * self.alphabet_size + v - else: - return (t * self.maxU + u) * self.minibatch * self.alphabet_size + v - - -class CpuRNNT_metadata: - def __init__( - self, - T: int, - U: int, - workspace: torch.Tensor, - bytes_used: int, - blank: int, - labels: torch.Tensor, - log_probs: torch.Tensor, - idx: CpuRNNT_index, - ): - """ - Metadata for CPU based RNNT loss calculation. Holds the working space memory. - - Args: - T: Length of the acoustic sequence (without padding). - U: Length of the target sequence (without padding). - workspace: Working space memory for the CPU. - bytes_used: Number of bytes currently used for indexing the working space memory. Generally 0. - blank: Index of the blank token in the vocabulary. - labels: Ground truth padded labels matrix of shape [B, U] - log_probs: Log probs / activation matrix of flattented shape [B, T, U, V+1] - idx: - """ - super(CpuRNNT_metadata, self).__init__() - - self.alphas = workspace[bytes_used : bytes_used + T * U] - bytes_used += T * U - - self.betas = workspace[bytes_used : bytes_used + T * U] - bytes_used += T * U - - self.log_probs2 = workspace[bytes_used : bytes_used + T * U * 2] # // only store blank & label - bytes_used += T * U * 2 - - self.bytes_used = bytes_used - - self.setup_probs(T, U, labels, blank, log_probs, idx) - - def setup_probs( - self, T: int, U: int, labels: torch.Tensor, blank: int, log_probs: torch.Tensor, idx: CpuRNNT_index - ): - # initialize the log probs memory for blank and label token. - for t in range(T): - for u in range(U): - offset = (t * U + u) * 2 # mult with 2 is for selecting either blank or label token. Odd idx is blank. - self.log_probs2[offset] = log_probs[idx(t, u, blank)] - # // labels do not have first blank - if u < U - 1: - self.log_probs2[offset + 1] = log_probs[idx(t, u, labels[u])] - - -class LogSoftmaxGradModification(Function): - @staticmethod - def forward(ctx, acts, clamp): - if clamp < 0: - raise ValueError("`clamp` must be 0.0 or positive float.") - - # This is needed for correctness (inplace is problematic), - # but it wastes a log of memory. - res = acts.new(acts) - ctx.clamp = clamp - return res - - @staticmethod - def backward(ctx, grad_output): - # Clamp the gradients of loss(logsoftmax(...)) - # CPU computes logsoftmax explicitly, so we need to override t - grad_output = torch.clamp(grad_output, -ctx.clamp, ctx.clamp) - return ( - grad_output, - None, - ) - - -class CPURNNT: - def __init__( - self, - minibatch: int, - maxT: int, - maxU: int, - alphabet_size: int, - workspace: torch.Tensor, - blank: int, - fastemit_lambda: float, - clamp: float, - num_threads: int, - batch_first: bool, - ): - """ - Helper class to compute the Transducer Loss on CPU. - - Args: - minibatch: Size of the minibatch b. - maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. - maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. - alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). - workspace: An allocated chunk of memory that will be sliced off and reshaped into required - blocks used as working memory. - blank: Index of the RNNT blank token in the vocabulary. Generally the first or last token in the vocab. - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. - num_threads: Number of OMP threads to launch. - batch_first: Bool that decides if batch dimension is first or third. - """ - self.minibatch_ = minibatch - self.maxT_ = maxT - self.maxU_ = maxU - self.alphabet_size_ = alphabet_size - self.workspace = workspace # a flat vector of floatX numbers that represents allocated memory slices - self.blank_ = blank - self.fastemit_lambda_ = fastemit_lambda - self.clamp_ = abs(clamp) - self.num_threads_ = num_threads - self.batch_first = batch_first - - if num_threads > 0: - numba.set_num_threads(min(multiprocessing.cpu_count(), num_threads)) - else: - self.num_threads_ = numba.get_num_threads() - - def cost_and_grad_kernel( - self, - log_probs: torch.Tensor, - grad: torch.Tensor, - labels: torch.Tensor, - mb: int, - T: int, - U: int, - bytes_used: int, - ): - idx = CpuRNNT_index(U, self.maxU_, self.minibatch_, self.alphabet_size_, self.batch_first) - rnntm = CpuRNNT_metadata(T, U, self.workspace, bytes_used, self.blank_, labels, log_probs, idx) - - if self.batch_first: - # zero grads - grad *= 0.0 - - llForward = self.compute_alphas(rnntm.log_probs2, T, U, rnntm.alphas) - llBackward = self.compute_betas_and_grads( - grad, rnntm.log_probs2, T, U, rnntm.alphas, rnntm.betas, labels, llForward - ) - - # Scale llForward by FastEmit lambda - llForward += llForward * self.fastemit_lambda_ - llBackward += llBackward * self.fastemit_lambda_ - - diff = (llForward - llBackward).abs() - if diff > 0.1: - print(f"WARNING: Forward backward likelihood mismatch : {diff}") - - return -llForward - - def compute_alphas(self, log_probs: torch.Tensor, T: int, U: int, alphas: torch.Tensor): - """ - Compute the probability of the forward variable alpha. - - Args: - log_probs: Flattened tensor [B, T, U, V+1] - T: Length of the acoustic sequence T (not padded). - U: Length of the target sequence U (not padded). - alphas: Working space memory for alpha of shape [B, T, U]. - - Returns: - Loglikelihood of the forward variable alpha. - """ - idx = CpuRNNT_index(U, self.maxU_, self.minibatch_, self.alphabet_size_, self.batch_first) - - alphas[0] = 0 - for t in range(T): - for u in range(U): - if u == 0 and t > 0: - alphas[idx(t, 0)] = alphas[idx(t - 1, 0)] + log_probs[idx(t - 1, 0) * 2] - - if t == 0 and u > 0: - alphas[idx(0, u)] = alphas[idx(0, u - 1)] + log_probs[idx(0, u - 1) * 2 + 1] - - if t > 0 and u > 0: - no_emit = alphas[idx(t - 1, u)] + log_probs[idx(t - 1, u) * 2] - emit = alphas[idx(t, u - 1)] + log_probs[idx(t, u - 1) * 2 + 1] - alphas[idx(t, u)] = log_sum_exp(emit, no_emit) - - loglike = alphas[idx(T - 1, U - 1)] + log_probs[idx(T - 1, U - 1) * 2] - return loglike - - def compute_betas_and_grads( - self, - grad: torch.Tensor, - log_probs: torch.Tensor, - T: int, - U: int, - alphas: torch.Tensor, - betas: torch.Tensor, - labels: torch.Tensor, - logll: torch.Tensor, - ): - """ - Compute backward variable beta as well as gradients of the activation matrix wrt loglikelihood - of forward variable. - - Args: - grad: Working space memory of flattened shape [B, T, U, V+1] - log_probs: Activatio tensor of flattented shape [B, T, U, V+1] - T: Length of the acoustic sequence T (not padded). - U: Length of the target sequence U (not padded). - alphas: Working space memory for alpha of shape [B, T, U]. - betas: Working space memory for alpha of shape [B, T, U]. - labels: Ground truth label of shape [B, U] - logll: Loglikelihood of the forward variable. - - Returns: - Loglikelihood of the forward variable and inplace updates the grad tensor. - """ - # Patch for CPU + fp16 - if log_probs.dtype == torch.float16 and not log_probs.is_cuda: - log_probs = log_probs.float() - - idx = CpuRNNT_index(U, self.maxU_, self.minibatch_, self.alphabet_size_, self.batch_first) - betas[idx(T - 1, U - 1)] = log_probs[idx(T - 1, U - 1) * 2] - - for t in range(T - 1, -1, -1): - for u in range(U - 1, -1, -1): - if (u == U - 1) and (t < T - 1): - betas[idx(t, U - 1)] = betas[idx(t + 1, U - 1)] + log_probs[idx(t, U - 1) * 2] - - if (t == T - 1) and (u < U - 1): - betas[idx(T - 1, u)] = betas[idx(T - 1, u + 1)] + log_probs[idx(T - 1, u) * 2 + 1] - - if (t < T - 1) and (u < U - 1): - no_emit = betas[idx(t + 1, u)] + log_probs[idx(t, u) * 2] - emit = betas[idx(t, u + 1)] + log_probs[idx(t, u) * 2 + 1] - betas[idx(t, u)] = log_sum_exp(emit, no_emit) - - loglike = betas[0] - # // Gradients w.r.t. log probabilities - for t in range(T): - for u in range(U): - if t < T - 1: - g = alphas[idx(t, u)] + betas[idx(t + 1, u)] - grad[idx(t, u, self.blank_)] = -torch.exp(log_probs[idx(t, u) * 2] + g - loglike) - - if u < U - 1: - g = alphas[idx(t, u)] + betas[idx(t, u + 1)] - grad[idx(t, u, labels[u])] = -torch.exp( - math.log1p(self.fastemit_lambda_) + log_probs[idx(t, u) * 2 + 1] + g - loglike - ) - - # // gradient to the last blank transition - grad[idx(T - 1, U - 1, self.blank_)] = -torch.exp( - log_probs[idx(T - 1, U - 1) * 2] + alphas[idx(T - 1, U - 1)] - loglike - ) - - return loglike - - def cost_and_grad( - self, - log_probs: torch.Tensor, - grads: torch.Tensor, - costs: torch.Tensor, - flat_labels: torch.Tensor, - label_lengths: torch.Tensor, - input_lengths: torch.Tensor, - ) -> global_constants.RNNTStatus: - # // per minibatch memory - per_minibatch_bytes = 0 - - # // alphas & betas - per_minibatch_bytes += self.maxT_ * self.maxU_ * 2 - - # // blank & label log probability cache - per_minibatch_bytes += self.maxT_ * self.maxU_ * 2 - - for mb in range(self.minibatch_): - T = input_lengths[mb] # // Length of utterance (time) - U = label_lengths[mb] + 1 # // Number of labels in transcription - batch_size = self.alphabet_size_ - if self.batch_first: - batch_size = self.maxT_ * self.maxU_ * self.alphabet_size_ - - costs[mb] = self.cost_and_grad_kernel( - log_probs[(mb * batch_size) :], - grads[(mb * batch_size) :], - flat_labels[(mb * (self.maxU_ - 1)) :], - mb, - T, - U, - mb * per_minibatch_bytes, - ) - - return global_constants.RNNTStatus.RNNT_STATUS_SUCCESS - - def score_forward( - self, - log_probs: torch.Tensor, - costs: torch.Tensor, - flat_labels: torch.Tensor, - label_lengths: torch.Tensor, - input_lengths: torch.Tensor, - ): - # // per minibatch memory - per_minibatch_bytes = 0 - - # // alphas & betas - per_minibatch_bytes += self.maxT_ * self.maxU_ * 2 - - # // blank & label log probability cache - per_minibatch_bytes += self.maxT_ * self.maxU_ * 2 - - for mb in range(self.minibatch_): - T = input_lengths[mb] # // Length of utterance (time) - U = label_lengths[mb] + 1 # // Number of labels in transcription - batch_size = self.alphabet_size_ - if self.batch_first: - batch_size = self.maxT_ * self.maxU_ * self.alphabet_size_ - - idx = CpuRNNT_index(U, self.maxU_, self.minibatch_, self.alphabet_size_, self.batch_first) - rnntm = CpuRNNT_metadata( - T, - U, - self.workspace, - mb * per_minibatch_bytes, - self.blank_, - flat_labels[(mb * (self.maxU_ - 1)) :], - log_probs[(mb * batch_size) :], - idx, - ) - - costs[mb] = -self.compute_alphas(rnntm.log_probs2, T, U, rnntm.alphas) - - return global_constants.RNNTStatus.RNNT_STATUS_SUCCESS diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/__init__.py deleted file mode 100644 index 1b4bbd40dff2c64244d18a0f4f0051cafe59b48d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright 2018-2019, Mingkun Huang -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 2059ee26a6646e1695ac54ab8d72cf01be74ae08..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/__pycache__/gpu_rnnt.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/__pycache__/gpu_rnnt.cpython-310.pyc deleted file mode 100644 index 8690b40841951aadea8c09f73e9b2bb18e827f28..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/__pycache__/gpu_rnnt.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/__pycache__/gpu_rnnt_kernel.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/__pycache__/gpu_rnnt_kernel.cpython-310.pyc deleted file mode 100644 index 90b4e7bd28ef6598aeb1baf1c4c76c55fa1a0834..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/__pycache__/gpu_rnnt_kernel.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/__pycache__/reduce.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/__pycache__/reduce.cpython-310.pyc deleted file mode 100644 index a433624b4e37a64babde015079e85bd773981011..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/__pycache__/reduce.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt.py deleted file mode 100644 index 70ffb459cb971016775527836913d12a5726762e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt.py +++ /dev/null @@ -1,805 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright 2018-2019, Mingkun Huang -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import multiprocessing -import random -from typing import Optional, Tuple - -import numba -import torch -from numba import cuda - -from nemo.collections.asr.parts.numba.rnnt_loss.utils import global_constants, rnnt_helper -from nemo.collections.asr.parts.numba.rnnt_loss.utils.cuda_utils import gpu_rnnt_kernel, reduce - - -class GPURNNT: - def __init__( - self, - minibatch: int, - maxT: int, - maxU: int, - alphabet_size: int, - workspace, - blank: int, - fastemit_lambda: float, - clamp: float, - num_threads: int, - stream, - ): - """ - Helper class to launch the CUDA Kernels to compute the Transducer Loss. - - Args: - minibatch: Int representing the batch size. - maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. - maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. - alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). - workspace: An allocated chunk of memory that will be sliced off and reshaped into required - blocks used as working memory. - blank: Index of the RNNT blank token in the vocabulary. Generally the first or last token in the vocab. - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. - num_threads: Number of OMP threads to launch. - stream: Numba Cuda Stream. - """ - self.minibatch_ = minibatch - self.maxT_ = maxT - self.maxU_ = maxU - self.alphabet_size_ = alphabet_size - self.gpu_workspace = cuda.as_cuda_array( - workspace - ) # a flat vector of floatX numbers that represents allocated memory slices - self.blank_ = blank - self.fastemit_lambda_ = fastemit_lambda - self.clamp_ = abs(clamp) - self.num_threads_ = num_threads - self.stream_ = stream # type: cuda.cudadrv.driver.Stream - - if num_threads > 0: - numba.set_num_threads(min(multiprocessing.cpu_count(), num_threads)) - self.num_threads_ = numba.get_num_threads() - else: - self.num_threads_ = numba.get_num_threads() - - def log_softmax(self, acts: torch.Tensor, denom: torch.Tensor): - """ - Computes the log softmax denominator of the input activation tensor - and stores the result in denom. - - Args: - acts: Activation tensor of shape [B, T, U, V+1]. The input must be represented as a flat tensor - of shape [B * T * U * (V+1)] to allow pointer indexing. - denom: A zero tensor of same shape as acts. - - Updates: - This kernel inplace updates the `denom` tensor - """ - # // trans_acts + pred_acts -> log_softmax denominator - reduce.reduce_max( - acts, - denom, - rows=self.alphabet_size_, - cols=self.minibatch_ * self.maxT_ * self.maxU_, - minus=False, - stream=self.stream_, - ) - - reduce.reduce_exp( - acts, - denom, - rows=self.alphabet_size_, - cols=self.minibatch_ * self.maxT_ * self.maxU_, - minus=True, - stream=self.stream_, - ) - - def compute_cost_and_score( - self, - acts: torch.Tensor, - grads: Optional[torch.Tensor], - costs: torch.Tensor, - labels: torch.Tensor, - label_lengths: torch.Tensor, - input_lengths: torch.Tensor, - ) -> global_constants.RNNTStatus: - """ - Compute both the loss and the gradients. - - Args: - acts: A flattened tensor of shape [B, T, U, V+1] representing the activation matrix. - grad: A flattented zero tensor of same shape as acts. - costs: A zero vector of length B which will be updated inplace with the log probability costs. - flat_labels: A flattened matrix of labels of shape [B, U] - label_lengths: A vector of length B that contains the original lengths of the acoustic sequence. - input_lengths: A vector of length B that contains the original lengths of the target sequence. - - Updates: - This will launch kernels that will update inline the following variables: - - grads: Gradients of the activation matrix wrt the costs vector. - - costs: Negative log likelihood of the forward variable. - - Returns: - An enum that either represents a successful RNNT operation or failure. - """ - training = grads is not None - - if training: - grads *= 0.0 # zero grads - - used_offset, (denom, alphas, betas, llForward, llBackward) = self._prepare_workspace() - - ######## START EXECUTION ######## - self.log_softmax(acts, denom) - - # Compute alphas - gpu_rnnt_kernel.compute_alphas_kernel[self.minibatch_, self.maxU_, self.stream_, 0]( - acts, - denom, - alphas, - llForward, - input_lengths, - label_lengths, - labels, - self.minibatch_, - self.maxT_, - self.maxU_, - self.alphabet_size_, - self.blank_, - ) - - if training: - # Compute betas - gpu_rnnt_kernel.compute_betas_kernel[self.minibatch_, self.maxU_, self.stream_, 0]( - acts, - denom, - betas, - llBackward, - input_lengths, - label_lengths, - labels, - self.minibatch_, - self.maxT_, - self.maxU_, - self.alphabet_size_, - self.blank_, - ) - - # Compute gradient - grad_blocks_per_grid = self.minibatch_ * self.maxT_ * self.maxU_ - grad_threads_per_block = gpu_rnnt_kernel.GPU_RNNT_THREAD_SIZE - gpu_rnnt_kernel.compute_grad_kernel[grad_blocks_per_grid, grad_threads_per_block, self.stream_, 0]( - grads, - acts, - denom, - alphas, - betas, - llForward, - input_lengths, - label_lengths, - labels, - self.minibatch_, - self.maxT_, - self.maxU_, - self.alphabet_size_, - self.blank_, - self.fastemit_lambda_, - self.clamp_, - ) - - # // cost copy, negate (for log likelihood) and update with additional regularizers - # This needs to be done via CUDA, because we used temporary memory llForward - # passed to alpha, which was updated with log likelihoods. - # But copying this data into a pytorch pointer is more difficult (numba api is one way) - # Therefore launch a pointwise CUDA kernel to update the costs inplace from data of llForward - # Then negate to compute the loglikelihood. - threadsperblock = min(costs.shape[0], 32) - blockspergrid = (costs.shape[0] + (threadsperblock - 1)) // threadsperblock - rnnt_helper.compute_costs_data[blockspergrid, threadsperblock, self.stream_, 0]( - llForward, costs, self.fastemit_lambda_ - ) - self.stream_.synchronize() - - return global_constants.RNNTStatus.RNNT_STATUS_SUCCESS - - def cost_and_grad( - self, - acts: torch.Tensor, - grads: torch.Tensor, - costs: torch.Tensor, - pad_labels: torch.Tensor, - label_lengths: torch.Tensor, - input_lengths: torch.Tensor, - ): - if ( - acts is None - or grads is None - or costs is None - or pad_labels is None - or label_lengths is None - or input_lengths is None - ): - return global_constants.RNNTStatus.RNNT_STATUS_INVALID_VALUE - - return self.compute_cost_and_score(acts, grads, costs, pad_labels, label_lengths, input_lengths) - - def score_forward( - self, - acts: torch.Tensor, - costs: torch.Tensor, - pad_labels: torch.Tensor, - label_lengths: torch.Tensor, - input_lengths: torch.Tensor, - ): - if acts is None or costs is None or pad_labels is None or label_lengths is None or input_lengths is None: - return global_constants.RNNTStatus.RNNT_STATUS_INVALID_VALUE - - return self.compute_cost_and_score(acts, None, costs, pad_labels, label_lengths, input_lengths) - - def _prepare_workspace(self) -> Tuple[int, Tuple[torch.Tensor, ...]]: - """ - Helper method that uses the workspace and constructs slices of it that can be used. - - Returns: - An int, representing the offset of the used workspace (practically, the slice of the workspace consumed) - A tuple of tensors representing the shared workspace. - """ - used_offset = 0 - - # // denom - denom = self.gpu_workspace[used_offset : used_offset + self.maxT_ * self.maxU_ * self.minibatch_] - used_offset += self.maxT_ * self.maxU_ * self.minibatch_ - - # // alphas & betas - alphas = self.gpu_workspace[used_offset : used_offset + self.maxT_ * self.maxU_ * self.minibatch_] - used_offset += self.maxT_ * self.maxU_ * self.minibatch_ - betas = self.gpu_workspace[used_offset : used_offset + self.maxT_ * self.maxU_ * self.minibatch_] - used_offset += self.maxT_ * self.maxU_ * self.minibatch_ - - # // logllh - llForward = self.gpu_workspace[used_offset : used_offset + self.minibatch_] - used_offset += self.minibatch_ - llBackward = self.gpu_workspace[used_offset : used_offset + self.minibatch_] - used_offset += self.minibatch_ - - return used_offset, (denom, alphas, betas, llForward, llBackward) - - -class MultiblankGPURNNT(GPURNNT): - def __init__( - self, - sigma: float, - num_big_blanks: int, - minibatch: int, - maxT: int, - maxU: int, - alphabet_size: int, - workspace, - big_blank_workspace, - blank: int, - fastemit_lambda: float, - clamp: float, - num_threads: int, - stream, - ): - """ - Helper class to launch the CUDA Kernels to compute Multi-blank Transducer Loss (https://arxiv.org/pdf/2211.03541). - - Args: - sigma: Hyper-parameter related to the logit-normalization method in training multi-blank transducers. - num_big_blanks: Number of big blank symbols the model has. This should not include the standard blank symbol. - minibatch: Int representing the batch size. - maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. - maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. - alphabet_size: The vocabulary dimension V + 1 + num-big-blanks - workspace: An allocated chunk of memory that will be sliced off and reshaped into required - blocks used as working memory. - big_blank_workspace: An allocated chunk of memory that will be sliced off and reshaped into required - blocks used as working memory specifically for the multi-blank related computations. - blank: Index of the RNNT blank token in the vocabulary. Generally the first or last token in the vocab. - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. - num_threads: Number of OMP threads to launch. - stream: Numba Cuda Stream. - """ - super().__init__( - minibatch, maxT, maxU, alphabet_size, workspace, blank, fastemit_lambda, clamp, num_threads, stream - ) - self.big_blank_workspace = cuda.as_cuda_array( - big_blank_workspace - ) # a flat vector of integer numbers that represents allocated memory slices - - self.num_big_blanks = num_big_blanks - self.sigma = sigma - - def compute_cost_and_score( - self, - acts: torch.Tensor, - grads: Optional[torch.Tensor], - costs: torch.Tensor, - labels: torch.Tensor, - label_lengths: torch.Tensor, - input_lengths: torch.Tensor, - ) -> global_constants.RNNTStatus: - """ - Compute both the loss and the gradients. - - Args: - acts: A flattened tensor of shape [B, T, U, V+1] representing the activation matrix. - grad: A flattented zero tensor of same shape as acts. - costs: A zero vector of length B which will be updated inplace with the log probability costs. - flat_labels: A flattened matrix of labels of shape [B, U] - label_lengths: A vector of length B that contains the original lengths of the acoustic sequence. - input_lengths: A vector of length B that contains the original lengths of the target sequence. - - Updates: - This will launch kernels that will update inline the following variables: - - grads: Gradients of the activation matrix wrt the costs vector. - - costs: Negative log likelihood of the forward variable. - - Returns: - An enum that either represents a successful RNNT operation or failure. - """ - training = grads is not None - - if training: - grads *= 0.0 # zero grads - - _, (denom, alphas, betas, llForward, llBackward, bigblank_durations) = self._prepare_workspace() - - ######## START EXECUTION ######## - self.log_softmax(acts, denom) - - # Compute alphas - gpu_rnnt_kernel.compute_multiblank_alphas_kernel[self.minibatch_, self.maxU_, self.stream_, 0]( - acts, - denom, - self.sigma, - alphas, - llForward, - input_lengths, - label_lengths, - labels, - self.minibatch_, - self.maxT_, - self.maxU_, - self.alphabet_size_, - self.blank_, - bigblank_durations, - self.num_big_blanks, - ) - - if training: - # Compute betas - gpu_rnnt_kernel.compute_multiblank_betas_kernel[self.minibatch_, self.maxU_, self.stream_, 0]( - acts, - denom, - self.sigma, - betas, - llBackward, - input_lengths, - label_lengths, - labels, - self.minibatch_, - self.maxT_, - self.maxU_, - self.alphabet_size_, - self.blank_, - bigblank_durations, - self.num_big_blanks, - ) - - # Compute gradient - grad_blocks_per_grid = self.minibatch_ * self.maxT_ * self.maxU_ - grad_threads_per_block = gpu_rnnt_kernel.GPU_RNNT_THREAD_SIZE - gpu_rnnt_kernel.compute_multiblank_grad_kernel[ - grad_blocks_per_grid, grad_threads_per_block, self.stream_, 0 - ]( - grads, - acts, - denom, - self.sigma, - alphas, - betas, - llForward, - input_lengths, - label_lengths, - labels, - self.minibatch_, - self.maxT_, - self.maxU_, - self.alphabet_size_, - self.blank_, - bigblank_durations, - self.num_big_blanks, - self.fastemit_lambda_, - self.clamp_, - ) - - # // cost copy, negate (for log likelihood) and update with additional regularizers - # This needs to be done via CUDA, because we used temporary memory llForward - # passed to alpha, which was updated with log likelihoods. - # But copying this data into a pytorch pointer is more difficult (numba api is one way) - # Therefore launch a pointwise CUDA kernel to update the costs inplace from data of llForward - # Then negate to compute the loglikelihood. - threadsperblock = min(costs.shape[0], 32) - blockspergrid = (costs.shape[0] + (threadsperblock - 1)) // threadsperblock - rnnt_helper.compute_costs_data[blockspergrid, threadsperblock, self.stream_, 0]( - llForward, costs, self.fastemit_lambda_ - ) - self.stream_.synchronize() - - return global_constants.RNNTStatus.RNNT_STATUS_SUCCESS - - def cost_and_grad( - self, - acts: torch.Tensor, - grads: torch.Tensor, - costs: torch.Tensor, - pad_labels: torch.Tensor, - label_lengths: torch.Tensor, - input_lengths: torch.Tensor, - ): - if ( - acts is None - or grads is None - or costs is None - or pad_labels is None - or label_lengths is None - or input_lengths is None - ): - return global_constants.RNNTStatus.RNNT_STATUS_INVALID_VALUE - - return self.compute_cost_and_score(acts, grads, costs, pad_labels, label_lengths, input_lengths) - - def score_forward( - self, - acts: torch.Tensor, - costs: torch.Tensor, - pad_labels: torch.Tensor, - label_lengths: torch.Tensor, - input_lengths: torch.Tensor, - ): - if acts is None or costs is None or pad_labels is None or label_lengths is None or input_lengths is None: - return global_constants.RNNTStatus.RNNT_STATUS_INVALID_VALUE - - return self.compute_cost_and_score(acts, None, costs, pad_labels, label_lengths, input_lengths) - - def _prepare_workspace(self) -> (int, Tuple[torch.Tensor]): - """ - Helper method that uses the workspace and constructs slices of it that can be used. - - Returns: - An int, representing the offset of the used workspace (practically, the slice of the workspace consumed) - A tuple of tensors representing the shared workspace. - """ - used_offset, (denom, alphas, betas, llForward, llBackward) = super()._prepare_workspace() - - bigblank_durations = self.big_blank_workspace[: self.num_big_blanks] - - return used_offset, (denom, alphas, betas, llForward, llBackward, bigblank_durations) - - -class GPUTDT(GPURNNT): - def __init__( - self, - sigma: float, - omega: float, - num_durations: int, - minibatch: int, - maxT: int, - maxU: int, - alphabet_size: int, - workspace, - tdt_workspace, - blank: int, - fastemit_lambda: float, - clamp: float, - num_threads: int, - stream, - ): - """ - Helper class to launch the CUDA Kernels to compute TDT Loss (https://arxiv.org/pdf/2211.03541). - - Args: - sigma: Hyper-parameter related to the logit-normalization method in training tdt transducers. - omega: Hyper-parameter related to the sampled training. - num_durations: Number of durations the model supports. - minibatch: Int representing the batch size. - maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. - maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. - alphabet_size: The vocabulary dimension V + 1 + num-big-blanks - workspace: An allocated chunk of memory that will be sliced off and reshaped into required - blocks used as working memory. - tdt_workspace: An allocated chunk of memory that will be sliced off and reshaped into required - blocks used as working memory specifically for the tdt related computations. - blank: Index of the blank token in the vocabulary. Must be the last token in the vocab. - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. - num_threads: Number of OMP threads to launch. - stream: Numba Cuda Stream. - """ - super().__init__( - minibatch, maxT, maxU, alphabet_size, workspace, blank, fastemit_lambda, clamp, num_threads, stream - ) - self.tdt_workspace = cuda.as_cuda_array( - tdt_workspace - ) # a flat vector of integer numbers that represents allocated memory slices - - self.num_durations = num_durations - self.sigma = sigma - self.omega = omega - - def compute_cost_and_score( - self, - label_acts: torch.Tensor, - duration_acts: torch.Tensor, - label_grads: Optional[torch.Tensor], - duration_grads: Optional[torch.Tensor], - costs: torch.Tensor, - labels: torch.Tensor, - label_lengths: torch.Tensor, - input_lengths: torch.Tensor, - ) -> global_constants.RNNTStatus: - """ - Compute both the loss and the gradients. - - Args: - label_acts: A flattened tensor of shape [B, T, U, V] representing the activation matrix for tokens. - duration_acts: A flattened tensor of shape [B, T, U, D] representing the activation matrix for durations. - label_grad: A flattented zero tensor of same shape as label_acts. - duration_grad: A flattented zero tensor of same shape as duration_acts. - costs: A zero vector of length B which will be updated inplace with the log probability costs. - flat_labels: A flattened matrix of labels of shape [B, U] - label_lengths: A vector of length B that contains the original lengths of the acoustic sequence. - input_lengths: A vector of length B that contains the original lengths of the target sequence. - - Updates: - This will launch kernels that will update inline the following variables: - - *_grads: Gradients of the activation matrix wrt the costs vector. - - costs: Negative log likelihood of the forward variable. - - Returns: - An enum that either represents a successful RNNT operation or failure. - """ - training = label_grads is not None - - if training: - label_grads *= 0.0 # zero grads - duration_grads *= 0.0 # zero grads - - _, (denom, alphas, betas, llForward, llBackward, durations) = self._prepare_workspace() - - ######## START EXECUTION ######## - self.log_softmax(label_acts, denom) - - r = random.uniform(0, 1) - if r < self.omega: - # Compute alphas - gpu_rnnt_kernel.compute_alphas_kernel[self.minibatch_, self.maxU_, self.stream_, 0]( - label_acts, - denom, - alphas, - llForward, - input_lengths, - label_lengths, - labels, - self.minibatch_, - self.maxT_, - self.maxU_, - self.alphabet_size_, - self.blank_, - ) - else: - # Compute alphas - gpu_rnnt_kernel.compute_tdt_alphas_kernel[self.minibatch_, self.maxU_, self.stream_, 0]( - label_acts, - duration_acts, - denom, - self.sigma, - alphas, - llForward, - input_lengths, - label_lengths, - labels, - self.minibatch_, - self.maxT_, - self.maxU_, - self.alphabet_size_, - self.blank_, - durations, - self.num_durations, - ) - - if training: - # Compute betas - if r < self.omega: - gpu_rnnt_kernel.compute_betas_kernel[self.minibatch_, self.maxU_, self.stream_, 0]( - label_acts, - denom, - betas, - llBackward, - input_lengths, - label_lengths, - labels, - self.minibatch_, - self.maxT_, - self.maxU_, - self.alphabet_size_, - self.blank_, - ) - - # Compute gradient - grad_blocks_per_grid = self.minibatch_ * self.maxT_ * self.maxU_ - grad_threads_per_block = gpu_rnnt_kernel.GPU_RNNT_THREAD_SIZE - gpu_rnnt_kernel.compute_grad_kernel[grad_blocks_per_grid, grad_threads_per_block, self.stream_, 0]( - label_grads, - label_acts, - denom, - alphas, - betas, - llForward, - input_lengths, - label_lengths, - labels, - self.minibatch_, - self.maxT_, - self.maxU_, - self.alphabet_size_, - self.blank_, - self.fastemit_lambda_, - self.clamp_, - ) - else: - gpu_rnnt_kernel.compute_tdt_betas_kernel[self.minibatch_, self.maxU_, self.stream_, 0]( - label_acts, - duration_acts, - denom, - self.sigma, - betas, - llBackward, - input_lengths, - label_lengths, - labels, - self.minibatch_, - self.maxT_, - self.maxU_, - self.alphabet_size_, - self.blank_, - durations, - self.num_durations, - ) - - # Compute gradient - grad_blocks_per_grid = self.minibatch_ * self.maxT_ * self.maxU_ - grad_threads_per_block = gpu_rnnt_kernel.GPU_RNNT_THREAD_SIZE - gpu_rnnt_kernel.compute_tdt_grad_kernel[grad_blocks_per_grid, grad_threads_per_block, self.stream_, 0]( - label_grads, - duration_grads, - label_acts, - duration_acts, - denom, - self.sigma, - alphas, - betas, - llForward, - input_lengths, - label_lengths, - labels, - self.minibatch_, - self.maxT_, - self.maxU_, - self.alphabet_size_, - self.blank_, - durations, - self.num_durations, - self.fastemit_lambda_, - self.clamp_, - ) - - # // cost copy, negate (for log likelihood) and update with additional regularizers - # This needs to be done via CUDA, because we used temporary memory llForward - # passed to alpha, which was updated with log likelihoods. - # But copying this data into a pytorch pointer is more difficult (numba api is one way) - # Therefore launch a pointwise CUDA kernel to update the costs inplace from data of llForward - # Then negate to compute the loglikelihood. - threadsperblock = min(costs.shape[0], 32) - blockspergrid = (costs.shape[0] + (threadsperblock - 1)) // threadsperblock - rnnt_helper.compute_costs_data[blockspergrid, threadsperblock, self.stream_, 0]( - llForward, costs, self.fastemit_lambda_ - ) - self.stream_.synchronize() - - return global_constants.RNNTStatus.RNNT_STATUS_SUCCESS - - def cost_and_grad( - self, - label_acts: torch.Tensor, - duration_acts: torch.Tensor, - label_grads: torch.Tensor, - duration_grads: torch.Tensor, - costs: torch.Tensor, - pad_labels: torch.Tensor, - label_lengths: torch.Tensor, - input_lengths: torch.Tensor, - ): - if ( - duration_acts is None - or label_acts is None - or label_grads is None - or duration_grads is None - or costs is None - or pad_labels is None - or label_lengths is None - or input_lengths is None - ): - return global_constants.RNNTStatus.RNNT_STATUS_INVALID_VALUE - - return self.compute_cost_and_score( - label_acts, duration_acts, label_grads, duration_grads, costs, pad_labels, label_lengths, input_lengths - ) - - def score_forward( - self, - label_acts: torch.Tensor, - duration_acts: torch.Tensor, - costs: torch.Tensor, - pad_labels: torch.Tensor, - label_lengths: torch.Tensor, - input_lengths: torch.Tensor, - ): - if ( - label_acts is None - or duration_acts is None - or costs is None - or pad_labels is None - or label_lengths is None - or input_lengths is None - ): - return global_constants.RNNTStatus.RNNT_STATUS_INVALID_VALUE - - return self.compute_cost_and_score( - label_acts, duration_acts, None, None, costs, pad_labels, label_lengths, input_lengths - ) - - def _prepare_workspace(self) -> (int, Tuple[torch.Tensor]): - """ - Helper method that uses the workspace and constructs slices of it that can be used. - - Returns: - An int, representing the offset of the used workspace (practically, the slice of the workspace consumed) - A tuple of tensors representing the shared workspace. - """ - used_offset, (denom, alphas, betas, llForward, llBackward) = super()._prepare_workspace() - - durations = self.tdt_workspace[: self.num_durations] - - return used_offset, (denom, alphas, betas, llForward, llBackward, durations) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt_kernel.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt_kernel.py deleted file mode 100644 index 4153af060941afeff1c7afb41c53ab1d53077f59..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/gpu_rnnt_kernel.py +++ /dev/null @@ -1,1408 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright 2018-2019, Mingkun Huang -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math - -import torch -from numba import cuda - -from nemo.collections.asr.parts.numba.rnnt_loss.utils import rnnt_helper - -GPU_RNNT_THREAD_SIZE = 256 - -INF = 10000.0 - - -@cuda.jit(device=True, inline=True) -def logp( - denom: torch.Tensor, acts: torch.Tensor, maxT: int, maxU: int, alphabet_size: int, mb: int, t: int, u: int, v: int -): - """ - Compute the sum of log probability from the activation tensor and its denominator. - - Args: - denom: Tensor of shape [B, T, U] flattened. Represents the denominator of the logprobs activation tensor - across entire vocabulary. - acts: Tensor of shape [B, T, U, V+1] flattened. Represents the logprobs activation tensor. - maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. - maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. - alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). - mb: Batch indexer. - t: Acoustic sequence timestep indexer. - u: Target sequence timestep indexer. - v: Vocabulary token indexer. - - Returns: - The sum of logprobs[mb, t, u, v] + denom[mb, t, u] - """ - col = (mb * maxT + t) * maxU + u - return denom[col] + acts[col * alphabet_size + v] - - -@cuda.jit(device=True, inline=True) -def logp_duration(acts: torch.Tensor, maxT: int, maxU: int, num_durations: int, mb: int, t: int, u: int, v: int): - col = (mb * maxT + t) * maxU + u - return acts[col * num_durations + v] - - -@cuda.jit() -def compute_alphas_kernel( - acts: torch.Tensor, - denom: torch.Tensor, - alphas: torch.Tensor, - llForward: torch.Tensor, - xlen: torch.Tensor, - ylen: torch.Tensor, - mlabels: torch.Tensor, # [B] - minibatch: int, - maxT: int, - maxU: int, - alphabet_size: int, - blank_: int, -): - """ - Compute alpha (forward variable) probabilities over the transduction step. - - Args: - acts: Tensor of shape [B, T, U, V+1] flattened. Represents the logprobs activation tensor. - denom: Tensor of shape [B, T, U] flattened. Represents the denominator of the logprobs activation tensor - across entire vocabulary. - alphas: Zero tensor of shape [B, T, U]. Will be updated inside the kernel with the forward variable - probabilities. - llForward: Zero tensor of shape [B]. Represents the log-likelihood of the forward pass. - Returned as the forward pass loss that is reduced by the optimizer. - xlen: Vector of length B which contains the actual acoustic sequence lengths in the padded - activation tensor. - ylen: Vector of length B which contains the actual target sequence lengths in the padded - activation tensor. - mlabels: Matrix of shape [B, U+1] (+1 here is due to token - usually the RNNT blank). - The matrix contains the padded target transcription that must be predicted. - minibatch: Int representing the batch size. - maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. - maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. - alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). - blank_: Index of the RNNT blank token in the vocabulary. Generally the first or last token in the vocab. - - Updates: - Kernel inplace updates the following inputs: - - alphas: forward variable scores. - - llForward: log-likelihood of forward variable. - """ - # // launch B blocks, each block has U threads - b = cuda.blockIdx.x # // batch id - u = cuda.threadIdx.x # label id, u - T = xlen[b] # select AM length of current sample - U = ylen[b] + 1 # select target length of current sample, +1 for the blank token - - labels: torch.Tensor = mlabels[b] # mb label start point, equivalent to mlabels + b * (maxU - 1) - offset = b * maxT * maxU # pointer indexing offset - - # alphas += offset # pointer offset, ignored since we explicitly add offset - - # Initilize alpha[b, t=0, u=0] for all b in B - if u == 0: - alphas[offset] = 0 - - # sync until all alphas are initialized - cuda.syncthreads() - - # Ordinary alpha calculations, broadcast across B=b and U=u - # Look up forward variable calculation from rnnt_numpy.forward_pass() - for n in range(1, T + U - 1): - t = n - u - - if u == 0: - # for t in range(1, T) step to initialize alphas[b, t, 0] - if t > 0 and t < T: - alphas[offset + t * maxU + u] = alphas[offset + (t - 1) * maxU + u] + logp( - denom, acts, maxT, maxU, alphabet_size, b, t - 1, 0, blank_ - ) - elif u < U: - # for u in range(1, U) step to initialize alphas[b, 0, u] - if t == 0: - alphas[offset + u] = alphas[offset + u - 1] + logp( - denom, acts, maxT, maxU, alphabet_size, b, 0, u - 1, labels[u - 1] - ) - - # for t in range(1, T) for u in range(1, U) step to compute alphas[b, t, u] - elif t > 0 and t < T: - no_emit = alphas[offset + (t - 1) * maxU + u] + logp( - denom, acts, maxT, maxU, alphabet_size, b, t - 1, u, blank_ - ) - emit = alphas[offset + t * maxU + u - 1] + logp( - denom, acts, maxT, maxU, alphabet_size, b, t, u - 1, labels[u - 1] - ) - - alphas[offset + t * maxU + u] = rnnt_helper.log_sum_exp(emit, no_emit) - - # sync across all B=b and U=u - cuda.syncthreads() - - # After final sync, alphas[b, T-1, U - 1] + logprobs[b, T-1, U-1, blank] + denom[b, T-1, U-1] gives - # log-likelihood of forward pass. - if u == 0: - loglike = alphas[offset + (T - 1) * maxU + U - 1] + logp( - denom, acts, maxT, maxU, alphabet_size, b, T - 1, U - 1, blank_ - ) - llForward[b] = loglike - - -@cuda.jit() -def compute_betas_kernel( - acts: torch.Tensor, - denom: torch.Tensor, - betas: torch.Tensor, - llBackward: torch.Tensor, - xlen: torch.Tensor, - ylen: torch.Tensor, - mlabels: torch.Tensor, # [B, U] - minibatch: int, - maxT: int, - maxU: int, - alphabet_size: int, - blank_: int, -): - """ - Compute beta (backward variable) probabilities over the transduction step. - - Args: - acts: Tensor of shape [B, T, U, V+1] flattened. Represents the logprobs activation tensor. - denom: Tensor of shape [B, T, U] flattened. Represents the denominator of the logprobs activation tensor - across entire vocabulary. - betas: Zero tensor of shape [B, T, U]. Will be updated inside the kernel with the backward variable - probabilities. - llBackward: Zero tensor of shape [B]. Represents the log-likelihood of the backward pass. - Returned as the backward pass loss that is reduced by the optimizer. - xlen: Vector of length B which contains the actual acoustic sequence lengths in the padded - activation tensor. - ylen: Vector of length B which contains the actual target sequence lengths in the padded - activation tensor. - mlabels: Matrix of shape [B, U+1] (+1 here is due to token - usually the RNNT blank). - The matrix contains the padded target transcription that must be predicted. - minibatch: Int representing the batch size. - maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. - maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. - alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). - blank_: Index of the RNNT blank token in the vocabulary. Generally the first or last token in the vocab. - - Updates: - Kernel inplace updates the following inputs: - - betas: backward variable scores. - - llBackward: log-likelihood of backward variable. - """ - # // launch B blocks, each block has U threads - b = cuda.blockIdx.x # // batch id - u = cuda.threadIdx.x # label id, u - T = xlen[b] # select AM length of current sample - U = ylen[b] + 1 # select target length of current sample, +1 for the blank token - - labels: torch.Tensor = mlabels[b] # mb label start point, equivalent to mlabels + b * (maxU - 1) - offset = b * maxT * maxU # pointer indexing offset - - # betas += offset # pointer offset, ignored since we explicitly add offset - - # Initilize beta[b, t=T-1, u=U-1] for all b in B with log_probs[b, t=T-1, u=U-1, blank] - if u == 0: - betas[offset + (T - 1) * maxU + U - 1] = logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, U - 1, blank_) - - # sync until all betas are initialized - cuda.syncthreads() - - # Ordinary beta calculations, broadcast across B=b and U=u - # Look up backward variable calculation from rnnt_numpy.backward_pass() - for n in range(T + U - 2, -1, -1): - t = n - u - - if u == (U - 1): - # for t in reversed(range(T - 1)) step to initialize betas[b, t, U-1] - if t >= 0 and t < (T - 1): - betas[offset + t * maxU + U - 1] = betas[offset + (t + 1) * maxU + U - 1] + logp( - denom, acts, maxT, maxU, alphabet_size, b, t, U - 1, blank_ - ) - elif u < U: - if t == T - 1: - # for u in reversed(range(U - 1)) step to initialize betas[b, T-1, u] - betas[offset + (T - 1) * maxU + u] = betas[offset + (T - 1) * maxU + u + 1] + logp( - denom, acts, maxT, maxU, alphabet_size, b, T - 1, u, labels[u] - ) - elif (t >= 0) and (t < T - 1): - # for t in reversed(range(T - 1)) for u in reversed(range(U - 1)) step to compute betas[b, t, u] - no_emit = betas[offset + (t + 1) * maxU + u] + logp( - denom, acts, maxT, maxU, alphabet_size, b, t, u, blank_ - ) - emit = betas[offset + t * maxU + u + 1] + logp( - denom, acts, maxT, maxU, alphabet_size, b, t, u, labels[u] - ) - betas[offset + t * maxU + u] = rnnt_helper.log_sum_exp(emit, no_emit) - - # sync across all B=b and U=u - cuda.syncthreads() - - # After final sync, betas[b, 0, 0] gives - # log-likelihood of backward pass. - if u == 0: - llBackward[b] = betas[offset] - - -@cuda.jit() -def compute_grad_kernel( - grads: torch.Tensor, - acts: torch.Tensor, - denom: torch.Tensor, - alphas: torch.Tensor, - betas: torch.Tensor, - logll: torch.Tensor, - xlen: torch.Tensor, - ylen: torch.Tensor, - mlabels: torch.Tensor, # [B, U] - minibatch: int, - maxT: int, - maxU: int, - alphabet_size: int, - blank_: int, - fastemit_lambda: float, - clamp: float, -): - """ - Compute gradients over the transduction step. - - Args: - grads: Zero Tensor of shape [B, T, U, V+1]. Is updated by this kernel to contain the gradients - of this batch of samples. - acts: Tensor of shape [B, T, U, V+1] flattened. Represents the logprobs activation tensor. - denom: Tensor of shape [B, T, U] flattened. Represents the denominator of the logprobs activation tensor - across entire vocabulary. - alphas: Alpha variable, contains forward probabilities. A tensor of shape [B, T, U]. - betas: Beta varoable, contains backward probabilities. A tensor of shape [B, T, U]. - logll: Log-likelihood of the forward variable, represented as a vector of shape [B]. - Represents the log-likelihood of the forward pass. - xlen: Vector of length B which contains the actual acoustic sequence lengths in the padded - activation tensor. - ylen: Vector of length B which contains the actual target sequence lengths in the padded - activation tensor. - mlabels: Matrix of shape [B, U+1] (+1 here is due to token - usually the RNNT blank). - The matrix contains the padded target transcription that must be predicted. - minibatch: Int representing the batch size. - maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. - maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. - alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). - blank_: Index of the RNNT blank token in the vocabulary. Generally the first or last token in the vocab. - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. - - Updates: - Kernel inplace updates the following inputs: - - grads: Gradients with respect to the log likelihood (logll). - """ - # Kernel call: - # blocks_per_grid = minibatch (b) * maxT (t) * maxU (u) - # threads_per_block = constant buffer size of parallel threads (v :: Constant) - tid = cuda.threadIdx.x # represents v, taking steps of some constant size - idx = tid # index of v < V+1; in steps of constant buffer size - col = cuda.blockIdx.x # represents a fused index of b * t * u - - # Decompose original indices from fused `col` - u = col % maxU # (b * t * u) % u = u - bt = (col - u) // maxU # (b * t * u - u) // U = b * t - t = bt % maxT # (b * t) % t = t - mb = (bt - t) // maxT # (b * t - t) // T = b - - # constants - T = xlen[mb] # select AM length of current sample - U = ylen[mb] + 1 # select target length of current sample, +1 for the blank token - labels: torch.Tensor = mlabels[mb] # labels = mlabels + mb * (maxU - 1); - - # Buffered gradient calculations, broadcast across B=b, T=t and U=u, looped over V with some constant stride. - # Look up gradient calculation from rnnt_numpy.compute_gradient() - if t < T and u < U: - # For cuda kernels, maximum number of threads per block is limited to some value. - # However, it may be the case that vocabulary size is larger than this limit - # To work around this, an arbitrary thread buffer size is chosen such that, - # 1) each element within the thread pool operates independently of the other - # 2) An inner while loop moves the index of each buffer element by the size of the buffer itself, - # such that all elements of the vocabulary size are covered in (V + 1 // thread_buffer) number of steps. - # As such, each thread will perform the while loop at least (V + 1 // thread_buffer) number of times - while idx < alphabet_size: - # remember, `col` represents the tri-index [b, t, u] - # therefore; logpk = denom[b, t, u] + acts[b, t, u, v] - logpk = denom[col] + acts[col * alphabet_size + idx] - # initialize the grad of the sample acts[b, t, u, v] - grad = math.exp(alphas[col] + betas[col] + logpk - logll[mb]) - - # If FastEmit regularization is enabled, calculate the gradeint of probability of predicting the next label - # at the current timestep. - # The formula for this is Equation 9 in https://arxiv.org/abs/2010.11148, multiplied by the log probability - # of the current step (t, u), normalized by the total log likelihood. - # Once the gradient has been calculated, scale it by `fastemit_lambda`, as in Equation 10. - if fastemit_lambda > 0.0 and u < U - 1: - fastemit_grad = fastemit_lambda * math.exp( - alphas[col] # alphas(t, u) - + (denom[col] + acts[col * alphabet_size + labels[u]]) # y_hat(t, u) - + betas[col + 1] # betas(t, u+1) - + logpk # log Pr(k|t, u) - - logll[mb] # total log likelihood for normalization - ) - else: - fastemit_grad = 0.0 - - # Update the gradient of act[b, t, u, v] with the gradient from FastEmit regularization - grad = grad + fastemit_grad - - # // grad to last blank transition - # grad[b, T-1, U-1, v=blank] -= exp(alphas[b, t, u) + logpk - logll[b]) - if (idx == blank_) and (t == T - 1) and (u == U - 1): - grad -= math.exp(alphas[col] + logpk - logll[mb]) - - # grad of blank across t < T; - # grad[b, t 0.0: - g = grads[col * alphabet_size + idx] - g = min(g, clamp) - g = max(g, -clamp) - grads[col * alphabet_size + idx] = g - - # update internal index through the thread_buffer; - # until idx < V + 1, such that entire vocabulary has been updated. - idx += GPU_RNNT_THREAD_SIZE - - -@cuda.jit() -def compute_multiblank_alphas_kernel( - acts: torch.Tensor, - denom: torch.Tensor, - sigma: float, - alphas: torch.Tensor, - llForward: torch.Tensor, - xlen: torch.Tensor, - ylen: torch.Tensor, - mlabels: torch.Tensor, - minibatch: int, - maxT: int, - maxU: int, - alphabet_size: int, - blank_: int, - big_blank_duration: torch.Tensor, - num_big_blanks: int, -): - """ - Compute alpha (forward variable) probabilities for multi-blank transducuer loss (https://arxiv.org/pdf/2211.03541). - - Args: - acts: Tensor of shape [B, T, U, V + 1 + num_big_blanks] flattened. Represents the logprobs activation tensor. - denom: Tensor of shape [B, T, U] flattened. Represents the denominator of the logprobs activation tensor - across entire vocabulary. - sigma: Hyper-parameter for logit-undernormalization technique for training multi-blank transducers. - alphas: Zero tensor of shape [B, T, U]. Will be updated inside the kernel with the forward variable - probabilities. - llForward: Zero tensor of shape [B]. Represents the log-likelihood of the forward pass. - Returned as the forward pass loss that is reduced by the optimizer. - xlen: Vector of length B which contains the actual acoustic sequence lengths in the padded - activation tensor. - ylen: Vector of length B which contains the actual target sequence lengths in the padded - activation tensor. - mlabels: Matrix of shape [B, U+1] (+1 here is due to token - usually the RNNT blank). - The matrix contains the padded target transcription that must be predicted. - minibatch: Int representing the batch size. - maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. - maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. - alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). - blank_: Index of the RNNT standard blank token in the vocabulary. - big_blank_durations: Vector of supported big blank durations of the model. - num_big_blanks: Number of big blanks of the model. - - Updates: - Kernel inplace updates the following inputs: - - alphas: forward variable scores. - - llForward: log-likelihood of forward variable. - """ - # // launch B blocks, each block has U threads - b = cuda.blockIdx.x # // batch id - u = cuda.threadIdx.x # label id, u - T = xlen[b] # select AM length of current sample - U = ylen[b] + 1 # select target length of current sample, +1 for the blank token - - labels: torch.Tensor = mlabels[b] # mb label start point, equivalent to mlabels + b * (maxU - 1) - offset = b * maxT * maxU # pointer indexing offset - - # Initilize alpha[b, t=0, u=0] for all b in B - if u == 0: - alphas[offset] = 0 - - # sync until all alphas are initialized - cuda.syncthreads() - - # Ordinary alpha calculations, broadcast across B=b and U=u - # Look up forward variable calculation from rnnt_numpy.forward_pass() - # Note: because of the logit under-normalization, everytime logp() is called, - # it is always followed by a `-sigma` term. - for n in range(1, T + U - 1): - t = n - u - - if u == 0: - # for t in range(1, T) step to initialize alphas[b, t, 0] - if t > 0 and t < T: - alphas[offset + t * maxU + u] = ( - alphas[offset + (t - 1) * maxU + u] - + logp(denom, acts, maxT, maxU, alphabet_size, b, t - 1, 0, blank_) - - sigma - ) - - # Now add the weights for big blanks. - for i in range(num_big_blanks): - if t >= big_blank_duration[i]: - alphas[offset + t * maxU + u] = rnnt_helper.log_sum_exp( - alphas[offset + t * maxU + u], - alphas[offset + (t - big_blank_duration[i]) * maxU + u] - + logp( - denom, acts, maxT, maxU, alphabet_size, b, t - big_blank_duration[i], 0, blank_ - 1 - i - ) - - sigma, - ) - - elif u < U: - # for u in range(1, U) step to initialize alphas[b, 0, u] - if t == 0: - alphas[offset + u] = ( - alphas[offset + u - 1] - + logp(denom, acts, maxT, maxU, alphabet_size, b, 0, u - 1, labels[u - 1]) - - sigma - ) - - # for t in range(1, T) for u in range(1, U) step to compute alphas[b, t, u] - elif t > 0 and t < T: - no_emit = ( - alphas[offset + (t - 1) * maxU + u] - + logp(denom, acts, maxT, maxU, alphabet_size, b, t - 1, u, blank_) - - sigma - ) - emit = ( - alphas[offset + t * maxU + u - 1] - + logp(denom, acts, maxT, maxU, alphabet_size, b, t, u - 1, labels[u - 1]) - - sigma - ) - - alphas[offset + t * maxU + u] = rnnt_helper.log_sum_exp(emit, no_emit) - - # Now add the weights for big blanks. - for i in range(num_big_blanks): - if t >= big_blank_duration[i]: - # big-blank weight here is - # alpha(t - duration, u) * p(big-blank | t - duration, u) / exp(sigma), in log domain - # do this all all big-blanks if the above condition is met - big_blank_no_emit = ( - alphas[offset + (t - big_blank_duration[i]) * maxU + u] - + logp( - denom, acts, maxT, maxU, alphabet_size, b, t - big_blank_duration[i], u, blank_ - 1 - i - ) - - sigma - ) - alphas[offset + t * maxU + u] = rnnt_helper.log_sum_exp( - alphas[offset + t * maxU + u], big_blank_no_emit - ) - - # sync across all B=b and U=u - cuda.syncthreads() - - # After final sync, alphas[b, T-1, U - 1] + logprobs[b, T-1, U-1, blank] + denom[b, T-1, U-1] gives - # log-likelihood of forward pass. - if u == 0: - loglike = ( - alphas[offset + (T - 1) * maxU + U - 1] - + logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, U - 1, blank_) - - sigma - ) - - # Now add the weights for big blanks for the final weight computation. - for i in range(num_big_blanks): - if T >= big_blank_duration[i]: - big_blank_loglike = ( - alphas[offset + (T - big_blank_duration[i]) * maxU + U - 1] - + logp(denom, acts, maxT, maxU, alphabet_size, b, T - big_blank_duration[i], U - 1, blank_ - 1 - i) - - sigma - ) - loglike = rnnt_helper.log_sum_exp(loglike, big_blank_loglike) - - llForward[b] = loglike - - -@cuda.jit() -def compute_multiblank_betas_kernel( - acts: torch.Tensor, - denom: torch.Tensor, - sigma: float, - betas: torch.Tensor, - llBackward: torch.Tensor, - xlen: torch.Tensor, - ylen: torch.Tensor, - mlabels: torch.Tensor, # [B, U] - minibatch: int, - maxT: int, - maxU: int, - alphabet_size: int, - blank_: int, - big_blank_duration: torch.Tensor, - num_big_blanks: int, -): - """ - Compute beta (backward variable) probabilities for multi-blank transducer loss (https://arxiv.org/pdf/2211.03541). - - Args: - acts: Tensor of shape [B, T, U, V + 1 + num-big-blanks] flattened. Represents the logprobs activation tensor. - denom: Tensor of shape [B, T, U] flattened. Represents the denominator of the logprobs activation tensor - across entire vocabulary. - sigma: Hyper-parameter for logit-undernormalization technique for training multi-blank transducers. - betas: Zero tensor of shape [B, T, U]. Will be updated inside the kernel with the backward variable - probabilities. - llBackward: Zero tensor of shape [B]. Represents the log-likelihood of the backward pass. - Returned as the backward pass loss that is reduced by the optimizer. - xlen: Vector of length B which contains the actual acoustic sequence lengths in the padded - activation tensor. - ylen: Vector of length B which contains the actual target sequence lengths in the padded - activation tensor. - mlabels: Matrix of shape [B, U+1] (+1 here is due to token - usually the RNNT blank). - The matrix contains the padded target transcription that must be predicted. - minibatch: Int representing the batch size. - maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. - maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. - alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). - blank_: Index of the RNNT standard blank token in the vocabulary. - big_blank_durations: Vector of supported big blank durations of the model. - num_big_blanks: Number of big blanks of the model. - - Updates: - Kernel inplace updates the following inputs: - - betas: backward variable scores. - - llBackward: log-likelihood of backward variable. - """ - # // launch B blocks, each block has U threads - b = cuda.blockIdx.x # // batch id - u = cuda.threadIdx.x # label id, u - T = xlen[b] # select AM length of current sample - U = ylen[b] + 1 # select target length of current sample, +1 for the blank token - - labels: torch.Tensor = mlabels[b] # mb label start point, equivalent to mlabels + b * (maxU - 1) - offset = b * maxT * maxU # pointer indexing offset - - # Note: just like the alphas, because of the logit under-normalization, everytime - # logp() is called, it is always followed by a `-sigma` term. - - # Initilize beta[b, t=T-1, u=U-1] for all b in B with log_probs[b, t=T-1, u=U-1, blank] - if u == 0: - betas[offset + (T - 1) * maxU + U - 1] = ( - logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, U - 1, blank_) - sigma - ) - - # sync until all betas are initialized - cuda.syncthreads() - - # Ordinary beta calculations, broadcast across B=b and U=u - # Look up backward variable calculation from rnnt_numpy.backward_pass() - for n in range(T + U - 2, -1, -1): - t = n - u - - if u == (U - 1): - # for t in reversed(range(T - 1)) step to initialize betas[b, t, U-1] - if t >= 0 and t < (T - 1): - # beta[t, U - 1] = beta[t + 1, U - 1] * p(blank | t, U - 1) / exp(sigma) - # this part is the same as regular RNN-T. - betas[offset + t * maxU + U - 1] = ( - betas[offset + (t + 1) * maxU + U - 1] - + logp(denom, acts, maxT, maxU, alphabet_size, b, t, U - 1, blank_) - - sigma - ) - - # now add the weights from big blanks - for i in range(num_big_blanks): - if t + big_blank_duration[i] < T: - # adding to beta[t, U - 1] of weight (in log domain), - # beta[t + duration, U - 1] * p(big-blank | t, U - 1) / exp(sigma) - betas[offset + t * maxU + U - 1] = rnnt_helper.log_sum_exp( - betas[offset + t * maxU + U - 1], - betas[offset + (t + big_blank_duration[i]) * maxU + U - 1] - + logp(denom, acts, maxT, maxU, alphabet_size, b, t, U - 1, blank_ - 1 - i) - - sigma, - ) - elif t + big_blank_duration[i] == T and big_blank_duration[i] != 1: - # adding to beta[T - duration, U - 1] of weight (in log domain), - # p(big-blank | T - duration, U - 1) / exp(sigma) - betas[offset + t * maxU + U - 1] = rnnt_helper.log_sum_exp( - betas[offset + t * maxU + U - 1], - logp(denom, acts, maxT, maxU, alphabet_size, b, t, U - 1, blank_ - 1 - i) - sigma, - ) - - elif u < U: - if t == T - 1: - # for u in reversed(range(U - 1)) step to initialize betas[b, T-1, u] - betas[offset + (T - 1) * maxU + u] = ( - betas[offset + (T - 1) * maxU + u + 1] - + logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, u, labels[u]) - - sigma - ) - elif (t >= 0) and (t < T - 1): - # for t in reversed(range(T - 1)) for u in reversed(range(U - 1)) step to compute betas[b, t, u] - no_emit = ( - betas[offset + (t + 1) * maxU + u] - + logp(denom, acts, maxT, maxU, alphabet_size, b, t, u, blank_) - - sigma - ) - emit = ( - betas[offset + t * maxU + u + 1] - + logp(denom, acts, maxT, maxU, alphabet_size, b, t, u, labels[u]) - - sigma - ) - betas[offset + t * maxU + u] = rnnt_helper.log_sum_exp(emit, no_emit) - - # now add the weights from big blanks - for i in range(num_big_blanks): - if t < T - big_blank_duration[i]: - # added weight for the big-blank, - # beta[t + duration, u] * p(big-blank | t, u) / exp(sigma) - big_blank_no_emit = ( - betas[offset + (t + big_blank_duration[i]) * maxU + u] - + logp(denom, acts, maxT, maxU, alphabet_size, b, t, u, blank_ - 1 - i) - - sigma - ) - betas[offset + t * maxU + u] = rnnt_helper.log_sum_exp( - betas[offset + t * maxU + u], big_blank_no_emit - ) - - # sync across all B=b and U=u - cuda.syncthreads() - - # After final sync, betas[b, 0, 0] gives - # log-likelihood of backward pass. - if u == 0: - llBackward[b] = betas[offset] - - -@cuda.jit() -def compute_multiblank_grad_kernel( - grads: torch.Tensor, - acts: torch.Tensor, - denom: torch.Tensor, - sigma: float, - alphas: torch.Tensor, - betas: torch.Tensor, - logll: torch.Tensor, - xlen: torch.Tensor, - ylen: torch.Tensor, - mlabels: torch.Tensor, # [B, U] - minibatch: int, - maxT: int, - maxU: int, - alphabet_size: int, - blank_: int, - big_blank_duration: torch.Tensor, - num_big_blanks: int, - fastemit_lambda: float, - clamp: float, -): - """ - Compute gradients for multi-blank transducer loss (https://arxiv.org/pdf/2211.03541). - - Args: - grads: Zero Tensor of shape [B, T, U, V + 1 + num_big_blanks]. Is updated by this kernel to contain the gradients - of this batch of samples. - acts: Tensor of shape [B, T, U, V + 1 + num_big_blanks] flattened. Represents the logprobs activation tensor. - denom: Tensor of shape [B, T, U] flattened. Represents the denominator of the logprobs activation tensor - across entire vocabulary. - sigma: Hyper-parameter for logit-undernormalization technique for training multi-blank transducers. - alphas: Alpha variable, contains forward probabilities. A tensor of shape [B, T, U]. - betas: Beta varoable, contains backward probabilities. A tensor of shape [B, T, U]. - logll: Log-likelihood of the forward variable, represented as a vector of shape [B]. - Represents the log-likelihood of the forward pass. - xlen: Vector of length B which contains the actual acoustic sequence lengths in the padded - activation tensor. - ylen: Vector of length B which contains the actual target sequence lengths in the padded - activation tensor. - mlabels: Matrix of shape [B, U+1] (+1 here is due to token - usually the RNNT blank). - The matrix contains the padded target transcription that must be predicted. - minibatch: Int representing the batch size. - maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. - maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. - alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). - blank_: Index of the RNNT blank token in the vocabulary. Generally the first or last token in the vocab. - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. - big_blank_durations: Vector of supported big blank durations of the model. - num_big_blanks: Number of big blanks of the model. - - Updates: - Kernel inplace updates the following inputs: - - grads: Gradients with respect to the log likelihood (logll). - """ - # Kernel call: - # blocks_per_grid = minibatch (b) * maxT (t) * maxU (u) - # threads_per_block = constant buffer size of parallel threads (v :: Constant) - tid = cuda.threadIdx.x # represents v, taking steps of some constant size - idx = tid # index of v < V+1; in steps of constant buffer size - col = cuda.blockIdx.x # represents a fused index of b * t * u - - # Decompose original indices from fused `col` - u = col % maxU # (b * t * u) % u = u - bt = (col - u) // maxU # (b * t * u - u) // U = b * t - t = bt % maxT # (b * t) % t = t - mb = (bt - t) // maxT # (b * t - t) // T = b - - # constants - T = xlen[mb] # select AM length of current sample - U = ylen[mb] + 1 # select target length of current sample, +1 for the blank token - labels: torch.Tensor = mlabels[mb] # labels = mlabels + mb * (maxU - 1); - - # Buffered gradient calculations, broadcast across B=b, T=t and U=u, looped over V with some constant stride. - # Look up gradient calculation from rnnt_numpy.compute_gradient() - if t < T and u < U: - # For cuda kernels, maximum number of threads per block is limited to some value. - # However, it may be the case that vocabulary size is larger than this limit - # To work around this, an arbitrary thread buffer size is chosen such that, - # 1) each element within the thread pool operates independently of the other - # 2) An inner while loop moves the index of each buffer element by the size of the buffer itself, - # such that all elements of the vocabulary size are covered in (V + 1 // thread_buffer) number of steps. - # As such, each thread will perform the while loop at least (V + 1 // thread_buffer) number of times - while idx < alphabet_size: - # remember, `col` represents the tri-index [b, t, u] - # therefore; logpk = denom[b, t, u] + acts[b, t, u, v] - logpk = denom[col] + acts[col * alphabet_size + idx] - # initialize the grad of the sample acts[b, t, u, v] - grad = math.exp(alphas[col] + betas[col] + logpk - logll[mb]) - - # In all of the following computation, whenever logpk is used, we - # need to subtract sigma based on our derivation of the gradient of - # the logit under-normalization method. - - # If FastEmit regularization is enabled, calculate the gradeint of probability of predicting the next label - # at the current timestep. - # The formula for this is Equation 9 in https://arxiv.org/abs/2010.11148, multiplied by the log probability - # of the current step (t, u), normalized by the total log likelihood. - # Once the gradient has been calculated, scale it by `fastemit_lambda`, as in Equation 10. - if fastemit_lambda > 0.0 and u < U - 1: - fastemit_grad = fastemit_lambda * math.exp( - alphas[col] # alphas(t, u) - + (denom[col] + acts[col * alphabet_size + labels[u]]) - + betas[col + 1] # betas(t, u+1) - + logpk # log Pr(k|t, u) - - sigma - - logll[mb] # total log likelihood for normalization - ) - else: - fastemit_grad = 0.0 - - # Update the gradient of act[b, t, u, v] with the gradient from FastEmit regularization - grad = grad + fastemit_grad - - # grad to last blank transition - # grad[b, T-1, U-1, v=blank] -= exp(alphas[b, t, u) + logpk - sigma - logll[b]) - if (idx == blank_) and (t == T - 1) and (u == U - 1): - grad -= math.exp(alphas[col] + logpk - sigma - logll[mb]) - else: - # this is one difference of the multi-blank gradient from standard RNN-T - # gradient -- basically, wherever the blank_ symbol is addressed in the - # original code, we need to do similar things to big blanks, and we need - # to change the if conditions to match the duration of the big-blank. - # grad[b, T-duration, U-1, v=big-blank] -= exp(alphas[b, t, u) + logpk - sigma - logll[b]) - for i in range(num_big_blanks): - if (idx == blank_ - 1 - i) and (t == T - big_blank_duration[i]) and (u == U - 1): - grad -= math.exp(alphas[col] + logpk - sigma - logll[mb]) - - # grad of blank across t < T; - # grad[b, t 0.0: - g = grads[col * alphabet_size + idx] - g = min(g, clamp) - g = max(g, -clamp) - grads[col * alphabet_size + idx] = g - - # update internal index through the thread_buffer; - # until idx < V + 1, such that entire vocabulary has been updated. - idx += GPU_RNNT_THREAD_SIZE - - -@cuda.jit() -def compute_tdt_alphas_kernel( - acts: torch.Tensor, - duration_acts: torch.Tensor, - denom: torch.Tensor, - sigma: float, - alphas: torch.Tensor, - llForward: torch.Tensor, - xlen: torch.Tensor, - ylen: torch.Tensor, - mlabels: torch.Tensor, # [B] - minibatch: int, - maxT: int, - maxU: int, - alphabet_size: int, - blank_: int, - durations: torch.Tensor, - num_durations: int, -): - """ - Compute alpha (forward variable) probabilities over the transduction step. - - Args: - acts: Tensor of shape [B, T, U, V] flattened. Represents the logprobs activation tensor for tokens. - duration_acts: Tensor of shape [B, T, U, D] flattened. Represents the logprobs activation tensor for duration. - denom: Tensor of shape [B, T, U] flattened. Represents the denominator of the logprobs activation tensor for tokens. - - alphas: Zero tensor of shape [B, T, U]. Will be updated inside the kernel with the forward variable - probabilities. - llForward: Zero tensor of shape [B]. Represents the log-likelihood of the forward pass. - Returned as the forward pass loss that is reduced by the optimizer. - xlen: Vector of length B which contains the actual acoustic sequence lengths in the padded - activation tensor. - ylen: Vector of length B which contains the actual target sequence lengths in the padded - activation tensor. - mlabels: Matrix of shape [B, U+1] (+1 here is due to token - usually the RNNT blank). - The matrix contains the padded target transcription that must be predicted. - minibatch: Int representing the batch size. - maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. - maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. - alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). - blank_: Index of the TDT blank token in the vocabulary. Must be the last token in the vocab. - - Updates: - Kernel inplace updates the following inputs: - - alphas: forward variable scores. - - llForward: log-likelihood of forward variable. - """ - # // launch B blocks, each block has U threads - b = cuda.blockIdx.x # // batch id - u = cuda.threadIdx.x # label id, u - T = xlen[b] # select AM length of current sample - U = ylen[b] + 1 # select target length of current sample, +1 for the blank token - - labels: torch.Tensor = mlabels[b] # mb label start point, equivalent to mlabels + b * (maxU - 1) - offset = b * maxT * maxU # pointer indexing offset - - # alphas += offset # pointer offset, ignored since we explicitly add offset - - # Initilize alpha[b, t=0, u=0] for all b in B - if u == 0: - alphas[offset] = 0 - - # sync until all alphas are initialized - cuda.syncthreads() - - # Ordinary alpha calculations, broadcast across B=b and U=u - # Look up forward variable calculation from rnnt_numpy.forward_pass() - for n in range(1, T + U - 1): - t = n - u - - if u == 0: - # when u == 0, we only consider blank emissions. - if t > 0 and t < T: - alphas[offset + t * maxU + u] = -INF - - for i in range(1, num_durations): # skip 0 since blank emission has to advance by at least one - if t >= durations[i]: - alphas[offset + t * maxU + u] = rnnt_helper.log_sum_exp( - alphas[offset + t * maxU + u], # the current alpha value - alphas[offset + (t - durations[i]) * maxU + u] # alpha(t - duration, u) - + logp( - denom, acts, maxT, maxU, alphabet_size, b, t - durations[i], u, blank_ - ) # logp of blank emission - - sigma # logit under-normalization - + logp_duration( - duration_acts, maxT, maxU, num_durations, b, t - durations[i], u, i - ), # logp of duration - ) - else: - break # since durations are in ascending order, when we encounter a duration that is too large, then - # there is no need to check larger durations after that. - - elif u < U: - # when t == 0, we only consider the non-blank emission. - if t == 0: - alphas[offset + u] = ( - alphas[offset + u - 1] # alpha(t, u - 1) - + logp( - denom, acts, maxT, maxU, alphabet_size, b, t, u - 1, labels[u - 1] - ) # logp of token emission - - sigma # logit under-normalization - + logp_duration( - duration_acts, maxT, maxU, num_durations, b, t, u - 1, 0 - ) # t = 0, so it must be duration = 0. Therefore the last argument passed to logp_duration() is 0. - ) - - # now we have t != 0 and u != 0, and we need to consider both non-blank and blank emissions. - elif t > 0 and t < T: - no_emit = -INF # no_emit stores the score for all blank emissions. - for i in range(1, num_durations): - if t >= durations[i]: - no_emit = rnnt_helper.log_sum_exp( - no_emit, # current score - alphas[offset + (t - durations[i]) * maxU + u] # alpha(t - duration, u) - + logp( - denom, acts, maxT, maxU, alphabet_size, b, t - durations[i], u, blank_ - ) # logp of blank emission - - sigma # logit under-normalization - + logp_duration( - duration_acts, maxT, maxU, num_durations, b, t - durations[i], u, i - ), # logp of duration - ) - else: - break # we can exit the loop early here, same as the case for u == 0 above. - - emit = -INF # emit stores the score for non-blank emissions. - for i in range(0, num_durations): - if t >= durations[i]: - emit = rnnt_helper.log_sum_exp( - emit, # current score - alphas[offset + (t - durations[i]) * maxU + u - 1] # alpha(t - duration, u - 1) - + logp( - denom, acts, maxT, maxU, alphabet_size, b, t - durations[i], u - 1, labels[u - 1] - ) # logp of non-blank emission - - sigma # logit under-normalization - + logp_duration( - duration_acts, maxT, maxU, num_durations, b, t - durations[i], u - 1, i - ), # logp of duration - ) - else: - break # we can exit the loop early here, same as the case for u == 0 above. - - # combining blank and non-blank emissions. - alphas[offset + t * maxU + u] = rnnt_helper.log_sum_exp(emit, no_emit) - - # sync across all B=b and U=u - cuda.syncthreads() - - # After final sync, the forward log-likelihood can be computed as the summataion of - # alpha(T - duration, U - 1) + logp(blank, duration | t - duration, U - 1), over different durations. - if u == 0: - # first we consider duration = 1 - loglike = ( - alphas[offset + (T - 1) * maxU + U - 1] - + logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, U - 1, blank_) - - sigma - + logp_duration(duration_acts, maxT, maxU, num_durations, b, T - 1, U - 1, 1) - ) - - # then we add the scores for duration > 1, if such durations are possible given the audio lengths. - for i in range(2, num_durations): - if T >= durations[i]: - big_blank_loglike = ( - alphas[offset + (T - durations[i]) * maxU + U - 1] - + logp(denom, acts, maxT, maxU, alphabet_size, b, T - durations[i], U - 1, blank_) - - sigma - + logp_duration(duration_acts, maxT, maxU, num_durations, b, T - durations[i], U - 1, i) - ) - loglike = rnnt_helper.log_sum_exp(loglike, big_blank_loglike) - else: - break - - llForward[b] = loglike - - -@cuda.jit() -def compute_tdt_betas_kernel( - acts: torch.Tensor, - duration_acts: torch.Tensor, - denom: torch.Tensor, - sigma: float, - betas: torch.Tensor, - llBackward: torch.Tensor, - xlen: torch.Tensor, - ylen: torch.Tensor, - mlabels: torch.Tensor, # [B, U] - minibatch: int, - maxT: int, - maxU: int, - alphabet_size: int, - blank_: int, - durations: torch.Tensor, - num_durations: int, -): - """ - Compute beta (backward variable) probabilities over the transduction step. - - Args: - acts: Tensor of shape [B, T, U, V] flattened. Represents the logprobs activation tensor for tokens. - duration_acts: Tensor of shape [B, T, U, D] flattened. Represents the logprobs activation tensor for duations. - denom: Tensor of shape [B, T, U] flattened. Represents the denominator of the logprobs activation tensor - across entire vocabulary. - betas: Zero tensor of shape [B, T, U]. Will be updated inside the kernel with the backward variable - probabilities. - llBackward: Zero tensor of shape [B]. Represents the log-likelihood of the backward pass. - Returned as the backward pass loss that is reduced by the optimizer. - xlen: Vector of length B which contains the actual acoustic sequence lengths in the padded - activation tensor. - ylen: Vector of length B which contains the actual target sequence lengths in the padded - activation tensor. - mlabels: Matrix of shape [B, U+1] (+1 here is due to token - usually the RNNT blank). - The matrix contains the padded target transcription that must be predicted. - minibatch: Int representing the batch size. - maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. - maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. - alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). - blank_: Index of the RNNT blank token in the vocabulary. Generally the first or last token in the vocab. - - Updates: - Kernel inplace updates the following inputs: - - betas: backward variable scores. - - llBackward: log-likelihood of backward variable. - """ - # // launch B blocks, each block has U threads - b = cuda.blockIdx.x # // batch id - u = cuda.threadIdx.x # label id, u - T = xlen[b] # select AM length of current sample - U = ylen[b] + 1 # select target length of current sample, +1 for the blank token - - labels: torch.Tensor = mlabels[b] # mb label start point, equivalent to mlabels + b * (maxU - 1) - offset = b * maxT * maxU # pointer indexing offset - - # betas += offset # pointer offset, ignored since we explicitly add offset - - # Initilize beta[b, t=T-1, u=U-1] for all b in B with log_probs[b, t=T-1, u=U-1, blank] - if u == 0: - betas[offset + (T - 1) * maxU + U - 1] = ( - logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, U - 1, blank_) - - sigma - + logp_duration(duration_acts, maxT, maxU, num_durations, b, T - 1, U - 1, 1) - ) - - # sync until all betas are initialized - cuda.syncthreads() - - # Ordinary beta calculations, broadcast across B=b and U=u - # Look up backward variable calculation from rnnt_numpy.backward_pass() - for n in range(T + U - 2, -1, -1): - t = n - u - - if u == U - 1: - # u == U - 1, we only consider blank emissions. - if t >= 0 and t + 1 < T: - betas[offset + t * maxU + U - 1] = -INF - for i in range(1, num_durations): - # although similar, the computation for beta's is slightly more complex for boundary cases. - # the following two cases correspond to whether t is exactly certain duration away from T. - # and they have slightly different update rules. - - if t + durations[i] < T: - betas[offset + t * maxU + U - 1] = rnnt_helper.log_sum_exp( - betas[offset + t * maxU + U - 1], - betas[ - offset + (t + durations[i]) * maxU + U - 1 - ] # beta[t, U - 1] depends on the value beta[t + duration, U - 1] here. - + logp(denom, acts, maxT, maxU, alphabet_size, b, t, U - 1, blank_) # log prob of blank - + logp_duration( - duration_acts, maxT, maxU, num_durations, b, t, U - 1, i - ) # log prob of duration (durations[i]) - - sigma, # for logit undernormalization - ) - elif t + durations[i] == T: - betas[offset + t * maxU + U - 1] = rnnt_helper.log_sum_exp( - betas[offset + t * maxU + U - 1], - # here we have one fewer term than the "if" block above. This could be seen as having "0" here since - # beta[t + duration, U - 1] isn't defined because t + duration is out of bound. - logp(denom, acts, maxT, maxU, alphabet_size, b, t, U - 1, blank_) # log prob of blank - + logp_duration( - duration_acts, maxT, maxU, num_durations, b, t, U - 1, i - ) # log prob of duration (durations[i]) - - sigma, # for logit undernormalization. Basically every time sigma shows up is because of logit undernormalization. - ) - - elif u < U - 1: - if t == T - 1: - # t == T - 1, so we only consider non-blank with duration 0. (Note, we can't have blank emissions with duration = 0) - betas[offset + (T - 1) * maxU + u] = ( - betas[offset + (T - 1) * maxU + u + 1] - + logp(denom, acts, maxT, maxU, alphabet_size, b, T - 1, u, labels[u]) # non-blank log prob - + logp_duration(duration_acts, maxT, maxU, num_durations, b, T - 1, u, 0) # log prob of duration 0 - - sigma - ) - - elif t >= 0 and t < T - 1: - # now we need to consider both blank andnon-blanks. Similar to alphas, we first compute them separately with no_emit and emit. - no_emit = -INF - for i in range(1, num_durations): - if t + durations[i] < T: - no_emit = rnnt_helper.log_sum_exp( - no_emit, - betas[offset + (t + durations[i]) * maxU + u] - + logp(denom, acts, maxT, maxU, alphabet_size, b, t, u, blank_) - + logp_duration(duration_acts, maxT, maxU, num_durations, b, t, u, i) - - sigma, - ) - - emit = -INF - for i in range(0, num_durations): - if t + durations[i] < T: - emit = rnnt_helper.log_sum_exp( - emit, - betas[offset + (t + durations[i]) * maxU + u + 1] - + logp(denom, acts, maxT, maxU, alphabet_size, b, t, u, labels[u]) - + logp_duration(duration_acts, maxT, maxU, num_durations, b, t, u, i) - - sigma, - ) - - # combining all blank emissions and all non-blank emissions. - betas[offset + t * maxU + u] = rnnt_helper.log_sum_exp(emit, no_emit) - - # sync across all B=b and U=u - cuda.syncthreads() - - # After final sync, betas[b, 0, 0] gives log-likelihood of backward pass, same with conventional Transducers. - if u == 0: - llBackward[b] = betas[offset] - - -@cuda.jit() -def compute_tdt_grad_kernel( - label_grads: torch.Tensor, - duration_grads: torch.Tensor, - acts: torch.Tensor, - duration_acts: torch.Tensor, - denom: torch.Tensor, - sigma: float, - alphas: torch.Tensor, - betas: torch.Tensor, - logll: torch.Tensor, - xlen: torch.Tensor, - ylen: torch.Tensor, - mlabels: torch.Tensor, # [B, U] - minibatch: int, - maxT: int, - maxU: int, - alphabet_size: int, - blank_: int, - durations: torch.Tensor, - num_durations: int, - fastemit_lambda: float, - clamp: float, -): - """ - Compute gradients over the transduction step. - - Args: - grads: Zero Tensor of shape [B, T, U, V] to store gradients for tokens. - duration_grads: Zero Tensor of shape [B, T, U, D] to store gradients for durations. - - acts: Tensor of shape [B, T, U, V] flattened. Represents the logprobs activation tensor for tokens. - duration_acts: Tensor of shape [B, T, U, D] flattened. Represents the logprobs activation tensor for durations. - denom: Tensor of shape [B, T, U] flattened. Represents the denominator of the logprobs activation tensor - across entire vocabulary. - alphas: Alpha variable, contains forward probabilities. A tensor of shape [B, T, U]. - betas: Beta varoable, contains backward probabilities. A tensor of shape [B, T, U]. - logll: Log-likelihood of the forward variable, represented as a vector of shape [B]. - Represents the log-likelihood of the forward pass. - xlen: Vector of length B which contains the actual acoustic sequence lengths in the padded - activation tensor. - ylen: Vector of length B which contains the actual target sequence lengths in the padded - activation tensor. - mlabels: Matrix of shape [B, U+1] (+1 here is due to token - usually the RNNT blank). - The matrix contains the padded target transcription that must be predicted. - minibatch: Int representing the batch size. - maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. - maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. - alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). - blank_: Index of the RNNT blank token in the vocabulary. Generally the first or last token in the vocab. - fastemit_lambda: Float scaling factor for FastEmit regularization. Refer to - FastEmit: Low-latency Streaming ASR with Sequence-level Emission Regularization. - clamp: Float value. When set to value >= 0.0, will clamp the gradient to [-clamp, clamp]. - - Updates: - Kernel inplace updates the following inputs: - - grads: Gradients with respect to the log likelihood (logll). - """ - # Kernel call: - # blocks_per_grid = minibatch (b) * maxT (t) * maxU (u) - # threads_per_block = constant buffer size of parallel threads (v :: Constant) - tid = cuda.threadIdx.x # represents v, taking steps of some constant size - idx = tid # index of v < V+1; in steps of constant buffer size - col = cuda.blockIdx.x # represents a fused index of b * t * u - - # Decompose original indices from fused `col` - u = col % maxU # (b * t * u) % u = u - bt = (col - u) // maxU # (b * t * u - u) // U = b * t - t = bt % maxT # (b * t) % t = t - mb = (bt - t) // maxT # (b * t - t) // T = b - - # constants - T = xlen[mb] # select AM length of current sample - U = ylen[mb] + 1 # select target length of current sample, +1 for the blank token - labels: torch.Tensor = mlabels[mb] # labels = mlabels + mb * (maxU - 1); - - # Buffered gradient calculations, broadcast across B=b, T=t and U=u, looped over V with some constant stride. - # Look up gradient calculation from rnnt_numpy.compute_gradient() - - if t < T and u < U: - logpk_blank = ( - denom[col] + acts[col * alphabet_size + blank_] - sigma - ) # whenever sigma is used, it is for logit under-normalization. - - if idx < num_durations: - grad = 0.0 - if t + durations[idx] < T and u < U - 1: # for label - logpk_label = denom[col] + acts[col * alphabet_size + labels[u]] - sigma - grad -= math.exp(alphas[col] + betas[col + 1 + durations[idx] * maxU] + logpk_label - logll[mb]) - - if t + durations[idx] < T and idx > 0: # for blank in the middle - grad -= math.exp(alphas[col] + betas[col + durations[idx] * maxU] + logpk_blank - logll[mb]) - - if t + durations[idx] == T and idx >= 1 and u == U - 1: # for blank as the last symbol - grad -= math.exp(alphas[col] + logpk_blank - logll[mb]) - - grad = grad * math.exp(duration_acts[col * num_durations + idx]) - duration_grads[col * num_durations + idx] = grad - - # For cuda kernels, maximum number of threads per block is limited to some value. - # However, it may be the case that vocabulary size is larger than this limit - # To work around this, an arbitrary thread buffer size is chosen such that, - # 1) each element within the thread pool operates independently of the other - # 2) An inner while loop moves the index of each buffer element by the size of the buffer itself, - # such that all elements of the vocabulary size are covered in (V + 1 // thread_buffer) number of steps. - # As such, each thread will perform the while loop at least (V + 1 // thread_buffer) number of times - while idx < alphabet_size: - # remember, `col` represents the tri-index [b, t, u] - # therefore; logpk = denom[b, t, u] + acts[b, t, u, v] - logpk = denom[col] + acts[col * alphabet_size + idx] - # initialize the grad of the sample acts[b, t, u, v] - grad = math.exp(alphas[col] + betas[col] + logpk - logll[mb]) - - # If FastEmit regularization is enabled, calculate the gradeint of probability of predicting the next label - # at the current timestep. - # The formula for this is Equation 9 in https://arxiv.org/abs/2010.11148, multiplied by the log probability - # of the current step (t, u), normalized by the total log likelihood. - # Once the gradient has been calculated, scale it by `fastemit_lambda`, as in Equation 10. - if fastemit_lambda > 0.0 and u < U - 1: - fastemit_grad = 0.0 - - for i in range(0, num_durations): - if t + durations[i] < T: - fastemit_grad += fastemit_lambda * math.exp( - alphas[col] # alphas(t, u) - + (denom[col] + acts[col * alphabet_size + labels[u]]) # log prob of token emission - + duration_acts[col * num_durations + i] # duration log-prob - + betas[col + 1 + durations[i] * maxU] # betas(t, u+1) - + logpk # log Pr(k|t, u) - - sigma # for logit under-normalization - - logll[mb] # total log likelihood for normalization - ) - else: - fastemit_grad = 0.0 - - # Update the gradient of act[b, t, u, v] with the gradient from FastEmit regularization - grad = grad + fastemit_grad - - # grad to last blank transition - # grad[b, T-1, U-1, v=blank] -= exp(alphas[b, t, u] + logpk - sigma - logll[b] + logp(duration) for all possible non-zero durations. - if idx == blank_ and u == U - 1: - for i in range(1, num_durations): - if t == T - durations[i]: - grad -= math.exp( - alphas[col] + logpk - sigma - logll[mb] + duration_acts[col * num_durations + i] - ) - - # grad of blank across t < T; - # grad[b, t 0.0: - g = label_grads[col * alphabet_size + idx] - g = min(g, clamp) - g = max(g, -clamp) - label_grads[col * alphabet_size + idx] = g - - # update internal index through the thread_buffer; - # until idx < V + 1, such that entire vocabulary has been updated. - idx += GPU_RNNT_THREAD_SIZE diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/reduce.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/reduce.py deleted file mode 100644 index 3e7fe2cb782823987a6e3de0f74ef919fc2d4026..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/cuda_utils/reduce.py +++ /dev/null @@ -1,362 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright 2018-2019, Mingkun Huang -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import enum -import math - -import torch -from numba import cuda - -from nemo.collections.asr.parts.numba.rnnt_loss.utils import global_constants, rnnt_helper - -warp_size = global_constants.warp_size() -dtype = global_constants.dtype() - -CTA_REDUCE_SIZE = 128 - - -class I_Op(enum.Enum): - """ - Represents an operation that is performed on the input tensor - """ - - EXPONENTIAL = 0 - IDENTITY = 1 - - -class R_Op(enum.Enum): - """ - Represents a reduction operation performed on the input tensor - """ - - ADD = 0 - MAXIMUM = 1 - - -@cuda.jit(device=True) -def CTAReduce(tid: int, x, storage, count: int, R_opid: int): - """ - CUDA Warp reduction kernel. - - It is a device kernel to be called by other kernels. - - The data will be read from the right segement recursively, and reduced (ROP) onto the left half. - Operation continues while warp size is larger than a given offset. - Beyond this offset, warp reduction is performed via `shfl_down_sync`, which halves the reduction - space and sums the two halves at each call. - - Note: - Efficient warp occurs at input shapes of 2 ^ K. - - References: - - Warp Primitives [https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/] - - Args: - tid: CUDA thread index - x: activation. Single float. - storage: shared memory of size CTA_REDUCE_SIZE used for reduction in parallel threads. - count: equivalent to num_rows, which is equivalent to alphabet_size (V+1) - R_opid: Operator ID for reduction. See R_Op for more information. - """ - storage[tid] = x - - cuda.syncthreads() - - # Fold the data in half with each pass - offset = CTA_REDUCE_SIZE // 2 - while offset >= warp_size: - if (tid + offset) < count and tid < offset: - # Read from the right half and store to the left half. - if R_opid == 0: - x = rnnt_helper.add(x, storage[offset + tid]) - else: - x = rnnt_helper.maximum(x, storage[offset + tid]) - - storage[tid] = x - - cuda.syncthreads() - offset = offset // 2 - - offset = warp_size // 2 - while offset > 0: - # warp reduction and sync - shuff = cuda.shfl_down_sync(0xFFFFFFFF, x, offset) - - if (tid + offset < count) and (tid < offset): - if R_opid == 0: - x = rnnt_helper.add(x, shuff) - else: - x = rnnt_helper.maximum(x, shuff) - - offset = offset // 2 - - return x - - -@cuda.jit() -def _reduce_rows(I_opid: int, R_opid: int, acts, output, num_rows: int): - """ - CUDA Warp reduction kernel which reduces via the R_Op.Maximum - - Reduces the input data such that I_Op = Identity and R_op = Maximum. - The result is stored in the blockIdx, and is stored as an identity op. - - Note: - Efficient warp occurs at input shapes of 2 ^ K. - - References: - - Warp Primitives [https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/] - - Args: - I_opid: Operator ID for input. See I_Op for more information. For this kernel, - the Identity op is chosen in general, and therefore the input is reduced in place - without scaling. - R_opid: Operator ID for reduction. See R_Op for more information. - For this kernel, generally Maximum op is chosen. It reduces the kernel via max. - acts: Flatened activation matrix of shape [B * T * U * (V+1)]. - output: Flatened output matrix of shape [B * T * U * (V+1)]. Data will be overwritten. - num_rows: Vocabulary size (including blank token) - V+1. - """ - tid = cuda.threadIdx.x - idx = tid - col = cuda.blockIdx.x - - # allocate shared thread memory - storage = cuda.shared.array(shape=(CTA_REDUCE_SIZE,), dtype=acts.dtype) - - max = output[col] - - # // Each block works on a column - if idx < num_rows: - curr = acts[col * num_rows + idx] - max - if I_opid == 0: - curr = rnnt_helper.exponential(curr) - else: - curr = rnnt_helper.identity(curr) - - idx += CTA_REDUCE_SIZE - - while idx < num_rows: - activation_ = acts[col * num_rows + idx] - max - - if I_opid == 0 and R_opid == 0: - curr = rnnt_helper.add(curr, rnnt_helper.exponential(activation_)) - elif I_opid == 0 and R_opid == 1: - curr = rnnt_helper.maximum(curr, rnnt_helper.exponential(activation_)) - elif I_opid == 1 and R_opid == 0: - curr = rnnt_helper.add(curr, rnnt_helper.identity(activation_)) - else: - curr = rnnt_helper.maximum(curr, rnnt_helper.identity(activation_)) - - idx += CTA_REDUCE_SIZE - - # // Sum thread-totals over the CTA. - curr = CTAReduce(tid, curr, storage, num_rows, R_opid) - - # // Store result in out (inplace, I_op: identity) - if tid == 0: - output[col] = curr - - -@cuda.jit() -def _reduce_minus(I_opid: int, R_opid: int, acts, output, num_rows: int): - """ - CUDA Warp reduction kernel which reduces via the R_Op.Add - - Reduces the input data such that I_Op = Exponential and R_op = Add. - The result is stored in the blockIdx, and is stored as an exp op. - - Note: - Efficient warp occurs at input shapes of 2 ^ K. - - References: - - Warp Primitives [https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/] - - Args: - I_opid: Operator ID for input. See I_Op for more information. For this kernel, - the Exponential op is chosen in general, and therefore the input is reduced in place - with scaling. - R_opid: Operator ID for reduction. See R_Op for more information. - For this kernel, generally Add op is chosen. It reduces the kernel via summation. - acts: Flatened activation matrix of shape [B * T * U * (V+1)]. - output: Flatened output matrix of shape [B * T * U * (V+1)]. Data will be overwritten. - num_rows: Vocabulary size (including blank token) - V+1. - """ - tid = cuda.threadIdx.x - idx = tid - col = cuda.blockIdx.x - - # allocate shared thread memory - storage = cuda.shared.array(shape=(CTA_REDUCE_SIZE,), dtype=acts.dtype) - - max = output[col] - - # // Each block works on a column - if idx < num_rows: - curr = acts[col * num_rows + idx] - max - if I_opid == 0: - curr = rnnt_helper.exponential(curr) - else: - curr = rnnt_helper.identity(curr) - - idx += CTA_REDUCE_SIZE - - while idx < num_rows: - activation_ = acts[col * num_rows + idx] - max - - if I_opid == 0 and R_opid == 0: - curr = rnnt_helper.add(curr, rnnt_helper.exponential(activation_)) - elif I_opid == 0 and R_opid == 1: - curr = rnnt_helper.maximum(curr, rnnt_helper.exponential(activation_)) - elif I_opid == 1 and R_opid == 0: - curr = rnnt_helper.add(curr, rnnt_helper.identity(activation_)) - else: - curr = rnnt_helper.maximum(curr, rnnt_helper.identity(activation_)) - - idx += CTA_REDUCE_SIZE - - # // Sum thread-totals over the CTA. - curr = CTAReduce(tid, curr, storage, num_rows, R_opid) - - # // Store result in out (inplace, I_op: exponential) - if tid == 0: - output[col] = -max - math.log(curr) - - -def ReduceHelper( - I_opid: int, - R_opid: int, - acts: torch.Tensor, - output: torch.Tensor, - num_rows: int, - num_cols: int, - minus: bool, - stream, -): - """ - CUDA Warp reduction kernel helper which reduces via the R_Op.Add and writes - the result to `output` according to I_op id. - - The result is stored in the blockIdx. - - Note: - Efficient warp occurs at input shapes of 2 ^ K. - - References: - - Warp Primitives [https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/] - - Args: - I_opid: Operator ID for input. See I_Op for more information. - R_opid: Operator ID for reduction. See R_Op for more information. - acts: Flatened activation matrix of shape [B * T * U * (V+1)]. - output: Flatened output matrix of shape [B * T * U * (V+1)]. Data will be overwritten. - num_rows: Vocabulary size (including blank token) - V+1. - Represents the number of threads per block. - num_cols: Flattened shape of activation matrix, without vocabulary dimension (B * T * U). - Represents number of blocks per grid. - minus: Bool flag whether to add or subtract as reduction. - If minus is set; calls _reduce_minus, else calls _reduce_rows kernel. - stream: CUDA Stream. - """ - if minus: - grid_size = num_cols - # call kernel - _reduce_minus[grid_size, CTA_REDUCE_SIZE, stream, 0](I_opid, R_opid, acts, output, num_rows) - - else: - grid_size = num_cols - # call kernel - _reduce_rows[grid_size, CTA_REDUCE_SIZE, stream, 0](I_opid, R_opid, acts, output, num_rows) - - return True - - -def reduce_exp(acts: torch.Tensor, denom, rows: int, cols: int, minus: bool, stream): - """ - Helper method to call the Warp Reduction Kernel to perform `exp` reduction. - - Note: - Efficient warp occurs at input shapes of 2 ^ K. - - References: - - Warp Primitives [https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/] - - Args: - acts: Flatened activation matrix of shape [B * T * U * (V+1)]. - output: Flatened output matrix of shape [B * T * U * (V+1)]. Data will be overwritten. - rows: Vocabulary size (including blank token) - V+1. - Represents the number of threads per block. - cols: Flattened shape of activation matrix, without vocabulary dimension (B * T * U). - Represents number of blocks per grid. - minus: Bool flag whether to add or subtract as reduction. - If minus is set; calls _reduce_minus, else calls _reduce_rows kernel. - stream: CUDA Stream. - """ - return ReduceHelper( - I_opid=I_Op.EXPONENTIAL.value, - R_opid=R_Op.ADD.value, - acts=acts, - output=denom, - num_rows=rows, - num_cols=cols, - minus=minus, - stream=stream, - ) - - -def reduce_max(acts: torch.Tensor, denom, rows: int, cols: int, minus: bool, stream): - """ - Helper method to call the Warp Reduction Kernel to perform `max` reduction. - - Note: - Efficient warp occurs at input shapes of 2 ^ K. - - References: - - Warp Primitives [https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/] - - Args: - acts: Flatened activation matrix of shape [B * T * U * (V+1)]. - output: Flatened output matrix of shape [B * T * U * (V+1)]. Data will be overwritten. - rows: Vocabulary size (including blank token) - V+1. - Represents the number of threads per block. - cols: Flattened shape of activation matrix, without vocabulary dimension (B * T * U). - Represents number of blocks per grid. - minus: Bool flag whether to add or subtract as reduction. - If minus is set; calls _reduce_minus, else calls _reduce_rows kernel. - stream: CUDA Stream. - """ - return ReduceHelper( - I_opid=I_Op.IDENTITY.value, - R_opid=R_Op.MAXIMUM.value, - acts=acts, - output=denom, - num_rows=rows, - num_cols=cols, - minus=minus, - stream=stream, - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/global_constants.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/global_constants.py deleted file mode 100644 index cc304753034ce70adf0f38df80d42d8e8f2ecdcf..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/global_constants.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright 2018-2019, Mingkun Huang -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import enum - -import numpy as np -from numba import float32 - -# Internal globals -_THREADS_PER_BLOCK = 32 -_WARP_SIZE = 32 -_DTYPE = float32 - -# Constants -FP32_INF = np.inf -FP32_NEG_INF = -np.inf -THRESHOLD = 1e-1 - -""" -Getters -""" - - -def threads_per_block(): - global _THREADS_PER_BLOCK - return _THREADS_PER_BLOCK - - -def warp_size(): - global _WARP_SIZE - return _WARP_SIZE - - -def dtype(): - global _DTYPE - return _DTYPE - - -# RNNT STATUS -class RNNTStatus(enum.Enum): - RNNT_STATUS_SUCCESS = 0 - RNNT_STATUS_INVALID_VALUE = 1 diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/rnnt_helper.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/rnnt_helper.py deleted file mode 100644 index 6ca7cd23726492c1bdb7504c6240c0c701866b2b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/rnnt_loss/utils/rnnt_helper.py +++ /dev/null @@ -1,148 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright 2018-2019, Mingkun Huang -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import math -from typing import Optional, Tuple - -import numba -import torch -from numba import cuda - -from nemo.collections.asr.parts.numba.rnnt_loss.utils import global_constants - -threshold = global_constants.THRESHOLD - - -@cuda.jit(device=True, inline=True) -def log_sum_exp(a: float, b: float): - if a == global_constants.FP32_NEG_INF: - return b - - if b == global_constants.FP32_NEG_INF: - return a - - if a > b: - return math.log1p(math.exp(b - a)) + a - else: - return math.log1p(math.exp(a - b)) + b - - -@cuda.jit(device=True, inline=True) -def div_up(x: int, y: int): - return (x + y - 1) // y - - -@cuda.jit(device=True) -def maximum(x, y): - if x < y: - return y - else: - return x - - -@cuda.jit(device=True) -def add(x, y): - return x + y - - -@cuda.jit(device=True) -def identity(x): - return x - - -@cuda.jit(device=True) -def negate(x): - return -x - - -@cuda.jit(device=True) -def exponential(x): - return math.exp(x) - - -@cuda.jit(device=True) -def log_plus(p1: float, p2: float): - if p1 == global_constants.FP32_NEG_INF: - return p2 - - if p2 == global_constants.FP32_NEG_INF: - return p1 - - result = math.log1p(math.exp(-math.fabs(p1 - p2))) + maximum(p1, p2) - return result - - -@cuda.jit(device=True, inline=True) -def copy_data_1d(source: torch.Tensor, dest: torch.Tensor, idx: int): - dest[idx] = source[idx] - - -@cuda.jit() -def compute_costs_data(source: torch.Tensor, dest: torch.Tensor, fastemit_lambda: float): - block = cuda.blockIdx.x - tid = cuda.threadIdx.x - idx = block * cuda.blockDim.x + tid - length = source.shape[0] - - if idx < length: - copy_data_1d(source, dest, idx) - dest[idx] *= -1.0 - dest[idx] *= numba.float32(1.0 + fastemit_lambda) - - -def get_workspace_size( - maxT: int, maxU: int, minibatch: int, gpu: bool -) -> Tuple[Optional[int], global_constants.RNNTStatus]: - - if minibatch <= 0 or maxT <= 0 or maxU <= 0: - return (None, global_constants.RNNTStatus.RNNT_STATUS_INVALID_VALUE) - - # per minibatch memory - per_minibatch_size = 0 - - # alphas & betas - per_minibatch_size += maxT * maxU * 2 - - if not gpu: - # // blank & label log probability cache - per_minibatch_size += maxT * maxU * 2 - else: - # // softmax denominator - per_minibatch_size += maxT * maxU - # // forward - backward loglikelihood - per_minibatch_size += 2 - - size = per_minibatch_size * minibatch - return (size, global_constants.RNNTStatus.RNNT_STATUS_SUCCESS) - - -def flatten_tensor(x: torch.Tensor): - original_shape = x.shape - x = x.view([-1]) - return x, original_shape diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/spec_augment/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/spec_augment/__init__.py deleted file mode 100644 index 17a22fcf81889ff15d8cedd8c306bd0d383dcf58..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/spec_augment/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.parts.numba.spec_augment.spec_aug_numba import ( - SpecAugmentNumba, - spec_augment_launch_heuristics, -) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/spec_augment/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/spec_augment/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index a769576f2b740e0b2f23eb6b0af3b99f127a8547..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/spec_augment/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/spec_augment/__pycache__/spec_aug_numba.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/spec_augment/__pycache__/spec_aug_numba.cpython-310.pyc deleted file mode 100644 index 11a1da07bb23236c18bf3db838bbf33f23fbacf9..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/spec_augment/__pycache__/spec_aug_numba.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/spec_augment/spec_aug_numba.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/spec_augment/spec_aug_numba.py deleted file mode 100644 index fcf5d5cebfebe432008b6d7c136129c79afb80f7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/numba/spec_augment/spec_aug_numba.py +++ /dev/null @@ -1,305 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import torch.nn as nn -from numba import cuda - -from nemo.core.classes import Typing, typecheck -from nemo.core.neural_types import LengthsType, NeuralType, SpectrogramType -from nemo.utils import logging - -MAX_THREAD_BUFFER = 512 - - -@cuda.jit() -def spec_augment_kernel( - x: torch.Tensor, - x_len: torch.Tensor, - freq_starts: torch.Tensor, - freq_widths: torch.Tensor, - time_starts: torch.Tensor, - time_widths: torch.Tensor, - mask_value: float, -): - """ - Numba CUDA kernel to perform SpecAugment in-place on the GPU. - Parallelize over freq and time axis, parallel threads over batch. - Sequential over masks (adaptive in time). - - Args: - x: Pytorch tensor of shape [B, F, T] with the acoustic features. - x_len: Pytorch tensor of shape [B] with the lengths of the padded sequence. - freq_starts: Pytorch tensor of shape [B, M_f] with the start indices of freq masks. - freq_widths: Pytorch tensor of shape [B, M_f] with the width of freq masks. - time_starts: Pytorch tensor of shape [B, M_t] with the start indices of time masks. - time_widths: Pytorch tensor of shape [B, M_t] with the width of time masks. - mask_value: Float value that will be used as mask value. - """ - f = cuda.blockIdx.x # indexes the Freq dim - t = cuda.blockIdx.y # indexes the Time dim - tid = cuda.threadIdx.x # index of the current mask - threads_per_block = cuda.blockDim.x - - # Compute the number of masks over freq axis - len_f = freq_starts.shape[1] - # For all samples in the batch, apply the freq mask - for bidx in range(0, x.shape[0], threads_per_block): - # Resolve the index of the batch (case where more masks than MAX_THREAD_BUFFER) - bm_idx = bidx + tid - - # Access mask only if valid sample id in batch - if bm_idx < x.shape[0]: - # For `len_f` number of freq masks that must be applied - for fidx in range(0, len_f): - # Access the start index and width of this freq mask - f_start = freq_starts[bm_idx, fidx] - f_width = freq_widths[bm_idx, fidx] - - # If block idx `f` >= start and < (start + width) of this freq mask - if f >= f_start and f < (f_start + f_width): - x[bm_idx, f, t] = mask_value - - # Compute the number of masks over time axis - len_t = time_starts.shape[1] - # For all samples in the batch, apply the time mask - for b_idx in range(0, x.shape[0], threads_per_block): - # Resolve the index of the batch (case where more masks than MAX_THREAD_BUFFER) - bm_idx = b_idx + tid - - # Access mask only if valid sample id in batch - if bm_idx < x.shape[0]: - # For `len_t` number of freq masks that must be applied - for tidx in range(0, len_t): - # Access the start index and width of this time mask - t_start = time_starts[bm_idx, tidx] - t_width = time_widths[bm_idx, tidx] - - # If block idx `t` >= start and < (start + width) of this time mask - if t >= t_start and t < (t_start + t_width): - # Current block idx `t` < current seq length x_len[b] - # This ensure that we mask only upto the length of that sample - # Everything after that index is padded value so unnecessary to mask - if t < x_len[bm_idx]: - x[bm_idx, f, t] = mask_value - - -def spec_augment_launch_heuristics(x: torch.Tensor, length: torch.Tensor): - """ - Heuristics to determins whether pytorch implementation or numba implementation is selected. - Assumes numba cuda is supported. - - Args: - x: Torch tensor of shape [B, F, T] - length: Optional, Torch of tensor of shape [B] - containing lengths of the tensor. - - Returns: - True if numba kernel should be selected, else False - """ - if not x.is_cuda: - return False - - if length is None: - return False - - if x.shape[0] < 8: - return False - - return True - - -def launch_spec_augment_kernel( - x: torch.Tensor, - x_len: torch.Tensor, - freq_starts: torch.Tensor, - freq_lengths: torch.Tensor, - time_starts: torch.Tensor, - time_lengths: torch.Tensor, - freq_masks: int, - time_masks: int, - mask_value: float, -): - """ - Helper method to launch the SpecAugment kernel - - Args: - x: Pytorch tensor of shape [B, F, T] with the acoustic features. - x_len: Pytorch tensor of shape [B] with the lengths of the padded sequence. - freq_starts: Pytorch tensor of shape [B, M_f] with the start indices of freq masks. - freq_widths: Pytorch tensor of shape [B, M_f] with the width of freq masks. - time_starts: Pytorch tensor of shape [B, M_t] with the start indices of time masks. - time_widths: Pytorch tensor of shape [B, M_t] with the width of time masks. - freq_masks: Int value that determines the number of time masks. - time_masks: Int value that determines the number of freq masks. - mask_value: Float value that will be used as mask value. - - Returns: - The spec augmented tensor 'x' - """ - # Setup CUDA stream - sh = x.shape - stream = cuda.external_stream(torch.cuda.current_stream(x.device).cuda_stream) - - if time_masks > 0 or freq_masks > 0: - # Parallelize over freq and time axis, parallel threads over batch - # Sequential over masks (adaptive in time). - blocks_per_grid = tuple([sh[1], sh[2]]) - # threads_per_block = min(MAX_THREAD_BUFFER, max(freq_masks, time_masks)) - threads_per_block = min(MAX_THREAD_BUFFER, x.shape[0]) - - # Numba does not support fp16, force cast to fp32 temporarily at the expense of memory - original_dtype = x.dtype - cast_x = False - if x.dtype == torch.float16: - x = x.float() - cast_x = True - - # Launch CUDA kernel - spec_augment_kernel[blocks_per_grid, threads_per_block, stream, 0]( - x, x_len, freq_starts, freq_lengths, time_starts, time_lengths, mask_value - ) - torch.cuda.synchronize() - - # Recast back to original dtype if earlier cast was performed - if cast_x: - x = x.to(dtype=original_dtype) - - return x - - -class SpecAugmentNumba(nn.Module, Typing): - """ - Zeroes out(cuts) random continuous horisontal or - vertical segments of the spectrogram as described in - SpecAugment (https://arxiv.org/abs/1904.08779). - - Utilizes a Numba CUDA kernel to perform inplace edit of the input without loops. - Parallelize over freq and time axis, parallel threads over batch. - Sequential over masks (adaptive in time). - - Args: - freq_masks - how many frequency segments should be cut - time_masks - how many time segments should be cut - freq_width - maximum number of frequencies to be cut in one segment - time_width - maximum number of time steps to be cut in one segment. - Can be a positive integer or a float value in the range [0, 1]. - If positive integer value, defines maximum number of time steps - to be cut in one segment. - If a float value, defines maximum percentage of timesteps that - are cut adaptively. - rng: Ignored. - """ - - @property - def input_types(self): - """Returns definitions of module input types - """ - return { - "input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Returns definitions of module output types - """ - return {"augmented_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())} - - def __init__( - self, freq_masks=0, time_masks=0, freq_width=10, time_width=0.1, rng=None, mask_value=0.0, - ): - super().__init__() - # Message to mention that numba specaugment kernel will be available - # if input device is CUDA and lengths are provided - logging.debug("Numba SpecAugment kernel is available") - - self.freq_masks = freq_masks - self.time_masks = time_masks - - self.freq_width = freq_width - self.time_width = time_width - - self.mask_value = mask_value - - # Unused - self.rng = rng - if self.rng is not None: - logging.warning("`rng` was supplied to SpecAugmentNumba, but it is not used.") - - if isinstance(time_width, int): - self.adaptive_temporal_width = False - else: - if time_width > 1.0 or time_width < 0.0: - raise ValueError('If `time_width` is a float value, must be in range [0, 1]') - - self.adaptive_temporal_width = True - - @typecheck() - @torch.no_grad() - def forward(self, input_spec, length): - sh = input_spec.shape - bs = sh[0] - - # Construct the freq and time masks as well as start positions - if self.freq_masks > 0: - freq_starts = torch.randint( - 0, sh[1] - self.freq_width + 1, size=[bs, self.freq_masks], device=input_spec.device - ) - freq_lengths = torch.randint(0, self.freq_width + 1, size=[bs, self.freq_masks], device=input_spec.device) - else: - freq_starts = torch.zeros([bs, 1], dtype=torch.int64, device=input_spec.device) - freq_lengths = torch.zeros([bs, 1], dtype=torch.int64, device=input_spec.device) - - if self.time_masks > 0: - if self.adaptive_temporal_width: - time_width = (length * self.time_width).int().clamp(min=1) - else: - time_width = ( - torch.tensor(self.time_width, dtype=torch.int32, device=input_spec.device) - .unsqueeze(0) - .repeat(sh[0]) - ) - - time_starts = [] - time_lengths = [] - for idx in range(sh[0]): - time_starts.append( - torch.randint( - 0, max(1, length[idx] - time_width[idx]), size=[1, self.time_masks], device=input_spec.device - ) - ) - time_lengths.append( - torch.randint(0, time_width[idx] + 1, size=[1, self.time_masks], device=input_spec.device) - ) - - time_starts = torch.cat(time_starts, 0) - time_lengths = torch.cat(time_lengths, 0) - - else: - time_starts = torch.zeros([bs, 1], dtype=torch.int64, device=input_spec.device) - time_lengths = torch.zeros([bs, 1], dtype=torch.int64, device=input_spec.device) - - x = launch_spec_augment_kernel( - input_spec, - length, - freq_starts=freq_starts, - freq_lengths=freq_lengths, - time_starts=time_starts, - time_lengths=time_lengths, - freq_masks=self.freq_masks, - time_masks=self.time_masks, - mask_value=self.mask_value, - ) - - return x diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__init__.py deleted file mode 100644 index a0785c56bf2aaa09b2ebd1716d70bc59b5a0acdc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__init__.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.parts.preprocessing.feature_loader import ExternalFeatureLoader -from nemo.collections.asr.parts.preprocessing.features import FeaturizerFactory, FilterbankFeatures, WaveformFeaturizer -from nemo.collections.asr.parts.preprocessing.perturb import ( - AudioAugmentor, - AugmentationDataset, - GainPerturbation, - ImpulsePerturbation, - NoisePerturbation, - NoisePerturbationWithNormalization, - Perturbation, - RirAndNoisePerturbation, - ShiftPerturbation, - SilencePerturbation, - SpeedPerturbation, - TimeStretchPerturbation, - TranscodePerturbation, - WhiteNoisePerturbation, - perturbation_types, - process_augmentations, - register_perturbation, -) -from nemo.collections.asr.parts.preprocessing.segment import AudioSegment diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 8b7cbe60c62b202114b442b8e8a80914e3ee7562..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__pycache__/feature_loader.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__pycache__/feature_loader.cpython-310.pyc deleted file mode 100644 index 81dc43ffe01072f88b0d3d247035cdaf36a2b3e1..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__pycache__/feature_loader.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__pycache__/features.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__pycache__/features.cpython-310.pyc deleted file mode 100644 index 2c94784a3cb1c6e28569db1bd0321bfaf29c190c..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__pycache__/features.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__pycache__/perturb.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__pycache__/perturb.cpython-310.pyc deleted file mode 100644 index ebb9072b40af193c7fe61a53a6707deaf5033756..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__pycache__/perturb.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__pycache__/segment.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__pycache__/segment.cpython-310.pyc deleted file mode 100644 index 7a34fe451b63249f30610dbdbece3dfc891d5770..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/__pycache__/segment.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/feature_loader.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/feature_loader.py deleted file mode 100644 index 8c629cf4cfd408f6497055961f5128d7d4470640..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/feature_loader.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import Optional - -import numpy as np -import torch - - -class ExternalFeatureLoader(object): - """Feature loader that load external features store in certain format. - Currently support pickle, npy and npz format. - """ - - def __init__( - self, augmentor: Optional["nemo.collections.asr.parts.perturb.FeatureAugmentor"] = None, - ): - """ - Feature loader - """ - self.augmentor = augmentor - - def load_feature_from_file(self, file_path: str): - """Load samples from file_path and convert it to be of type float32 - file_path (str) is the path of the file that stores feature/sample. - """ - - if file_path.endswith(".pt") or file_path.endswith(".pth"): - samples = torch.load(file_path, map_location="cpu").float().numpy() - return samples - else: - # load pickle/npy/npz file - samples = np.load(file_path, allow_pickle=True) - return self._convert_samples_to_float32(samples) - # TODO load other type of files such as kaldi io ark - - @staticmethod - def _convert_samples_to_float32(samples: np.ndarray) -> np.ndarray: - """Convert sample type to float32. - Integers will be scaled to [-1, 1] in float32. - """ - float32_samples = samples.astype('float32') - if samples.dtype in np.sctypes['int']: - bits = np.iinfo(samples.dtype).bits - float32_samples *= 1.0 / 2 ** (bits - 1) - elif samples.dtype in np.sctypes['float']: - pass - else: - raise TypeError("Unsupported sample type: %s." % samples.dtype) - return float32_samples - - def process(self, file_path: str) -> torch.Tensor: - features = self.load_feature_from_file(file_path) - features = self.process_segment(features) - return features - - def process_segment(self, feature_segment): - if self.augmentor: - # augmentor for external features. Here possible augmentor for external embedding feature is Diaconis Augmentation and might be implemented later - self.augmentor.perturb(feature_segment) - return torch.tensor(feature_segment, dtype=torch.float) - - return torch.tensor(feature_segment, dtype=torch.float) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/features.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/features.py deleted file mode 100644 index 67813f3e66d20d7596c17db6a9f3a90eb24baf06..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/features.py +++ /dev/null @@ -1,655 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright (c) 2018 Ryan Leary -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# This file contains code artifacts adapted from https://github.com/ryanleary/patter -import math -import random -from typing import Optional, Tuple, Union - -import librosa -import numpy as np -import torch -import torch.nn as nn - -from nemo.collections.asr.parts.preprocessing.perturb import AudioAugmentor -from nemo.collections.asr.parts.preprocessing.segment import AudioSegment -from nemo.utils import logging - -try: - import torchaudio - - HAVE_TORCHAUDIO = True -except ModuleNotFoundError: - HAVE_TORCHAUDIO = False - - -CONSTANT = 1e-5 - - -def normalize_batch(x, seq_len, normalize_type): - x_mean = None - x_std = None - if normalize_type == "per_feature": - x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device) - x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device) - for i in range(x.shape[0]): - if x[i, :, : seq_len[i]].shape[1] == 1: - raise ValueError( - "normalize_batch with `per_feature` normalize_type received a tensor of length 1. This will result " - "in torch.std() returning nan. Make sure your audio length has enough samples for a single " - "feature (ex. at least `hop_length` for Mel Spectrograms)." - ) - x_mean[i, :] = x[i, :, : seq_len[i]].mean(dim=1) - x_std[i, :] = x[i, :, : seq_len[i]].std(dim=1) - # make sure x_std is not zero - x_std += CONSTANT - return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2), x_mean, x_std - elif normalize_type == "all_features": - x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device) - x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device) - for i in range(x.shape[0]): - x_mean[i] = x[i, :, : seq_len[i].item()].mean() - x_std[i] = x[i, :, : seq_len[i].item()].std() - # make sure x_std is not zero - x_std += CONSTANT - return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1), x_mean, x_std - elif "fixed_mean" in normalize_type and "fixed_std" in normalize_type: - x_mean = torch.tensor(normalize_type["fixed_mean"], device=x.device) - x_std = torch.tensor(normalize_type["fixed_std"], device=x.device) - return ( - (x - x_mean.view(x.shape[0], x.shape[1]).unsqueeze(2)) / x_std.view(x.shape[0], x.shape[1]).unsqueeze(2), - x_mean, - x_std, - ) - else: - return x, x_mean, x_std - - -def clean_spectrogram_batch(spectrogram: torch.Tensor, spectrogram_len: torch.Tensor, fill_value=0.0) -> torch.Tensor: - """ - Fill spectrogram values outside the length with `fill_value` - - Args: - spectrogram: Tensor with shape [B, C, L] containing batched spectrograms - spectrogram_len: Tensor with shape [B] containing the sequence length of each batch element - fill_value: value to fill with, 0.0 by default - - Returns: - cleaned spectrogram, tensor with shape equal to `spectrogram` - """ - device = spectrogram.device - batch_size, _, max_len = spectrogram.shape - mask = torch.arange(max_len, device=device)[None, :] >= spectrogram_len[:, None] - mask = mask.unsqueeze(1).expand_as(spectrogram) - return spectrogram.masked_fill(mask, fill_value) - - -def splice_frames(x, frame_splicing): - """ Stacks frames together across feature dim - - input is batch_size, feature_dim, num_frames - output is batch_size, feature_dim*frame_splicing, num_frames - - """ - seq = [x] - for n in range(1, frame_splicing): - seq.append(torch.cat([x[:, :, :n], x[:, :, n:]], dim=2)) - return torch.cat(seq, dim=1) - - -@torch.jit.script_if_tracing -def make_seq_mask_like( - lengths: torch.Tensor, like: torch.Tensor, time_dim: int = -1, valid_ones: bool = True -) -> torch.Tensor: - """ - - Args: - lengths: Tensor with shape [B] containing the sequence length of each batch element - like: The mask will contain the same number of dimensions as this Tensor, and will have the same max - length in the time dimension of this Tensor. - time_dim: Time dimension of the `shape_tensor` and the resulting mask. Zero-based. - valid_ones: If True, valid tokens will contain value `1` and padding will be `0`. Else, invert. - - Returns: - A :class:`torch.Tensor` containing 1's and 0's for valid and invalid tokens, respectively, if `valid_ones`, else - vice-versa. Mask will have the same number of dimensions as `like`. Batch and time dimensions will match - the `like`. All other dimensions will be singletons. E.g., if `like.shape == [3, 4, 5]` and - `time_dim == -1', mask will have shape `[3, 1, 5]`. - """ - # Mask with shape [B, T] - mask = torch.arange(like.shape[time_dim], device=like.device).repeat(lengths.shape[0], 1).lt(lengths.view(-1, 1)) - # [B, T] -> [B, *, T] where * is any number of singleton dimensions to expand to like tensor - for _ in range(like.dim() - mask.dim()): - mask = mask.unsqueeze(1) - # If needed, transpose time dim - if time_dim != -1 and time_dim != mask.dim() - 1: - mask = mask.transpose(-1, time_dim) - # Maybe invert the padded vs. valid token values - if not valid_ones: - mask = ~mask - return mask - - -class WaveformFeaturizer(object): - def __init__(self, sample_rate=16000, int_values=False, augmentor=None): - self.augmentor = augmentor if augmentor is not None else AudioAugmentor() - self.sample_rate = sample_rate - self.int_values = int_values - - def max_augmentation_length(self, length): - return self.augmentor.max_augmentation_length(length) - - def process( - self, - file_path, - offset=0, - duration=0, - trim=False, - trim_ref=np.max, - trim_top_db=60, - trim_frame_length=2048, - trim_hop_length=512, - orig_sr=None, - channel_selector=None, - normalize_db=None, - ): - audio = AudioSegment.from_file( - file_path, - target_sr=self.sample_rate, - int_values=self.int_values, - offset=offset, - duration=duration, - trim=trim, - trim_ref=trim_ref, - trim_top_db=trim_top_db, - trim_frame_length=trim_frame_length, - trim_hop_length=trim_hop_length, - orig_sr=orig_sr, - channel_selector=channel_selector, - normalize_db=normalize_db, - ) - return self.process_segment(audio) - - def process_segment(self, audio_segment): - self.augmentor.perturb(audio_segment) - return torch.tensor(audio_segment.samples, dtype=torch.float) - - @classmethod - def from_config(cls, input_config, perturbation_configs=None): - if perturbation_configs is not None: - aa = AudioAugmentor.from_config(perturbation_configs) - else: - aa = None - - sample_rate = input_config.get("sample_rate", 16000) - int_values = input_config.get("int_values", False) - - return cls(sample_rate=sample_rate, int_values=int_values, augmentor=aa) - - -class FeaturizerFactory(object): - def __init__(self): - pass - - @classmethod - def from_config(cls, input_cfg, perturbation_configs=None): - return WaveformFeaturizer.from_config(input_cfg, perturbation_configs=perturbation_configs) - - -class FilterbankFeatures(nn.Module): - """Featurizer that converts wavs to Mel Spectrograms. - See AudioToMelSpectrogramPreprocessor for args. - """ - - def __init__( - self, - sample_rate=16000, - n_window_size=320, - n_window_stride=160, - window="hann", - normalize="per_feature", - n_fft=None, - preemph=0.97, - nfilt=64, - lowfreq=0, - highfreq=None, - log=True, - log_zero_guard_type="add", - log_zero_guard_value=2 ** -24, - dither=CONSTANT, - pad_to=16, - max_duration=16.7, - frame_splicing=1, - exact_pad=False, - pad_value=0, - mag_power=2.0, - use_grads=False, - rng=None, - nb_augmentation_prob=0.0, - nb_max_freq=4000, - mel_norm="slaney", - stft_exact_pad=False, # Deprecated arguments; kept for config compatibility - stft_conv=False, # Deprecated arguments; kept for config compatibility - ): - super().__init__() - if stft_conv or stft_exact_pad: - logging.warning( - "Using torch_stft is deprecated and has been removed. The values have been forcibly set to False " - "for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True " - "as needed." - ) - if exact_pad and n_window_stride % 2 == 1: - raise NotImplementedError( - f"{self} received exact_pad == True, but hop_size was odd. If audio_length % hop_size == 0. Then the " - "returned spectrogram would not be of length audio_length // hop_size. Please use an even hop_size." - ) - self.log_zero_guard_value = log_zero_guard_value - if ( - n_window_size is None - or n_window_stride is None - or not isinstance(n_window_size, int) - or not isinstance(n_window_stride, int) - or n_window_size <= 0 - or n_window_stride <= 0 - ): - raise ValueError( - f"{self} got an invalid value for either n_window_size or " - f"n_window_stride. Both must be positive ints." - ) - logging.info(f"PADDING: {pad_to}") - - self.win_length = n_window_size - self.hop_length = n_window_stride - self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length)) - self.stft_pad_amount = (self.n_fft - self.hop_length) // 2 if exact_pad else None - - if exact_pad: - logging.info("STFT using exact pad") - torch_windows = { - 'hann': torch.hann_window, - 'hamming': torch.hamming_window, - 'blackman': torch.blackman_window, - 'bartlett': torch.bartlett_window, - 'none': None, - } - window_fn = torch_windows.get(window, None) - window_tensor = window_fn(self.win_length, periodic=False) if window_fn else None - self.register_buffer("window", window_tensor) - self.stft = lambda x: torch.stft( - x, - n_fft=self.n_fft, - hop_length=self.hop_length, - win_length=self.win_length, - center=False if exact_pad else True, - window=self.window.to(dtype=torch.float), - return_complex=True, - ) - - self.normalize = normalize - self.log = log - self.dither = dither - self.frame_splicing = frame_splicing - self.nfilt = nfilt - self.preemph = preemph - self.pad_to = pad_to - highfreq = highfreq or sample_rate / 2 - - filterbanks = torch.tensor( - librosa.filters.mel( - sr=sample_rate, n_fft=self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq, norm=mel_norm - ), - dtype=torch.float, - ).unsqueeze(0) - self.register_buffer("fb", filterbanks) - - # Calculate maximum sequence length - max_length = self.get_seq_len(torch.tensor(max_duration * sample_rate, dtype=torch.float)) - max_pad = pad_to - (max_length % pad_to) if pad_to > 0 else 0 - self.max_length = max_length + max_pad - self.pad_value = pad_value - self.mag_power = mag_power - - # We want to avoid taking the log of zero - # There are two options: either adding or clamping to a small value - if log_zero_guard_type not in ["add", "clamp"]: - raise ValueError( - f"{self} received {log_zero_guard_type} for the " - f"log_zero_guard_type parameter. It must be either 'add' or " - f"'clamp'." - ) - - self.use_grads = use_grads - if not use_grads: - self.forward = torch.no_grad()(self.forward) - self._rng = random.Random() if rng is None else rng - self.nb_augmentation_prob = nb_augmentation_prob - if self.nb_augmentation_prob > 0.0: - if nb_max_freq >= sample_rate / 2: - self.nb_augmentation_prob = 0.0 - else: - self._nb_max_fft_bin = int((nb_max_freq / sample_rate) * n_fft) - - # log_zero_guard_value is the the small we want to use, we support - # an actual number, or "tiny", or "eps" - self.log_zero_guard_type = log_zero_guard_type - logging.debug(f"sr: {sample_rate}") - logging.debug(f"n_fft: {self.n_fft}") - logging.debug(f"win_length: {self.win_length}") - logging.debug(f"hop_length: {self.hop_length}") - logging.debug(f"n_mels: {nfilt}") - logging.debug(f"fmin: {lowfreq}") - logging.debug(f"fmax: {highfreq}") - logging.debug(f"using grads: {use_grads}") - logging.debug(f"nb_augmentation_prob: {nb_augmentation_prob}") - - def log_zero_guard_value_fn(self, x): - if isinstance(self.log_zero_guard_value, str): - if self.log_zero_guard_value == "tiny": - return torch.finfo(x.dtype).tiny - elif self.log_zero_guard_value == "eps": - return torch.finfo(x.dtype).eps - else: - raise ValueError( - f"{self} received {self.log_zero_guard_value} for the " - f"log_zero_guard_type parameter. It must be either a " - f"number, 'tiny', or 'eps'" - ) - else: - return self.log_zero_guard_value - - def get_seq_len(self, seq_len): - # Assuming that center is True is stft_pad_amount = 0 - pad_amount = self.stft_pad_amount * 2 if self.stft_pad_amount is not None else self.n_fft // 2 * 2 - seq_len = torch.floor_divide((seq_len + pad_amount - self.n_fft), self.hop_length) + 1 - return seq_len.to(dtype=torch.long) - - @property - def filter_banks(self): - return self.fb - - def forward(self, x, seq_len, linear_spec=False): - seq_len = self.get_seq_len(seq_len) - - if self.stft_pad_amount is not None: - x = torch.nn.functional.pad( - x.unsqueeze(1), (self.stft_pad_amount, self.stft_pad_amount), "reflect" - ).squeeze(1) - - # dither (only in training mode for eval determinism) - if self.training and self.dither > 0: - x += self.dither * torch.randn_like(x) - - # do preemphasis - if self.preemph is not None: - x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1) - - # disable autocast to get full range of stft values - with torch.cuda.amp.autocast(enabled=False): - x = self.stft(x) - - # torch stft returns complex tensor (of shape [B,N,T]); so convert to magnitude - # guard is needed for sqrt if grads are passed through - guard = 0 if not self.use_grads else CONSTANT - x = torch.view_as_real(x) - x = torch.sqrt(x.pow(2).sum(-1) + guard) - - if self.training and self.nb_augmentation_prob > 0.0: - for idx in range(x.shape[0]): - if self._rng.random() < self.nb_augmentation_prob: - x[idx, self._nb_max_fft_bin :, :] = 0.0 - - # get power spectrum - if self.mag_power != 1.0: - x = x.pow(self.mag_power) - - # return plain spectrogram if required - if linear_spec: - return x, seq_len - - # dot with filterbank energies - x = torch.matmul(self.fb.to(x.dtype), x) - # log features if required - if self.log: - if self.log_zero_guard_type == "add": - x = torch.log(x + self.log_zero_guard_value_fn(x)) - elif self.log_zero_guard_type == "clamp": - x = torch.log(torch.clamp(x, min=self.log_zero_guard_value_fn(x))) - else: - raise ValueError("log_zero_guard_type was not understood") - - # frame splicing if required - if self.frame_splicing > 1: - x = splice_frames(x, self.frame_splicing) - - # normalize if required - if self.normalize: - x, _, _ = normalize_batch(x, seq_len, normalize_type=self.normalize) - - # mask to zero any values beyond seq_len in batch, pad to multiple of `pad_to` (for efficiency) - max_len = x.size(-1) - mask = torch.arange(max_len).to(x.device) - mask = mask.repeat(x.size(0), 1) >= seq_len.unsqueeze(1) - x = x.masked_fill(mask.unsqueeze(1).type(torch.bool).to(device=x.device), self.pad_value) - del mask - pad_to = self.pad_to - if pad_to == "max": - x = nn.functional.pad(x, (0, self.max_length - x.size(-1)), value=self.pad_value) - elif pad_to > 0: - pad_amt = x.size(-1) % pad_to - if pad_amt != 0: - x = nn.functional.pad(x, (0, pad_to - pad_amt), value=self.pad_value) - return x, seq_len - - -class FilterbankFeaturesTA(nn.Module): - """ - Exportable, `torchaudio`-based implementation of Mel Spectrogram extraction. - - See `AudioToMelSpectrogramPreprocessor` for args. - - """ - - def __init__( - self, - sample_rate: int = 16000, - n_window_size: int = 320, - n_window_stride: int = 160, - normalize: Optional[str] = "per_feature", - nfilt: int = 64, - n_fft: Optional[int] = None, - preemph: float = 0.97, - lowfreq: float = 0, - highfreq: Optional[float] = None, - log: bool = True, - log_zero_guard_type: str = "add", - log_zero_guard_value: Union[float, str] = 2 ** -24, - dither: float = 1e-5, - window: str = "hann", - pad_to: int = 0, - pad_value: float = 0.0, - mel_norm="slaney", - # Seems like no one uses these options anymore. Don't convolute the code by supporting thm. - use_grads: bool = False, # Deprecated arguments; kept for config compatibility - max_duration: float = 16.7, # Deprecated arguments; kept for config compatibility - frame_splicing: int = 1, # Deprecated arguments; kept for config compatibility - exact_pad: bool = False, # Deprecated arguments; kept for config compatibility - nb_augmentation_prob: float = 0.0, # Deprecated arguments; kept for config compatibility - nb_max_freq: int = 4000, # Deprecated arguments; kept for config compatibility - mag_power: float = 2.0, # Deprecated arguments; kept for config compatibility - rng: Optional[random.Random] = None, # Deprecated arguments; kept for config compatibility - stft_exact_pad: bool = False, # Deprecated arguments; kept for config compatibility - stft_conv: bool = False, # Deprecated arguments; kept for config compatibility - ): - super().__init__() - if not HAVE_TORCHAUDIO: - raise ValueError(f"Need to install torchaudio to instantiate a {self.__class__.__name__}") - - # Make sure log zero guard is supported, if given as a string - supported_log_zero_guard_strings = {"eps", "tiny"} - if isinstance(log_zero_guard_value, str) and log_zero_guard_value not in supported_log_zero_guard_strings: - raise ValueError( - f"Log zero guard value must either be a float or a member of {supported_log_zero_guard_strings}" - ) - - # Copied from `AudioPreprocessor` due to the ad-hoc structuring of the Mel Spec extractor class - self.torch_windows = { - 'hann': torch.hann_window, - 'hamming': torch.hamming_window, - 'blackman': torch.blackman_window, - 'bartlett': torch.bartlett_window, - 'ones': torch.ones, - None: torch.ones, - } - - # Ensure we can look up the window function - if window not in self.torch_windows: - raise ValueError(f"Got window value '{window}' but expected a member of {self.torch_windows.keys()}") - - self.win_length = n_window_size - self.hop_length = n_window_stride - self._sample_rate = sample_rate - self._normalize_strategy = normalize - self._use_log = log - self._preemphasis_value = preemph - self.log_zero_guard_type = log_zero_guard_type - self.log_zero_guard_value: Union[str, float] = log_zero_guard_value - self.dither = dither - self.pad_to = pad_to - self.pad_value = pad_value - self.n_fft = n_fft - self._mel_spec_extractor: torchaudio.transforms.MelSpectrogram = torchaudio.transforms.MelSpectrogram( - sample_rate=self._sample_rate, - win_length=self.win_length, - hop_length=self.hop_length, - n_mels=nfilt, - window_fn=self.torch_windows[window], - mel_scale="slaney", - norm=mel_norm, - n_fft=n_fft, - f_max=highfreq, - f_min=lowfreq, - wkwargs={"periodic": False}, - ) - - @property - def filter_banks(self): - """ Matches the analogous class """ - return self._mel_spec_extractor.mel_scale.fb - - def _resolve_log_zero_guard_value(self, dtype: torch.dtype) -> float: - if isinstance(self.log_zero_guard_value, float): - return self.log_zero_guard_value - return getattr(torch.finfo(dtype), self.log_zero_guard_value) - - def _apply_dithering(self, signals: torch.Tensor) -> torch.Tensor: - if self.training and self.dither > 0.0: - noise = torch.randn_like(signals) * self.dither - signals = signals + noise - return signals - - def _apply_preemphasis(self, signals: torch.Tensor) -> torch.Tensor: - if self._preemphasis_value is not None: - padded = torch.nn.functional.pad(signals, (1, 0)) - signals = signals - self._preemphasis_value * padded[:, :-1] - return signals - - def _compute_output_lengths(self, input_lengths: torch.Tensor) -> torch.Tensor: - out_lengths = input_lengths.div(self.hop_length, rounding_mode="floor").add(1).long() - return out_lengths - - def _apply_pad_to(self, features: torch.Tensor) -> torch.Tensor: - # Only apply during training; else need to capture dynamic shape for exported models - if not self.training or self.pad_to == 0 or features.shape[-1] % self.pad_to == 0: - return features - pad_length = self.pad_to - (features.shape[-1] % self.pad_to) - return torch.nn.functional.pad(features, pad=(0, pad_length), value=self.pad_value) - - def _apply_log(self, features: torch.Tensor) -> torch.Tensor: - if self._use_log: - zero_guard = self._resolve_log_zero_guard_value(features.dtype) - if self.log_zero_guard_type == "add": - features = features + zero_guard - elif self.log_zero_guard_type == "clamp": - features = features.clamp(min=zero_guard) - else: - raise ValueError(f"Unsupported log zero guard type: '{self.log_zero_guard_type}'") - features = features.log() - return features - - def _extract_spectrograms(self, signals: torch.Tensor) -> torch.Tensor: - # Complex FFT needs to be done in single precision - with torch.cuda.amp.autocast(enabled=False): - features = self._mel_spec_extractor(waveform=signals) - return features - - def _apply_normalization(self, features: torch.Tensor, lengths: torch.Tensor, eps: float = 1e-5) -> torch.Tensor: - # For consistency, this function always does a masked fill even if not normalizing. - mask: torch.Tensor = make_seq_mask_like(lengths=lengths, like=features, time_dim=-1, valid_ones=False) - features = features.masked_fill(mask, 0.0) - # Maybe don't normalize - if self._normalize_strategy is None: - return features - # Use the log zero guard for the sqrt zero guard - guard_value = self._resolve_log_zero_guard_value(features.dtype) - if self._normalize_strategy == "per_feature" or self._normalize_strategy == "all_features": - # 'all_features' reduces over each sample; 'per_feature' reduces over each channel - reduce_dim = 2 - if self._normalize_strategy == "all_features": - reduce_dim = [1, 2] - # [B, D, T] -> [B, D, 1] or [B, 1, 1] - means = features.sum(dim=reduce_dim, keepdim=True).div(lengths.view(-1, 1, 1)) - stds = ( - features.sub(means) - .masked_fill(mask, 0.0) - .pow(2.0) - .sum(dim=reduce_dim, keepdim=True) # [B, D, T] -> [B, D, 1] or [B, 1, 1] - .div(lengths.view(-1, 1, 1) - 1) # assume biased estimator - .clamp(min=guard_value) # avoid sqrt(0) - .sqrt() - ) - features = (features - means) / (stds + eps) - else: - # Deprecating constant std/mean - raise ValueError(f"Unsupported norm type: '{self._normalize_strategy}") - features = features.masked_fill(mask, 0.0) - return features - - def forward(self, input_signal: torch.Tensor, length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - feature_lengths = self._compute_output_lengths(input_lengths=length) - signals = self._apply_dithering(signals=input_signal) - signals = self._apply_preemphasis(signals=signals) - features = self._extract_spectrograms(signals=signals) - features = self._apply_log(features=features) - features = self._apply_normalization(features=features, lengths=feature_lengths) - features = self._apply_pad_to(features=features) - return features, feature_lengths diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/perturb.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/perturb.py deleted file mode 100644 index 62f1b151abe9785fb02618c22f0490af082152b1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/perturb.py +++ /dev/null @@ -1,1338 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright (c) 2018 Ryan Leary -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# This file contains code artifacts adapted from https://github.com/ryanleary/patter -import copy -import inspect -import io -import os -import random -import subprocess -from tempfile import NamedTemporaryFile -from typing import Any, List, Optional, Union - -import librosa -import numpy as np -import soundfile as sf -from scipy import signal - -from nemo.collections.asr.parts.preprocessing.segment import AudioSegment -from nemo.collections.common.parts.preprocessing import collections, parsers -from nemo.core.classes import IterableDataset -from nemo.utils import logging - -# TODO @blisc: Perhaps refactor instead of import guarding -HAVE_OMEGACONG_WEBDATASET = True -try: - import webdataset as wd - from omegaconf import DictConfig, OmegaConf -except ModuleNotFoundError: - from nemo.utils.exceptions import LightningNotInstalledException - - HAVE_OMEGACONG_WEBDATASET = False - - -try: - from nemo.collections.asr.parts.utils import numba_utils - - HAVE_NUMBA = True -except (ImportError, ModuleNotFoundError): - HAVE_NUMBA = False - - -def read_one_audiosegment(manifest, target_sr, tarred_audio=False, audio_dataset=None): - if tarred_audio: - if audio_dataset is None: - raise TypeError("Expected augmentation dataset but got None") - audio_file, file_id, manifest_entry = next(audio_dataset) - - offset = 0 if manifest_entry.offset is None else manifest_entry.offset - duration = 0 if manifest_entry.duration is None else manifest_entry.duration - - else: - audio_record = random.sample(manifest.data, 1)[0] - audio_file = audio_record.audio_file - offset = 0 if audio_record.offset is None else audio_record.offset - duration = 0 if audio_record.duration is None else audio_record.duration - - return AudioSegment.from_file(audio_file, target_sr=target_sr, offset=offset, duration=duration) - - -class Perturbation(object): - def max_augmentation_length(self, length): - return length - - def perturb(self, data): - raise NotImplementedError - - -class SpeedPerturbation(Perturbation): - """ - Performs Speed Augmentation by re-sampling the data to a different sampling rate, - which does not preserve pitch. - - Note: This is a very slow operation for online augmentation. If space allows, - it is preferable to pre-compute and save the files to augment the dataset. - - Args: - sr: Original sampling rate. - resample_type: Type of resampling operation that will be performed. - For better speed using `resampy`'s fast resampling method, use `resample_type='kaiser_fast'`. - For high-quality resampling, set `resample_type='kaiser_best'`. - To use `scipy.signal.resample`, set `resample_type='fft'` or `resample_type='scipy'` - min_speed_rate: Minimum sampling rate modifier. - max_speed_rate: Maximum sampling rate modifier. - num_rates: Number of discrete rates to allow. Can be a positive or negative - integer. - If a positive integer greater than 0 is provided, the range of - speed rates will be discretized into `num_rates` values. - If a negative integer or 0 is provided, the full range of speed rates - will be sampled uniformly. - Note: If a positive integer is provided and the resultant discretized - range of rates contains the value '1.0', then those samples with rate=1.0, - will not be augmented at all and simply skipped. This is to unnecessary - augmentation and increase computation time. Effective augmentation chance - in such a case is = `prob * (num_rates - 1 / num_rates) * 100`% chance - where `prob` is the global probability of a sample being augmented. - rng: Random seed. Default is None - """ - - def __init__(self, sr, resample_type, min_speed_rate=0.9, max_speed_rate=1.1, num_rates=5, rng=None): - - min_rate = min(min_speed_rate, max_speed_rate) - if min_rate < 0.0: - raise ValueError("Minimum sampling rate modifier must be > 0.") - - if resample_type not in ('kaiser_best', 'kaiser_fast', 'fft', 'scipy'): - raise ValueError("Supported `resample_type` values are ('kaiser_best', 'kaiser_fast', 'fft', 'scipy')") - - self._sr = sr - self._min_rate = min_speed_rate - self._max_rate = max_speed_rate - self._num_rates = num_rates - if num_rates > 0: - self._rates = np.linspace(self._min_rate, self._max_rate, self._num_rates, endpoint=True) - self._res_type = resample_type - random.seed(rng) if rng else None - - def max_augmentation_length(self, length): - return length * self._max_rate - - def perturb(self, data): - # Select speed rate either from choice or random sample - if self._num_rates < 0: - speed_rate = random.uniform(self._min_rate, self._max_rate) - else: - speed_rate = random.choice(self._rates) - - # Skip perturbation in case of identity speed rate - if speed_rate == 1.0: - return - - new_sr = int(self._sr * speed_rate) - data._samples = librosa.core.resample( - data._samples, orig_sr=self._sr, target_sr=new_sr, res_type=self._res_type - ) - - -class TimeStretchPerturbation(Perturbation): - """ - Time-stretch an audio series by a fixed rate while preserving pitch, based on [1, 2]. - - Note: - This is a simplified implementation, intended primarily for reference and pedagogical purposes. - It makes no attempt to handle transients, and is likely to produce audible artifacts. - - Reference - [1] [Ellis, D. P. W. “A phase vocoder in Matlab.” Columbia University, 2002.] - (http://www.ee.columbia.edu/~dpwe/resources/matlab/pvoc/) - [2] [librosa.effects.time_stretch] - (https://librosa.github.io/librosa/generated/librosa.effects.time_stretch.html) - - Args: - min_speed_rate: Minimum sampling rate modifier. - max_speed_rate: Maximum sampling rate modifier. - num_rates: Number of discrete rates to allow. Can be a positive or negative - integer. - If a positive integer greater than 0 is provided, the range of - speed rates will be discretized into `num_rates` values. - If a negative integer or 0 is provided, the full range of speed rates - will be sampled uniformly. - Note: If a positive integer is provided and the resultant discretized - range of rates contains the value '1.0', then those samples with rate=1.0, - will not be augmented at all and simply skipped. This is to avoid unnecessary - augmentation and increase computation time. Effective augmentation chance - in such a case is = `prob * (num_rates - 1 / num_rates) * 100`% chance - where `prob` is the global probability of a sample being augmented. - n_fft: Number of fft filters to be computed. - rng: Random seed. Default is None - """ - - def __init__(self, min_speed_rate=0.9, max_speed_rate=1.1, num_rates=5, n_fft=512, rng=None): - - min_rate = min(min_speed_rate, max_speed_rate) - if min_rate < 0.0: - raise ValueError("Minimum sampling rate modifier must be > 0.") - - self._min_rate = min_speed_rate - self._max_rate = max_speed_rate - self._num_rates = num_rates - if num_rates > 0: - self._rates = np.linspace(self._min_rate, self._max_rate, self._num_rates, endpoint=True) - random.seed(rng) if rng else None - - # Pre-compute constants - self._n_fft = int(n_fft) - self._hop_length = int(n_fft // 2) - - # Pre-allocate buffers - self._phi_advance_fast = np.linspace(0, np.pi * self._hop_length, self._hop_length + 1) - self._scale_buffer_fast = np.empty(self._hop_length + 1, dtype=np.float32) - - self._phi_advance_slow = np.linspace(0, np.pi * self._n_fft, self._n_fft + 1) - self._scale_buffer_slow = np.empty(self._n_fft + 1, dtype=np.float32) - - def max_augmentation_length(self, length): - return length * self._max_rate - - def perturb(self, data): - # Select speed rate either from choice or random sample - if self._num_rates < 0: - speed_rate = random.uniform(self._min_rate, self._max_rate) - else: - speed_rate = random.choice(self._rates) - - # Skip perturbation in case of identity speed rate - if speed_rate == 1.0: - return - - # Increase `n_fft` based on task (speed up or slow down audio) - # This greatly reduces upper bound of maximum time taken - # to compute slowed down audio segments. - if speed_rate >= 1.0: # Speed up audio - fft_multiplier = 1 - phi_advance = self._phi_advance_fast - scale_buffer = self._scale_buffer_fast - - else: # Slow down audio - fft_multiplier = 2 - phi_advance = self._phi_advance_slow - scale_buffer = self._scale_buffer_slow - - n_fft = int(self._n_fft * fft_multiplier) - hop_length = int(self._hop_length * fft_multiplier) - - # Perform short-term Fourier transform (STFT) - stft = librosa.core.stft(data._samples, n_fft=n_fft, hop_length=hop_length) - - # Stretch by phase vocoding - if HAVE_NUMBA: - stft_stretch = numba_utils.phase_vocoder(stft, speed_rate, phi_advance, scale_buffer) - - else: - stft_stretch = librosa.core.phase_vocoder(stft, speed_rate, hop_length) - - # Predict the length of y_stretch - len_stretch = int(round(len(data._samples) / speed_rate)) - - # Invert the STFT - y_stretch = librosa.core.istft( - stft_stretch, dtype=data._samples.dtype, hop_length=hop_length, length=len_stretch - ) - - data._samples = y_stretch - - -class SilencePerturbation(Perturbation): - """ - Applies random silence at the start and/or end of the audio. - - Args: - min_start_silence_secs (float): Min start silence level in secs - max_start_silence_secs (float): Max start silence level in secs - min_end_silence_secs (float): Min end silence level in secs - max_end_silence_secs (float): Max end silence level in secs - rng (int): Random seed. Default is None - value: (float): value representing silence to be added to audio array. - """ - - def __init__( - self, - min_start_silence_secs: float = 0, - max_start_silence_secs: float = 0, - min_end_silence_secs: float = 0, - max_end_silence_secs: float = 0, - rng: int = None, - value: float = 0, - ): - self._min_start_silence_secs = min_start_silence_secs - self._max_start_silence_secs = max_start_silence_secs - self._min_end_silence_secs = min_end_silence_secs - self._max_end_silence_secs = max_end_silence_secs - - random.seed(rng) if rng else None - self._value = value - - def perturb(self, data): - start_silence_len = random.uniform(self._min_start_silence_secs, self._max_start_silence_secs) - end_silence_len = random.uniform(self._min_end_silence_secs, self._max_end_silence_secs) - start = np.full((int(start_silence_len * data.sample_rate),), self._value) - end = np.full((int(end_silence_len * data.sample_rate),), self._value) - - data._samples = np.concatenate([start, data._samples, end]) - - -class GainPerturbation(Perturbation): - """ - Applies random gain to the audio. - - Args: - min_gain_dbfs (float): Min gain level in dB - max_gain_dbfs (float): Max gain level in dB - rng (int): Random seed. Default is None - """ - - def __init__(self, min_gain_dbfs=-10, max_gain_dbfs=10, rng=None): - self._min_gain_dbfs = min_gain_dbfs - self._max_gain_dbfs = max_gain_dbfs - random.seed(rng) if rng else None - - def perturb(self, data): - gain = random.uniform(self._min_gain_dbfs, self._max_gain_dbfs) - data._samples = data._samples * (10.0 ** (gain / 20.0)) - - -class ImpulsePerturbation(Perturbation): - """ - Convolves audio with a Room Impulse Response. - - Args: - manifest_path (list): Manifest file for RIRs - audio_tar_filepaths (list): Tar files, if RIR audio files are tarred - shuffle_n (int): Shuffle parameter for shuffling buffered files from the tar files - normalize_impulse (bool): Normalize impulse response to zero mean and amplitude 1 - shift_impulse (bool): Shift impulse response to adjust for delay at the beginning - rng (int): Random seed. Default is None - """ - - def __init__( - self, - manifest_path=None, - audio_tar_filepaths=None, - shuffle_n=128, - normalize_impulse=False, - shift_impulse=False, - rng=None, - ): - self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser([]), index_by_file_id=True) - self._audiodataset = None - self._tarred_audio = False - self._normalize_impulse = normalize_impulse - self._shift_impulse = shift_impulse - self._data_iterator = None - - if audio_tar_filepaths: - self._tarred_audio = True - self._audiodataset = AugmentationDataset(manifest_path, audio_tar_filepaths, shuffle_n) - self._data_iterator = iter(self._audiodataset) - - self._rng = rng - random.seed(self._rng) if rng else None - - def perturb(self, data): - impulse = read_one_audiosegment( - self._manifest, data.sample_rate, tarred_audio=self._tarred_audio, audio_dataset=self._data_iterator, - ) - - # normalize if necessary - if self._normalize_impulse: - # normalize the impulse response to zero mean and amplitude 1 - impulse_norm = impulse.samples - np.mean(impulse.samples) - impulse_norm /= max(abs(impulse_norm)) - else: - impulse_norm = impulse.samples - - # len of input data samples - len_data = len(data._samples) - - # convolve with the full impulse response - data._samples = signal.fftconvolve(data._samples, impulse_norm, "full") - - # compensate the dominant path propagation delay - if self._shift_impulse: - # Find the peak of the IR and shift the output to the left - max_ind = np.argmax(np.abs(impulse_norm)) - data._samples = data._samples[max_ind:] - - # trim to match the input data length - data._samples = data._samples[:len_data] - - # normalize data samples to [-1,1] after rir convolution to avoid nans with fp16 training - data._samples = data._samples / max(abs(data._samples)) - - -class ShiftPerturbation(Perturbation): - """ - Perturbs audio by shifting the audio in time by a random amount between min_shift_ms and max_shift_ms. - The final length of the audio is kept unaltered by padding the audio with zeros. - - - Args: - min_shift_ms (float): Minimum time in milliseconds by which audio will be shifted - max_shift_ms (float): Maximum time in milliseconds by which audio will be shifted - rng (int): Random seed. Default is None - """ - - def __init__(self, min_shift_ms=-5.0, max_shift_ms=5.0, rng=None): - self._min_shift_ms = min_shift_ms - self._max_shift_ms = max_shift_ms - random.seed(rng) if rng else None - - def perturb(self, data): - shift_ms = random.uniform(self._min_shift_ms, self._max_shift_ms) - if abs(shift_ms) / 1000 > data.duration: - # TODO: do something smarter than just ignore this condition - return - shift_samples = int(shift_ms * data.sample_rate // 1000) - # logging.debug("shift: %s", shift_samples) - if shift_samples < 0: - data._samples[-shift_samples:] = data._samples[:shift_samples] - data._samples[:-shift_samples] = 0 - elif shift_samples > 0: - data._samples[:-shift_samples] = data._samples[shift_samples:] - data._samples[-shift_samples:] = 0 - - -class NoisePerturbation(Perturbation): - """ - Perturbation that adds noise to input audio. - - Args: - manifest_path (str): Manifest file with paths to noise files - min_snr_db (float): Minimum SNR of audio after noise is added - max_snr_db (float): Maximum SNR of audio after noise is added - max_gain_db (float): Maximum gain that can be applied on the noise sample - audio_tar_filepaths (list) : Tar files, if noise audio files are tarred - shuffle_n (int): Shuffle parameter for shuffling buffered files from the tar files - orig_sr (int): Original sampling rate of the noise files - rng (int): Random seed. Default is None - """ - - def __init__( - self, - manifest_path=None, - min_snr_db=10, - max_snr_db=50, - max_gain_db=300.0, - rng=None, - audio_tar_filepaths=None, - shuffle_n=100, - orig_sr=16000, - ): - self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser([]), index_by_file_id=True) - self._audiodataset = None - self._tarred_audio = False - self._orig_sr = orig_sr - self._data_iterator = None - - if audio_tar_filepaths: - self._tarred_audio = True - self._audiodataset = AugmentationDataset(manifest_path, audio_tar_filepaths, shuffle_n) - self._data_iterator = iter(self._audiodataset) - - random.seed(rng) if rng else None - self._rng = rng - - self._min_snr_db = min_snr_db - self._max_snr_db = max_snr_db - self._max_gain_db = max_gain_db - - @property - def orig_sr(self): - return self._orig_sr - - def get_one_noise_sample(self, target_sr): - return read_one_audiosegment( - self._manifest, target_sr, tarred_audio=self._tarred_audio, audio_dataset=self._data_iterator - ) - - def perturb(self, data, ref_mic=0): - """ - Args: - data (AudioSegment): audio data - ref_mic (int): reference mic index for scaling multi-channel audios - """ - noise = read_one_audiosegment( - self._manifest, data.sample_rate, tarred_audio=self._tarred_audio, audio_dataset=self._data_iterator, - ) - self.perturb_with_input_noise(data, noise, ref_mic=ref_mic) - - def perturb_with_input_noise(self, data, noise, data_rms=None, ref_mic=0): - """ - Args: - data (AudioSegment): audio data - noise (AudioSegment): noise data - data_rms (Union[float, List[float]): rms_db for data input - ref_mic (int): reference mic index for scaling multi-channel audios - """ - if data.num_channels != noise.num_channels: - raise ValueError( - f"Found mismatched channels for data ({data.num_channels}) and noise ({noise.num_channels})." - ) - - if not (0 <= ref_mic < data.num_channels): - raise ValueError( - f" reference mic ID must be an integer in [0, {data.num_channels}), got {ref_mic} instead." - ) - - snr_db = random.uniform(self._min_snr_db, self._max_snr_db) - if data_rms is None: - data_rms = data.rms_db - - if data.num_channels > 1: - noise_gain_db = data_rms[ref_mic] - noise.rms_db[ref_mic] - snr_db - else: - noise_gain_db = data_rms - noise.rms_db - snr_db - noise_gain_db = min(noise_gain_db, self._max_gain_db) - - # calculate noise segment to use - start_time = random.uniform(0.0, noise.duration - data.duration) - if noise.duration > (start_time + data.duration): - noise.subsegment(start_time=start_time, end_time=start_time + data.duration) - - # adjust gain for snr purposes and superimpose - noise.gain_db(noise_gain_db) - - if noise._samples.shape[0] < data._samples.shape[0]: - noise_idx = random.randint(0, data._samples.shape[0] - noise._samples.shape[0]) - data._samples[noise_idx : noise_idx + noise._samples.shape[0]] += noise._samples - - else: - data._samples += noise._samples - - def perturb_with_foreground_noise(self, data, noise, data_rms=None, max_noise_dur=2, max_additions=1, ref_mic=0): - """ - Args: - data (AudioSegment): audio data - noise (AudioSegment): noise data - data_rms (Union[float, List[float]): rms_db for data input - max_noise_dur: (float): max noise duration - max_additions (int): number of times for adding noise - ref_mic (int): reference mic index for scaling multi-channel audios - """ - if data.num_channels != noise.num_channels: - raise ValueError( - f"Found mismatched channels for data ({data.num_channels}) and noise ({noise.num_channels})." - ) - - if not (0 <= ref_mic < data.num_channels): - raise ValueError( - f" reference mic ID must be an integer in [0, {data.num_channels}), got {ref_mic} instead." - ) - - snr_db = random.uniform(self._min_snr_db, self._max_snr_db) - if not data_rms: - data_rms = data.rms_db - - if data.num_channels > 1: - noise_gain_db = data_rms[ref_mic] - noise.rms_db[ref_mic] - snr_db - else: - noise_gain_db = data_rms - noise.rms_db - snr_db - noise_gain_db = min(noise_gain_db, self._max_gain_db) - - n_additions = random.randint(1, max_additions) - - for i in range(n_additions): - noise_dur = random.uniform(0.0, max_noise_dur) - start_time = random.uniform(0.0, noise.duration) - start_sample = int(round(start_time * noise.sample_rate)) - end_sample = int(round(min(noise.duration, (start_time + noise_dur)) * noise.sample_rate)) - noise_samples = np.copy(noise._samples[start_sample:end_sample]) - # adjust gain for snr purposes and superimpose - noise_samples *= 10.0 ** (noise_gain_db / 20.0) - - if noise_samples.shape[0] > data._samples.shape[0]: - noise_samples = noise_samples[0 : data._samples.shape[0]] - - noise_idx = random.randint(0, data._samples.shape[0] - noise_samples.shape[0]) - data._samples[noise_idx : noise_idx + noise_samples.shape[0]] += noise_samples - - -class NoisePerturbationWithNormalization(Perturbation): - """ - Perturbation that adds noise to input audio, with normalisation to specific decibel level. - Also tiles shorter noise samples up to their corresponding clean audio length. - - Args: - manifest_path (str or list): Manifest file with paths to noise files, can be list if using multiple noise sources - min_snr_db (float): Minimum SNR of audio after noise is added - max_snr_db (float): Maximum SNR of audio after noise is added - snr_samples (list): A discrete list of SNRs DBs to sample from when mixing, will be used instead of [min_snr_db,max_snr_db] - norm_to_db (float): Will normalise clean, noise, and mixed samples to this DB - audio_tar_filepaths (str or list) : Tar files, if noise audio files are tarred, can be list for multiple sources - shuffle_n (int): Shuffle parameter for shuffling buffered files from the tar files - orig_sr (int): Original sampling rate of the noise files - rng (int): Random seed. Default is None - shard_strategy (str): if you're using tarred audio and wish to scatter instead of replicate, set this to 'scatter' - epsilon (float): minimum value for RMS DB normalisation to avoid divide by zero - """ - - def __init__( - self, - manifest_path=None, - min_snr_db=10, - max_snr_db=50, - snr_samples=None, - norm_to_db=None, - rng=None, - audio_tar_filepaths=None, - shuffle_n=128, - orig_sr=16000, - global_rank=0, - world_size=1, - shard_strategy='replicate', - epsilon=0.01, - ): - # import here to avoid circular import error - from nemo.collections.asr.data.audio_to_text import RandomizedChainDataset - - self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser([]), index_by_file_id=True) - self._audiodataset = None - self._tarred_audio = False - self._orig_sr = orig_sr - self._data_iterator = None - - random.seed(rng) if rng else None - self._rng = rng - - if audio_tar_filepaths: - self._tarred_audio = True - if isinstance(manifest_path, str): - manifest_path = [manifest_path] - if isinstance(audio_tar_filepaths, str): - audio_tar_filepaths = [audio_tar_filepaths] - datasets = [] - for tarred_audio_filepath, manifest_filepath in zip(audio_tar_filepaths, manifest_path): - dataset = AugmentationDataset( - manifest_filepath, - tarred_audio_filepath, - shuffle_n, - rank=global_rank, - world_size=world_size, - shard_strategy=shard_strategy, - ) - datasets.append(dataset) - self._audiodataset = RandomizedChainDataset( - datasets, rnd_seed=(rng if rng else random.randint(0, 30000)) + global_rank - ) - if len(self._audiodataset) == 0: - raise RuntimeError( - "NoisePerturbationWithNormalization detected a zero length RandomizedChainDataset, should never happen" - ) - self._data_iterator = iter(self._audiodataset) - - self._min_snr_db = min_snr_db - self._max_snr_db = max_snr_db - self._norm_to_db = norm_to_db - self._snr_samples = snr_samples if isinstance(snr_samples, list) and len(snr_samples) > 0 else None - self._epsilon = epsilon - - @property - def orig_sr(self): - return self._orig_sr - - def read_one_audiosegment(self, target_sr): - if self._tarred_audio: - if self._data_iterator is None: - raise TypeError("Expected valid iterator but got None") - try: - audio_file, file_id, manifest_entry = next(self._data_iterator) - except StopIteration: - self._data_iterator = iter(self._audiodataset) - audio_file, file_id, manifest_entry = next(self._data_iterator) - - offset = 0 if manifest_entry.offset is None else manifest_entry.offset - duration = 0 if manifest_entry.duration is None else manifest_entry.duration - - else: - audio_record = random.sample(self._manifest.data, 1)[0] - audio_file = audio_record.audio_file - offset = 0 if audio_record.offset is None else audio_record.offset - duration = 0 if audio_record.duration is None else audio_record.duration - - return AudioSegment.from_file(audio_file, target_sr=target_sr, offset=offset, duration=duration) - - def perturb(self, data, ref_mic=0): - """ - Args: - data (AudioSegment): audio data - ref_mic (int): reference mic index for scaling multi-channel audios - """ - - noise = self.read_one_audiosegment(data.sample_rate) - - # noise samples need to be at least 1 second long to avoid strange oddities - # in the RMS SNR mixing, so we have a fail-safe here to ensure at least 1 sec duration - while noise.duration < 1: - noise = self.read_one_audiosegment(data.sample_rate) - - self.perturb_with_input_noise(data, noise, ref_mic=ref_mic, norm_to_db=self._norm_to_db) - - def snr_mixer(self, clean, noise, snr, norm_to_db=-25.0): - """ - Mixes the clean audio with the noise - Args: - clean (numpy array): the clean audio data - noise (numpy array): the noise audio data - snr (float): the SNR value for the mixing - norm_to_db (float): the DB value to normalise to before mixing - """ - clean = self.norm_audio_to_db(clean, norm_to_db) - noise = self.norm_audio_to_db(noise, norm_to_db) - - # Set the noise level for a given SNR - # note that if your noise doesn't overlap with your audio then your target SNR - # may not be achievable. Consider using an rms-threshold in the future - noisescalar = 10 ** (-snr / 20.0) - noisenewlevel = noise * noisescalar - noisyspeech = clean + noisenewlevel - - return clean, noisenewlevel, noisyspeech - - def norm_audio_to_db(self, x, norm_to_db): - """ - Normalises audio signal to particular db, with some epsilon in-case of divide by zero - Args: - x (numpy array): input audio signal - norm_to_db (float): the db to normalise to - """ - rms = (x ** 2).mean(axis=0) ** 0.5 - rms = np.where(np.isclose(rms, 0), self._epsilon, rms) - scalar = 10 ** (norm_to_db / 20.0) / rms - return x * scalar - - def concatenate_noise_sample(self, clean, noise, fs, silence_length=0.25): - """ - Tiles the noise array to match the clean audio array, with small silence between the joins - Args: - clean (numpy array): clean audio data - noise (numpy array): noise audio data - fs (int): sample rate used by both clean and noise audio data - silence_length (float): the amount of silence (in secs) to insert before tiling - """ - while len(noise) < len(clean): - if noise.ndim > 1: - zeros = np.zeros((int(fs * silence_length), noise.shape[-1])) - else: - zeros = np.zeros((int(fs * silence_length),)) - noiseconcat = np.append(noise, zeros, axis=0) - noise = np.append(noiseconcat, noise, axis=0) - - return noise - - def perturb_with_input_noise(self, data, noise, data_rms=None, ref_mic=0, norm_to_db=-25.0): - """ - Args: - data (AudioSegment): audio data - noise (AudioSegment): noise data - data_rms (Union[float, List[float]): rms_db for data input - ref_mic (int): reference mic index for scaling multi-channel audio, if set to None then - each channel will be scaled independently - norm_to_db (float): will normalise all audio to this DB - """ - if data.num_channels != noise.num_channels: - raise ValueError( - f"Found mismatched channels for data ({data.num_channels}) and noise ({noise.num_channels})." - ) - - if not (0 <= ref_mic < data.num_channels): - raise ValueError( - f" reference mic ID must be an integer in [0, {data.num_channels}), got {ref_mic} instead." - ) - - if self._snr_samples: - snr_db = random.sample(self._snr_samples, 1)[0] - else: - snr_db = random.uniform(self._min_snr_db, self._max_snr_db) - if data_rms is None: - data_rms = data.rms_db[ref_mic] if isinstance(data.rms_db, (list, np.ndarray)) else data.rms_db - - if norm_to_db is None: - norm_to_db = data_rms - - data_norm = data._samples - noise_norm = noise._samples - - if len(data_norm) == 0: - return - - if len(noise_norm) < len(data_norm): - noise_norm = self.concatenate_noise_sample(data_norm, noise_norm, data.sample_rate) - noise_norm = noise_norm[0 : len(data_norm)] - - _, _, noisy_snr = self.snr_mixer(clean=data_norm, noise=noise_norm, snr=snr_db, norm_to_db=norm_to_db) - - data._samples = noisy_snr - - -class WhiteNoisePerturbation(Perturbation): - """ - Perturbation that adds white noise to an audio file in the training dataset. - - Args: - min_level (int): Minimum level in dB at which white noise should be added - max_level (int): Maximum level in dB at which white noise should be added - rng (int): Random seed. Default is None - """ - - def __init__(self, min_level=-90, max_level=-46, rng=None): - self.min_level = int(min_level) - self.max_level = int(max_level) - np.random.seed(rng) if rng else None - - def perturb(self, data): - noise_level_db = np.random.randint(self.min_level, self.max_level, dtype='int32') - noise_signal = np.random.randn(data._samples.shape[0]) * (10.0 ** (noise_level_db / 20.0)) - data._samples += noise_signal - - -class RirAndNoisePerturbation(Perturbation): - """ - RIR augmentation with additive foreground and background noise. - In this implementation audio data is augmented by first convolving the audio with a Room Impulse Response - and then adding foreground noise and background noise at various SNRs. RIR, foreground and background noises - should either be supplied with a manifest file or as tarred audio files (faster). - - Different sets of noise audio files based on the original sampling rate of the noise. This is useful while - training a mixed sample rate model. For example, when training a mixed model with 8 kHz and 16 kHz audio with a - target sampling rate of 16 kHz, one would want to augment 8 kHz data with 8 kHz noise rather than 16 kHz noise. - - Args: - rir_manifest_path: Manifest file for RIRs - rir_tar_filepaths: Tar files, if RIR audio files are tarred - rir_prob: Probability of applying a RIR - noise_manifest_paths: Foreground noise manifest path - min_snr_db: Min SNR for foreground noise - max_snr_db: Max SNR for background noise, - noise_tar_filepaths: Tar files, if noise files are tarred - apply_noise_rir: Whether to convolve foreground noise with a a random RIR - orig_sample_rate: Original sampling rate of foreground noise audio - max_additions: Max number of times foreground noise is added to an utterance, - max_duration: Max duration of foreground noise - bg_noise_manifest_paths: Background noise manifest path - bg_min_snr_db: Min SNR for background noise - bg_max_snr_db: Max SNR for background noise - bg_noise_tar_filepaths: Tar files, if noise files are tarred - bg_orig_sample_rate: Original sampling rate of background noise audio - rng: Random seed. Default is None - - """ - - def __init__( - self, - rir_manifest_path=None, - rir_prob=0.5, - noise_manifest_paths=None, - noise_prob=1.0, - min_snr_db=0, - max_snr_db=50, - rir_tar_filepaths=None, - rir_shuffle_n=100, - noise_tar_filepaths=None, - apply_noise_rir=False, - orig_sample_rate=None, - max_additions=5, - max_duration=2.0, - bg_noise_manifest_paths=None, - bg_noise_prob=1.0, - bg_min_snr_db=10, - bg_max_snr_db=50, - bg_noise_tar_filepaths=None, - bg_orig_sample_rate=None, - rng=None, - ): - - self._rir_prob = rir_prob - self._noise_prob = noise_prob - self._bg_noise_prob = bg_noise_prob - random.seed(rng) if rng else None - self._rir_perturber = ImpulsePerturbation( - manifest_path=rir_manifest_path, - audio_tar_filepaths=rir_tar_filepaths, - shuffle_n=rir_shuffle_n, - shift_impulse=True, - ) - self._fg_noise_perturbers = None - self._bg_noise_perturbers = None - if noise_manifest_paths: - self._fg_noise_perturbers = {} - for i in range(len(noise_manifest_paths)): - if orig_sample_rate is None: - orig_sr = 16000 - else: - orig_sr = orig_sample_rate[i] - self._fg_noise_perturbers[orig_sr] = NoisePerturbation( - manifest_path=noise_manifest_paths[i], - min_snr_db=min_snr_db[i], - max_snr_db=max_snr_db[i], - audio_tar_filepaths=noise_tar_filepaths[i], - orig_sr=orig_sr, - ) - self._max_additions = max_additions - self._max_duration = max_duration - if bg_noise_manifest_paths: - self._bg_noise_perturbers = {} - for i in range(len(bg_noise_manifest_paths)): - if bg_orig_sample_rate is None: - orig_sr = 16000 - else: - orig_sr = bg_orig_sample_rate[i] - self._bg_noise_perturbers[orig_sr] = NoisePerturbation( - manifest_path=bg_noise_manifest_paths[i], - min_snr_db=bg_min_snr_db[i], - max_snr_db=bg_max_snr_db[i], - audio_tar_filepaths=bg_noise_tar_filepaths[i], - orig_sr=orig_sr, - ) - - self._apply_noise_rir = apply_noise_rir - - def perturb(self, data): - prob = random.uniform(0.0, 1.0) - - if prob < self._rir_prob: - self._rir_perturber.perturb(data) - - data_rms = data.rms_db - - if self._fg_noise_perturbers is not None and random.uniform(0.0, 1.0) < self._noise_prob: - orig_sr = data.orig_sr - if orig_sr not in self._fg_noise_perturbers: - orig_sr = max(self._fg_noise_perturbers.keys()) - fg_perturber = self._fg_noise_perturbers[orig_sr] - noise = fg_perturber.get_one_noise_sample(data.sample_rate) - if self._apply_noise_rir: - self._rir_perturber.perturb(noise) - fg_perturber.perturb_with_foreground_noise( - data, noise, data_rms=data_rms, max_noise_dur=self._max_duration, max_additions=self._max_additions - ) - - if self._bg_noise_perturbers is not None and random.uniform(0.0, 1.0) < self._bg_noise_prob: - orig_sr = data.orig_sr - if orig_sr not in self._bg_noise_perturbers: - orig_sr = max(self._bg_noise_perturbers.keys()) - bg_perturber = self._bg_noise_perturbers[orig_sr] - - noise = bg_perturber.get_one_noise_sample(data.sample_rate) - bg_perturber.perturb_with_input_noise(data, noise, data_rms=data_rms) - - -class TranscodePerturbation(Perturbation): - """ - Audio codec augmentation. This implementation uses sox to transcode audio with low rate audio codecs, - so users need to make sure that the installed sox version supports the codecs used here (G711 and amr-nb). - - Args: - codecs (List[str]):A list of codecs to be trancoded to. Default is None. - rng (int): Random seed. Default is None. - """ - - def __init__(self, codecs=None, rng=None): - random.seed(rng) if rng else None - self._codecs = codecs if codecs is not None else ["g711", "amr-nb", "ogg"] - self.att_factor = 0.8 # to avoid saturation while writing to wav - if codecs is not None: - for codec in codecs: - if codec not in ["g711", "amr-nb", "ogg"]: - raise ValueError( - f"TranscodePerturbation with {codec} isnot supported. Only {codecs} are supported" - ) - - def perturb(self, data): - max_level = np.max(np.abs(data._samples)) - if max_level > 0.8: - norm_factor = self.att_factor / max_level - norm_samples = norm_factor * data._samples - else: - norm_samples = data._samples - orig_f = NamedTemporaryFile(suffix=".wav") - sf.write(orig_f.name, norm_samples.transpose(), 16000) - - codec_ind = random.randint(0, len(self._codecs) - 1) - if self._codecs[codec_ind] == "amr-nb": - transcoded_f = NamedTemporaryFile(suffix="_amr.wav") - rates = list(range(0, 4)) - rate = rates[random.randint(0, len(rates) - 1)] - _ = subprocess.check_output( - f"sox {orig_f.name} -V0 -C {rate} -t amr-nb - | sox -t amr-nb - -V0 -b 16 -r 16000 {transcoded_f.name}", - shell=True, - ) - elif self._codecs[codec_ind] == "ogg": - transcoded_f = NamedTemporaryFile(suffix="_ogg.wav") - rates = list(range(-1, 8)) - rate = rates[random.randint(0, len(rates) - 1)] - _ = subprocess.check_output( - f"sox {orig_f.name} -V0 -C {rate} -t ogg - | sox -t ogg - -V0 -b 16 -r 16000 {transcoded_f.name}", - shell=True, - ) - elif self._codecs[codec_ind] == "g711": - transcoded_f = NamedTemporaryFile(suffix="_g711.wav") - _ = subprocess.check_output( - f"sox {orig_f.name} -V0 -r 8000 -c 1 -e a-law {transcoded_f.name} lowpass 3400 highpass 300", - shell=True, - ) - - new_data = AudioSegment.from_file(transcoded_f.name, target_sr=16000) - data._samples = new_data._samples[0 : data._samples.shape[0]] - return - - -class RandomSegmentPerturbation(Perturbation): - """ - Returns a random segment from input of duration "duration_sec". - If duration_sec > input audio length, pad_to_duration determines the outcome. - - RandomSegmentPerturbation is intended for self-supervised learning. - Not for supervised, as extracting corresponding text is not facilitated. - - - Args: - duration_sec (float): duration of the segment to be extracted - pad_to_duration (bool): zero pad if length of input audio < duration_sec - rng: Random seed. Default is None - """ - - def __init__(self, duration_sec=32.0, pad_to_duration=False, rng=None): - if duration_sec <= 0: - raise ValueError("duration_sec should be > 0") - - self._duration_sec = duration_sec - self._pad_to_duration = pad_to_duration - random.seed(rng) if rng else None - - def perturb(self, data): - if self._duration_sec > data.duration: - if not self._pad_to_duration: - raise ValueError(f"audio length < {self._duration_sec} sec and pad_to_duration is set to False") - start_time = 0.0 - pad_size = self._duration_sec * data.sample_rate - data.num_samples - data.pad(pad_size=pad_size) - else: - start_time = random.uniform(0.0, data.duration - self._duration_sec) - - end_time = start_time + self._duration_sec - data.subsegment(start_time=start_time, end_time=end_time) - - -perturbation_types = { - "speed": SpeedPerturbation, - "time_stretch": TimeStretchPerturbation, - "gain": GainPerturbation, - "silence": SilencePerturbation, - "impulse": ImpulsePerturbation, - "shift": ShiftPerturbation, - "noise": NoisePerturbation, - "noise_norm": NoisePerturbationWithNormalization, - "white_noise": WhiteNoisePerturbation, - "rir_noise_aug": RirAndNoisePerturbation, - "transcode_aug": TranscodePerturbation, - "random_segment": RandomSegmentPerturbation, -} - - -def register_perturbation(name: str, perturbation: Perturbation): - if name in perturbation_types.keys(): - raise KeyError( - f"Perturbation with the name {name} exists. " f"Type of perturbation : {perturbation_types[name]}." - ) - - perturbation_types[name] = perturbation - - -class AudioAugmentor(object): - def __init__(self, perturbations=None, rng=None): - random.seed(rng) if rng else None - self._pipeline = perturbations if perturbations is not None else [] - - def perturb(self, segment): - for (prob, p) in self._pipeline: - if random.random() < prob: - p.perturb(segment) - return - - def max_augmentation_length(self, length): - newlen = length - for (prob, p) in self._pipeline: - newlen = p.max_augmentation_length(newlen) - return newlen - - @classmethod - def from_config(cls, config): - ptbs = [] - for p in config: - if p['aug_type'] not in perturbation_types: - logging.warning("%s perturbation not known. Skipping.", p['aug_type']) - continue - perturbation = perturbation_types[p['aug_type']] - ptbs.append((p['prob'], perturbation(**p['cfg']))) - return cls(perturbations=ptbs) - - -def process_augmentations(augmenter, global_rank=0, world_size=1) -> Optional[AudioAugmentor]: - """Process list of online data augmentations. - Accepts either an AudioAugmentor object with pre-defined augmentations, - or a dictionary that points to augmentations that have been defined. - If a dictionary is passed, must follow the below structure: - Dict[str, Dict[str, Any]]: Which refers to a dictionary of string - names for augmentations, defined in `asr/parts/perturb.py`. - The inner dictionary may contain key-value arguments of the specific - augmentation, along with an essential key `prob`. `prob` declares the - probability of the augmentation being applied, and must be a float - value in the range [0, 1]. - # Example in YAML config file - Augmentations are generally applied only during training, so we can add - these augmentations to our yaml config file, and modify the behaviour - for training and evaluation. - ```yaml - AudioToSpeechLabelDataLayer: - ... # Parameters shared between train and evaluation time - train: - augmentor: - shift: - prob: 0.5 - min_shift_ms: -5.0 - max_shift_ms: 5.0 - white_noise: - prob: 1.0 - min_level: -90 - max_level: -46 - ... - eval: - ... - ``` - Then in the training script, - ```python - import copy - from ruamel.yaml import YAML - yaml = YAML(typ="safe") - with open(model_config) as f: - params = yaml.load(f) - # Train Config for Data Loader - train_dl_params = copy.deepcopy(params["AudioToTextDataLayer"]) - train_dl_params.update(params["AudioToTextDataLayer"]["train"]) - del train_dl_params["train"] - del train_dl_params["eval"] - data_layer_train = nemo_asr.AudioToTextDataLayer( - ..., - **train_dl_params, - ) - # Evaluation Config for Data Loader - eval_dl_params = copy.deepcopy(params["AudioToTextDataLayer"]) - eval_dl_params.update(params["AudioToTextDataLayer"]["eval"]) - del eval_dl_params["train"] - del eval_dl_params["eval"] - data_layer_eval = nemo_asr.AudioToTextDataLayer( - ..., - **eval_dl_params, - ) - ``` - # Registering your own Augmentations - To register custom augmentations to obtain the above convenience of - the declaring the augmentations in YAML, you can put additional keys in - `perturbation_types` dictionary as follows. - ```python - from nemo.collections.asr.parts import perturb - # Define your own perturbation here - class CustomPerturbation(perturb.Perturbation): - ... - perturb.register_perturbation(name_of_perturbation, CustomPerturbation) - ``` - Args: - augmenter: AudioAugmentor object or - dictionary of str -> kwargs (dict) which is parsed and used - to initialize an AudioAugmentor. - Note: It is crucial that each individual augmentation has - a keyword `prob`, that defines a float probability in the - the range [0, 1] of this augmentation being applied. - If this keyword is not present, then the augmentation is - disabled and a warning is logged. - Returns: AudioAugmentor object - """ - if augmenter is None: - return None - - if isinstance(augmenter, AudioAugmentor): - return augmenter - - augmenter_types = {dict} - if HAVE_OMEGACONG_WEBDATASET: - augmenter_types = {dict, DictConfig} - if not type(augmenter) in augmenter_types: - raise ValueError("Cannot parse augmenter. Must be a dict or an AudioAugmentor object ") - - if HAVE_OMEGACONG_WEBDATASET and isinstance(augmenter, DictConfig): - augmenter = OmegaConf.to_container(augmenter, resolve=True) - - augmenter = copy.deepcopy(augmenter) - - augmentations = [] - for augment_name, augment_kwargs in augmenter.items(): - prob = augment_kwargs.get('prob', None) - - if prob is None: - raise KeyError( - f'Augmentation "{augment_name}" will not be applied as ' - f'keyword argument "prob" was not defined for this augmentation.' - ) - - else: - _ = augment_kwargs.pop('prob') - - if prob < 0.0 or prob > 1.0: - raise ValueError("`prob` must be a float value between 0 and 1.") - - try: - augmentation_class = perturbation_types[augment_name] - if 'global_rank' in inspect.signature(augmentation_class).parameters: - augment_kwargs['global_rank'] = global_rank - if 'world_size' in inspect.signature(augmentation_class).parameters: - augment_kwargs['world_size'] = world_size - augmentation = augmentation_class(**augment_kwargs) - augmentations.append([prob, augmentation]) - except KeyError: - raise KeyError(f"Invalid perturbation name. Allowed values : {perturbation_types.keys()}") - - augmenter = AudioAugmentor(perturbations=augmentations) - return augmenter - - -class AugmentationDataset(IterableDataset): - """ - A class that loads tarred audio files and cycles over the files in the dataset. - Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToCharDataset/AudioToBPEDataset), - as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should - contain the information for one audio file, including at least the transcript and name of the audio - file within the tarball. - Valid formats for the audio_tar_filepaths argument include: - (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or - (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...]. - Note: For brace expansion in (1), there may be cases where `{x..y}` syntax cannot be used due to shell interference. - This occurs most commonly inside SLURM scripts. Therefore we provide a few equivalent replacements. - Supported opening braces - { <=> (, [, < and the special tag _OP_. - Supported closing braces - } <=> ), ], > and the special tag _CL_. - For SLURM based tasks, we suggest the use of the special tags for ease of use. - See the WebDataset documentation for more information about accepted data and input formats. - """ - - def __init__( - self, - manifest_path: str, - tar_filepaths: Union[str, List[str]], - shuffle_n: int = 128, - rank: int = 0, - world_size: int = 1, - shard_strategy: str = "replicate", - ): - # import here to avoid circular import error - from nemo.collections.asr.data.audio_to_text import expand_sharded_filepaths - - self._manifest = collections.ASRAudioText(manifest_path, parser=parsers.make_parser([]), index_by_file_id=True) - - tar_filepaths = expand_sharded_filepaths( - tar_filepaths, shard_strategy=shard_strategy, world_size=world_size, global_rank=rank - ) - - if not HAVE_OMEGACONG_WEBDATASET: - raise LightningNotInstalledException(self) - self.audio_dataset = wd.WebDataset(urls=tar_filepaths, nodesplitter=None) - - if shuffle_n > 0: - self.audio_dataset = self.audio_dataset.shuffle(shuffle_n) - else: - logging.info("WebDataset will not shuffle files within the tar files.") - - self.audio_dataset = ( - self.audio_dataset.rename(audio='wav;ogg;flac', key='__key__') - .to_tuple('audio', 'key') - .pipe(self._loop_offsets) - ) - - def __len__(self): - return len(self._manifest) - - def _loop_offsets(self, iterator): - """This function is used to iterate through utterances with different offsets for each file. - """ - - class TarredAudioLoopOffsets: - def __init__(self, collection): - self.iterator = iterator - self.collection = collection - self.current_fn = None - self.current_bytes = None - self.offset_id = 0 - - def __iter__(self): - return self - - def __next__(self): - if self.current_fn is None: - self.current_bytes, self.current_fn = next(self.iterator) - self.offset_id = 0 - else: - offset_list = self.collection.mapping[self.current_fn] - if len(offset_list) == self.offset_id + 1: - self.current_bytes, self.current_fn = next(self.iterator) - self.offset_id = 0 - else: - self.offset_id += 1 - - return self.current_bytes, self.current_fn, self.offset_id - - return TarredAudioLoopOffsets(self._manifest) - - def __iter__(self): - audio_iter = iter(self.audio_dataset) - - while True: - try: - audio_bytes, audio_filename, offset_id = next(audio_iter) - file_id, _ = os.path.splitext(os.path.basename(audio_filename)) - manifest_idx = self._manifest.mapping[file_id][offset_id] - manifest_entry = self._manifest[manifest_idx] - - # Convert audio bytes to IO stream for processing (for SoundFile to read) - audio_file = io.BytesIO(audio_bytes) - yield audio_file, file_id, manifest_entry - except StopIteration: - audio_iter = iter(self.audio_dataset) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/segment.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/segment.py deleted file mode 100644 index f614d3d2218689ca9b3ba263578a674f00207cf1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/preprocessing/segment.py +++ /dev/null @@ -1,542 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Copyright (c) 2018 Ryan Leary -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# This file contains code artifacts adapted from https://github.com/ryanleary/patter - -import math -import os -import random -from typing import Optional - -import librosa -import numpy as np -import soundfile as sf - -from nemo.collections.asr.parts.utils.audio_utils import select_channels -from nemo.utils import logging - -# TODO @blisc: Perhaps refactor instead of import guarding -HAVE_PYDUB = True -try: - from pydub import AudioSegment as Audio - from pydub.exceptions import CouldntDecodeError -except ModuleNotFoundError: - HAVE_PYDUB = False - - -available_formats = sf.available_formats() -sf_supported_formats = ["." + i.lower() for i in available_formats.keys()] - - -class AudioSegment(object): - """Audio segment abstraction. - :param samples: Audio samples [num_samples x num_channels]. - :type samples: ndarray.float32 - :param sample_rate: Audio sample rate. - :type sample_rate: int - :raises TypeError: If the sample data type is not float or int. - """ - - def __init__( - self, - samples, - sample_rate, - target_sr=None, - trim=False, - trim_ref=np.max, - trim_top_db=60, - trim_frame_length=2048, - trim_hop_length=512, - orig_sr=None, - channel_selector=None, - normalize_db: Optional[float] = None, - ref_channel: Optional[int] = None, - ): - """Create audio segment from samples. - Samples are convert float32 internally, with int scaled to [-1, 1]. - """ - samples = self._convert_samples_to_float32(samples) - - # Check if channel selector is necessary - if samples.ndim == 1 and channel_selector not in [None, 0, 'average']: - raise ValueError( - 'Input signal is one-dimensional, channel selector (%s) cannot not be used.', str(channel_selector) - ) - elif samples.ndim == 2: - samples = select_channels(samples, channel_selector) - elif samples.ndim >= 3: - raise NotImplementedError( - 'Signals with more than two dimensions (sample, channel) are currently not supported.' - ) - - if target_sr is not None and target_sr != sample_rate: - # resample along the temporal dimension (axis=0) will be in librosa 0.10.0 (#1561) - samples = samples.transpose() - samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr) - samples = samples.transpose() - sample_rate = target_sr - if trim: - # librosa is using channels-first layout (num_channels, num_samples), which is transpose of AudioSegment's layout - samples = samples.transpose() - samples, _ = librosa.effects.trim( - samples, top_db=trim_top_db, ref=trim_ref, frame_length=trim_frame_length, hop_length=trim_hop_length - ) - samples = samples.transpose() - self._samples = samples - self._sample_rate = sample_rate - self._orig_sr = orig_sr if orig_sr is not None else sample_rate - self._ref_channel = ref_channel - self._normalize_db = normalize_db - - if normalize_db is not None: - self.normalize_db(normalize_db, ref_channel) - - def __eq__(self, other): - """Return whether two objects are equal.""" - if type(other) is not type(self): - return False - if self._sample_rate != other._sample_rate: - return False - if self._samples.shape != other._samples.shape: - return False - if np.any(self.samples != other._samples): - return False - return True - - def __ne__(self, other): - """Return whether two objects are unequal.""" - return not self.__eq__(other) - - def __str__(self): - """Return human-readable representation of segment.""" - if self.num_channels == 1: - return "%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, rms=%.2fdB" % ( - type(self), - self.num_samples, - self.sample_rate, - self.duration, - self.rms_db, - ) - else: - rms_db_str = ', '.join([f'{rms:.2f}dB' for rms in self.rms_db]) - return "%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, num_channels=%d, rms=[%s]" % ( - type(self), - self.num_samples, - self.sample_rate, - self.duration, - self.num_channels, - rms_db_str, - ) - - @staticmethod - def _convert_samples_to_float32(samples): - """Convert sample type to float32. - Audio sample type is usually integer or float-point. - Integers will be scaled to [-1, 1] in float32. - """ - float32_samples = samples.astype('float32') - if samples.dtype in np.sctypes['int']: - bits = np.iinfo(samples.dtype).bits - float32_samples *= 1.0 / 2 ** (bits - 1) - elif samples.dtype in np.sctypes['float']: - pass - else: - raise TypeError("Unsupported sample type: %s." % samples.dtype) - return float32_samples - - @classmethod - def from_file( - cls, - audio_file, - target_sr=None, - int_values=False, - offset=0, - duration=0, - trim=False, - trim_ref=np.max, - trim_top_db=60, - trim_frame_length=2048, - trim_hop_length=512, - orig_sr=None, - channel_selector=None, - normalize_db=None, - ref_channel=None, - ): - """ - Load a file supported by librosa and return as an AudioSegment. - :param audio_file: path of file to load. - Alternatively, a list of paths of single-channel files can be provided - to form a multichannel signal. - :param target_sr: the desired sample rate - :param int_values: if true, load samples as 32-bit integers - :param offset: offset in seconds when loading audio - :param duration: duration in seconds when loading audio - :param trim: if true, trim leading and trailing silence from an audio signal - :param trim_ref: the reference amplitude. By default, it uses `np.max` and compares to the peak amplitude in - the signal - :param trim_top_db: the threshold (in decibels) below reference to consider as silence - :param trim_frame_length: the number of samples per analysis frame - :param trim_hop_length: the number of samples between analysis frames - :param orig_sr: the original sample rate - :param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable - of integers denoting a subset of channels. Channel selector is using zero-based indexing. - If set to `None`, the original signal will be used. - :param normalize_db (Optional[float]): if not None, normalize the audio signal to a target RMS value - :param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio, set None to use max RMS across channels - :return: AudioSegment instance - """ - samples = None - if isinstance(audio_file, list): - return cls.from_file_list( - audio_file_list=audio_file, - target_sr=target_sr, - int_values=int_values, - offset=offset, - duration=duration, - trim=trim, - trim_ref=trim_ref, - trim_top_db=trim_top_db, - trim_frame_length=trim_frame_length, - trim_hop_length=trim_hop_length, - orig_sr=orig_sr, - channel_selector=channel_selector, - normalize_db=normalize_db, - ref_channel=ref_channel, - ) - - if not isinstance(audio_file, str) or os.path.splitext(audio_file)[-1] in sf_supported_formats: - try: - with sf.SoundFile(audio_file, 'r') as f: - dtype = 'int32' if int_values else 'float32' - sample_rate = f.samplerate - if offset > 0: - f.seek(int(offset * sample_rate)) - if duration > 0: - samples = f.read(int(duration * sample_rate), dtype=dtype) - else: - samples = f.read(dtype=dtype) - except RuntimeError as e: - logging.error( - f"Loading {audio_file} via SoundFile raised RuntimeError: `{e}`. " - f"NeMo will fallback to loading via pydub." - ) - - if hasattr(audio_file, "seek"): - audio_file.seek(0) - - if HAVE_PYDUB and samples is None: - try: - samples = Audio.from_file(audio_file) - sample_rate = samples.frame_rate - num_channels = samples.channels - if offset > 0: - # pydub does things in milliseconds - seconds = offset * 1000 - samples = samples[int(seconds) :] - if duration > 0: - seconds = duration * 1000 - samples = samples[: int(seconds)] - samples = np.array(samples.get_array_of_samples()) - # For multi-channel signals, channels are stacked in a one-dimensional vector - if num_channels > 1: - samples = np.reshape(samples, (-1, num_channels)) - except CouldntDecodeError as err: - logging.error(f"Loading {audio_file} via pydub raised CouldntDecodeError: `{err}`.") - - if samples is None: - libs = "soundfile, and pydub" if HAVE_PYDUB else "soundfile" - raise Exception(f"Your audio file {audio_file} could not be decoded. We tried using {libs}.") - - return cls( - samples, - sample_rate, - target_sr=target_sr, - trim=trim, - trim_ref=trim_ref, - trim_top_db=trim_top_db, - trim_frame_length=trim_frame_length, - trim_hop_length=trim_hop_length, - orig_sr=orig_sr, - channel_selector=channel_selector, - normalize_db=normalize_db, - ref_channel=ref_channel, - ) - - @classmethod - def from_file_list( - cls, - audio_file_list, - target_sr=None, - int_values=False, - offset=0, - duration=0, - trim=False, - channel_selector=None, - *args, - **kwargs, - ): - """ - Function wrapper for `from_file` method. Load a list of files from `audio_file_list`. - The length of each audio file is unified with the duration item in the input manifest file. - See `from_file` method for arguments. - - If a list of files is provided, load samples from individual single-channel files and - concatenate them along the channel dimension. - """ - if isinstance(channel_selector, int): - # Shortcut when selecting a single channel - if channel_selector >= len(audio_file_list): - raise RuntimeError( - f'Channel cannot be selected: channel_selector={channel_selector}, num_audio_files={len(audio_file_list)}' - ) - # Select only a single file - audio_file_list = [audio_file_list[channel_selector]] - # Reset the channel selector since we applied it here - channel_selector = None - - samples = None - - for a_file in audio_file_list: - # Load audio from the current file - a_segment = cls.from_file( - a_file, - target_sr=target_sr, - int_values=int_values, - offset=offset, - duration=duration, - channel_selector=None, - trim=False, # Do not apply trim to individual files, it will be applied to the concatenated signal - *args, - **kwargs, - ) - - # Only single-channel individual files are supported for now - if a_segment.num_channels != 1: - raise RuntimeError( - f'Expecting a single-channel audio signal, but loaded {a_segment.num_channels} channels from file {a_file}' - ) - - if target_sr is None: - # All files need to be loaded with the same sample rate - target_sr = a_segment.sample_rate - - # Concatenate samples - a_samples = a_segment.samples[:, None] - - if samples is None: - samples = a_samples - else: - # Check the dimensions match - if len(a_samples) != len(samples): - raise RuntimeError( - f'Loaded samples need to have identical length: {a_samples.shape} != {samples.shape}' - ) - - # Concatenate along channel dimension - samples = np.concatenate([samples, a_samples], axis=1) - - # Final setup for class initialization - samples = np.squeeze(samples) - sample_rate = target_sr - - return cls( - samples, sample_rate, target_sr=target_sr, trim=trim, channel_selector=channel_selector, *args, **kwargs, - ) - - @classmethod - def segment_from_file( - cls, - audio_file, - target_sr=None, - n_segments=0, - trim=False, - orig_sr=None, - channel_selector=None, - offset=None, - dtype='float32', - ): - """Grabs n_segments number of samples from audio_file. - If offset is not provided, n_segments are selected randomly. - If offset is provided, it is used to calculate the starting sample. - - Note that audio_file can be either the file path, or a file-like object. - - :param audio_file: path to a file or a file-like object - :param target_sr: sample rate for the output samples - :param n_segments: desired number of samples - :param trim: if true, trim leading and trailing silence from an audio signal - :param orig_sr: the original sample rate - :param channel selector: select a subset of channels. If set to `None`, the original signal will be used. - :param offset: fixed offset in seconds - :param dtype: data type to load audio as. - :return: numpy array of samples - """ - is_segmented = False - try: - with sf.SoundFile(audio_file, 'r') as f: - sample_rate = f.samplerate - if target_sr is not None: - n_segments_at_original_sr = math.ceil(n_segments * sample_rate / target_sr) - else: - n_segments_at_original_sr = n_segments - - if 0 < n_segments_at_original_sr < len(f): - max_audio_start = len(f) - n_segments_at_original_sr - if offset is None: - audio_start = random.randint(0, max_audio_start) - else: - audio_start = math.floor(offset * sample_rate) - if audio_start > max_audio_start: - raise RuntimeError( - f'Provided audio start ({audio_start}) is larger than the maximum possible ({max_audio_start})' - ) - f.seek(audio_start) - samples = f.read(n_segments_at_original_sr, dtype=dtype) - is_segmented = True - elif n_segments_at_original_sr > len(f): - logging.warning( - f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) of the audio file {audio_file}. This may lead to shape mismatch errors." - ) - samples = f.read(dtype=dtype) - else: - samples = f.read(dtype=dtype) - except RuntimeError as e: - logging.error(f"Loading {audio_file} via SoundFile raised RuntimeError: `{e}`.") - raise e - - features = cls( - samples, sample_rate, target_sr=target_sr, trim=trim, orig_sr=orig_sr, channel_selector=channel_selector - ) - - if is_segmented: - features._samples = features._samples[:n_segments] - - return features - - @property - def samples(self): - return self._samples.copy() - - @property - def sample_rate(self): - return self._sample_rate - - @property - def num_channels(self): - if self._samples.ndim == 1: - return 1 - else: - return self._samples.shape[-1] - - @property - def num_samples(self): - return self._samples.shape[0] - - @property - def duration(self): - return self.num_samples / float(self._sample_rate) - - @property - def rms_db(self): - """Return per-channel RMS value. - """ - mean_square = np.mean(self._samples ** 2, axis=0) - return 10 * np.log10(mean_square) - - @property - def orig_sr(self): - return self._orig_sr - - def gain_db(self, gain): - self._samples *= 10.0 ** (gain / 20.0) - - def normalize_db(self, target_db=-20, ref_channel=None): - """Normalize the signal to a target RMS value in decibels. - For multi-channel audio, the RMS value is determined by the reference channel (if not None), - otherwise it will be the maximum RMS across all channels. - """ - rms_db = self.rms_db - if self.num_channels > 1: - rms_db = max(rms_db) if ref_channel is None else rms_db[ref_channel] - gain = target_db - rms_db - self.gain_db(gain) - - def pad(self, pad_size, symmetric=False): - """Add zero padding to the sample. The pad size is given in number - of samples. - If symmetric=True, `pad_size` will be added to both sides. If false, - `pad_size` - zeros will be added only to the end. - """ - samples_ndim = self._samples.ndim - if samples_ndim == 1: - pad_width = pad_size if symmetric else (0, pad_size) - elif samples_ndim == 2: - # pad samples, keep channels - pad_width = ((pad_size, pad_size), (0, 0)) if symmetric else ((0, pad_size), (0, 0)) - else: - raise NotImplementedError( - f"Padding not implemented for signals with more that 2 dimensions. Current samples dimension: {samples_ndim}." - ) - # apply padding - self._samples = np.pad(self._samples, pad_width, mode='constant',) - - def subsegment(self, start_time=None, end_time=None): - """Cut the AudioSegment between given boundaries. - Note that this is an in-place transformation. - :param start_time: Beginning of subsegment in seconds. - :type start_time: float - :param end_time: End of subsegment in seconds. - :type end_time: float - :raise ValueError: If start_time or end_time is incorrectly set, - e.g. out of bounds in time. - """ - start_time = 0.0 if start_time is None else start_time - end_time = self.duration if end_time is None else end_time - if start_time < 0.0: - start_time = self.duration + start_time - if end_time < 0.0: - end_time = self.duration + end_time - if start_time < 0.0: - raise ValueError("The slice start position (%f s) is out of bounds." % start_time) - if end_time < 0.0: - raise ValueError("The slice end position (%f s) is out of bounds." % end_time) - if start_time > end_time: - raise ValueError( - "The slice start position (%f s) is later than the end position (%f s)." % (start_time, end_time) - ) - if end_time > self.duration: - raise ValueError("The slice end position (%f s) is out of bounds (> %f s)" % (end_time, self.duration)) - start_sample = int(round(start_time * self._sample_rate)) - end_sample = int(round(end_time * self._sample_rate)) - self._samples = self._samples[start_sample:end_sample] diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__init__.py deleted file mode 100644 index bc443be41c4c3fcf0764eb9fd3e1828d63f8438c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index c527553a24645fce41a9fa3c67b9fb4ca52cb6df..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/batchnorm.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/batchnorm.cpython-310.pyc deleted file mode 100644 index 55b590ca23703e3ad1969761f02f98008446112e..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/batchnorm.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/causal_convs.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/causal_convs.cpython-310.pyc deleted file mode 100644 index d71751fbe2c3f2b946b416a6aab1bd1074e84312..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/causal_convs.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/conformer_modules.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/conformer_modules.cpython-310.pyc deleted file mode 100644 index c07f8379728d8a5d24e9c430eb47889cf2d2206a..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/conformer_modules.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/ctc_beam_decoding.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/ctc_beam_decoding.cpython-310.pyc deleted file mode 100644 index 0381075dd3a8008d215ae7453a6a51c69bc7dca3..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/ctc_beam_decoding.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/ctc_greedy_decoding.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/ctc_greedy_decoding.cpython-310.pyc deleted file mode 100644 index 5006187bc4f9c6a545600de18e5a7103aae673f5..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/ctc_greedy_decoding.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/jasper.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/jasper.cpython-310.pyc deleted file mode 100644 index 49b981f24d07a511852b84a93a23bfd1437bac8c..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/jasper.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/multi_head_attention.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/multi_head_attention.cpython-310.pyc deleted file mode 100644 index fd022d1aa6e3ed33e8d6420a9056ef3007cc3185..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/multi_head_attention.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/multichannel_modules.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/multichannel_modules.cpython-310.pyc deleted file mode 100644 index 7fbeb12cef5e21c5a63f519653c239c365d085dc..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/multichannel_modules.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/rnnt_beam_decoding.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/rnnt_beam_decoding.cpython-310.pyc deleted file mode 100644 index 7ba3bedbc228ad23ac9aff6134fc25a6b996a233..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/rnnt_beam_decoding.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/rnnt_greedy_decoding.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/rnnt_greedy_decoding.cpython-310.pyc deleted file mode 100644 index 728e92606a43d890ac57a7dae6d5fc9852103dcf..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/rnnt_greedy_decoding.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/spectr_augment.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/spectr_augment.cpython-310.pyc deleted file mode 100644 index 5f8208d3b6e0ca811cfe8a08f7f82caa06168aaa..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/spectr_augment.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/squeezeformer_modules.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/squeezeformer_modules.cpython-310.pyc deleted file mode 100644 index bc12256494f712f6bc26e300b4f21b4edc75bf1f..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/squeezeformer_modules.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/stateless_net.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/stateless_net.cpython-310.pyc deleted file mode 100644 index 3160d1549e058d57fff8ad472920c712dd4d4ed1..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/stateless_net.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/subsampling.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/subsampling.cpython-310.pyc deleted file mode 100644 index cb7e4e3c6b93c0161afd88b715d7db8b845427ab..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/subsampling.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/tdnn_attention.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/tdnn_attention.cpython-310.pyc deleted file mode 100644 index 4b7656eebb00fea6caea530601b097c8bd0ec49a..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/__pycache__/tdnn_attention.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/adapters/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/adapters/__init__.py deleted file mode 100644 index 6aa05d07dea1de8d16f47cceea0cbb3c2b091263..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/adapters/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module import ( - MHAResidualAddAdapterStrategy, - MHAResidualAddAdapterStrategyConfig, - MultiHeadAttentionAdapter, - MultiHeadAttentionAdapterConfig, - PositionalEncodingAdapter, - PositionalEncodingAdapterConfig, - RelPositionalEncodingAdapter, - RelPositionalEncodingAdapterConfig, - RelPositionMultiHeadAttentionAdapter, - RelPositionMultiHeadAttentionAdapterConfig, -) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/adapters/multi_head_attention_adapter_module.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/adapters/multi_head_attention_adapter_module.py deleted file mode 100644 index 333b630ff7f6ffc0233b3c4a9399fc8770a06dd1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/adapters/multi_head_attention_adapter_module.py +++ /dev/null @@ -1,386 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from dataclasses import dataclass, field -from typing import Any, Optional - -import torch -from torch import nn as nn - -from nemo.collections.asr.parts.submodules import multi_head_attention as mha -from nemo.collections.common.parts import adapter_modules -from nemo.core.classes.mixins import adapter_mixin_strategies - - -class MHAResidualAddAdapterStrategy(adapter_mixin_strategies.ResidualAddAdapterStrategy): - """ - An implementation of residual addition of an adapter module with its input for the MHA Adapters. - """ - - def forward(self, input: torch.Tensor, adapter: torch.nn.Module, *, module: 'AdapterModuleMixin'): - """ - A basic strategy, comprising of a residual connection over the input, after forward pass by - the underlying adapter. Additional work is done to pack and unpack the dictionary of inputs and outputs. - - Note: The `value` tensor is added to the output of the attention adapter as the residual connection. - - Args: - input: A dictionary of multiple input arguments for the adapter module. - `query`, `key`, `value`: Original output tensor of the module, or the output of the - previous adapter (if more than one adapters are enabled). - `mask`: Attention mask. - `pos_emb`: Optional positional embedding for relative encoding. - adapter: The adapter module that is currently required to perform the forward pass. - module: The calling module, in its entirety. It is a module that implements `AdapterModuleMixin`, - therefore the strategy can access all other adapters in this module via `module.adapter_layer`. - - Returns: - The result tensor, after one of the active adapters has finished its forward passes. - """ - out = self.compute_output(input, adapter, module=module) - - # If not in training mode, or probability of stochastic depth is 0, skip step. - p = self.stochastic_depth - if not module.training or p == 0.0: - pass - else: - out = self.apply_stochastic_depth(out, input['value'], adapter, module=module) - - # Return the residual connection output = input + adapter(input) - result = input['value'] + out - - # If l2_lambda is activated, register the loss value - self.compute_auxiliary_losses(result, input['value'], adapter, module=module) - - return result - - def compute_output( - self, input: torch.Tensor, adapter: torch.nn.Module, *, module: 'AdapterModuleMixin' - ) -> torch.Tensor: - """ - Compute the output of a single adapter to some input. - - Args: - input: Original output tensor of the module, or the output of the previous adapter (if more than - one adapters are enabled). - adapter: The adapter module that is currently required to perform the forward pass. - module: The calling module, in its entirety. It is a module that implements `AdapterModuleMixin`, - therefore the strategy can access all other adapters in this module via `module.adapter_layer`. - - Returns: - The result tensor, after one of the active adapters has finished its forward passes. - """ - if isinstance(input, (list, tuple)): - out = adapter(*input) - elif isinstance(input, dict): - out = adapter(**input) - else: - out = adapter(input) - return out - - -@dataclass -class MHAResidualAddAdapterStrategyConfig(adapter_mixin_strategies.ResidualAddAdapterStrategyConfig): - _target_: str = "{0}.{1}".format( - MHAResidualAddAdapterStrategy.__module__, MHAResidualAddAdapterStrategy.__name__ - ) # mandatory field - - -class MultiHeadAttentionAdapter(mha.MultiHeadAttention, adapter_modules.AdapterModuleUtil): - """Multi-Head Attention layer of Transformer. - Args: - n_head (int): number of heads - n_feat (int): size of the features - dropout_rate (float): dropout rate - proj_dim (int, optional): Optional integer value for projection before computing attention. - If None, then there is no projection (equivalent to proj_dim = n_feat). - If > 0, then will project the n_feat to proj_dim before calculating attention. - If <0, then will equal n_head, so that each head has a projected dimension of 1. - adapter_strategy: By default, MHAResidualAddAdapterStrategyConfig. An adapter composition function object. - """ - - def __init__( - self, - n_head: int, - n_feat: int, - dropout_rate: float, - proj_dim: Optional[int] = None, - adapter_strategy: MHAResidualAddAdapterStrategy = None, - ): - super().__init__(n_head=n_head, n_feat=n_feat, dropout_rate=dropout_rate, max_cache_len=0) - - self.pre_norm = nn.LayerNorm(n_feat) - - # Set the projection dim to number of heads automatically - if proj_dim is not None and proj_dim < 1: - proj_dim = n_head - - self.proj_dim = proj_dim - - # Recompute weights for projection dim - if self.proj_dim is not None: - if self.proj_dim % n_head != 0: - raise ValueError(f"proj_dim ({proj_dim}) is not divisible by n_head ({n_head})") - - self.d_k = self.proj_dim // n_head - self.s_d_k = math.sqrt(self.d_k) - self.linear_q = nn.Linear(n_feat, self.proj_dim) - self.linear_k = nn.Linear(n_feat, self.proj_dim) - self.linear_v = nn.Linear(n_feat, self.proj_dim) - self.linear_out = nn.Linear(self.proj_dim, n_feat) - - # Setup adapter strategy - self.setup_adapter_strategy(adapter_strategy) - - # reset parameters for Q to be identity operation - self.reset_parameters() - - def forward(self, query, key, value, mask, pos_emb=None, cache=None): - """Compute 'Scaled Dot Product Attention'. - Args: - query (torch.Tensor): (batch, time1, size) - key (torch.Tensor): (batch, time2, size) - value(torch.Tensor): (batch, time2, size) - mask (torch.Tensor): (batch, time1, time2) - cache (torch.Tensor) : (batch, time_cache, size) - - returns: - output (torch.Tensor): transformed `value` (batch, time1, d_model) weighted by the query dot key attention - cache (torch.Tensor) : (batch, time_cache_next, size) - """ - # Need to perform duplicate computations as at this point the tensors have been - # separated by the adapter forward - query = self.pre_norm(query) - key = self.pre_norm(key) - value = self.pre_norm(value) - - return super().forward(query, key, value, mask, pos_emb, cache=cache) - - def reset_parameters(self): - with torch.no_grad(): - nn.init.zeros_(self.linear_out.weight) - nn.init.zeros_(self.linear_out.bias) - - def get_default_strategy_config(self) -> 'dataclass': - return MHAResidualAddAdapterStrategyConfig() - - -@dataclass -class MultiHeadAttentionAdapterConfig: - n_head: int - n_feat: int - dropout_rate: float = 0.0 - proj_dim: Optional[int] = None - adapter_strategy: Optional[Any] = field(default_factory=lambda: MHAResidualAddAdapterStrategyConfig()) - _target_: str = "{0}.{1}".format(MultiHeadAttentionAdapter.__module__, MultiHeadAttentionAdapter.__name__) - - -class RelPositionMultiHeadAttentionAdapter(mha.RelPositionMultiHeadAttention, adapter_modules.AdapterModuleUtil): - """Multi-Head Attention layer of Transformer-XL with support of relative positional encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): number of heads - n_feat (int): size of the features - dropout_rate (float): dropout rate - proj_dim (int, optional): Optional integer value for projection before computing attention. - If None, then there is no projection (equivalent to proj_dim = n_feat). - If > 0, then will project the n_feat to proj_dim before calculating attention. - If <0, then will equal n_head, so that each head has a projected dimension of 1. - adapter_strategy: By default, MHAResidualAddAdapterStrategyConfig. An adapter composition function object. - """ - - def __init__( - self, - n_head: int, - n_feat: int, - dropout_rate: float, - proj_dim: Optional[int] = None, - adapter_strategy: MHAResidualAddAdapterStrategyConfig = None, - ): - super().__init__( - n_head=n_head, n_feat=n_feat, dropout_rate=dropout_rate, pos_bias_u=None, pos_bias_v=None, max_cache_len=0 - ) - - self.pre_norm = nn.LayerNorm(n_feat) - - # Set the projection dim to number of heads automatically - if proj_dim is not None and proj_dim < 1: - proj_dim = n_head - - self.proj_dim = proj_dim - - # Recompute weights for projection dim - if self.proj_dim is not None: - if self.proj_dim % n_head != 0: - raise ValueError(f"proj_dim ({proj_dim}) is not divisible by n_head ({n_head})") - - self.d_k = self.proj_dim // n_head - self.s_d_k = math.sqrt(self.d_k) - self.linear_q = nn.Linear(n_feat, self.proj_dim) - self.linear_k = nn.Linear(n_feat, self.proj_dim) - self.linear_v = nn.Linear(n_feat, self.proj_dim) - self.linear_out = nn.Linear(self.proj_dim, n_feat) - self.linear_pos = nn.Linear(n_feat, self.proj_dim, bias=False) - self.pos_bias_u = nn.Parameter(torch.FloatTensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.FloatTensor(self.h, self.d_k)) - - # Setup adapter strategy - self.setup_adapter_strategy(adapter_strategy) - - # reset parameters for Q to be identity operation - self.reset_parameters() - - def forward(self, query, key, value, mask, pos_emb, cache=None): - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): (batch, time1, size) - key (torch.Tensor): (batch, time2, size) - value(torch.Tensor): (batch, time2, size) - mask (torch.Tensor): (batch, time1, time2) - pos_emb (torch.Tensor) : (batch, time1, size) - cache (torch.Tensor) : (batch, time_cache, size) - Returns: - output (torch.Tensor): transformed `value` (batch, time1, d_model) weighted by the query dot key attention - cache_next (torch.Tensor) : (batch, time_cache_next, size) - """ - # Need to perform duplicate computations as at this point the tensors have been - # separated by the adapter forward - query = self.pre_norm(query) - key = self.pre_norm(key) - value = self.pre_norm(value) - - return super().forward(query, key, value, mask, pos_emb, cache=cache) - - def reset_parameters(self): - with torch.no_grad(): - nn.init.zeros_(self.linear_out.weight) - nn.init.zeros_(self.linear_out.bias) - - # NOTE: This exact procedure apparently highly important. - # Above operation is safe to do as self.linear_out.weight *= 0.0 (similar for bias) - # However: - # DO NOT REPLACE BELOW WITH self.pos_bias_u *= 0.0 OR self.pos_bias_v *= 0.0 - # For some reason at init sometimes it will cause the value of the tensor to become NaN - # All operations to compute matrix_ac and matrix_bd will then fail. - nn.init.zeros_(self.pos_bias_u) - nn.init.zeros_(self.pos_bias_v) - - def get_default_strategy_config(self) -> 'dataclass': - return MHAResidualAddAdapterStrategyConfig() - - -@dataclass -class RelPositionMultiHeadAttentionAdapterConfig: - n_head: int - n_feat: int - dropout_rate: float = 0.0 - proj_dim: Optional[int] = None - adapter_strategy: Optional[Any] = field(default_factory=lambda: MHAResidualAddAdapterStrategyConfig()) - _target_: str = "{0}.{1}".format( - RelPositionMultiHeadAttentionAdapter.__module__, RelPositionMultiHeadAttentionAdapter.__name__ - ) - - -class PositionalEncodingAdapter(mha.PositionalEncoding, adapter_modules.AdapterModuleUtil): - - """ - Absolute positional embedding adapter. - - .. note:: - - Absolute positional embedding value is added to the input tensor *without residual connection* ! - Therefore, the input is changed, if you only require the positional embedding, drop the returned `x` ! - - Args: - d_model (int): The input dimension of x. - max_len (int): The max sequence length. - xscale (float): The input scaling factor. Defaults to 1.0. - adapter_strategy (AbstractAdapterStrategy): By default, ReturnResultAdapterStrategyConfig. - An adapter composition function object. - NOTE: Since this is a positional encoding, it will not add a residual ! - """ - - def __init__( - self, - d_model: int, - max_len: int = 5000, - xscale=1.0, - adapter_strategy: adapter_mixin_strategies.ReturnResultAdapterStrategyConfig = None, - ): - - super().__init__( - d_model=d_model, dropout_rate=0.0, max_len=max_len, xscale=xscale, dropout_rate_emb=0.0, - ) - - # Setup adapter strategy - self.setup_adapter_strategy(adapter_strategy) - - def get_default_strategy_config(self) -> 'dataclass': - return adapter_mixin_strategies.ReturnResultAdapterStrategyConfig() - - -@dataclass -class PositionalEncodingAdapterConfig: - d_model: int - max_len: int = 5000 - xscale: float = 1.0 - adapter_strategy: Optional[Any] = field( - default_factory=lambda: adapter_mixin_strategies.ResidualAddAdapterStrategyConfig() - ) - _target_: str = "{0}.{1}".format(PositionalEncodingAdapter.__module__, PositionalEncodingAdapter.__name__) - - -class RelPositionalEncodingAdapter(mha.RelPositionalEncoding, adapter_modules.AdapterModuleUtil): - """ - Relative positional encoding for TransformerXL's layers - See : Appendix B in https://arxiv.org/abs/1901.02860 - - .. note:: - - Relative positional embedding value is **not** added to the input tensor ! - Therefore, the input should be updated changed, if you only require the positional embedding, drop the returned `x` ! - - Args: - d_model (int): embedding dim - max_len (int): maximum input length - xscale (bool): whether to scale the input by sqrt(d_model) - adapter_strategy: By default, ReturnResultAdapterStrategyConfig. An adapter composition function object. - """ - - def __init__( - self, - d_model: int, - max_len: int = 5000, - xscale=1.0, - adapter_strategy: adapter_mixin_strategies.ReturnResultAdapterStrategyConfig = None, - ): - super().__init__(d_model=d_model, dropout_rate=0.0, max_len=max_len, xscale=xscale, dropout_rate_emb=0.0) - - # Setup adapter strategy - self.setup_adapter_strategy(adapter_strategy) - - def get_default_strategy_config(self) -> 'dataclass': - return adapter_mixin_strategies.ReturnResultAdapterStrategyConfig() - - -@dataclass -class RelPositionalEncodingAdapterConfig: - d_model: int - max_len: int = 5000 - xscale: float = 1.0 - adapter_strategy: Optional[Any] = field( - default_factory=lambda: adapter_mixin_strategies.ResidualAddAdapterStrategyConfig() - ) - _target_: str = "{0}.{1}".format(RelPositionalEncodingAdapter.__module__, RelPositionalEncodingAdapter.__name__) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/batchnorm.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/batchnorm.py deleted file mode 100644 index 66a69f761c14818a4b8f322e8eb643d80c711aa9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/batchnorm.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -from functools import reduce -from typing import List - -import torch -import torch.nn as nn - - -class FusedBatchNorm1d(nn.Module): - """ - Fused BatchNorm to use in Conformer to improve accuracy in finetuning with TTS scenario - Drop-in replacement for BatchNorm1d with simple affine projection - """ - - def __init__(self, num_features: int): - """ - Args: - num_features: number of channels, see original BatchNorm1d documentation - """ - super().__init__() - self.num_features = num_features - self.weight = nn.Parameter(torch.ones(num_features)) - self.bias = nn.Parameter(torch.zeros(num_features)) - - @classmethod - def from_batchnorm(cls, bn: nn.BatchNorm1d) -> FusedBatchNorm1d: - """ - Construct FusedBatchNorm1d module from BatchNorm1d - Args: - bn: original BatchNorm module - - Returns: - FusedBatchNorm1d module with initialized params; in eval mode result is equivalent to original BatchNorm - """ - assert isinstance(bn, nn.BatchNorm1d) - fused_bn = FusedBatchNorm1d(bn.num_features) - # init projection params from original batch norm - # so, for inference mode output is the same - std = torch.sqrt(bn.running_var.data + bn.eps) - fused_bn.weight.data = bn.weight.data / std - fused_bn.bias.data = bn.bias.data - bn.running_mean.data * fused_bn.weight.data - return fused_bn - - def forward(self, x: torch.Tensor): - if x.dim() == 3: - return x * self.weight.unsqueeze(-1) + self.bias.unsqueeze(-1) - assert x.dim() == 2 - return x * self.weight + self.bias - - -def _get_module_by_name(module: nn.Module, full_layer_name: str) -> nn.Module: - names = full_layer_name.split(sep='.') - return reduce(getattr, names, module) - - -def replace_bn_with_fused_bn(module: nn.Module, full_layer_name: str): - """ - Replace BatchNorm1d named `full_layer_name` in nn.Module with FusedBatchNorm1d - Args: - module: nn.Module instance, modified inplace - full_layer_name: name of BatchNorm1d submodule in module to replace - """ - bn = _get_module_by_name(module, full_layer_name) - assert isinstance(bn, nn.BatchNorm1d) - fused_bn = FusedBatchNorm1d.from_batchnorm(bn) - try: - parent_name, norm_name = full_layer_name.rsplit(".", maxsplit=1) - setattr(_get_module_by_name(module, parent_name), norm_name, fused_bn) - except ValueError: - norm_name = full_layer_name - setattr(module, norm_name, fused_bn) - - -def replace_bn_with_fused_bn_all(model: nn.Module) -> List[str]: - """ - Replace BatchNorm1d with FusedBatchNorm1d in model - Args: - model: nn.Module instance, modified inplace - - Returns: - list of replaced module names - """ - replaced_module_names = [] - for name, module in model.named_modules(): - if isinstance(module, nn.BatchNorm1d): - replace_bn_with_fused_bn(model, name) - replaced_module_names.append(name) - return replaced_module_names diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/causal_convs.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/causal_convs.py deleted file mode 100644 index 32f08a8d2feb463b2b46f5ee096a69e5047696f1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/causal_convs.py +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Union - -import torch -import torch.nn.functional as F -from torch import nn - -__all__ = ['CausalConv2D', 'CausalConv1D'] - - -class CausalConv2D(nn.Conv2d): - """ - A causal version of nn.Conv2d where each location in the 2D matrix would have no access to locations on its right or down - All arguments are the same as nn.Conv2d except padding which should be set as None - """ - - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: int, - stride: int = 1, - padding: Union[str, int] = 0, - dilation: int = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', - device=None, - dtype=None, - ) -> None: - if padding is not None: - raise ValueError("Argument padding should be set to None for CausalConv2D.") - self._left_padding = kernel_size - 1 - self._right_padding = stride - 1 - - padding = 0 - super(CausalConv2D, self).__init__( - in_channels, - out_channels, - kernel_size, - stride, - padding, - dilation, - groups, - bias, - padding_mode, - device, - dtype, - ) - - def forward( - self, x, - ): - x = F.pad(x, pad=(self._left_padding, self._right_padding, self._left_padding, self._right_padding)) - x = super().forward(x) - return x - - -class CausalConv1D(nn.Conv1d): - """ - A causal version of nn.Conv1d where each step would have limited access to locations on its right or left - All arguments are the same as nn.Conv1d except padding. - - If padding is set None, then paddings are set automatically to make it a causal convolution where each location would not see any steps on its right. - - If padding is set as a list (size of 2), then padding[0] would be used as left padding and padding[1] as right padding. - It would make it possible to control the number of steps to be accessible on the right and left. - This mode is not supported when stride > 1. padding[0]+padding[1] should be equal to (kernel_size - 1). - """ - - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: int, - stride: int = 1, - padding: Union[str, int] = 0, - dilation: int = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', - device=None, - dtype=None, - ) -> None: - self.cache_drop_size = None - if padding is None: - self._left_padding = kernel_size - 1 - self._right_padding = stride - 1 - else: - if stride != 1 and padding != kernel_size - 1: - raise ValueError("No striding allowed for non-symmetric convolutions!") - if isinstance(padding, int): - self._left_padding = padding - self._right_padding = padding - elif isinstance(padding, list) and len(padding) == 2 and padding[0] + padding[1] == kernel_size - 1: - self._left_padding = padding[0] - self._right_padding = padding[1] - else: - raise ValueError(f"Invalid padding param: {padding}!") - - self._max_cache_len = self._left_padding - - super(CausalConv1D, self).__init__( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=stride, - padding=0, - dilation=dilation, - groups=groups, - bias=bias, - padding_mode=padding_mode, - device=device, - dtype=dtype, - ) - - def update_cache(self, x, cache=None): - if cache is None: - new_x = F.pad(x, pad=(self._left_padding, self._right_padding)) - next_cache = cache - else: - new_x = F.pad(x, pad=(0, self._right_padding)) - new_x = torch.cat([cache, new_x], dim=-1) - if self.cache_drop_size > 0: - next_cache = new_x[:, :, : -self.cache_drop_size] - else: - next_cache = new_x - next_cache = next_cache[:, :, -cache.size(-1) :] - return new_x, next_cache - - def forward(self, x, cache=None): - x, cache = self.update_cache(x, cache=cache) - x = super().forward(x) - if cache is None: - return x - else: - return x, cache diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/conformer_modules.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/conformer_modules.py deleted file mode 100644 index 677d2acd9f2e7627af7f50b14fc9f70f07be4cc2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/conformer_modules.py +++ /dev/null @@ -1,411 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import torch -from torch import nn as nn -from torch.nn import LayerNorm - -from nemo.collections.asr.parts.submodules.batchnorm import FusedBatchNorm1d -from nemo.collections.asr.parts.submodules.causal_convs import CausalConv1D -from nemo.collections.asr.parts.submodules.multi_head_attention import ( - MultiHeadAttention, - RelPositionMultiHeadAttention, - RelPositionMultiHeadAttentionLongformer, -) -from nemo.collections.asr.parts.utils.activations import Swish -from nemo.collections.common.parts import adapter_modules -from nemo.collections.common.parts.utils import activation_registry -from nemo.core.classes.mixins import AccessMixin -from nemo.core.classes.mixins.adapter_mixins import AdapterModuleMixin - -__all__ = ['ConformerConvolution', 'ConformerFeedForward', 'ConformerLayer'] - - -class ConformerLayer(torch.nn.Module, AdapterModuleMixin, AccessMixin): - """A single block of the Conformer encoder. - - Args: - d_model (int): input dimension of MultiheadAttentionMechanism and PositionwiseFeedForward - d_ff (int): hidden dimension of PositionwiseFeedForward - self_attention_model (str): type of the attention layer and positional encoding - 'rel_pos': relative positional embedding and Transformer-XL - 'rel_pos_local_attn': relative positional embedding and Transformer-XL with local attention using - overlapping chunks. Attention context is determined by att_context_size parameter. - 'abs_pos': absolute positional embedding and Transformer - Default is rel_pos. - global_tokens (int): number of tokens to be used for global attention. - Only relevant if self_attention_model is 'rel_pos_local_attn'. - Defaults to 0. - global_tokens_spacing (int): how far apart the global tokens are - Defaults to 1. - global_attn_separate (bool): whether the q, k, v layers used for global tokens should be separate. - Defaults to False. - n_heads (int): number of heads for multi-head attention - conv_kernel_size (int): kernel size for depthwise convolution in convolution module - dropout (float): dropout probabilities for linear layers - dropout_att (float): dropout probabilities for attention distributions - """ - - def __init__( - self, - d_model, - d_ff, - self_attention_model='rel_pos', - global_tokens=0, - global_tokens_spacing=1, - global_attn_separate=False, - n_heads=4, - conv_kernel_size=31, - conv_norm_type='batch_norm', - conv_context_size=None, - dropout=0.1, - dropout_att=0.1, - pos_bias_u=None, - pos_bias_v=None, - att_context_size=[-1, -1], - ): - super(ConformerLayer, self).__init__() - - self.self_attention_model = self_attention_model - self.n_heads = n_heads - self.fc_factor = 0.5 - - # first feed forward module - self.norm_feed_forward1 = LayerNorm(d_model) - self.feed_forward1 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout) - - # convolution module - self.norm_conv = LayerNorm(d_model) - self.conv = ConformerConvolution( - d_model=d_model, - kernel_size=conv_kernel_size, - norm_type=conv_norm_type, - conv_context_size=conv_context_size, - ) - - # multi-headed self-attention module - self.norm_self_att = LayerNorm(d_model) - MHA_max_cache_len = att_context_size[0] - - if self_attention_model == 'rel_pos': - self.self_attn = RelPositionMultiHeadAttention( - n_head=n_heads, - n_feat=d_model, - dropout_rate=dropout_att, - pos_bias_u=pos_bias_u, - pos_bias_v=pos_bias_v, - max_cache_len=MHA_max_cache_len, - ) - elif self_attention_model == 'rel_pos_local_attn': - self.self_attn = RelPositionMultiHeadAttentionLongformer( - n_head=n_heads, - n_feat=d_model, - dropout_rate=dropout_att, - pos_bias_u=pos_bias_u, - pos_bias_v=pos_bias_v, - max_cache_len=MHA_max_cache_len, - att_context_size=att_context_size, - global_tokens=global_tokens, - global_tokens_spacing=global_tokens_spacing, - global_attn_separate=global_attn_separate, - ) - elif self_attention_model == 'abs_pos': - self.self_attn = MultiHeadAttention( - n_head=n_heads, n_feat=d_model, dropout_rate=dropout_att, max_cache_len=MHA_max_cache_len - ) - else: - raise ValueError( - f"'{self_attention_model}' is not not a valid value for 'self_attention_model', " - f"valid values can be from ['rel_pos', 'rel_pos_local_attn', 'abs_pos']" - ) - - # second feed forward module - self.norm_feed_forward2 = LayerNorm(d_model) - self.feed_forward2 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout) - - self.dropout = nn.Dropout(dropout) - self.norm_out = LayerNorm(d_model) - - def forward(self, x, att_mask=None, pos_emb=None, pad_mask=None, cache_last_channel=None, cache_last_time=None): - """ - Args: - x (torch.Tensor): input signals (B, T, d_model) - att_mask (torch.Tensor): attention masks(B, T, T) - pos_emb (torch.Tensor): (L, 1, d_model) - pad_mask (torch.tensor): padding mask - cache_last_channel (torch.tensor) : cache for MHA layers (B, T_cache, d_model) - cache_last_time (torch.tensor) : cache for convolutional layers (B, d_model, T_cache) - Returns: - x (torch.Tensor): (B, T, d_model) - cache_last_channel (torch.tensor) : next cache for MHA layers (B, T_cache, d_model) - cache_last_time (torch.tensor) : next cache for convolutional layers (B, d_model, T_cache) - """ - residual = x - x = self.norm_feed_forward1(x) - x = self.feed_forward1(x) - residual = residual + self.dropout(x) * self.fc_factor - - x = self.norm_self_att(residual) - if self.self_attention_model == 'rel_pos': - x = self.self_attn(query=x, key=x, value=x, mask=att_mask, pos_emb=pos_emb, cache=cache_last_channel) - elif self.self_attention_model == 'rel_pos_local_attn': - x = self.self_attn(query=x, key=x, value=x, pad_mask=pad_mask, pos_emb=pos_emb, cache=cache_last_channel) - elif self.self_attention_model == 'abs_pos': - x = self.self_attn(query=x, key=x, value=x, mask=att_mask, cache=cache_last_channel) - else: - x = None - - if x is not None and cache_last_channel is not None: - (x, cache_last_channel) = x - - residual = residual + self.dropout(x) - - if self.is_adapter_available(): - # Call the MHA adapters - pack_ip = { - 'x': residual, - 'loc': 'mha', - 'att_mask': att_mask, - 'pos_emb': pos_emb, - } - pack_ip = self.forward_enabled_adapters(pack_ip) - residual = pack_ip['x'] - - x = self.norm_conv(residual) - x = self.conv(x, pad_mask=pad_mask, cache=cache_last_time) - if cache_last_time is not None: - (x, cache_last_time) = x - residual = residual + self.dropout(x) - - x = self.norm_feed_forward2(residual) - x = self.feed_forward2(x) - residual = residual + self.dropout(x) * self.fc_factor - - x = self.norm_out(residual) - - if self.is_adapter_available(): - # Call the adapters - pack_ip = { - 'x': x, - 'loc': 'post', - } - pack_ip = self.forward_enabled_adapters(pack_ip) - x = pack_ip['x'] - - if self.is_access_enabled() and self.access_cfg.get('save_encoder_tensors', False): - self.register_accessible_tensor(name='encoder', tensor=x) - if cache_last_channel is None: - return x - else: - return x, cache_last_channel, cache_last_time - - def forward_single_enabled_adapter_( - self, - input: dict, - adapter_module: torch.nn.Module, - *, - adapter_name: str, - adapter_strategy: 'nemo.core.classes.mixins.adapter_mixin_strategies.AbstractAdapterStrategy', - ): - """ - Perform the forward step of a single adapter module on some input data. - - **Note**: Subclasses can override this method to accommodate more complicate adapter forward steps. - - Args: - input: Dictionary of packed tensors. The dict should contain at least - `x`: output tensor - `loc`: Semantic location in module where this adapter was called - `att_mask`: Optional, Attention mask - `pos_emb`: Optional, Positional Embedding for Relative Positional Encoding. - The output tensor of the calling module is the input to the first adapter, whose output - is then chained to the next adapter until all adapters are consumed. - adapter_module: The adapter module that is currently required to perform the forward pass. - adapter_name: The resolved name of the adapter that is undergoing the current forward pass. - adapter_strategy: A subclass of `AbstractAdapterStrategy`, that determines how the - output of the adapter should be merged with the input, or if it should be merged at all. - - Returns: - The result tensor, after the current active adapter has finished its forward pass. - """ - # (input: torch.Tensor, adapter: torch.nn.Module, *, module: 'AdapterModuleMixin') - x = input['x'] - loc = input['loc'] - att_mask = input.get('att_mask', None) - pos_emb = input.get('pos_emb', None) - - if isinstance(adapter_module, adapter_modules.LinearAdapter) and loc == 'post': - output = adapter_strategy(x, adapter_module, module=self) - - elif isinstance(adapter_module, MultiHeadAttention) and loc == 'mha': - if self.self_attention_model == 'rel_pos': - x = dict(query=x, key=x, value=x, mask=att_mask, pos_emb=pos_emb) - output = adapter_strategy(x, adapter_module, module=self) - - elif self.self_attention_model == 'abs_pos': - x = dict(query=x, key=x, value=x, mask=att_mask) - output = adapter_strategy(x, adapter_module, module=self) - - else: - raise ValueError(f"Unsupported value of self_attention_model , provided {self.self_attention_model}!") - - else: - # No adapter compatible, skip - output = x - - input['x'] = output - - return input - - -class ConformerConvolution(nn.Module): - """The convolution module for the Conformer model. - Args: - d_model (int): hidden dimension - kernel_size (int): kernel size for depthwise convolution - pointwise_activation (str): name of the activation function to be used for the pointwise conv. - Note that Conformer uses a special key `glu_` which is treated as the original default from - the paper. - """ - - def __init__( - self, d_model, kernel_size, norm_type='batch_norm', conv_context_size=None, pointwise_activation='glu_' - ): - super(ConformerConvolution, self).__init__() - assert (kernel_size - 1) % 2 == 0 - self.d_model = d_model - self.kernel_size = kernel_size - self.norm_type = norm_type - - if conv_context_size is None: - conv_context_size = (kernel_size - 1) // 2 - - if pointwise_activation in activation_registry: - self.pointwise_activation = activation_registry[pointwise_activation]() - dw_conv_input_dim = d_model * 2 - - if hasattr(self.pointwise_activation, 'inplace'): - self.pointwise_activation.inplace = True - else: - self.pointwise_activation = pointwise_activation - dw_conv_input_dim = d_model - - self.pointwise_conv1 = nn.Conv1d( - in_channels=d_model, out_channels=d_model * 2, kernel_size=1, stride=1, padding=0, bias=True - ) - - self.depthwise_conv = CausalConv1D( - in_channels=dw_conv_input_dim, - out_channels=dw_conv_input_dim, - kernel_size=kernel_size, - stride=1, - padding=conv_context_size, - groups=dw_conv_input_dim, - bias=True, - ) - - if norm_type == 'batch_norm': - self.batch_norm = nn.BatchNorm1d(dw_conv_input_dim) - elif norm_type == 'instance_norm': - self.batch_norm = nn.InstanceNorm1d(dw_conv_input_dim) - elif norm_type == 'layer_norm': - self.batch_norm = nn.LayerNorm(dw_conv_input_dim) - elif norm_type == 'fused_batch_norm': - self.batch_norm = FusedBatchNorm1d(dw_conv_input_dim) - elif norm_type.startswith('group_norm'): - num_groups = int(norm_type.replace("group_norm", "")) - self.batch_norm = nn.GroupNorm(num_groups=num_groups, num_channels=d_model) - else: - raise ValueError(f"conv_norm_type={norm_type} is not valid!") - - self.activation = Swish() - self.pointwise_conv2 = nn.Conv1d( - in_channels=dw_conv_input_dim, out_channels=d_model, kernel_size=1, stride=1, padding=0, bias=True - ) - - def forward(self, x, pad_mask=None, cache=None): - x = x.transpose(1, 2) - x = self.pointwise_conv1(x) - - # Compute the activation function or use GLU for original Conformer - if self.pointwise_activation == 'glu_': - x = nn.functional.glu(x, dim=1) - else: - x = self.pointwise_activation(x) - - if pad_mask is not None: - x = x.float().masked_fill(pad_mask.unsqueeze(1), 0.0) - - x = self.depthwise_conv(x, cache=cache) - if cache is not None: - x, cache = x - - if self.norm_type == "layer_norm": - x = x.transpose(1, 2) - x = self.batch_norm(x) - x = x.transpose(1, 2) - else: - x = self.batch_norm(x) - - x = self.activation(x) - x = self.pointwise_conv2(x) - x = x.transpose(1, 2) - if cache is None: - return x - else: - return x, cache - - def reset_parameters_conv(self): - pw1_max = pw2_max = self.d_model ** -0.5 - dw_max = self.kernel_size ** -0.5 - - with torch.no_grad(): - nn.init.uniform_(self.pointwise_conv1.weight, -pw1_max, pw1_max) - nn.init.uniform_(self.pointwise_conv1.bias, -pw1_max, pw1_max) - nn.init.uniform_(self.pointwise_conv2.weight, -pw2_max, pw2_max) - nn.init.uniform_(self.pointwise_conv2.bias, -pw2_max, pw2_max) - nn.init.uniform_(self.depthwise_conv.weight, -dw_max, dw_max) - nn.init.uniform_(self.depthwise_conv.bias, -dw_max, dw_max) - - -class ConformerFeedForward(nn.Module): - """ - feed-forward module of Conformer model. - """ - - def __init__(self, d_model, d_ff, dropout, activation=Swish()): - super(ConformerFeedForward, self).__init__() - self.d_model = d_model - self.d_ff = d_ff - self.linear1 = nn.Linear(d_model, d_ff) - self.activation = activation - self.dropout = nn.Dropout(p=dropout) - self.linear2 = nn.Linear(d_ff, d_model) - - def forward(self, x): - x = self.linear1(x) - x = self.activation(x) - x = self.dropout(x) - x = self.linear2(x) - return x - - def reset_parameters_ff(self): - ffn1_max = self.d_model ** -0.5 - ffn2_max = self.d_ff ** -0.5 - with torch.no_grad(): - nn.init.uniform_(self.linear1.weight, -ffn1_max, ffn1_max) - nn.init.uniform_(self.linear1.bias, -ffn1_max, ffn1_max) - nn.init.uniform_(self.linear2.weight, -ffn2_max, ffn2_max) - nn.init.uniform_(self.linear2.bias, -ffn2_max, ffn2_max) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py deleted file mode 100644 index 5ed504fd9c453a6c00ebce1fd1696de39d0bf10e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/ctc_beam_decoding.py +++ /dev/null @@ -1,606 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import os -from dataclasses import dataclass, field -from typing import List, Optional, Tuple, Union - -import torch - -from nemo.collections.asr.parts.utils import rnnt_utils -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.core.classes import Typing, typecheck -from nemo.core.neural_types import HypothesisType, LengthsType, LogprobsType, NeuralType -from nemo.utils import logging - -DEFAULT_TOKEN_OFFSET = 100 - - -def pack_hypotheses( - hypotheses: List[rnnt_utils.NBestHypotheses], logitlen: torch.Tensor, -) -> List[rnnt_utils.NBestHypotheses]: - - if logitlen is not None: - if hasattr(logitlen, 'cpu'): - logitlen_cpu = logitlen.to('cpu') - else: - logitlen_cpu = logitlen - - for idx, hyp in enumerate(hypotheses): # type: rnnt_utils.NBestHypotheses - for candidate_idx, cand in enumerate(hyp.n_best_hypotheses): - cand.y_sequence = torch.tensor(cand.y_sequence, dtype=torch.long) - - if logitlen is not None: - cand.length = logitlen_cpu[idx] - - if cand.dec_state is not None: - cand.dec_state = _states_to_device(cand.dec_state) - - return hypotheses - - -def _states_to_device(dec_state, device='cpu'): - if torch.is_tensor(dec_state): - dec_state = dec_state.to(device) - - elif isinstance(dec_state, (list, tuple)): - dec_state = tuple(_states_to_device(dec_i, device) for dec_i in dec_state) - - return dec_state - - -class AbstractBeamCTCInfer(Typing): - """A beam CTC decoder. - - Provides a common abstraction for sample level beam decoding. - - Args: - blank_id: int, index of the blank token. Can be 0 or len(vocabulary). - beam_size: int, size of the beam used in the underlying beam search engine. - - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "decoder_output": NeuralType(('B', 'T', 'D'), LogprobsType()), - "decoder_lengths": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return {"predictions": [NeuralType(elements_type=HypothesisType())]} - - def __init__(self, blank_id: int, beam_size: int): - self.blank_id = blank_id - - if beam_size < 1: - raise ValueError("Beam search size cannot be less than 1!") - - self.beam_size = beam_size - - # Variables set by corresponding setter methods - self.vocab = None - self.decoding_type = None - self.tokenizer = None - - # Utility maps for vocabulary - self.vocab_index_map = None - self.index_vocab_map = None - - # Internal variable, used to prevent double reduction of consecutive tokens (ctc collapse) - self.override_fold_consecutive_value = None - - def set_vocabulary(self, vocab: List[str]): - """ - Set the vocabulary of the decoding framework. - - Args: - vocab: List of str. Each token corresponds to its location in the vocabulary emitted by the model. - Note that this vocabulary must NOT contain the "BLANK" token. - """ - self.vocab = vocab - self.vocab_index_map = {v: i for i, v in enumerate(vocab)} - self.index_vocab_map = {i: v for i, v in enumerate(vocab)} - - def set_decoding_type(self, decoding_type: str): - """ - Sets the decoding type of the framework. Can support either char or subword models. - - Args: - decoding_type: Str corresponding to decoding type. Only supports "char" and "subword". - """ - decoding_type = decoding_type.lower() - supported_types = ['char', 'subword'] - - if decoding_type not in supported_types: - raise ValueError( - f"Unsupported decoding type. Supported types = {supported_types}.\n" f"Given = {decoding_type}" - ) - - self.decoding_type = decoding_type - - def set_tokenizer(self, tokenizer: TokenizerSpec): - """ - Set the tokenizer of the decoding framework. - - Args: - tokenizer: NeMo tokenizer object, which inherits from TokenizerSpec. - """ - self.tokenizer = tokenizer - - @typecheck() - def forward( - self, decoder_output: torch.Tensor, decoder_lengths: torch.Tensor, - ) -> Tuple[List[Union[rnnt_utils.Hypothesis, rnnt_utils.NBestHypotheses]]]: - """Returns a list of hypotheses given an input batch of the encoder hidden embedding. - Output token is generated auto-repressively. - - Args: - decoder_output: A tensor of size (batch, timesteps, features) or (batch, timesteps) (each timestep is a label). - decoder_lengths: list of int representing the length of each sequence - output sequence. - - Returns: - packed list containing batch number of sentences (Hypotheses). - """ - raise NotImplementedError() - - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) - - -class BeamCTCInfer(AbstractBeamCTCInfer): - """A greedy CTC decoder. - - Provides a common abstraction for sample level and batch level greedy decoding. - - Args: - blank_index: int index of the blank token. Can be 0 or len(vocabulary). - preserve_alignments: Bool flag which preserves the history of logprobs generated during - decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `logprobs` in it. Here, `logprobs` is a torch.Tensors. - compute_timestamps: A bool flag, which determines whether to compute the character/subword, or - word based timestamp mapping the output log-probabilities to discrite intervals of timestamps. - The timestamps will be available in the returned Hypothesis.timestep as a dictionary. - - """ - - def __init__( - self, - blank_id: int, - beam_size: int, - search_type: str = "default", - return_best_hypothesis: bool = True, - preserve_alignments: bool = False, - compute_timestamps: bool = False, - beam_alpha: float = 1.0, - beam_beta: float = 0.0, - kenlm_path: str = None, - flashlight_cfg: Optional['FlashlightConfig'] = None, - pyctcdecode_cfg: Optional['PyCTCDecodeConfig'] = None, - ): - super().__init__(blank_id=blank_id, beam_size=beam_size) - - self.search_type = search_type - self.return_best_hypothesis = return_best_hypothesis - self.preserve_alignments = preserve_alignments - self.compute_timestamps = compute_timestamps - - if self.compute_timestamps: - raise ValueError(f"Currently this flag is not supported for beam search algorithms.") - - self.vocab = None # This must be set by specific method by user before calling forward() ! - - if search_type == "default" or search_type == "nemo": - self.search_algorithm = self.default_beam_search - elif search_type == "pyctcdecode": - self.search_algorithm = self._pyctcdecode_beam_search - elif search_type == "flashlight": - self.search_algorithm = self.flashlight_beam_search - else: - raise NotImplementedError( - f"The search type ({search_type}) supplied is not supported!\n" - f"Please use one of : (default, nemo, pyctcdecode)" - ) - - # Log the beam search algorithm - logging.info(f"Beam search algorithm: {search_type}") - - self.beam_alpha = beam_alpha - self.beam_beta = beam_beta - - # Default beam search args - self.kenlm_path = kenlm_path - - # PyCTCDecode params - if pyctcdecode_cfg is None: - pyctcdecode_cfg = PyCTCDecodeConfig() - self.pyctcdecode_cfg = pyctcdecode_cfg # type: PyCTCDecodeConfig - - if flashlight_cfg is None: - flashlight_cfg = FlashlightConfig() - self.flashlight_cfg = flashlight_cfg - - # Default beam search scorer functions - self.default_beam_scorer = None - self.pyctcdecode_beam_scorer = None - self.flashlight_beam_scorer = None - self.token_offset = 0 - - @typecheck() - def forward( - self, decoder_output: torch.Tensor, decoder_lengths: torch.Tensor, - ) -> Tuple[List[Union[rnnt_utils.Hypothesis, rnnt_utils.NBestHypotheses]]]: - """Returns a list of hypotheses given an input batch of the encoder hidden embedding. - Output token is generated auto-repressively. - - Args: - decoder_output: A tensor of size (batch, timesteps, features). - decoder_lengths: list of int representing the length of each sequence - output sequence. - - Returns: - packed list containing batch number of sentences (Hypotheses). - """ - if self.vocab is None: - raise RuntimeError("Please set the vocabulary with `set_vocabulary()` before calling this function.") - - if self.decoding_type is None: - raise ValueError("Please set the decoding type with `set_decoding_type()` before calling this function.") - - with torch.no_grad(), torch.inference_mode(): - # Process each sequence independently - prediction_tensor = decoder_output - - if prediction_tensor.ndim != 3: - raise ValueError( - f"`decoder_output` must be a tensor of shape [B, T, V] (log probs, float). " - f"Provided shape = {prediction_tensor.shape}" - ) - - # determine type of input - logprobs or labels - out_len = decoder_lengths if decoder_lengths is not None else None - hypotheses = self.search_algorithm(prediction_tensor, out_len) - - # Pack results into Hypotheses - packed_result = pack_hypotheses(hypotheses, decoder_lengths) - - # Pack the result - if self.return_best_hypothesis and isinstance(packed_result[0], rnnt_utils.NBestHypotheses): - packed_result = [res.n_best_hypotheses[0] for res in packed_result] # type: Hypothesis - - return (packed_result,) - - @torch.no_grad() - def default_beam_search( - self, x: torch.Tensor, out_len: torch.Tensor - ) -> List[Union[rnnt_utils.Hypothesis, rnnt_utils.NBestHypotheses]]: - """ - Open Seq2Seq Beam Search Algorithm (DeepSpeed) - - Args: - x: Tensor of shape [B, T, V+1], where B is the batch size, T is the maximum sequence length, - and V is the vocabulary size. The tensor contains log-probabilities. - out_len: Tensor of shape [B], contains lengths of each sequence in the batch. - - Returns: - A list of NBestHypotheses objects, one for each sequence in the batch. - """ - if self.compute_timestamps: - raise ValueError( - f"Beam Search with strategy `{self.search_type}` does not support time stamp calculation!" - ) - - if self.default_beam_scorer is None: - # Check for filepath - if self.kenlm_path is None or not os.path.exists(self.kenlm_path): - raise FileNotFoundError( - f"KenLM binary file not found at : {self.kenlm_path}. " - f"Please set a valid path in the decoding config." - ) - - # perform token offset for subword models - if self.decoding_type == 'subword': - vocab = [chr(idx + self.token_offset) for idx in range(len(self.vocab))] - else: - # char models - vocab = self.vocab - - # Must import at runtime to avoid circular dependency due to module level import. - from nemo.collections.asr.modules.beam_search_decoder import BeamSearchDecoderWithLM - - self.default_beam_scorer = BeamSearchDecoderWithLM( - vocab=vocab, - lm_path=self.kenlm_path, - beam_width=self.beam_size, - alpha=self.beam_alpha, - beta=self.beam_beta, - num_cpus=max(1, os.cpu_count()), - input_tensor=False, - ) - - x = x.to('cpu') - - with typecheck.disable_checks(): - data = [x[sample_id, : out_len[sample_id], :].softmax(dim=-1) for sample_id in range(len(x))] - beams_batch = self.default_beam_scorer.forward(log_probs=data, log_probs_length=None) - - # For each sample in the batch - nbest_hypotheses = [] - for beams_idx, beams in enumerate(beams_batch): - # For each beam candidate / hypothesis in each sample - hypotheses = [] - for candidate_idx, candidate in enumerate(beams): - hypothesis = rnnt_utils.Hypothesis( - score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None - ) - - # For subword encoding, NeMo will double encode the subword (multiple tokens) into a - # singular unicode id. In doing so, we preserve the semantic of the unicode token, and - # compress the size of the final KenLM ARPA / Binary file. - # In order to do double encoding, we shift the subword by some token offset. - # This step is ignored for character based models. - if self.decoding_type == 'subword': - pred_token_ids = [ord(c) - self.token_offset for c in candidate[1]] - else: - # Char models - pred_token_ids = [self.vocab_index_map[c] for c in candidate[1]] - - # We preserve the token ids and the score for this hypothesis - hypothesis.y_sequence = pred_token_ids - hypothesis.score = candidate[0] - - # If alignment must be preserved, we preserve a view of the output logprobs. - # Note this view is shared amongst all beams within the sample, be sure to clone it if you - # require specific processing for each sample in the beam. - # This is done to preserve memory. - if self.preserve_alignments: - hypothesis.alignments = x[beams_idx][: out_len[beams_idx]] - - hypotheses.append(hypothesis) - - # Wrap the result in NBestHypothesis. - hypotheses = rnnt_utils.NBestHypotheses(hypotheses) - nbest_hypotheses.append(hypotheses) - - return nbest_hypotheses - - @torch.no_grad() - def _pyctcdecode_beam_search( - self, x: torch.Tensor, out_len: torch.Tensor - ) -> List[Union[rnnt_utils.Hypothesis, rnnt_utils.NBestHypotheses]]: - """ - PyCTCDecode Beam Search Algorithm. Should support Char and Subword models. - - Args: - x: Tensor of shape [B, T, V+1], where B is the batch size, T is the maximum sequence length, - and V is the vocabulary size. The tensor contains log-probabilities. - out_len: Tensor of shape [B], contains lengths of each sequence in the batch. - - Returns: - A list of NBestHypotheses objects, one for each sequence in the batch. - """ - if self.compute_timestamps: - raise ValueError( - f"Beam Search with strategy `{self.search_type}` does not support time stamp calculation!" - ) - - try: - import pyctcdecode - except (ImportError, ModuleNotFoundError): - raise ImportError( - f"Could not load `pyctcdecode` library. Please install it from pip using :\n" - f"pip install --upgrade pyctcdecode" - ) - - if self.pyctcdecode_beam_scorer is None: - self.pyctcdecode_beam_scorer = pyctcdecode.build_ctcdecoder( - labels=self.vocab, kenlm_model_path=self.kenlm_path, alpha=self.beam_alpha, beta=self.beam_beta - ) # type: pyctcdecode.BeamSearchDecoderCTC - - x = x.to('cpu').numpy() - - with typecheck.disable_checks(): - beams_batch = [] - for sample_id in range(len(x)): - logprobs = x[sample_id, : out_len[sample_id], :] - result = self.pyctcdecode_beam_scorer.decode_beams( - logprobs, - beam_width=self.beam_size, - beam_prune_logp=self.pyctcdecode_cfg.beam_prune_logp, - token_min_logp=self.pyctcdecode_cfg.token_min_logp, - prune_history=self.pyctcdecode_cfg.prune_history, - hotwords=self.pyctcdecode_cfg.hotwords, - hotword_weight=self.pyctcdecode_cfg.hotword_weight, - lm_start_state=None, - ) # Output format: text, last_lm_state, text_frames, logit_score, lm_score - beams_batch.append(result) - - nbest_hypotheses = [] - for beams_idx, beams in enumerate(beams_batch): - hypotheses = [] - for candidate_idx, candidate in enumerate(beams): - # Candidate = (text, last_lm_state, text_frames, logit_score, lm_score) - hypothesis = rnnt_utils.Hypothesis( - score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None - ) - - # TODO: Requires token ids to be returned rather than text. - if self.decoding_type == 'subword': - if self.tokenizer is None: - raise ValueError("Tokenizer must be provided for subword decoding. Use set_tokenizer().") - - pred_token_ids = self.tokenizer.text_to_ids(candidate[0]) - else: - if self.vocab is None: - raise ValueError("Vocab must be provided for character decoding. Use set_vocab().") - - chars = list(candidate[0]) - pred_token_ids = [self.vocab_index_map[c] for c in chars] - - hypothesis.y_sequence = pred_token_ids - hypothesis.text = candidate[0] # text - hypothesis.score = candidate[4] # score - - # Inject word level timestamps - hypothesis.timestep = candidate[2] # text_frames - - if self.preserve_alignments: - hypothesis.alignments = torch.from_numpy(x[beams_idx][: out_len[beams_idx]]) - - hypotheses.append(hypothesis) - - hypotheses = rnnt_utils.NBestHypotheses(hypotheses) - nbest_hypotheses.append(hypotheses) - - return nbest_hypotheses - - @torch.no_grad() - def flashlight_beam_search( - self, x: torch.Tensor, out_len: torch.Tensor - ) -> List[Union[rnnt_utils.Hypothesis, rnnt_utils.NBestHypotheses]]: - """ - Flashlight Beam Search Algorithm. Should support Char and Subword models. - - Args: - x: Tensor of shape [B, T, V+1], where B is the batch size, T is the maximum sequence length, - and V is the vocabulary size. The tensor contains log-probabilities. - out_len: Tensor of shape [B], contains lengths of each sequence in the batch. - - Returns: - A list of NBestHypotheses objects, one for each sequence in the batch. - """ - if self.compute_timestamps: - raise ValueError( - f"Beam Search with strategy `{self.search_type}` does not support time stamp calculation!" - ) - - if self.flashlight_beam_scorer is None: - # Check for filepath - if self.kenlm_path is None or not os.path.exists(self.kenlm_path): - raise FileNotFoundError( - f"KenLM binary file not found at : {self.kenlm_path}. " - f"Please set a valid path in the decoding config." - ) - - # perform token offset for subword models - # if self.decoding_type == 'subword': - # vocab = [chr(idx + self.token_offset) for idx in range(len(self.vocab))] - # else: - # # char models - # vocab = self.vocab - - # Must import at runtime to avoid circular dependency due to module level import. - from nemo.collections.asr.modules.flashlight_decoder import FlashLightKenLMBeamSearchDecoder - - self.flashlight_beam_scorer = FlashLightKenLMBeamSearchDecoder( - lm_path=self.kenlm_path, - vocabulary=self.vocab, - tokenizer=self.tokenizer, - lexicon_path=self.flashlight_cfg.lexicon_path, - boost_path=self.flashlight_cfg.boost_path, - beam_size=self.beam_size, - beam_size_token=self.flashlight_cfg.beam_size_token, - beam_threshold=self.flashlight_cfg.beam_threshold, - lm_weight=self.beam_alpha, - word_score=self.beam_beta, - unk_weight=self.flashlight_cfg.unk_weight, - sil_weight=self.flashlight_cfg.sil_weight, - ) - - x = x.to('cpu') - - with typecheck.disable_checks(): - beams_batch = self.flashlight_beam_scorer.forward(log_probs=x) - - # For each sample in the batch - nbest_hypotheses = [] - for beams_idx, beams in enumerate(beams_batch): - # For each beam candidate / hypothesis in each sample - hypotheses = [] - for candidate_idx, candidate in enumerate(beams): - hypothesis = rnnt_utils.Hypothesis( - score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None - ) - - # We preserve the token ids and the score for this hypothesis - hypothesis.y_sequence = candidate['tokens'].tolist() - hypothesis.score = candidate['score'] - - # If alignment must be preserved, we preserve a view of the output logprobs. - # Note this view is shared amongst all beams within the sample, be sure to clone it if you - # require specific processing for each sample in the beam. - # This is done to preserve memory. - if self.preserve_alignments: - hypothesis.alignments = x[beams_idx][: out_len[beams_idx]] - - hypotheses.append(hypothesis) - - # Wrap the result in NBestHypothesis. - hypotheses = rnnt_utils.NBestHypotheses(hypotheses) - nbest_hypotheses.append(hypotheses) - - return nbest_hypotheses - - def set_decoding_type(self, decoding_type: str): - super().set_decoding_type(decoding_type) - - # Please check train_kenlm.py in scripts/asr_language_modeling/ to find out why we need - # TOKEN_OFFSET for BPE-based models - if self.decoding_type == 'subword': - self.token_offset = DEFAULT_TOKEN_OFFSET - - -@dataclass -class PyCTCDecodeConfig: - # These arguments cannot be imported from pyctcdecode (optional dependency) - # Therefore we copy the values explicitly - # Taken from pyctcdecode.constant - beam_prune_logp: float = -10.0 - token_min_logp: float = -5.0 - prune_history: bool = False - hotwords: Optional[List[str]] = None - hotword_weight: float = 10.0 - - -@dataclass -class FlashlightConfig: - lexicon_path: Optional[str] = None - boost_path: Optional[str] = None - beam_size_token: int = 16 - beam_threshold: float = 20.0 - unk_weight: float = -math.inf - sil_weight: float = 0.0 - - -@dataclass -class BeamCTCInferConfig: - beam_size: int - search_type: str = 'default' - preserve_alignments: bool = False - compute_timestamps: bool = False - return_best_hypothesis: bool = True - - beam_alpha: float = 1.0 - beam_beta: float = 0.0 - kenlm_path: Optional[str] = None - - flashlight_cfg: Optional[FlashlightConfig] = field(default_factory=lambda: FlashlightConfig()) - pyctcdecode_cfg: Optional[PyCTCDecodeConfig] = field(default_factory=lambda: PyCTCDecodeConfig()) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py deleted file mode 100644 index 686ef79cabad4696e396a8ad2c7d2ac784b2ccaf..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py +++ /dev/null @@ -1,264 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field -from typing import List, Optional - -import torch -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.asr.parts.utils import rnnt_utils -from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodConfig, ConfidenceMethodMixin -from nemo.core.classes import Typing, typecheck -from nemo.core.neural_types import HypothesisType, LengthsType, LogprobsType, NeuralType -from nemo.utils import logging - - -def pack_hypotheses(hypotheses: List[rnnt_utils.Hypothesis], logitlen: torch.Tensor,) -> List[rnnt_utils.Hypothesis]: - - if logitlen is not None: - if hasattr(logitlen, 'cpu'): - logitlen_cpu = logitlen.to('cpu') - else: - logitlen_cpu = logitlen - - for idx, hyp in enumerate(hypotheses): # type: rnnt_utils.Hypothesis - hyp.y_sequence = torch.tensor(hyp.y_sequence, dtype=torch.long) - - if logitlen is not None: - hyp.length = logitlen_cpu[idx] - - if hyp.dec_state is not None: - hyp.dec_state = _states_to_device(hyp.dec_state) - - return hypotheses - - -def _states_to_device(dec_state, device='cpu'): - if torch.is_tensor(dec_state): - dec_state = dec_state.to(device) - - elif isinstance(dec_state, (list, tuple)): - dec_state = tuple(_states_to_device(dec_i, device) for dec_i in dec_state) - - return dec_state - - -class GreedyCTCInfer(Typing, ConfidenceMethodMixin): - """A greedy CTC decoder. - - Provides a common abstraction for sample level and batch level greedy decoding. - - Args: - blank_index: int index of the blank token. Can be 0 or len(vocabulary). - preserve_alignments: Bool flag which preserves the history of logprobs generated during - decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `logprobs` in it. Here, `logprobs` is a torch.Tensors. - compute_timestamps: A bool flag, which determines whether to compute the character/subword, or - word based timestamp mapping the output log-probabilities to discrite intervals of timestamps. - The timestamps will be available in the returned Hypothesis.timestep as a dictionary. - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores - generated during decoding. When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of floats. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame - confidence scores. - - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - # Input can be of dimention - - # ('B', 'T', 'D') [Log probs] or ('B', 'T') [Labels] - - return { - "decoder_output": NeuralType(None, LogprobsType()), - "decoder_lengths": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return {"predictions": [NeuralType(elements_type=HypothesisType())]} - - def __init__( - self, - blank_id: int, - preserve_alignments: bool = False, - compute_timestamps: bool = False, - preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, - ): - super().__init__() - - self.blank_id = blank_id - self.preserve_alignments = preserve_alignments - # we need timestamps to extract non-blank per-frame confidence - self.compute_timestamps = compute_timestamps | preserve_frame_confidence - self.preserve_frame_confidence = preserve_frame_confidence - - # set confidence calculation method - self._init_confidence_method(confidence_method_cfg) - - @typecheck() - def forward( - self, decoder_output: torch.Tensor, decoder_lengths: torch.Tensor, - ): - """Returns a list of hypotheses given an input batch of the encoder hidden embedding. - Output token is generated auto-repressively. - - Args: - decoder_output: A tensor of size (batch, timesteps, features) or (batch, timesteps) (each timestep is a label). - decoder_lengths: list of int representing the length of each sequence - output sequence. - - Returns: - packed list containing batch number of sentences (Hypotheses). - """ - with torch.inference_mode(): - hypotheses = [] - # Process each sequence independently - prediction_cpu_tensor = decoder_output.cpu() - - if prediction_cpu_tensor.ndim < 2 or prediction_cpu_tensor.ndim > 3: - raise ValueError( - f"`decoder_output` must be a tensor of shape [B, T] (labels, int) or " - f"[B, T, V] (log probs, float). Provided shape = {prediction_cpu_tensor.shape}" - ) - - # determine type of input - logprobs or labels - if prediction_cpu_tensor.ndim == 2: # labels - greedy_decode = self._greedy_decode_labels - else: - greedy_decode = self._greedy_decode_logprobs - - for ind in range(prediction_cpu_tensor.shape[0]): - out_len = decoder_lengths[ind] if decoder_lengths is not None else None - hypothesis = greedy_decode(prediction_cpu_tensor[ind], out_len) - hypotheses.append(hypothesis) - - # Pack results into Hypotheses - packed_result = pack_hypotheses(hypotheses, decoder_lengths) - - return (packed_result,) - - @torch.no_grad() - def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: torch.Tensor): - # x: [T, D] - # out_len: [seq_len] - - # Initialize blank state and empty label set in Hypothesis - hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None) - prediction = x.detach().cpu() - - if out_len is not None: - prediction = prediction[:out_len] - - prediction_logprobs, prediction_labels = prediction.max(dim=-1) - - non_blank_ids = prediction_labels != self.blank_id - hypothesis.y_sequence = prediction_labels.numpy().tolist() - hypothesis.score = (prediction_logprobs[non_blank_ids]).sum() - - if self.preserve_alignments: - # Preserve the logprobs, as well as labels after argmax - hypothesis.alignments = (prediction.clone(), prediction_labels.clone()) - - if self.compute_timestamps: - hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].numpy().tolist() - - if self.preserve_frame_confidence: - hypothesis.frame_confidence = self._get_confidence(prediction) - - return hypothesis - - @torch.no_grad() - def _greedy_decode_labels(self, x: torch.Tensor, out_len: torch.Tensor): - # x: [T] - # out_len: [seq_len] - - # Initialize blank state and empty label set in Hypothesis - hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None) - prediction_labels = x.detach().cpu() - - if out_len is not None: - prediction_labels = prediction_labels[:out_len] - - non_blank_ids = prediction_labels != self.blank_id - hypothesis.y_sequence = prediction_labels.numpy().tolist() - hypothesis.score = -1.0 - - if self.preserve_alignments: - raise ValueError("Requested for alignments, but predictions provided were labels, not log probabilities.") - - if self.compute_timestamps: - hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].numpy().tolist() - - if self.preserve_frame_confidence: - raise ValueError( - "Requested for per-frame confidence, but predictions provided were labels, not log probabilities." - ) - - return hypothesis - - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) - - -@dataclass -class GreedyCTCInferConfig: - preserve_alignments: bool = False - compute_timestamps: bool = False - preserve_frame_confidence: bool = False - confidence_method_cfg: Optional[ConfidenceMethodConfig] = field(default_factory=lambda: ConfidenceMethodConfig()) - - def __post_init__(self): - # OmegaConf.structured ensures that post_init check is always executed - self.confidence_method_cfg = OmegaConf.structured( - self.confidence_method_cfg - if isinstance(self.confidence_method_cfg, ConfidenceMethodConfig) - else ConfidenceMethodConfig(**self.confidence_method_cfg) - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/jasper.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/jasper.py deleted file mode 100644 index 900d0bd55a10c47cb8638cf5b93d9890854ed26b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/jasper.py +++ /dev/null @@ -1,1178 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from typing import Callable, Iterable, List, Optional, Tuple - -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch import Tensor -from torch.nn.init import _calculate_correct_fan -from torch.nn.modules.utils import _single - -from nemo.collections.common.parts.utils import activation_registry -from nemo.core.classes.mixins import AccessMixin -from nemo.core.classes.mixins.adapter_mixins import AdapterModuleMixin -from nemo.utils import logging - -try: - from pytorch_quantization import calib - from pytorch_quantization import nn as quant_nn - from pytorch_quantization import quant_modules - from pytorch_quantization.tensor_quant import QuantDescriptor - - PYTORCH_QUANTIZATION_AVAILABLE = True -except ImportError: - PYTORCH_QUANTIZATION_AVAILABLE = False - -jasper_activations = activation_registry - - -def tds_uniform_(tensor, mode='fan_in'): - """ - Uniform Initialization from the paper [Sequence-to-Sequence Speech Recognition with Time-Depth Separable Convolutions](https://www.isca-speech.org/archive/Interspeech_2019/pdfs/2460.pdf) - Normalized to - - - .. math:: - \\text{bound} = \\text{2} \\times \\sqrt{\\frac{1}{\\text{fan\\_mode}}} - - Args: - tensor: an n-dimensional `torch.Tensor` - mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'`` - preserves the magnitude of the variance of the weights in the - forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the - backwards pass. - """ - fan = _calculate_correct_fan(tensor, mode) - gain = 2.0 # sqrt(4.0) = 2 - std = gain / math.sqrt(fan) # sqrt(4.0 / fan_in) - bound = std # Calculate uniform bounds from standard deviation - with torch.no_grad(): - return tensor.uniform_(-bound, bound) - - -def tds_normal_(tensor, mode='fan_in'): - """ - Normal Initialization from the paper [Sequence-to-Sequence Speech Recognition with Time-Depth Separable Convolutions](https://www.isca-speech.org/archive/Interspeech_2019/pdfs/2460.pdf) - Normalized to - - - .. math:: - \\text{bound} = \\text{2} \\times \\sqrt{\\frac{1}{\\text{fan\\_mode}}} - - Args: - tensor: an n-dimensional `torch.Tensor` - mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'`` - preserves the magnitude of the variance of the weights in the - forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the - backwards pass. - """ - fan = _calculate_correct_fan(tensor, mode) - gain = 2.0 - std = gain / math.sqrt(fan) # sqrt(4.0 / fan_in) - bound = std # Calculate uniform bounds from standard deviation - with torch.no_grad(): - return tensor.normal_(0.0, bound) - - -def init_weights(m, mode: Optional[str] = 'xavier_uniform'): - if isinstance(m, MaskedConv1d): - init_weights(m.conv, mode) - if isinstance(m, (nn.Conv1d, nn.Linear)): - if mode is not None: - if mode == 'xavier_uniform': - nn.init.xavier_uniform_(m.weight, gain=1.0) - elif mode == 'xavier_normal': - nn.init.xavier_normal_(m.weight, gain=1.0) - elif mode == 'kaiming_uniform': - nn.init.kaiming_uniform_(m.weight, nonlinearity="relu") - elif mode == 'kaiming_normal': - nn.init.kaiming_normal_(m.weight, nonlinearity="relu") - elif mode == 'tds_uniform': - tds_uniform_(m.weight) - elif mode == 'tds_normal': - tds_normal_(m.weight) - else: - raise ValueError("Unknown Initialization mode: {0}".format(mode)) - elif isinstance(m, nn.BatchNorm1d): - if m.track_running_stats: - m.running_mean.zero_() - m.running_var.fill_(1) - m.num_batches_tracked.zero_() - if m.affine: - nn.init.ones_(m.weight) - nn.init.zeros_(m.bias) - - -def compute_new_kernel_size(kernel_size, kernel_width): - new_kernel_size = max(int(kernel_size * kernel_width), 1) - # If kernel is even shape, round up to make it odd - if new_kernel_size % 2 == 0: - new_kernel_size += 1 - return new_kernel_size - - -def get_same_padding(kernel_size, stride, dilation) -> int: - if stride > 1 and dilation > 1: - raise ValueError("Only stride OR dilation may be greater than 1") - return (dilation * (kernel_size - 1)) // 2 - - -def get_asymtric_padding(kernel_size, stride, dilation, future_context): - if stride > 1 and dilation > 1: - raise ValueError("Only stride OR dilation may be greater than 1") - - left_context = kernel_size - 1 - future_context - right_context = future_context - - symmetric_padding = get_same_padding(kernel_size, stride, dilation) - - if kernel_size <= future_context: - # kernel size is smaller than future context, equivalent to using entire context of kernel - # simply return symmetric padding for this scenario - logging.warning( - f"Future context window is larger than the kernel size!\n" - f"Left context = {left_context} | Right context = greater than {right_context} | " - f"Kernel size = {kernel_size}\n" - f"Switching to symmetric padding (left context = right context = {symmetric_padding})" - ) - return symmetric_padding - - if left_context < symmetric_padding: - logging.warning( - f"Future context window is larger than half the kernel size!\n" - f"Conv layer therefore uses more future information than past to compute its output!\n" - f"Left context = {left_context} | Right context = {right_context} | " - f"Kernel size = {kernel_size}" - ) - - if dilation > 1: - left_context = dilation * kernel_size - 1 - dilation * future_context - right_context = dilation * future_context - return (left_context, right_context) - - return (left_context, right_context) - - -@torch.jit.script -def _se_pool_step_script_infer(x: torch.Tensor, context_window: int, mask: torch.Tensor): - """ - Calculates the masked average over padded limited context segment during inference mode. - - Args: - x: Input tensor. Shape = [B, C, T] - context_window: Integer context window, must be 0 or greater. - mask: Mask tensor, 1 represents value index, 0 represents padded index. Shape = [B, 1, T]. - - Returns: - A tensor reduced via masked average pool over some limited context. Shape = [B, C, 1] - """ - timesteps = x.shape[-1] - if timesteps < context_window: - y = torch.sum(x, dim=-1, keepdim=True) / mask.sum(dim=-1, keepdim=True).to(x.dtype) - else: - # << During inference prefer to use entire context >> - # x = x[:, :, :context_window] # [B, C, context_window] - # mask = mask[:, :, :context_window] # [B, 1, context_window] - # - # mask = mask.sum(dim=-1, keepdim=True).to(x.dtype) # [B, C, 1] - # y = x.sum(dim=-1, keepdim=True) # [B, 1, 1] - # y = y / (mask + 1e-8) # [B, C, 1] - y = torch.sum(x, dim=-1, keepdim=True) / mask.sum(dim=-1, keepdim=True).to(x.dtype) - - return y - - -@torch.jit.script -def _se_pool_step_script_train(x: torch.Tensor, context_window: int, mask: torch.Tensor): - """ - Calculates the masked average over padded limited context segment during training mode. - Randomly slices a segment of length `context_window` from signal+padded input tensor across all channels and - uses it for computing masked limited context. - - Args: - x: Input tensor. Shape = [B, C, T] - context_window: Integer context window, must be 0 or greater. - mask: Mask tensor, 1 represents value index, 0 represents padded index. Shape = [B, 1, T]. - - Returns: - A tensor reduced via masked average pool over some limited context. Shape = [B, C, 1] - """ - timesteps = x.shape[-1] - if timesteps < context_window: - y = torch.sum(x, dim=-1, keepdim=True) / mask.sum(dim=-1, keepdim=True).to(x.dtype) - else: - start_idx = torch.randint(0, timesteps - context_window, size=[1], dtype=torch.int32)[0] - x = x[:, :, start_idx : (start_idx + context_window)] # [B, C, context_window] - mask = mask[:, :, start_idx : (start_idx + context_window)] # [B, 1, context_window] - - mask = mask.sum(dim=-1, keepdim=True).to(x.dtype) # [B, C, 1] - y = x.sum(dim=-1, keepdim=True) # [B, 1, 1] - y = y / (mask + 1e-8) # [B, C, 1] - - return y - - -@torch.jit.script -def _masked_conv_init_lens(lens: torch.Tensor, current_maxlen: int, original_maxlen: torch.Tensor): - if current_maxlen > original_maxlen: - new_lens = torch.arange(current_maxlen) - new_max_lens = torch.tensor(current_maxlen) - else: - new_lens = lens - new_max_lens = original_maxlen - return new_lens, new_max_lens - - -class MaskedConv1d(nn.Module): - __constants__ = ["use_conv_mask", "real_out_channels", "heads"] - - def __init__( - self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - heads=-1, - bias=False, - use_mask=True, - quantize=False, - ): - super(MaskedConv1d, self).__init__() - - if not (heads == -1 or groups == in_channels): - raise ValueError("Only use heads for depthwise convolutions") - - self.real_out_channels = out_channels - if heads != -1: - in_channels = heads - out_channels = heads - groups = heads - - # preserve original padding - self._padding = padding - - # if padding is a tuple/list, it is considered as asymmetric padding - if type(padding) in (tuple, list): - self.pad_layer = nn.ConstantPad1d(padding, value=0.0) - # reset padding for conv since pad_layer will handle this - padding = 0 - else: - self.pad_layer = None - - if PYTORCH_QUANTIZATION_AVAILABLE and quantize: - self.conv = quant_nn.QuantConv1d( - in_channels, - out_channels, - kernel_size, - stride=stride, - padding=padding, - dilation=dilation, - groups=groups, - bias=bias, - ) - elif not PYTORCH_QUANTIZATION_AVAILABLE and quantize: - raise ImportError( - "pytorch-quantization is not installed. Install from " - "https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization." - ) - else: - self.conv = nn.Conv1d( - in_channels, - out_channels, - kernel_size, - stride=stride, - padding=padding, - dilation=dilation, - groups=groups, - bias=bias, - ) - self.use_mask = use_mask - self.heads = heads - - # Calculations for "same" padding cache - self.same_padding = (self.conv.stride[0] == 1) and ( - 2 * self.conv.padding[0] == self.conv.dilation[0] * (self.conv.kernel_size[0] - 1) - ) - if self.pad_layer is None: - self.same_padding_asymmetric = False - else: - self.same_padding_asymmetric = (self.conv.stride[0] == 1) and ( - sum(self._padding) == self.conv.dilation[0] * (self.conv.kernel_size[0] - 1) - ) - - # `self.lens` caches consecutive integers from 0 to `self.max_len` that are used to compute the mask for a - # batch. Recomputed to bigger size as needed. Stored on a device of the latest batch lens. - if self.use_mask: - self.max_len = torch.tensor(0) - self.lens = torch.tensor(0) - - def get_seq_len(self, lens): - if self.same_padding or self.same_padding_asymmetric: - return lens - - if self.pad_layer is None: - return ( - torch.div( - lens + 2 * self.conv.padding[0] - self.conv.dilation[0] * (self.conv.kernel_size[0] - 1) - 1, - self.conv.stride[0], - rounding_mode='trunc', - ) - + 1 - ) - else: - return ( - torch.div( - lens + sum(self._padding) - self.conv.dilation[0] * (self.conv.kernel_size[0] - 1) - 1, - self.conv.stride[0], - rounding_mode='trunc', - ) - + 1 - ) - - def forward(self, x, lens): - if self.use_mask: - # Generally will be called by ConvASREncoder, but kept as single gpu backup. - if x.size(2) > self.max_len: - self.update_masked_length(x.size(2), device=lens.device) - x = self.mask_input(x, lens) - - # Update lengths - lens = self.get_seq_len(lens) - - # asymmtric pad if necessary - if self.pad_layer is not None: - x = self.pad_layer(x) - - sh = x.shape - if self.heads != -1: - x = x.view(-1, self.heads, sh[-1]) - - out = self.conv(x) - - if self.heads != -1: - out = out.view(sh[0], self.real_out_channels, -1) - - return out, lens - - def update_masked_length(self, max_len, seq_range=None, device=None): - if seq_range is None: - self.lens, self.max_len = _masked_conv_init_lens(self.lens, max_len, self.max_len) - self.lens = self.lens.to(device) - else: - self.lens = seq_range - self.max_len = torch.tensor(max_len) - - def mask_input(self, x, lens): - max_len = x.size(2) - mask = self.lens[:max_len].unsqueeze(0).to(lens.device) < lens.unsqueeze(1) - x = x * mask.unsqueeze(1).to(device=x.device) - return x - - -class GroupShuffle(nn.Module): - def __init__(self, groups, channels): - super(GroupShuffle, self).__init__() - - self.groups = groups - self.channels_per_group = channels // groups - - def forward(self, x): - sh = x.shape - - x = x.view(-1, self.groups, self.channels_per_group, sh[-1]) - - x = torch.transpose(x, 1, 2).contiguous() - - x = x.view(-1, self.groups * self.channels_per_group, sh[-1]) - - return x - - -class SqueezeExcite(nn.Module): - def __init__( - self, - channels: int, - reduction_ratio: int, - context_window: int = -1, - interpolation_mode: str = 'nearest', - activation: Optional[Callable] = None, - quantize: bool = False, - ): - """ - Squeeze-and-Excitation sub-module. - - Args: - channels: Input number of channels. - reduction_ratio: Reduction ratio for "squeeze" layer. - context_window: Integer number of timesteps that the context - should be computed over, using stride 1 average pooling. - If value < 1, then global context is computed. - interpolation_mode: Interpolation mode of timestep dimension. - Used only if context window is > 1. - The modes available for resizing are: `nearest`, `linear` (3D-only), - `bilinear`, `area` - activation: Intermediate activation function used. Must be a - callable activation function. - """ - super(SqueezeExcite, self).__init__() - self.interpolation_mode = interpolation_mode - self._quantize = quantize - - self.pool = None # prepare a placeholder which will be updated - - if activation is None: - activation = nn.ReLU(inplace=True) - - if PYTORCH_QUANTIZATION_AVAILABLE and quantize: - self.fc = nn.Sequential( - quant_nn.QuantLinear(channels, channels // reduction_ratio, bias=False), - activation, - quant_nn.QuantLinear(channels // reduction_ratio, channels, bias=False), - ) - elif not PYTORCH_QUANTIZATION_AVAILABLE and quantize: - raise ImportError( - "pytorch-quantization is not installed. Install from " - "https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization." - ) - else: - self.fc = nn.Sequential( - nn.Linear(channels, channels // reduction_ratio, bias=False), - activation, - nn.Linear(channels // reduction_ratio, channels, bias=False), - ) - self.gap = nn.AdaptiveAvgPool1d(1) - - # Set default context window - self.change_context_window(context_window=context_window) - - # Set default max sequence length - self.set_max_len(16) - - def forward(self, x, lengths): - return self.forward_for_export(x, lengths) - - def forward_for_export(self, x, lengths): - # The use of negative indices on the transpose allow for expanded SqueezeExcite - max_len = x.shape[-1] - if max_len > self.max_len: - self.set_max_len(max_len) - dtype = x.dtype - # Computes in float32 to avoid instabilities during training with AMP. - with torch.cuda.amp.autocast(enabled=False): - # Create sample mask - 1 represents value, 0 represents pad - mask = self.make_pad_mask(lengths, max_audio_length=max_len, device=x.device) - mask = ~mask # 0 represents value, 1 represents pad - x = x.float() # For stable AMP, SE must be computed at fp32. - x.masked_fill_(mask, 0.0) # mask padded values explicitly to 0 - y = self._se_pool_step(x, mask) # [B, C, 1] - y = y.transpose(1, -1) # [B, 1, C] - y = self.fc(y) # [B, 1, C] - y = y.transpose(1, -1) # [B, C, 1] - - # Note: Keep for future, in case we improve WER from doing so. - # if self.context_window >= 0: - # y = F.interpolate(y, size=x.shape[-1], mode=self.interpolation_mode) - - y = torch.sigmoid(y) - y = x * y - return y, lengths - - def _se_pool_step(self, x, mask): - # Negate mask back to represent 1 for signal and 0 for padded timestep. - mask = ~mask - - if self.context_window < 0: - # [B, C, 1] - Masked Average over value + padding. - y = torch.sum(x, dim=-1, keepdim=True) / mask.sum(dim=-1, keepdim=True).type(x.dtype) - else: - # [B, C, 1] - Masked Average over value + padding with limited context. - # During training randomly subsegments a context_window chunk of timesteps. - # During inference selects only the first context_window chunk of timesteps. - if self.training: - y = _se_pool_step_script_train(x, self.context_window, mask) - else: - y = _se_pool_step_script_infer(x, self.context_window, mask) - return y - - def set_max_len(self, max_len, seq_range=None): - """ Sets maximum input length. - Pre-calculates internal seq_range mask. - """ - self.max_len = max_len - if seq_range is None: - device = next(self.parameters()).device - seq_range = torch.arange(0, self.max_len, device=device) - if hasattr(self, 'seq_range'): - self.seq_range = seq_range - else: - self.register_buffer('seq_range', seq_range, persistent=False) - - def make_pad_mask(self, seq_lens, max_audio_length, device=None): - """Make masking for padding.""" - if device and self.seq_range.device != device: - self.seq_range = self.seq_range.to(device) - if self.seq_range.device != seq_lens.device: - seq_lens = seq_lens.to(self.seq_range.device) - - mask = self.seq_range[:max_audio_length].expand(seq_lens.size(0), -1) < seq_lens.unsqueeze(-1) # [B, T]; bool - mask = mask.unsqueeze(1) # [B, 1, T] - - return mask - - def change_context_window(self, context_window: int): - """ - Update the context window of the SqueezeExcitation module, in-place if possible. - - Will update the pooling layer to either nn.AdaptiveAvgPool1d() (for global SE) or nn.AvgPool1d() - (for limited context SE). - - If only the context window is changing but still a limited SE context block - then - the earlier instance of nn.AvgPool1d() will be updated. - - Args: - context_window: An integer representing the number of input timeframes that will be used - to compute the context. Each timeframe corresponds to a single window stride of the - STFT features. - - Say the window_stride = 0.01s, then a context window of 128 represents 128 * 0.01 s - of context to compute the Squeeze step. - """ - if hasattr(self, 'context_window'): - logging.info(f"Changing Squeeze-Excitation context window from {self.context_window} to {context_window}") - - self.context_window = context_window - - -class JasperBlock(nn.Module, AdapterModuleMixin, AccessMixin): - """ - Constructs a single "Jasper" block. With modified parameters, also constructs other blocks for models - such as `QuartzNet` and `Citrinet`. - - - For `Jasper` : `separable` flag should be False - - For `QuartzNet` : `separable` flag should be True - - For `Citrinet` : `separable` flag and `se` flag should be True - - Note that above are general distinctions, each model has intricate differences that expand over - multiple such blocks. - - For further information about the differences between models which use JasperBlock, please review - the configs for ASR models found in the ASR examples directory. - - Args: - inplanes: Number of input channels. - planes: Number of output channels. - repeat: Number of repeated sub-blocks (R) for this block. - kernel_size: Convolution kernel size across all repeated sub-blocks. - kernel_size_factor: Floating point scale value that is multiplied with kernel size, - then rounded down to nearest odd integer to compose the kernel size. Defaults to 1.0. - stride: Stride of the convolutional layers. - dilation: Integer which defined dilation factor of kernel. Note that when dilation > 1, stride must - be equal to 1. - padding: String representing type of padding. Currently only supports "same" padding, - which symmetrically pads the input tensor with zeros. - dropout: Floating point value, determins percentage of output that is zeroed out. - activation: String representing activation functions. Valid activation functions are : - {"hardtanh": nn.Hardtanh, "relu": nn.ReLU, "selu": nn.SELU, "swish": Swish}. - Defaults to "relu". - residual: Bool that determined whether a residual branch should be added or not. - All residual branches are constructed using a pointwise convolution kernel, that may or may not - perform strided convolution depending on the parameter `residual_mode`. - groups: Number of groups for Grouped Convolutions. Defaults to 1. - separable: Bool flag that describes whether Time-Channel depthwise separable convolution should be - constructed, or ordinary convolution should be constructed. - heads: Number of "heads" for the masked convolution. Defaults to -1, which disables it. - normalization: String that represents type of normalization performed. Can be one of - "batch", "group", "instance" or "layer" to compute BatchNorm1D, GroupNorm1D, InstanceNorm or - LayerNorm (which are special cases of GroupNorm1D). - norm_groups: Number of groups used for GroupNorm (if `normalization` == "group"). - residual_mode: String argument which describes whether the residual branch should be simply - added ("add") or should first stride, then add ("stride_add"). Required when performing stride on - parallel branch as well as utilizing residual add. - residual_panes: Number of residual panes, used for Jasper-DR models. Please refer to the paper. - conv_mask: Bool flag which determines whether to utilize masked convolutions or not. In general, - it should be set to True. - se: Bool flag that determines whether Squeeze-and-Excitation layer should be used. - se_reduction_ratio: Integer value, which determines to what extend the hidden dimension of the SE - intermediate step should be reduced. Larger values reduce number of parameters, but also limit - the effectiveness of SE layers. - se_context_window: Integer value determining the number of timesteps that should be utilized in order - to compute the averaged context window. Defaults to -1, which means it uses global context - such - that all timesteps are averaged. If any positive integer is used, it will utilize limited context - window of that size. - se_interpolation_mode: String used for interpolation mode of timestep dimension for SE blocks. - Used only if context window is > 1. - The modes available for resizing are: `nearest`, `linear` (3D-only), - `bilinear`, `area`. - stride_last: Bool flag that determines whether all repeated blocks should stride at once, - (stride of S^R when this flag is False) or just the last repeated block should stride - (stride of S when this flag is True). - future_context: Int value that determins how many "right" / "future" context frames will be utilized - when calculating the output of the conv kernel. All calculations are done for odd kernel sizes only. - - By default, this is -1, which is recomputed as the symmetric padding case. - - When future_context >= 0, will compute the asymmetric padding as follows : - (left context, right context) = [K - 1 - future_context, future_context] - - Determining an exact formula to limit future context is dependent on global layout of the model. - As such, we provide both "local" and "global" guidelines below. - - Local context limit (should always be enforced) - - future context should be <= half the kernel size for any given layer - - future context > kernel size defaults to symmetric kernel - - future context of layer = number of future frames * width of each frame (dependent on stride) - - Global context limit (should be carefully considered) - - future context should be layed out in an ever reducing pattern. Initial layers should restrict - future context less than later layers, since shallow depth (and reduced stride) means each frame uses - less amounts of future context. - - Beyond a certain point, future context should remain static for a given stride level. This is - the upper bound of the amount of future context that can be provided to the model on a global scale. - - future context is calculated (roughly) as - (2 ^ stride) * (K // 2) number of future frames. - This resultant value should be bound to some global maximum number of future seconds of audio (in ms). - - Note: In the special case where K < future_context, it is assumed that the kernel is too small to limit - its future context, so symmetric padding is used instead. - - Note: There is no explicit limitation on the amount of future context used, as long as - K > future_context constraint is maintained. This might lead to cases where future_context is - more than half the actual kernel size K! In such cases, the conv layer is utilizing more of the future - context than its current and past context to compute the output. While this is possible to do, - it is not recommended and the layer will raise a warning to notify the user of such cases. - It is advised to simply use symmetric padding for such cases. - - Example: - Say we have a model that performs 8x stride and receives spectrogram frames with stride of 0.01s. - Say we wish to upper bound future context to 80 ms. - - Layer ID, Kernel Size, Stride, Future Context, Global Context - 0, K=5, S=1, FC=8, GC= 2 * (2^0) = 2 * 0.01 ms (special case, K < FC so use symmetric pad) - 1, K=7, S=1, FC=3, GC= 3 * (2^0) = 3 * 0.01 ms (note that symmetric pad here uses 3 FC frames!) - 2, K=11, S=2, FC=4, GC= 4 * (2^1) = 8 * 0.01 ms (note that symmetric pad here uses 5 FC frames!) - 3, K=15, S=1, FC=4, GC= 4 * (2^1) = 8 * 0.01 ms (note that symmetric pad here uses 7 FC frames!) - 4, K=21, S=2, FC=2, GC= 2 * (2^2) = 8 * 0.01 ms (note that symmetric pad here uses 10 FC frames!) - 5, K=25, S=2, FC=1, GC= 1 * (2^3) = 8 * 0.01 ms (note that symmetric pad here uses 14 FC frames!) - 6, K=29, S=1, FC=1, GC= 1 * (2^3) = 8 * 0.01 ms ... - quantize: Bool flag whether to quantize the Convolutional blocks. - layer_idx (int, optional): can be specified to allow layer output capture for InterCTC loss. Defaults to -1. - """ - - __constants__ = ["conv_mask", "separable", "residual_mode", "res", "mconv"] - - def __init__( - self, - inplanes, - planes, - repeat=3, - kernel_size=11, - kernel_size_factor=1, - stride=1, - dilation=1, - padding='same', - dropout=0.2, - activation=None, - residual=True, - groups=1, - separable=False, - heads=-1, - normalization="batch", - norm_groups=1, - residual_mode='add', - residual_panes=[], - conv_mask=False, - se=False, - se_reduction_ratio=16, - se_context_window=-1, - se_interpolation_mode='nearest', - stride_last=False, - future_context: int = -1, - quantize=False, - layer_idx: int = -1, # only used for capturing tensors for interctc loss - ): - super(JasperBlock, self).__init__() - - if padding != "same": - raise ValueError("currently only 'same' padding is supported") - - kernel_size_factor = float(kernel_size_factor) - if isinstance(kernel_size, Iterable): - kernel_size = [compute_new_kernel_size(k, kernel_size_factor) for k in kernel_size] - else: - kernel_size = [compute_new_kernel_size(kernel_size, kernel_size_factor)] - - if future_context < 0: - padding_val = get_same_padding(kernel_size[0], stride[0], dilation[0]) - else: - padding_val = get_asymtric_padding(kernel_size[0], stride[0], dilation[0], future_context) - - self.inplanes = inplanes - self.planes = planes - self.conv_mask = conv_mask - self.separable = separable - self.residual_mode = residual_mode - self.se = se - self.quantize = quantize - self.layer_idx = layer_idx - # will be set in self.forward() if defined in AccessMixin config - self.interctc_should_capture = None - - inplanes_loop = inplanes - conv = nn.ModuleList() - - for _ in range(repeat - 1): - # Stride last means only the last convolution in block will have stride - if stride_last: - stride_val = [1] - else: - stride_val = stride - - conv.extend( - self._get_conv_bn_layer( - inplanes_loop, - planes, - kernel_size=kernel_size, - stride=stride_val, - dilation=dilation, - padding=padding_val, - groups=groups, - heads=heads, - separable=separable, - normalization=normalization, - norm_groups=norm_groups, - quantize=quantize, - ) - ) - - conv.extend(self._get_act_dropout_layer(drop_prob=dropout, activation=activation)) - - inplanes_loop = planes - - conv.extend( - self._get_conv_bn_layer( - inplanes_loop, - planes, - kernel_size=kernel_size, - stride=stride, - dilation=dilation, - padding=padding_val, - groups=groups, - heads=heads, - separable=separable, - normalization=normalization, - norm_groups=norm_groups, - quantize=quantize, - ) - ) - - if se: - conv.append( - SqueezeExcite( - planes, - reduction_ratio=se_reduction_ratio, - context_window=se_context_window, - interpolation_mode=se_interpolation_mode, - activation=activation, - quantize=quantize, - ) - ) - - self.mconv = conv - - res_panes = residual_panes.copy() - self.dense_residual = residual - - if residual: - res_list = nn.ModuleList() - - if residual_mode == 'stride_add': - stride_val = stride - else: - stride_val = [1] - - if len(residual_panes) == 0: - res_panes = [inplanes] - self.dense_residual = False - for ip in res_panes: - res = nn.ModuleList( - self._get_conv_bn_layer( - ip, - planes, - kernel_size=1, - normalization=normalization, - norm_groups=norm_groups, - stride=stride_val, - quantize=quantize, - ) - ) - - res_list.append(res) - - self.res = res_list - if PYTORCH_QUANTIZATION_AVAILABLE and self.quantize: - self.residual_quantizer = quant_nn.TensorQuantizer(quant_nn.QuantConv2d.default_quant_desc_input) - elif not PYTORCH_QUANTIZATION_AVAILABLE and quantize: - raise ImportError( - "pytorch-quantization is not installed. Install from " - "https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization." - ) - else: - self.res = None - - self.mout = nn.Sequential(*self._get_act_dropout_layer(drop_prob=dropout, activation=activation)) - - def _get_conv( - self, - in_channels, - out_channels, - kernel_size=11, - stride=1, - dilation=1, - padding=0, - bias=False, - groups=1, - heads=-1, - separable=False, - quantize=False, - ): - use_mask = self.conv_mask - if use_mask: - return MaskedConv1d( - in_channels, - out_channels, - kernel_size, - stride=stride, - dilation=dilation, - padding=padding, - bias=bias, - groups=groups, - heads=heads, - use_mask=use_mask, - quantize=quantize, - ) - else: - if PYTORCH_QUANTIZATION_AVAILABLE and quantize: - return quant_nn.QuantConv1d( - in_channels, - out_channels, - kernel_size, - stride=stride, - dilation=dilation, - padding=padding, - bias=bias, - groups=groups, - ) - elif not PYTORCH_QUANTIZATION_AVAILABLE and quantize: - raise ImportError( - "pytorch-quantization is not installed. Install from " - "https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization." - ) - else: - return nn.Conv1d( - in_channels, - out_channels, - kernel_size, - stride=stride, - dilation=dilation, - padding=padding, - bias=bias, - groups=groups, - ) - - def _get_conv_bn_layer( - self, - in_channels, - out_channels, - kernel_size=11, - stride=1, - dilation=1, - padding=0, - bias=False, - groups=1, - heads=-1, - separable=False, - normalization="batch", - norm_groups=1, - quantize=False, - ): - if norm_groups == -1: - norm_groups = out_channels - - if separable: - layers = [ - self._get_conv( - in_channels, - in_channels, - kernel_size, - stride=stride, - dilation=dilation, - padding=padding, - bias=bias, - groups=in_channels, - heads=heads, - quantize=quantize, - ), - self._get_conv( - in_channels, - out_channels, - kernel_size=1, - stride=1, - dilation=1, - padding=0, - bias=bias, - groups=groups, - quantize=quantize, - ), - ] - else: - layers = [ - self._get_conv( - in_channels, - out_channels, - kernel_size, - stride=stride, - dilation=dilation, - padding=padding, - bias=bias, - groups=groups, - quantize=quantize, - ) - ] - - if normalization == "group": - layers.append(nn.GroupNorm(num_groups=norm_groups, num_channels=out_channels)) - elif normalization == "instance": - layers.append(nn.GroupNorm(num_groups=out_channels, num_channels=out_channels)) - elif normalization == "layer": - layers.append(nn.GroupNorm(num_groups=1, num_channels=out_channels)) - elif normalization == "batch": - layers.append(nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.1)) - else: - raise ValueError( - f"Normalization method ({normalization}) does not match" f" one of [batch, layer, group, instance]." - ) - - if groups > 1: - layers.append(GroupShuffle(groups, out_channels)) - return layers - - def _get_act_dropout_layer(self, drop_prob=0.2, activation=None): - if activation is None: - activation = nn.Hardtanh(min_val=0.0, max_val=20.0) - layers = [activation, nn.Dropout(p=drop_prob)] - return layers - - def forward(self, input_: Tuple[List[Tensor], Optional[Tensor]]) -> Tuple[List[Tensor], Optional[Tensor]]: - """ - Forward pass of the module. - - Args: - input_: The input is a tuple of two values - the preprocessed audio signal as well as the lengths - of the audio signal. The audio signal is padded to the shape [B, D, T] and the lengths are - a torch vector of length B. - - Returns: - The output of the block after processing the input through `repeat` number of sub-blocks, - as well as the lengths of the encoded audio after padding/striding. - """ - lens_orig = None - xs = input_[0] - if len(input_) == 2: - xs, lens_orig = input_ - - # compute forward convolutions - out = xs[-1] - - lens = lens_orig - for i, l in enumerate(self.mconv): - # if we're doing masked convolutions, we need to pass in and - # possibly update the sequence lengths - # if (i % 4) == 0 and self.conv_mask: - if isinstance(l, (MaskedConv1d, SqueezeExcite)): - out, lens = l(out, lens) - else: - out = l(out) - - # compute the residuals - if self.res is not None: - for i, layer in enumerate(self.res): - res_out = xs[i] - for j, res_layer in enumerate(layer): - if isinstance(res_layer, MaskedConv1d): - res_out, _ = res_layer(res_out, lens_orig) - else: - res_out = res_layer(res_out) - - if self.residual_mode == 'add' or self.residual_mode == 'stride_add': - if PYTORCH_QUANTIZATION_AVAILABLE and self.quantize: - out = self.residual_quantizer(out) + res_out - elif not PYTORCH_QUANTIZATION_AVAILABLE and self.quantize: - raise ImportError( - "pytorch-quantization is not installed. Install from " - "https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization." - ) - else: - out = out + res_out - else: - out = torch.max(out, res_out) - - # compute the output - out = self.mout(out) - - # Support ASR Adapters - if self.is_adapter_available(): - # Check for all available and enabled adapters - adapter_names = self.get_enabled_adapters() - - if len(adapter_names) > 0: - out = out.transpose(1, 2) # (B, T, C) - - # Call the adapters - out = self.forward_enabled_adapters(out) - - out = out.transpose(1, 2) # (B, C, T) - - if self.is_access_enabled(): - # for adapters - if self.access_cfg.get('save_encoder_tensors', False): - self.register_accessible_tensor(name='encoder', tensor=out) - # for interctc - even though in some cases it's the same, we - # want to register separate key to be able to modify it later - # during interctc processing, if required - if self.interctc_should_capture is None: - capture_layers = self.access_cfg.get('interctc', {}).get('capture_layers', []) - self.interctc_should_capture = self.layer_idx in capture_layers - if self.interctc_should_capture: - # shape is the same as the shape of audio_signal output, i.e. [B, D, T] - self.register_accessible_tensor(name=f'interctc/layer_output_{self.layer_idx}', tensor=out) - self.register_accessible_tensor(name=f'interctc/layer_length_{self.layer_idx}', tensor=lens) - - if self.res is not None and self.dense_residual: - return xs + [out], lens - - return [out], lens - - -class ParallelBlock(nn.Module): - """ - Computational module that computes several `blocks` independently from each other and aggregates the outputs. - It expects audio inputs to be passed together with lengths, just like Jasper blocks, and all outputs to have - the same dimensions but it does not impose any additional requirements on the structure of the blocks themselves. - - Args: - blocks: List of Jasper blocks that will be computed concurently. It is expected that they accept the same - input and return outputs with the same number of channels. - aggregation_mode: an optional string, indicating how the outputs will be aggregated. Supported values are - ['sum', 'dropout']. "sum" value forces outputs to be summed together. "dropout" value enables tower - dropout training with different blocks being dropped out during training. - block_dropout_prob: a probability of dropping any individual block during training with "dropout" aggregation - mode. Acts as a regularization technique. - residual_mode: an optional string indicating how residuals will be applied. Supported values are - ['sum', 'conv']. In 'sum' mode input features are summed together with the output. This will fail if the - number of channels in the input is different from the number of channels in an output tensor. In 'conv' mode - inputs are passed through pointwise convolution to make input channel dimension match output channel - dimension. In this mode `in_filters` and `out_filters` params are required. - in_filters: number of filters (channels) in the input tensor of each block. - out_filters: number of filters (channels) in the output tensor of each block. - """ - - def __init__( - self, - blocks, - aggregation_mode: str = "sum", - block_dropout_prob: int = 0.0, - residual_mode: str = "sum", - in_filters: int = None, - out_filters: int = None, - ): - super().__init__() - self.blocks = nn.ModuleList(blocks) - - self.supported_aggregations = ["sum", "dropout"] - if aggregation_mode not in self.supported_aggregations: - raise ValueError( - f"Got non-supported aggregation mode: {aggregation_mode}. Supported values are {self.supported_aggregations}." - ) - self.aggregation_mode = aggregation_mode - - if aggregation_mode == "dropout": - self.weights = nn.Parameter(torch.ones(len(blocks)), requires_grad=False) - self.dropout = nn.Dropout(block_dropout_prob) - - self.supported_residuals = ["sum", "conv"] - if residual_mode not in self.supported_residuals: - raise ValueError( - f"Got non-supported residual mode: {residual_mode}. Supported values are {self.supported_residuals}." - ) - self.residual_mode = residual_mode - - if residual_mode == "conv": - if in_filters is None or out_filters is None: - raise ValueError("in_filters and out_filters have to be specified when using 'conv' residual mode.") - self.res_conv = MaskedConv1d(in_filters, out_filters, kernel_size=1, bias=False, use_mask=True) - - def get_dropout_mask(self): - weights = self.dropout(self.weights) - while torch.sum(weights) == 0 and self.dropout.p < 1.0: - weights = self.dropout(self.weights) - return weights - - def forward(self, x: Tuple[List[Tensor], Optional[Tensor]]): - """ - Forward pass computing aggregated output. - - Args: - x: tuple of padded signal and lengths the signal. The shape of the signal is [B, D, T]. The lengths are - 1D torch tensor of length B. - - Returns: - torch tensor after passing input throught each block and aggregating these outputs according to the - aggregation mode. - """ - if len(self.blocks) == 1: - return self.blocks[0](x) - - result = None - max_mask = None - - scaling_weights = None - if self.aggregation_mode == "dropout": - scaling_weights = self.get_dropout_mask() - - for i, block in enumerate(self.blocks): - output, mask = block(x) - - weighted_output = output[-1] - if self.aggregation_mode == "dropout": - weighted_output = scaling_weights[i] * output[-1] - - if result is None: - result = weighted_output - else: - result = result + weighted_output - - if max_mask is None: - max_mask = mask - else: - max_mask = torch.max(torch.stack([mask, max_mask]), dim=0)[0] - input_feat = x[0][-1] - lens = x[1] - if self.residual_mode == "sum": - result = result + input_feat - elif self.residual_mode == "conv": - result = result + self.res_conv(input_feat, lens)[0] - return [result], max_mask diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/multi_head_attention.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/multi_head_attention.py deleted file mode 100644 index 6a866a617f356d21ff05a0771a9626fcb73be6dd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/multi_head_attention.py +++ /dev/null @@ -1,1026 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Part of this code is adopted from https://github.com/espnet/espnet -""" - -import math -from functools import lru_cache -from typing import List, Tuple - -import torch -import torch.nn as nn -import torch.nn.functional as F - -from nemo.utils import avoid_float16_autocast_context - -__all__ = [ - 'RelPositionMultiHeadAttention', - 'RelPositionalEncoding', - 'PositionalEncoding', -] - - -class MultiHeadAttention(nn.Module): - """Multi-Head Attention layer of Transformer. - Args: - n_head (int): number of heads - n_feat (int): size of the features - dropout_rate (float): dropout rate - """ - - def __init__(self, n_head, n_feat, dropout_rate, max_cache_len=0): - """Construct an MultiHeadedAttention object.""" - super(MultiHeadAttention, self).__init__() - self.cache_drop_size = None - assert n_feat % n_head == 0 - # We assume d_v always equals d_k - self.d_k = n_feat // n_head - self.s_d_k = math.sqrt(self.d_k) - self.h = n_head - self.linear_q = nn.Linear(n_feat, n_feat) - self.linear_k = nn.Linear(n_feat, n_feat) - self.linear_v = nn.Linear(n_feat, n_feat) - self.linear_out = nn.Linear(n_feat, n_feat) - self.dropout = nn.Dropout(p=dropout_rate) - - self._max_cache_len = max_cache_len - - def forward_qkv(self, query, key, value): - """Transforms query, key and value. - Args: - query (torch.Tensor): (batch, time1, size) - key (torch.Tensor): (batch, time2, size) - value (torch.Tensor): (batch, time2, size) - returns: - q (torch.Tensor): (batch, head, time1, size) - k (torch.Tensor): (batch, head, time2, size) - v (torch.Tensor): (batch, head, time2, size) - """ - n_batch = query.size(0) - q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - q = q.transpose(1, 2) - k = k.transpose(1, 2) - v = v.transpose(1, 2) - - return q, k, v - - def forward_attention(self, value, scores, mask): - """Compute attention context vector. - Args: - value (torch.Tensor): (batch, time2, size) - scores(torch.Tensor): (batch, time1, time2) - mask(torch.Tensor): (batch, time1, time2) - returns: - value (torch.Tensor): transformed `value` (batch, time2, d_model) weighted by the attention scores - """ - n_batch = value.size(0) - if mask is not None: - mask = mask.unsqueeze(1) # (batch, 1, time1, time2) - scores = scores.masked_fill(mask, -10000.0) - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) # (batch, head, time1, time2) - else: - attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - - p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = x.transpose(1, 2).reshape(n_batch, -1, self.h * self.d_k) # (batch, time1, d_model) - - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, query, key, value, mask, pos_emb=None, cache=None): - """Compute 'Scaled Dot Product Attention'. - Args: - query (torch.Tensor): (batch, time1, size) - key (torch.Tensor): (batch, time2, size) - value(torch.Tensor): (batch, time2, size) - mask (torch.Tensor): (batch, time1, time2) - cache (torch.Tensor) : (batch, time_cache, size) - - returns: - output (torch.Tensor): transformed `value` (batch, time1, d_model) weighted by the query dot key attention - cache (torch.Tensor) : (batch, time_cache_next, size) - """ - key, value, query, cache = self.update_cache(key=key, value=value, query=query, cache=cache) - - if torch.is_autocast_enabled(): - query, key, value = query.to(torch.float32), key.to(torch.float32), value.to(torch.float32) - - # temporary until we solve this more gracefully - with avoid_float16_autocast_context(): - q, k, v = self.forward_qkv(query, key, value) - scores = torch.matmul(q, k.transpose(-2, -1)) / self.s_d_k - out = self.forward_attention(v, scores, mask) - if cache is None: - return out - else: - return out, cache - - def update_cache(self, key, value, query, cache): - if cache is not None: - key = value = torch.cat([cache, key], dim=1) - q_keep_size = query.shape[1] - self.cache_drop_size - cache = torch.cat([cache[:, q_keep_size:, :], query[:, :q_keep_size, :]], dim=1) - return key, value, query, cache - - -class RelPositionMultiHeadAttention(MultiHeadAttention): - """Multi-Head Attention layer of Transformer-XL with support of relative positional encoding. - Paper: https://arxiv.org/abs/1901.02860 - Args: - n_head (int): number of heads - n_feat (int): size of the features - dropout_rate (float): dropout rate - """ - - def __init__(self, n_head, n_feat, dropout_rate, pos_bias_u, pos_bias_v, max_cache_len=0): - """Construct an RelPositionMultiHeadedAttention object.""" - super().__init__(n_head=n_head, n_feat=n_feat, dropout_rate=dropout_rate, max_cache_len=max_cache_len) - # linear transformation for positional encoding - self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) - # these two learnable biases are used in matrix c and matrix d - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - if pos_bias_u is None or pos_bias_v is None: - self.pos_bias_u = nn.Parameter(torch.FloatTensor(self.h, self.d_k)) - self.pos_bias_v = nn.Parameter(torch.FloatTensor(self.h, self.d_k)) - # nn.init.normal_(self.pos_bias_u, 0.0, 0.02) - # nn.init.normal_(self.pos_bias_v, 0.0, 0.02) - nn.init.zeros_(self.pos_bias_u) - nn.init.zeros_(self.pos_bias_v) - else: - self.pos_bias_u = pos_bias_u - self.pos_bias_v = pos_bias_v - - def rel_shift(self, x): - """Compute relative positional encoding. - Args: - x (torch.Tensor): (batch, nheads, time, 2*time-1) - """ - b, h, qlen, pos_len = x.size() # (b, h, t1, t2) - # need to add a column of zeros on the left side of last dimension to perform the relative shifting - x = torch.nn.functional.pad(x, pad=(1, 0)) # (b, h, t1, t2+1) - x = x.view(b, h, -1, qlen) # (b, h, t2+1, t1) - # need to drop the first row - x = x[:, :, 1:].view(b, h, qlen, pos_len) # (b, h, t1, t2) - return x - - def forward(self, query, key, value, mask, pos_emb, cache=None): - """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Args: - query (torch.Tensor): (batch, time1, size) - key (torch.Tensor): (batch, time2, size) - value(torch.Tensor): (batch, time2, size) - mask (torch.Tensor): (batch, time1, time2) - pos_emb (torch.Tensor) : (batch, time1, size) - cache (torch.Tensor) : (batch, time_cache, size) - - Returns: - output (torch.Tensor): transformed `value` (batch, time1, d_model) weighted by the query dot key attention - cache (torch.Tensor) : (batch, time_cache_next, size) - """ - key, value, query, cache = self.update_cache(key=key, value=value, query=query, cache=cache) - - if torch.is_autocast_enabled(): - query, key, value = query.to(torch.float32), key.to(torch.float32), value.to(torch.float32) - - # temporary until we solve this more gracefully - with avoid_float16_autocast_context(): - q, k, v = self.forward_qkv(query, key, value) - q = q.transpose(1, 2) # (batch, time1, head, d_k) - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) - p = p.transpose(1, 2) # (batch, head, time1, d_k) - - # (batch, head, time1, d_k) - q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) - # (batch, head, time1, d_k) - q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) - - # compute attention score - # first compute matrix a and matrix c - # as described in https://arxiv.org/abs/1901.02860 Section 3.3 - # (batch, head, time1, time2) - matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) - - # compute matrix b and matrix d - # (batch, head, time1, time2) - matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - matrix_bd = self.rel_shift(matrix_bd) - # drops extra elements in the matrix_bd to match the matrix_ac's size - matrix_bd = matrix_bd[:, :, :, : matrix_ac.size(-1)] - - scores = (matrix_ac + matrix_bd) / self.s_d_k # (batch, head, time1, time2) - - out = self.forward_attention(v, scores, mask) - - if cache is None: - return out - else: - return out, cache - - -class RelPositionMultiHeadAttentionLongformer(RelPositionMultiHeadAttention): - """Multi-Head Attention layer of Transformer-XL with sliding window local+global attention from Longformer. - Partially adapted from allenai (https://github.com/allenai/longformer/blob/master/longformer/sliding_chunks.py) - and huggingface (https://github.com/huggingface/transformers/blob/main/src/transformers/models/longformer/modeling_longformer.py) - Paper: https://arxiv.org/abs/1901.02860 (Transformer-XL), - https://arxiv.org/abs/2004.05150 (Longformer) - Args: - n_head (int): number of heads - n_feat (int): size of the features - dropout_rate (float): dropout rate - pos_bias_u (Tensor): the positional bias matrix U - pos_bias_v (Tensor): the positional bias matrix V - att_context_size (List[int]): List of 2 ints corresponding to left and right attention context sizes. - max_cache_len (int): the maximum size of cache - global_tokens (int): number of tokens to be used for global attention - global_tokens_spacing (int): how far apart the global tokens are - global_attn_separate (bool): whether the q, k, v layers used for global tokens should be separate - """ - - def __init__( - self, - n_head, - n_feat, - dropout_rate, - pos_bias_u, - pos_bias_v, - att_context_size, - max_cache_len=0, - global_tokens=0, - global_tokens_spacing=1, - global_attn_separate=False, - ): - """Construct an RelPositionMultiHeadAttentionLongformer object.""" - super().__init__( - n_head=n_head, - n_feat=n_feat, - dropout_rate=dropout_rate, - pos_bias_u=pos_bias_u, - pos_bias_v=pos_bias_v, - max_cache_len=max_cache_len, - ) - self.att_context_size = att_context_size - self.global_tokens = global_tokens - self.global_tokens_spacing = global_tokens_spacing - self.global_attn_separate = global_attn_separate - - if self.global_attn_separate: - self.global_q = nn.Linear(n_feat, n_feat) - self.global_k = nn.Linear(n_feat, n_feat) - self.global_v = nn.Linear(n_feat, n_feat) - - def forward(self, query, key, value, pad_mask, pos_emb, cache=None): - """Compute Scaled Dot Product Local Attention with rel. positional encoding. using overlapping chunks - Args: - query (torch.Tensor): (batch, time, size) - key (torch.Tensor): (batch, time, size) - value(torch.Tensor): (batch, time, size) - pad_mask (torch.Tensor): (batch, time) - pos_emb (torch.Tensor) : (batch, 2w + 1, size) - cache (torch.Tensor) : (batch, time_cache, size) - Returns: - output (torch.Tensor): transformed `value` (batch, time1, d_model) weighted by the query dot key attention - cache (torch.Tensor) : (batch, time_cache_next, size) - """ - - key, value, query, cache = self.update_cache(key=key, value=value, query=query, cache=cache) - - if torch.is_autocast_enabled(): - query, key, value = query.to(torch.float32), key.to(torch.float32), value.to(torch.float32) - - # temporary until we solve this more gracefully - with avoid_float16_autocast_context(): - q, k, v = self.forward_qkv(query, key, value) - n_batch, _, T, _ = q.size() - - w = max(self.att_context_size[0], self.att_context_size[1]) - if w <= 0: - raise ValueError("When using local attention, context size must be set > 0") - pad_len = (2 * w - T % (2 * w)) % (2 * w) # pad time to 2w - q = F.pad(q, (0, 0, 0, pad_len)) # (batch, head, time, size) - k = F.pad(k, (0, 0, 0, pad_len)) # (batch, head, time, size) - v = F.pad(v, (0, 0, 0, pad_len)) # (batch, head, time, size) - mask = F.pad(pad_mask, (0, pad_len), value=1.0) - - q_with_bias_u = q + self.pos_bias_u.unsqueeze(1) # (batch, head, time, size) - q_with_bias_v = q + self.pos_bias_v.unsqueeze(1) # (batch, head, time, size) - - diagonal_matrix_ac = self.sliding_chunks_matmul_qk( - q_with_bias_u, k, w, padding_value=0.0 - ) # (batch, head, time, 2w + 1) - - # add relative positional embedding - - n_batch_pos = pos_emb.size(0) - p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k).transpose(1, 2) - # (batch, head, 2w, size) - diagonal_matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) - # (batch, head, time, 2w + 1) - - start_pos = w - self.att_context_size[0] - end_pos = w + self.att_context_size[1] - - diagonal_matrix_ac[:, :, :, : self.att_context_size[0]] += diagonal_matrix_bd[ - :, :, :, : self.att_context_size[0] - ] - diagonal_matrix_ac[:, :, :, -(self.att_context_size[1] + 1) :] += diagonal_matrix_bd[ - :, :, :, self.att_context_size[0] : - ] - scores = diagonal_matrix_ac / self.s_d_k - # (batch, head, time, 2w + 1) - - # mask invalid positions - scores[:, :, :, :start_pos] = -10000.0 - scores[:, :, :, end_pos + 1 :] = -10000.0 - - # This implementation is fast and takes very little memory because num_heads x hidden_size = 1 - # from (bsz x seq_len) to (bsz x num_heads x seqlen x hidden_size) - mask = mask.unsqueeze(dim=1).unsqueeze(dim=-1) - # cast to float/half then replace 1's with -inf - float_mask = mask.type_as(scores).masked_fill(mask, -10000.0) - ones = float_mask.new_ones(size=float_mask.size()) # tensor of ones - # diagonal mask with zeros everywhere and -inf inplace of padding - d_mask = self.sliding_chunks_matmul_qk(ones, float_mask, w, padding_value=0.0) - # (batch, head, time, 2w + 1) - - scores += d_mask - - if self.global_tokens > 0: - - # create q, k, v for global attn - if self.global_attn_separate: - global_q = self.global_q(query).view(n_batch, -1, self.h, self.d_k) - global_k = self.global_k(key).view(n_batch, -1, self.h, self.d_k) - global_v = self.global_v(value).view(n_batch, -1, self.h, self.d_k) - global_q = global_q.transpose(1, 2) - global_k = global_k.transpose(1, 2) - global_v = global_v.transpose(1, 2) - global_q = F.pad(global_q, (0, 0, 0, pad_len)) # (batch, head, time, size) - global_k = F.pad(global_k, (0, 0, 0, pad_len)) # (batch, head, time, size) - global_v = F.pad(global_v, (0, 0, 0, pad_len)) # (batch, head, time, size) - else: - global_q, global_k, global_v = q, k, v - - global_q /= self.s_d_k - - # assign which tokens are global - is_index_global_attn = torch.zeros_like(pad_mask) - is_index_global_attn[ - :, : self.global_tokens * self.global_tokens_spacing : self.global_tokens_spacing - ] = 1.0 - - # compute global attn indices - ( - max_num_global_attn_indices, - is_index_global_attn_nonzero, - is_local_index_global_attn_nonzero, - is_local_index_no_global_attn_nonzero, - ) = self._get_global_attn_indices(is_index_global_attn=is_index_global_attn) - - # calculate global attn probs with global keys - # (batch, time, head, max_num_global_attn_indices) - global_key_attn = self._compute_global_key_attn( - query=global_q.transpose(1, 2), - key=global_k.transpose(1, 2), - max_num_global_attn_indices=max_num_global_attn_indices, - is_index_global_attn_nonzero=is_index_global_attn_nonzero, - is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero, - is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero, - ).transpose(1, 2) - - # concat to local_attn_probs - # (batch, time, head, max_num_global_attn_indices + 2*w) - scores = torch.cat((global_key_attn, scores), dim=-1) - - # free memory - del global_key_attn - - attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) - p_attn = self.dropout(attn) - # (batch, head, time, 2w + 1) - - if self.global_tokens > 0: - # compute sum of global and local attn - out = self._compute_attn_output_with_global_indices( - value=v, - attn_probs=p_attn, - max_num_global_attn_indices=max_num_global_attn_indices, - is_index_global_attn_nonzero=is_index_global_attn_nonzero, - is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero, - w=w, - ) - else: - # compute local attn only - out = self.sliding_chunks_matmul_pv(p_attn, v, w) - - out = out.reshape(n_batch, -1, self.h * self.d_k)[:, :T] - - if self.global_tokens > 0: - out_global_to_all = self._compute_out_global_to_all( - query=global_q, - key=global_k, - value=global_v, - max_num_global_attn_indices=max_num_global_attn_indices, - is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero, - is_index_global_attn_nonzero=is_index_global_attn_nonzero, - is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero, - is_index_masked=mask, - ) - - # overwrite values with global attention - out[is_index_global_attn_nonzero] = out_global_to_all - - ret = self.linear_out(out) - - if cache is None: - return ret - else: - return ret, cache - - def _get_global_attn_indices(self, is_index_global_attn: torch.Tensor) -> Tuple: - """ - Compute global attention indices. - - Args: - is_index_global_attn (torch.Tensor): (batch, time) A boolean tensor indicating if an index is a global attention index. - - Returns: - max_num_global_attn_indices (int): Maximum number of global attention indices in the batch. - is_index_global_attn_nonzero (tuple): Indices of global attention (non-zero elements). - is_local_index_global_attn_nonzero (tuple): Indices of non-padding values within global attention indices. - is_local_index_no_global_attn_nonzero (tuple): Indices of padding values within global attention indices. - """ - # Calculate the number of global attention indices in the batch - num_global_attn_indices = is_index_global_attn.long().sum(dim=1) - - # Find the maximum number of global attention indices in the batch - max_num_global_attn_indices = num_global_attn_indices.max() - - # Get the indices of global attention (non-zero elements) - is_index_global_attn_nonzero = is_index_global_attn.nonzero(as_tuple=True) - - # Create a helper tensor to find the local indices of global attention - is_local_index_global_attn = torch.arange( - max_num_global_attn_indices, device=is_index_global_attn.device - ) < num_global_attn_indices.unsqueeze(dim=-1) - - # Find the non-padding values within global attention indices - is_local_index_global_attn_nonzero = is_local_index_global_attn.nonzero(as_tuple=True) - - # Find the padding values within global attention indices - is_local_index_no_global_attn_nonzero = (is_local_index_global_attn == 0).nonzero(as_tuple=True) - - return ( - max_num_global_attn_indices, - is_index_global_attn_nonzero, - is_local_index_global_attn_nonzero, - is_local_index_no_global_attn_nonzero, - ) - - def _compute_global_key_attn( - self, - key: torch.Tensor, - query: torch.Tensor, - max_num_global_attn_indices: int, - is_index_global_attn_nonzero: tuple, - is_local_index_global_attn_nonzero: tuple, - is_local_index_no_global_attn_nonzero: tuple, - ) -> torch.Tensor: - """ - Compute the attention probabilities using only global key vectors. - - Args: - key (torch.Tensor): (batch, time, head, head_dim) The key vectors. - query (torch.Tensor): (batch, time, head, head_dim) The query vectors. - max_num_global_attn_indices (int): Maximum number of global attention indices in the batch. - is_index_global_attn_nonzero (tuple): Indices of global attention (non-zero elements). - is_local_index_global_attn_nonzero (tuple): Non-padding values within global attention indices. - is_local_index_no_global_attn_nonzero (tuple): Padding values within global attention indices. - - Returns: - attn_probs_from_global_key (torch.Tensor): (batch, time, head, max_num_global_attn_indices) The computed attention probabilities using only global key vectors. - """ - batch_size = key.shape[0] - - # create only global key vectors - key_only_global = key.new_zeros(batch_size, max_num_global_attn_indices, self.h, self.d_k) - - key_only_global[is_local_index_global_attn_nonzero] = key[is_index_global_attn_nonzero] - - # (batch_size, seq_len, head, max_num_global_attn_indices) - attn_probs_from_global_key = torch.einsum("blhd,bshd->blhs", (query, key_only_global)) - - # need to transpose since ONNX export only supports consecutive indexing: https://pytorch.org/docs/stable/onnx.html#writes-sets - attn_probs_from_global_key = attn_probs_from_global_key.transpose(1, 3) - attn_probs_from_global_key[ - is_local_index_no_global_attn_nonzero[0], is_local_index_no_global_attn_nonzero[1], :, : - ] = torch.finfo(attn_probs_from_global_key.dtype).min - attn_probs_from_global_key = attn_probs_from_global_key.transpose(1, 3) - - return attn_probs_from_global_key - - def _compute_attn_output_with_global_indices( - self, - value: torch.Tensor, - attn_probs: torch.Tensor, - max_num_global_attn_indices: int, - is_index_global_attn_nonzero: tuple, - is_local_index_global_attn_nonzero: tuple, - w: int, - ) -> torch.Tensor: - """ - Compute the attention output with global indices. - - Args: - value (torch.Tensor): (batch, head, time, head_dim) The value vectors for global attention. - attn_probs (torch.Tensor): (batch, time, head, 2w) The attention probabilities. - max_num_global_attn_indices (int): Maximum number of global attention indices in the batch. - is_index_global_attn_nonzero (tuple): Indices of global attention (non-zero elements). - is_local_index_global_attn_nonzero (tuple): Non-padding values within global attention indices. - w (int): Local context size - Returns: - torch.Tensor: (batch, time, head x head_dim) The attention output of all tokens attending to global. - """ - batch_size, time = attn_probs.shape[0], attn_probs.shape[2] - - value = value.transpose(1, 2) - - # get value vectors for global only - value_vectors_only_global = value.new_zeros(batch_size, max_num_global_attn_indices, self.h, self.d_k) - value_vectors_only_global[is_local_index_global_attn_nonzero] = value[is_index_global_attn_nonzero] - - # cut local attn probs to global only - attn_probs_only_global = attn_probs.narrow(-1, 0, max_num_global_attn_indices) - # compute attn output only global - attn_output_only_global = torch.matmul( - attn_probs_only_global.clone(), value_vectors_only_global.transpose(1, 2).clone() - ).transpose(1, 2) - - # reshape attn probs - attn_probs_without_global = attn_probs.narrow( - -1, max_num_global_attn_indices, attn_probs.size(-1) - max_num_global_attn_indices - ).contiguous() - - # compute attn output with global - attn_output_without_global = self.sliding_chunks_matmul_pv(attn_probs_without_global, value.transpose(1, 2), w) - - return attn_output_only_global + attn_output_without_global - - def _compute_out_global_to_all( - self, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - max_num_global_attn_indices: int, - is_local_index_global_attn_nonzero: tuple, - is_index_global_attn_nonzero: tuple, - is_local_index_no_global_attn_nonzero: tuple, - is_index_masked: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Compute the attention output of global tokens attending to all. - - Args: - query (torch.Tensor): (batch, head, time, head_dim) The queries for global attention. - key (torch.Tensor): (batch, head, time, head_dim) The keys for global attention. - value (torch.Tensor): (batch, head, time, head_dim) The values for global attention. - max_num_global_attn_indices (int): Maximum number of global attention indices in the batch. - is_local_index_global_attn_nonzero (tuple): Non-padding values within global attention indices. - is_index_global_attn_nonzero (tuple): Indices of global attention (non-zero elements). - is_local_index_no_global_attn_nonzero (tuple): Padding values within global attention indices. - is_index_masked (torch.Tensor): (batch, time) A boolean tensor indicating if an index is masked. - - Returns: - global_attn_output (torch.Tensor): (batch, max_num_global_attn_indices, head x head_dim) - The attention output of global tokens attending to all. - """ - - batch_size = key.shape[0] - seq_len = key.shape[2] - - global_k = key.reshape(batch_size * self.h, -1, self.d_k) - global_v = value.reshape(batch_size * self.h, -1, self.d_k) - - global_q = query.transpose(1, 2) - global_q_from_global = global_q.new_zeros(batch_size, max_num_global_attn_indices, self.h, self.d_k) - global_q_from_global[is_local_index_global_attn_nonzero] = global_q[is_index_global_attn_nonzero] - global_q_from_global = global_q_from_global.transpose(0, 1).reshape(batch_size * self.h, -1, self.d_k) - - # compute attn scores - global_attn_scores = torch.bmm(global_q_from_global, global_k.transpose(1, 2)) - global_attn_scores = global_attn_scores.view(batch_size, self.h, max_num_global_attn_indices, seq_len) - - # need to transpose since ONNX export only supports consecutive indexing: https://pytorch.org/docs/stable/onnx.html#writes-sets - global_attn_scores = global_attn_scores.transpose(1, 2) - global_attn_scores[ - is_local_index_no_global_attn_nonzero[0], is_local_index_no_global_attn_nonzero[1], :, : - ] = torch.finfo(global_attn_scores.dtype).min - global_attn_scores = global_attn_scores.transpose(1, 2) - - global_attn_scores = global_attn_scores.masked_fill( - is_index_masked.transpose(2, 3), torch.finfo(global_attn_scores.dtype).min, - ) - - global_attn_scores = global_attn_scores.view(batch_size * self.h, max_num_global_attn_indices, seq_len) - - # compute global attn probs - global_attn_probs_float = nn.functional.softmax(global_attn_scores, dim=-1, dtype=torch.float32) - - global_attn_probs = self.dropout(global_attn_probs_float) - - # global attn output - global_attn_output = torch.bmm(global_attn_probs, global_v) - global_attn_output = global_attn_output.view(batch_size, self.h, max_num_global_attn_indices, self.d_k) - - global_attn_output = global_attn_output[ - is_local_index_global_attn_nonzero[0], :, is_local_index_global_attn_nonzero[1] - ] - - global_attn_output = global_attn_output.reshape(global_attn_output.shape[0], -1) - - return global_attn_output - - # Longformer implementation for overlap case - # - def _skew(self, x: torch.Tensor, direction: List[int], padding_value: float) -> torch.Tensor: - """Convert diagonals into columns (or columns into diagonals depending on `direction` - - Args: - x (torch.Tensor): (batch x head, chunk_count, 2w, 2w) - direction (List[int]): padding directions - padding_value (float): value to pad with - - Returns: - output (torch.Tensor): (batch x head, chunk_count, 2w, 2w + 1) - - """ - x_padded = F.pad(x, direction, value=padding_value) - x_padded = x_padded.view(*x_padded.size()[:-2], x_padded.size(-1), x_padded.size(-2)) - return x_padded - - def _skew2(self, x: torch.Tensor, padding_value: float) -> torch.Tensor: - """Shift every row 1 step to right converting columns into diagonals - - Args: - x (torch.Tensor): (batch x head, chunks_count + 1, w, 2w + 1) - padding_value (float): value to pad with - - Returns: - output (torch.Tensor): (batch x head, chunks_count + 1, w, 3w) - """ - # X = B x C x M x L - B, C, M, L = x.size() - x = F.pad(x, (0, M + 1), value=padding_value) # B x C x M x (L+M+1) - x = x.view(B, C, -1) # B x C x ML+MM+M - x = x[:, :, :-M] # B x C x ML+MM - x = x.view(B, C, M, M + L) # B x C, M x L+M - x = x[:, :, :, :-1] - return x - - def _chunk_overlap(self, x: torch.Tensor, w: int) -> torch.Tensor: - """Convert into overlapping chunks. - - Args: - x (torch.Tensor): # (batch x head, time, size) - w (int): Chunk overlap size - - Returns: - output (torch.Tensor): # (batch x head, chunk_count, 2w, size) - """ - - # non-overlapping chunks of size = 2w - x = x.view(x.size(0), x.size(1) // (w * 2), w * 2, x.size(2)) - - # use `as_strided` to make the chunks overlap with an overlap size = w - chunk_size = list(x.size()) - chunk_size[1] = chunk_size[1] * 2 - 1 - - chunk_stride = list(x.stride()) - chunk_stride[1] = chunk_stride[1] // 2 - return x.as_strided(size=chunk_size, stride=chunk_stride) - - @lru_cache() - def _get_invalid_locations_mask(self, w: int, device: str): - - diagonals_list = [] - for j in range(-w, 1): - diagonal_mask = torch.zeros(w, device='cpu', dtype=torch.uint8) - diagonal_mask[:-j] = 1 - diagonals_list.append(diagonal_mask) - - mask = torch.stack(diagonals_list, dim=-1) - mask = mask[None, None, :, :] - - ending_mask = mask.flip(dims=(2, 3)).bool().to(device) - return mask.bool().to(device), ending_mask - - def mask_invalid_locations( - self, input_tensor: torch.Tensor, w: int, - ): - """ - Mask locations invalid for the sliding window attention - - Args: - input_tensor (torch.Tensor): # (batch x head, time, size) - w (int): Chunk overlap size - """ - beginning_mask, ending_mask = self._get_invalid_locations_mask(w, input_tensor.device) - seq_len = input_tensor.size(2) - beginning_input = input_tensor[:, :, :w, : w + 1] - beginning_mask = beginning_mask[:, :, :seq_len].expand(beginning_input.size()) - beginning_input.masked_fill_(beginning_mask, -float('inf')) - - ending_input = input_tensor[:, :, -w:, -(w + 1) :] - ending_mask = ending_mask[:, :, -seq_len:].expand(ending_input.size()) - ending_input.masked_fill_(ending_mask, -float('inf')) - - def sliding_chunks_matmul_qk(self, q: torch.Tensor, k: torch.Tensor, w: int, padding_value: float) -> torch.Tensor: - """Matrix multiplication of query x key tensors using with a sliding window attention pattern. - This implementation splits the input into overlapping chunks of size 2w - with an overlap of size w - - Args: - q (torch.Tensor): (batch, head, time, size) - k (torch.Tensor): (batch, head, time, size) - w (int): Chunk overlap size - padding_value (float): Value to pad with - - Returns: - output (torch.Tensor): (batch, head, time, 2w + 1) - """ - bsz, num_heads, seqlen, head_dim = q.size() - assert seqlen % (w * 2) == 0 - assert q.size() == k.size() - - chunks_count = seqlen // w - 1 - - # group bsz and num_heads dimensions into one, then chunk seqlen into chunks of size w * 2 - q = q.reshape(bsz * num_heads, seqlen, head_dim) - k = k.reshape(bsz * num_heads, seqlen, head_dim) - - chunk_q = self._chunk_overlap(q, w) # (batch x head, chunk_count, 2w, size) - chunk_k = self._chunk_overlap(k, w) # (batch x head, chunk_count, 2w, size) - - # matrix multipication - # bcxd: bsz*num_heads x chunks x 2w x head_dim - # bcyd: bsz*num_heads x chunks x 2w x head_dim - # bcxy: bsz*num_heads x chunks x 2w x 2w - chunk_attn = torch.einsum('bcxd,bcyd->bcxy', (chunk_q, chunk_k)) # multiply - # (batch x head, chunk_count, 2w, 2w) - - # convert diagonals into columns - diagonal_chunk_attn = self._skew(chunk_attn, direction=(0, 0, 0, 1), padding_value=padding_value) - # (batch x head, chunk_count, 2w, 2w + 1) - - # allocate space for the overall attention matrix where the chunks are combined. The last dimension - # has (w * 2 + 1) columns. The first (w) columns are the w lower triangles (attention from a word to - # w previous words). The following column is attention score from each word to itself, then - # followed by w columns for the upper triangle. - - diagonal_attn = diagonal_chunk_attn.new_empty((bsz * num_heads, chunks_count + 1, w, w * 2 + 1)) - # (batch x head, chunk_count + 1, w, 2w + 1) - - # copy parts from diagonal_chunk_attn into the compined matrix of attentions - # - copying the main diagonal and the upper triangle - diagonal_attn[:, :-1, :, w:] = diagonal_chunk_attn[:, :, :w, : w + 1] - diagonal_attn[:, -1, :, w:] = diagonal_chunk_attn[:, -1, w:, : w + 1] - # - copying the lower triangle - diagonal_attn[:, 1:, :, :w] = diagonal_chunk_attn[:, :, -(w + 1) : -1, w + 1 :] - diagonal_attn[:, 0, 1:w, 1:w] = diagonal_chunk_attn[:, 0, : w - 1, 1 - w :] - - # separate bsz and num_heads dimensions again - diagonal_attn = diagonal_attn.view(bsz, num_heads, seqlen, 2 * w + 1) - # (batch, head, time, 2w + 1) - - self.mask_invalid_locations(diagonal_attn, w) - - return diagonal_attn - - def sliding_chunks_matmul_pv(self, prob: torch.Tensor, v: torch.Tensor, w: int): - """Same as sliding_chunks_matmul_qk but for prob and value tensors. - - Args: - prob (torch.Tensor): (batch, head, time, size) - v (torch.Tensor): (batch, head, time, size) - w (int): Chunk overlap size - - Returns: - output (torch.Tensor): (batch, time, head, size) - """ - bsz, num_heads, seqlen, head_dim = v.size() - chunks_count = seqlen // w - 1 - # group bsz and num_heads dimensions into one, then chunk seqlen into chunks of size 2w - chunk_prob = prob.reshape(bsz * num_heads, seqlen // w, w, 2 * w + 1) - # (batch x head, chunks_count + 1, w, 2w + 1) - - # group bsz and num_heads dimensions into one - v = v.reshape(bsz * num_heads, seqlen, head_dim) - # (batch x head, time, size) - - # pad seqlen with w at the beginning of the sequence and another w at the end - padded_v = F.pad(v, (0, 0, w, w), value=-1) - # (batch x head, time + 2w, size) - - # chunk padded_v into chunks of size 3w and an overlap of size w - chunk_v_size = (bsz * num_heads, chunks_count + 1, 3 * w, head_dim) - chunk_v_stride = padded_v.stride() - chunk_v_stride = chunk_v_stride[0], w * chunk_v_stride[1], chunk_v_stride[1], chunk_v_stride[2] - chunk_v = padded_v.as_strided(size=chunk_v_size, stride=chunk_v_stride) - # (batch x head, chunks_count + 1, 3w, size) - - skewed_prob = self._skew2(chunk_prob, padding_value=0) - # (batch x head, chunks_count + 1, w, 3w) - - context = torch.einsum('bcwd,bcdh->bcwh', (skewed_prob, chunk_v)) - # (batch x head, chunks_count + 1, w, size) - - return context.view(bsz, num_heads, seqlen, head_dim).transpose(1, 2) - - -class PositionalEncoding(torch.nn.Module): - """Fixed sinusoidal positional encoding. - Args: - d_model (int): embedding dim - dropout_rate (float): dropout rate - max_len (int): maximum input length - xscale (bool): whether to scale the input by sqrt(d_model) - dropout_rate_emb (float): dropout rate for the positional embeddings - """ - - def __init__(self, d_model, dropout_rate, max_len=5000, xscale=None, dropout_rate_emb=0.0): - """Construct an PositionalEncoding object.""" - super(PositionalEncoding, self).__init__() - self.d_model = d_model - self.xscale = xscale - self.dropout = torch.nn.Dropout(p=dropout_rate) - self.max_len = max_len - if dropout_rate_emb > 0: - self.dropout_emb = nn.Dropout(dropout_rate_emb) - else: - self.dropout_emb = None - - def create_pe(self, positions): - pos_length = positions.size(0) - pe = torch.zeros(pos_length, self.d_model, device=positions.device) - div_term = torch.exp( - torch.arange(0, self.d_model, 2, dtype=torch.float32, device=positions.device) - * -(math.log(10000.0) / self.d_model) - ) - pe[:, 0::2] = torch.sin(positions * div_term) - pe[:, 1::2] = torch.cos(positions * div_term) - pe = pe.unsqueeze(0) - if hasattr(self, 'pe'): - self.pe = pe - else: - self.register_buffer('pe', pe, persistent=False) - - def extend_pe(self, length, device): - """Reset and extend the positional encodings if needed.""" - if hasattr(self, 'pe') and self.pe.size(1) >= length: - return - positions = torch.arange(0, length, dtype=torch.float32, device=device).unsqueeze(1) - self.create_pe(positions=positions) - - def forward(self, x: torch.Tensor, cache_len=0): - """Adds positional encoding. - Args: - x (torch.Tensor): Input. Its shape is (batch, time, feature_size) - cache_len (int): the size of the cache which is used to shift positions - Returns: - x+pos_emb (torch.Tensor): Its shape is (batch, time, feature_size) - pos_emb (torch.Tensor): Its shape is (1, time, feature_size) - """ - input_len = x.size(1) + cache_len - if self.xscale: - x = x * self.xscale - pos_emb = self.pe[:, :input_len] - if self.dropout_emb: - pos_emb = self.dropout_emb(pos_emb) - x = x + pos_emb - return self.dropout(x), pos_emb - - -class RelPositionalEncoding(PositionalEncoding): - """Relative positional encoding for TransformerXL's layers - See : Appendix B in https://arxiv.org/abs/1901.02860 - Args: - d_model (int): embedding dim - dropout_rate (float): dropout rate - max_len (int): maximum input length - xscale (bool): whether to scale the input by sqrt(d_model) - dropout_rate_emb (float): dropout rate for the positional embeddings - """ - - def extend_pe(self, length, device): - """Reset and extend the positional encodings if needed.""" - needed_size = 2 * length - 1 - if hasattr(self, 'pe') and self.pe.size(1) >= needed_size: - return - # positions would be from negative numbers to positive - # positive positions would be used for left positions and negative for right positions - positions = torch.arange(length - 1, -length, -1, dtype=torch.float32, device=device).unsqueeze(1) - self.create_pe(positions=positions) - - def forward(self, x, cache_len=0): - """Compute positional encoding. - Args: - x (torch.Tensor): Input. Its shape is (batch, time, feature_size) - cache_len (int): the size of the cache which is used to shift positions - Returns: - x (torch.Tensor): Its shape is (batch, time, feature_size) - pos_emb (torch.Tensor): Its shape is (1, time, feature_size) - """ - - if self.xscale: - x = x * self.xscale - - # center_pos would be the index of position 0 - # negative positions would be used for right and positive for left tokens - # for input of length L, 2*L-1 positions are needed, positions from (L-1) to -(L-1) - input_len = x.size(1) + cache_len - center_pos = self.pe.size(1) // 2 + 1 - start_pos = center_pos - input_len - end_pos = center_pos + input_len - 1 - pos_emb = self.pe[:, start_pos:end_pos] - if self.dropout_emb: - pos_emb = self.dropout_emb(pos_emb) - return self.dropout(x), pos_emb - - -class LocalAttRelPositionalEncoding(PositionalEncoding): - """Relative positional encoding for sliding window attention or chunked attention. - See above for relative positional encoding based on Transformer-XL paper - Args: - left_chunk_size (int): number of frames to in past chunks - chunk size (int): number of frames (max frames if using multimode) in current chunk - d_model (int): embedding dim - dropout_rate (float): dropout rate - max_len (int): maximum input length - xscale (bool): whether to scale the input by sqrt(d_model) - dropout_rate_emb (float): dropout rate for the positional embeddings - """ - - def __init__(self, att_context_size, **kwargs): - super(LocalAttRelPositionalEncoding, self).__init__(**kwargs) - self.left_context = att_context_size[0] - self.right_context = att_context_size[1] - - def extend_pe(self, length, device): - """Reset and extend the positional encodings only at the beginning""" - if hasattr(self, 'pe'): - return - - positions = torch.arange( - self.left_context, -self.right_context - 1, -1, dtype=torch.float32, device=device - ).unsqueeze(1) - self.create_pe(positions=positions) - - def forward(self, x, cache_len=0): - """Compute positional encoding. - Args: - x (torch.Tensor): Input. Its shape is (batch, time, feature_size) - Returns: - x (torch.Tensor): Its shape is (batch, time, feature_size) - pos_emb (torch.Tensor): Its shape is (1, time, feature_size) - """ - - if self.xscale: - x = x * self.xscale - - end_pos = self.left_context + self.right_context + 1 - pos_emb = self.pe[:, :end_pos] - if self.dropout_emb: - pos_emb = self.dropout_emb(pos_emb) - return self.dropout(x), pos_emb diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/multichannel_modules.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/multichannel_modules.py deleted file mode 100644 index 04ab9985d6415838d203cb69a9f74156dd86cd95..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/multichannel_modules.py +++ /dev/null @@ -1,780 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random -from typing import Callable, Optional - -import torch - -from nemo.collections.asr.parts.submodules.multi_head_attention import MultiHeadAttention -from nemo.core.classes import NeuralModule, typecheck -from nemo.core.neural_types import AudioSignal, FloatType, NeuralType, SpectrogramType -from nemo.utils import logging - -try: - import torchaudio - - HAVE_TORCHAUDIO = True -except ModuleNotFoundError: - HAVE_TORCHAUDIO = False - - -class ChannelAugment(NeuralModule): - """Randomly permute and selects a subset of channels. - - Args: - permute_channels (bool): Apply a random permutation of channels. - num_channels_min (int): Minimum number of channels to select. - num_channels_max (int): Max number of channels to select. - rng: Optional, random generator. - seed: Optional, seed for the generator. - """ - - def __init__( - self, - permute_channels: bool = True, - num_channels_min: int = 1, - num_channels_max: Optional[int] = None, - rng: Optional[Callable] = None, - seed: Optional[int] = None, - ): - super().__init__() - - self._rng = random.Random(seed) if rng is None else rng - self.permute_channels = permute_channels - self.num_channels_min = num_channels_min - self.num_channels_max = num_channels_max - - if num_channels_max is not None and num_channels_min > num_channels_max: - raise ValueError( - f'Min number of channels {num_channels_min} cannot be greater than max number of channels {num_channels_max}' - ) - - logging.debug('Initialized %s with', self.__class__.__name__) - logging.debug('\tpermute_channels: %s', self.permute_channels) - logging.debug('\tnum_channels_min: %s', self.num_channels_min) - logging.debug('\tnum_channels_max: %s', self.num_channels_max) - - @property - def input_types(self): - """Returns definitions of module input types - """ - return { - 'input': NeuralType(('B', 'C', 'T'), AudioSignal()), - } - - @property - def output_types(self): - """Returns definitions of module output types - """ - return { - 'output': NeuralType(('B', 'C', 'T'), AudioSignal()), - } - - @typecheck() - @torch.no_grad() - def forward(self, input: torch.Tensor) -> torch.Tensor: - # Expecting (B, C, T) - assert input.ndim == 3, f'Expecting input with shape (B, C, T)' - num_channels_in = input.size(1) - - if num_channels_in < self.num_channels_min: - raise RuntimeError( - f'Number of input channels ({num_channels_in}) is smaller than the min number of output channels ({self.num_channels_min})' - ) - - num_channels_max = num_channels_in if self.num_channels_max is None else self.num_channels_max - num_channels_out = self._rng.randint(self.num_channels_min, num_channels_max) - - channels = list(range(num_channels_in)) - - if self.permute_channels: - self._rng.shuffle(channels) - - channels = channels[:num_channels_out] - - return input[:, channels, :] - - -class TransformAverageConcatenate(NeuralModule): - """Apply transform-average-concatenate across channels. - We're using a version from [2]. - - Args: - in_features: Number of input features - out_features: Number of output features - - References: - [1] Luo et al, End-to-end Microphone Permutation and Number Invariant Multi-channel Speech Separation, 2019 - [2] Yoshioka et al, VarArray: Array-Geometry-Agnostic Continuous Speech Separation, 2022 - """ - - def __init__(self, in_features: int, out_features: Optional[int] = None): - super().__init__() - - if out_features is None: - out_features = in_features - - # Parametrize with the total number of features (needs to be divisible by two due to stacking) - if out_features % 2 != 0: - raise ValueError(f'Number of output features should be divisible by two, currently set to {out_features}') - - self.transform_channel = torch.nn.Sequential( - torch.nn.Linear(in_features, out_features // 2, bias=False), torch.nn.ReLU() - ) - self.transform_average = torch.nn.Sequential( - torch.nn.Linear(in_features, out_features // 2, bias=False), torch.nn.ReLU() - ) - - logging.debug('Initialized %s with', self.__class__.__name__) - logging.debug('\tin_features: %d', in_features) - logging.debug('\tout_features: %d', out_features) - - @property - def input_types(self): - """Returns definitions of module input types - """ - return { - 'input': NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - } - - @property - def output_types(self): - """Returns definitions of module output types - """ - return { - 'output': NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - } - - @typecheck() - def forward(self, input: torch.Tensor) -> torch.Tensor: - """ - Args: - input: shape (B, M, in_features, T) - - Returns: - Output tensor with shape shape (B, M, out_features, T) - """ - B, M, F, T = input.shape - - # (B, M, F, T) -> (B, T, M, F) - input = input.permute(0, 3, 1, 2) - - # transform and average across channels - average = self.transform_average(input) - average = torch.mean(average, dim=-2, keepdim=True) - # view with the number of channels expanded to M - average = average.expand(-1, -1, M, -1) - - # transform each channel - transform = self.transform_channel(input) - - # concatenate along feature dimension - output = torch.cat([transform, average], dim=-1) - - # Return to the original layout - # (B, T, M, F) -> (B, M, F, T) - output = output.permute(0, 2, 3, 1) - - return output - - -class TransformAttendConcatenate(NeuralModule): - """Apply transform-attend-concatenate across channels. - The output is a concatenation of transformed channel and MHA - over channels. - - Args: - in_features: Number of input features - out_features: Number of output features - n_head: Number of heads for the MHA module - dropout_rate: Dropout rate for the MHA module - - References: - - Jukić et al, Flexible multichannel speech enhancement for noise-robust frontend, 2023 - """ - - def __init__(self, in_features: int, out_features: Optional[int] = None, n_head: int = 4, dropout_rate: float = 0): - super().__init__() - - if out_features is None: - out_features = in_features - - # Parametrize with the total number of features (needs to be divisible by two due to stacking) - if out_features % 2 != 0: - raise ValueError(f'Number of output features should be divisible by two, currently set to {out_features}') - - self.transform_channel = torch.nn.Sequential( - torch.nn.Linear(in_features, out_features // 2, bias=False), torch.nn.ReLU() - ) - self.transform_attend = torch.nn.Sequential( - torch.nn.Linear(in_features, out_features // 2, bias=False), torch.nn.ReLU() - ) - self.attention = MultiHeadAttention(n_head=n_head, n_feat=out_features // 2, dropout_rate=dropout_rate) - - logging.debug('Initialized %s with', self.__class__.__name__) - logging.debug('\tin_features: %d', in_features) - logging.debug('\tout_features: %d', out_features) - logging.debug('\tn_head: %d', n_head) - logging.debug('\tdropout_rate: %f', dropout_rate) - - @property - def input_types(self): - """Returns definitions of module input types - """ - return { - 'input': NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - } - - @property - def output_types(self): - """Returns definitions of module output types - """ - return { - 'output': NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - } - - @typecheck() - def forward(self, input: torch.Tensor) -> torch.Tensor: - """ - Args: - input: shape (B, M, in_features, T) - - Returns: - Output tensor with shape shape (B, M, out_features, T) - """ - B, M, F, T = input.shape - - # (B, M, F, T) -> (B, T, M, F) - input = input.permute(0, 3, 1, 2) - input = input.reshape(B * T, M, F) - - # transform each channel - transform = self.transform_channel(input) - - # attend - attend = self.transform_attend(input) - # attention across channels - attend = self.attention(query=attend, key=attend, value=attend, mask=None) - - # concatenate along feature dimension - output = torch.cat([transform, attend], dim=-1) - - # return to the original layout - output = output.view(B, T, M, -1) - - # (B, T, M, num_features) -> (B, M, num_features, T) - output = output.permute(0, 2, 3, 1) - - return output - - -class ChannelAveragePool(NeuralModule): - """Apply average pooling across channels. - """ - - def __init__(self): - super().__init__() - logging.debug('Initialized %s', self.__class__.__name__) - - @property - def input_types(self): - """Returns definitions of module input types - """ - return { - 'input': NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - } - - @property - def output_types(self): - """Returns definitions of module output types - """ - return { - 'output': NeuralType(('B', 'D', 'T'), SpectrogramType()), - } - - @typecheck() - def forward(self, input: torch.Tensor) -> torch.Tensor: - """ - Args: - input: shape (B, M, F, T) - - Returns: - Output tensor with shape shape (B, F, T) - """ - return torch.mean(input, dim=-3) - - -class ChannelAttentionPool(NeuralModule): - """Use attention pooling to aggregate information across channels. - First apply MHA across channels and then apply averaging. - - Args: - in_features: Number of input features - out_features: Number of output features - n_head: Number of heads for the MHA module - dropout_rate: Dropout rate for the MHA module - - References: - - Wang et al, Neural speech separation using sparially distributed microphones, 2020 - - Jukić et al, Flexible multichannel speech enhancement for noise-robust frontend, 2023 - """ - - def __init__(self, in_features: int, n_head: int = 1, dropout_rate: float = 0): - super().__init__() - self.in_features = in_features - self.attention = MultiHeadAttention(n_head=n_head, n_feat=in_features, dropout_rate=dropout_rate) - - logging.debug('Initialized %s with', self.__class__.__name__) - logging.debug('\tin_features: %d', in_features) - logging.debug('\tnum_heads: %d', n_head) - logging.debug('\tdropout_rate: %d', dropout_rate) - - @property - def input_types(self): - """Returns definitions of module input types - """ - return { - 'input': NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - } - - @property - def output_types(self): - """Returns definitions of module output types - """ - return { - 'output': NeuralType(('B', 'D', 'T'), SpectrogramType()), - } - - @typecheck() - def forward(self, input: torch.Tensor) -> torch.Tensor: - """ - Args: - input: shape (B, M, F, T) - - Returns: - Output tensor with shape shape (B, F, T) - """ - B, M, F, T = input.shape - - # (B, M, F, T) -> (B, T, M, F) - input = input.permute(0, 3, 1, 2) - input = input.reshape(B * T, M, F) - - # attention across channels - output = self.attention(query=input, key=input, value=input, mask=None) - - # return to the original layout - output = output.view(B, T, M, -1) - - # (B, T, M, num_features) -> (B, M, out_features, T) - output = output.permute(0, 2, 3, 1) - - # average across channels - output = torch.mean(output, axis=-3) - - return output - - -class ParametricMultichannelWienerFilter(NeuralModule): - """Parametric multichannel Wiener filter, with an adjustable - tradeoff between noise reduction and speech distortion. - It supports automatic reference channel selection based - on the estimated output SNR. - - Args: - beta: Parameter of the parameteric filter, tradeoff between noise reduction - and speech distortion (0: MVDR, 1: MWF). - rank: Rank assumption for the speech covariance matrix. - postfilter: Optional postfilter. If None, no postfilter is applied. - ref_channel: Optional, reference channel. If None, it will be estimated automatically. - ref_hard: If true, estimate a hard (one-hot) reference. If false, a soft reference. - ref_hard_use_grad: If true, use straight-through gradient when using the hard reference - ref_subband_weighting: If true, use subband weighting when estimating reference channel - num_subbands: Optional, used to determine the parameter size for reference estimation - diag_reg: Optional, diagonal regularization for the multichannel filter - eps: Small regularization constant to avoid division by zero - - References: - - Souden et al, On Optimal Frequency-Domain Multichannel Linear Filtering for Noise Reduction, 2010 - """ - - def __init__( - self, - beta: float = 1.0, - rank: str = 'one', - postfilter: Optional[str] = None, - ref_channel: Optional[int] = None, - ref_hard: bool = True, - ref_hard_use_grad: bool = True, - ref_subband_weighting: bool = False, - num_subbands: Optional[int] = None, - diag_reg: Optional[float] = 1e-6, - eps: float = 1e-8, - ): - if not HAVE_TORCHAUDIO: - logging.error('Could not import torchaudio. Some features might not work.') - - raise ModuleNotFoundError( - f"torchaudio is not installed but is necessary to instantiate a {self.__class__.__name__}" - ) - - super().__init__() - - # Parametric filter - # 0=MVDR, 1=MWF - self.beta = beta - - # Rank - # Assumed rank for the signal covariance matrix (psd_s) - self.rank = rank - - if self.rank == 'full' and self.beta == 0: - raise ValueError(f'Rank {self.rank} is not compatible with beta {self.beta}.') - - # Postfilter, applied on the output of the multichannel filter - if postfilter not in [None, 'ban']: - raise ValueError(f'Postfilter {postfilter} is not supported.') - self.postfilter = postfilter - - # Regularization - if diag_reg is not None and diag_reg < 0: - raise ValueError(f'Diagonal regularization {diag_reg} must be positive.') - self.diag_reg = diag_reg - - if eps <= 0: - raise ValueError(f'Epsilon {eps} must be positive.') - self.eps = eps - - # PSD estimator - self.psd = torchaudio.transforms.PSD() - - # Reference channel - self.ref_channel = ref_channel - if self.ref_channel == 'max_snr': - self.ref_estimator = ReferenceChannelEstimatorSNR( - hard=ref_hard, - hard_use_grad=ref_hard_use_grad, - subband_weighting=ref_subband_weighting, - num_subbands=num_subbands, - eps=eps, - ) - else: - self.ref_estimator = None - # Flag to determine if the filter is MISO or MIMO - self.is_mimo = self.ref_channel is None - - logging.debug('Initialized %s', self.__class__.__name__) - logging.debug('\tbeta: %f', self.beta) - logging.debug('\trank: %s', self.rank) - logging.debug('\tpostfilter: %s', self.postfilter) - logging.debug('\tdiag_reg: %g', self.diag_reg) - logging.debug('\teps: %g', self.eps) - logging.debug('\tref_channel: %s', self.ref_channel) - logging.debug('\tis_mimo: %s', self.is_mimo) - - @staticmethod - def trace(x: torch.Tensor, keepdim: bool = False) -> torch.Tensor: - """Calculate trace of matrix slices over the last - two dimensions in the input tensor. - - Args: - x: tensor, shape (..., C, C) - - Returns: - Trace for each (C, C) matrix. shape (...) - """ - trace = torch.diagonal(x, dim1=-2, dim2=-1).sum(-1) - if keepdim: - trace = trace.unsqueeze(-1).unsqueeze(-1) - return trace - - def apply_diag_reg(self, psd: torch.Tensor) -> torch.Tensor: - """Apply diagonal regularization on psd. - - Args: - psd: tensor, shape (..., C, C) - - Returns: - Tensor, same shape as input. - """ - # Regularization: diag_reg * trace(psd) + eps - diag_reg = self.diag_reg * self.trace(psd).real + self.eps - - # Apply regularization - psd = psd + torch.diag_embed(diag_reg.unsqueeze(-1) * torch.ones(psd.shape[-1], device=psd.device)) - - return psd - - def apply_filter(self, input: torch.Tensor, filter: torch.Tensor) -> torch.Tensor: - """Apply the MIMO filter on the input. - - Args: - input: batch with C input channels, shape (B, C, F, T) - filter: batch of C-input, M-output filters, shape (B, F, C, M) - - Returns: - M-channel filter output, shape (B, M, F, T) - """ - if not filter.is_complex(): - raise TypeError(f'Expecting complex-valued filter, found {filter.dtype}') - - if not input.is_complex(): - raise TypeError(f'Expecting complex-valued input, found {input.dtype}') - - if filter.ndim != 4 or filter.size(-2) != input.size(-3) or filter.size(-3) != input.size(-2): - raise ValueError(f'Filter shape {filter.shape}, not compatible with input shape {input.shape}') - - output = torch.einsum('bfcm,bcft->bmft', filter.conj(), input) - - return output - - def apply_ban(self, input: torch.Tensor, filter: torch.Tensor, psd_n: torch.Tensor) -> torch.Tensor: - """Apply blind analytic normalization postfilter. Note that this normalization has been - derived for the GEV beamformer in [1]. More specifically, the BAN postfilter aims to scale GEV - to satisfy the distortionless constraint and the final analytical expression is derived using - an assumption on the norm of the transfer function. - However, this may still be useful in some instances. - - Args: - input: batch with M output channels (B, M, F, T) - filter: batch of C-input, M-output filters, shape (B, F, C, M) - psd_n: batch of noise PSDs, shape (B, F, C, C) - - Returns: - Filtere input, shape (B, M, F, T) - - References: - - Warsitz and Haeb-Umbach, Blind Acoustic Beamforming Based on Generalized Eigenvalue Decomposition, 2007 - """ - # number of input channel, used to normalize the numerator - num_inputs = filter.size(-2) - numerator = torch.einsum('bfcm,bfci,bfij,bfjm->bmf', filter.conj(), psd_n, psd_n, filter) - numerator = torch.sqrt(numerator.abs() / num_inputs) - - denominator = torch.einsum('bfcm,bfci,bfim->bmf', filter.conj(), psd_n, filter) - denominator = denominator.abs() - - # Scalar filter per output channel, frequency and batch - # shape (B, M, F) - ban = numerator / (denominator + self.eps) - - input = ban[..., None] * input - - return input - - @property - def input_types(self): - """Returns definitions of module input types - """ - return { - 'input': NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - 'mask_s': NeuralType(('B', 'D', 'T'), FloatType()), - 'mask_n': NeuralType(('B', 'D', 'T'), FloatType()), - } - - @property - def output_types(self): - """Returns definitions of module output types - """ - return { - 'output': NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()), - } - - @typecheck() - def forward(self, input: torch.Tensor, mask_s: torch.Tensor, mask_n: torch.Tensor) -> torch.Tensor: - """Return processed signal. - The output has either one channel (M=1) if a ref_channel is selected, - or the same number of channels as the input (M=C) if ref_channel is None. - - Args: - input: Input signal, complex tensor with shape (B, C, F, T) - mask_s: Mask for the desired signal, shape (B, F, T) - mask_n: Mask for the undesired noise, shape (B, F, T) - - Returns: - Processed signal, shape (B, M, F, T) - """ - iodtype = input.dtype - - with torch.cuda.amp.autocast(enabled=False): - # Convert to double - input = input.cdouble() - mask_s = mask_s.double() - mask_n = mask_n.double() - - # Calculate signal statistics - psd_s = self.psd(input, mask_s) - psd_n = self.psd(input, mask_n) - - if self.rank == 'one': - # Calculate filter W using (18) in [1] - # Diagonal regularization - if self.diag_reg: - psd_n = self.apply_diag_reg(psd_n) - - # MIMO filter - # (B, F, C, C) - W = torch.linalg.solve(psd_n, psd_s) - lam = self.trace(W, keepdim=True).real - W = W / (self.beta + lam + self.eps) - elif self.rank == 'full': - # Calculate filter W using (15) in [1] - psd_sn = psd_s + self.beta * psd_n - - if self.diag_reg: - psd_sn = self.apply_diag_reg(psd_sn) - - # MIMO filter - # (B, F, C, C) - W = torch.linalg.solve(psd_sn, psd_s) - else: - raise RuntimeError(f'Unexpected rank {self.rank}') - - if torch.jit.isinstance(self.ref_channel, int): - # Fixed ref channel - # (B, F, C, 1) - W = W[..., self.ref_channel].unsqueeze(-1) - elif self.ref_estimator is not None: - # Estimate ref channel tensor (one-hot or soft across C) - # (B, C) - ref_channel_tensor = self.ref_estimator(W=W, psd_s=psd_s, psd_n=psd_n).to(W.dtype) - # Weighting across channels - # (B, F, C, 1) - W = torch.sum(W * ref_channel_tensor[:, None, None, :], dim=-1, keepdim=True) - - output = self.apply_filter(input=input, filter=W) - - # Optional: postfilter - if self.postfilter == 'ban': - output = self.apply_ban(input=output, filter=W, psd_n=psd_n) - - return output.to(iodtype) - - -class ReferenceChannelEstimatorSNR(NeuralModule): - """Estimate a reference channel by selecting the reference - that maximizes the output SNR. It returns one-hot encoded - vector or a soft reference. - - A straight-through estimator is used for gradient when using - hard reference. - - Args: - hard: If true, use hard estimate of ref channel. - If false, use a soft estimate across channels. - hard_use_grad: Use straight-through estimator for - the gradient. - subband_weighting: If true, use subband weighting when - adding across subband SNRs. If false, use average - across subbands. - - References: - Boeddeker et al, Front-End Processing for the CHiME-5 Dinner Party Scenario, 2018 - """ - - def __init__( - self, - hard: bool = True, - hard_use_grad: bool = True, - subband_weighting: bool = False, - num_subbands: Optional[int] = None, - eps: float = 1e-8, - ): - super().__init__() - - self.hard = hard - self.hard_use_grad = hard_use_grad - self.subband_weighting = subband_weighting - self.eps = eps - - if subband_weighting and num_subbands is None: - raise ValueError(f'Number of subbands must be provided when using subband_weighting={subband_weighting}.') - # Subband weighting - self.weight_s = torch.nn.Parameter(torch.ones(num_subbands)) if subband_weighting else None - self.weight_n = torch.nn.Parameter(torch.ones(num_subbands)) if subband_weighting else None - - logging.debug('Initialized %s', self.__class__.__name__) - logging.debug('\thard: %d', self.hard) - logging.debug('\thard_use_grad: %d', self.hard_use_grad) - logging.debug('\tsubband_weighting: %d', self.subband_weighting) - logging.debug('\tnum_subbands: %s', num_subbands) - logging.debug('\teps: %e', self.eps) - - @property - def input_types(self): - """Returns definitions of module input types - """ - return { - 'W': NeuralType(('B', 'D', 'C', 'C'), SpectrogramType()), - 'psd_s': NeuralType(('B', 'D', 'C', 'C'), SpectrogramType()), - 'psd_n': NeuralType(('B', 'D', 'C', 'C'), SpectrogramType()), - } - - @property - def output_types(self): - """Returns definitions of module output types - """ - return { - 'output': NeuralType(('B', 'C'), FloatType()), - } - - @typecheck() - def forward(self, W: torch.Tensor, psd_s: torch.Tensor, psd_n: torch.Tensor) -> torch.Tensor: - """ - Args: - W: Multichannel input multichannel output filter, shape (B, F, C, M), where - C is the number of input channels and M is the number of output channels - psd_s: Covariance for the signal, shape (B, F, C, C) - psd_n: Covariance for the noise, shape (B, F, C, C) - - Returns: - One-hot or soft reference channel, shape (B, M) - """ - if self.subband_weighting: - # (B, F, M) - pow_s = torch.einsum('...jm,...jk,...km->...m', W.conj(), psd_s, W).abs() - pow_n = torch.einsum('...jm,...jk,...km->...m', W.conj(), psd_n, W).abs() - - # Subband-weighting - # (B, F, M) -> (B, M) - pow_s = torch.sum(pow_s * self.weight_s.softmax(dim=0).unsqueeze(1), dim=-2) - pow_n = torch.sum(pow_n * self.weight_n.softmax(dim=0).unsqueeze(1), dim=-2) - else: - # Sum across f as well - # (B, F, C, M), (B, F, C, C), (B, F, C, M) -> (B, M) - pow_s = torch.einsum('...fjm,...fjk,...fkm->...m', W.conj(), psd_s, W).abs() - pow_n = torch.einsum('...fjm,...fjk,...fkm->...m', W.conj(), psd_n, W).abs() - - # Estimated SNR per channel (B, C) - snr = pow_s / (pow_n + self.eps) - snr = 10 * torch.log10(snr + self.eps) - - # Soft reference - ref_soft = snr.softmax(dim=-1) - - if self.hard: - _, idx = ref_soft.max(dim=-1, keepdim=True) - ref_hard = torch.zeros_like(snr).scatter(-1, idx, 1.0) - if self.hard_use_grad: - # Straight-through for gradient - # Propagate ref_soft gradient, as if thresholding is identity - ref = ref_hard - ref_soft.detach() + ref_soft - else: - # No gradient - ref = ref_hard - else: - ref = ref_soft - - return ref diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py deleted file mode 100644 index e27f9dac043ea28af26ec96e3c37a0000651f034..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/rnnt_beam_decoding.py +++ /dev/null @@ -1,1504 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Union - -import numpy as np -import torch -from tqdm import tqdm - -from nemo.collections.asr.modules import rnnt_abstract -from nemo.collections.asr.parts.utils.rnnt_utils import ( - HATJointOutput, - Hypothesis, - NBestHypotheses, - is_prefix, - select_k_expansions, -) -from nemo.core.classes import Typing, typecheck -from nemo.core.neural_types import AcousticEncodedRepresentation, HypothesisType, LengthsType, NeuralType -from nemo.utils import logging - -try: - import kenlm - - KENLM_AVAILABLE = True -except (ImportError, ModuleNotFoundError): - KENLM_AVAILABLE = False - - -def pack_hypotheses(hypotheses: List[Hypothesis]) -> List[Hypothesis]: - for idx, hyp in enumerate(hypotheses): # type: rnnt_utils.Hypothesis - hyp.y_sequence = torch.tensor(hyp.y_sequence, dtype=torch.long) - - if hyp.dec_state is not None: - hyp.dec_state = _states_to_device(hyp.dec_state) - - # Remove -1 from timestep - if hyp.timestep is not None and len(hyp.timestep) > 0 and hyp.timestep[0] == -1: - hyp.timestep = hyp.timestep[1:] - - return hypotheses - - -def _states_to_device(dec_state, device='cpu'): - if torch.is_tensor(dec_state): - dec_state = dec_state.to(device) - - elif isinstance(dec_state, (list, tuple)): - dec_state = tuple(_states_to_device(dec_i, device) for dec_i in dec_state) - - return dec_state - - -class BeamRNNTInfer(Typing): - """ - Beam Search implementation ported from ESPNet implementation - - https://github.com/espnet/espnet/blob/master/espnet/nets/beam_search_transducer.py - - Sequence level beam decoding or batched-beam decoding, performed auto-repressively - depending on the search type chosen. - - Args: - decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. - joint_model: rnnt_utils.AbstractRNNTJoint implementation. - - beam_size: number of beams for beam search. Must be a positive integer >= 1. - If beam size is 1, defaults to stateful greedy search. - This greedy search might result in slightly different results than - the greedy results obtained by GreedyRNNTInfer due to implementation differences. - - For accurate greedy results, please use GreedyRNNTInfer or GreedyBatchedRNNTInfer. - - search_type: str representing the type of beam search to perform. - Must be one of ['beam', 'tsd', 'alsd']. 'nsc' is currently not supported. - - Algoritm used: - `beam` - basic beam search strategy. Larger beams generally result in better decoding, - however the time required for the search also grows steadily. - - `tsd` - time synchronous decoding. Please refer to the paper: - [Alignment-Length Synchronous Decoding for RNN Transducer](https://ieeexplore.ieee.org/document/9053040) - for details on the algorithm implemented. - - Time synchronous decoding (TSD) execution time grows by the factor T * max_symmetric_expansions. - For longer sequences, T is greater, and can therefore take a long time for beams to obtain - good results. This also requires greater memory to execute. - - `alsd` - alignment-length synchronous decoding. Please refer to the paper: - [Alignment-Length Synchronous Decoding for RNN Transducer](https://ieeexplore.ieee.org/document/9053040) - for details on the algorithm implemented. - - Alignment-length synchronous decoding (ALSD) execution time is faster than TSD, with growth - factor of T + U_max, where U_max is the maximum target length expected during execution. - - Generally, T + U_max < T * max_symmetric_expansions. However, ALSD beams are non-unique, - therefore it is required to use larger beam sizes to achieve the same (or close to the same) - decoding accuracy as TSD. - - For a given decoding accuracy, it is possible to attain faster decoding via ALSD than TSD. - - `maes` = modified adaptive expansion searcn. Please refer to the paper: - [Accelerating RNN Transducer Inference via Adaptive Expansion Search](https://ieeexplore.ieee.org/document/9250505) - - Modified Adaptive Synchronous Decoding (mAES) execution time is adaptive w.r.t the - number of expansions (for tokens) required per timestep. The number of expansions can usually - be constrained to 1 or 2, and in most cases 2 is sufficient. - - This beam search technique can possibly obtain superior WER while sacrificing some evaluation time. - - score_norm: bool, whether to normalize the scores of the log probabilities. - - return_best_hypothesis: bool, decides whether to return a single hypothesis (the best out of N), - or return all N hypothesis (sorted with best score first). The container class changes based - this flag - - When set to True (default), returns a single Hypothesis. - When set to False, returns a NBestHypotheses container, which contains a list of Hypothesis. - - # The following arguments are specific to the chosen `search_type` - - tsd_max_sym_exp_per_step: Used for `search_type=tsd`. The maximum symmetric expansions allowed - per timestep during beam search. Larger values should be used to attempt decoding of longer - sequences, but this in turn increases execution time and memory usage. - - alsd_max_target_len: Used for `search_type=alsd`. The maximum expected target sequence length - during beam search. Larger values allow decoding of longer sequences at the expense of - execution time and memory. - - # The following two flags are placeholders and unused until `nsc` implementation is stabilized. - nsc_max_timesteps_expansion: Unused int. - - nsc_prefix_alpha: Unused int. - - # mAES flags - maes_num_steps: Number of adaptive steps to take. From the paper, 2 steps is generally sufficient. int > 1. - - maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to keep this as 1 - in order to reduce expensive beam search cost later. int >= 0. - - maes_expansion_beta: Maximum number of prefix expansions allowed, in addition to the beam size. - Effectively, the number of hypothesis = beam_size + maes_expansion_beta. Must be an int >= 0, - and affects the speed of inference since large values will perform large beam search in the next step. - - maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the expansions. - The default (2.3) is selected from the paper. It performs a comparison (max_log_prob - gamma <= log_prob[v]) - where v is all vocabulary indices in the Vocab set and max_log_prob is the "most" likely token to be - predicted. Gamma therefore provides a margin of additional tokens which can be potential candidates for - expansion apart from the "most likely" candidate. - Lower values will reduce the number of expansions (by increasing pruning-by-value, thereby improving speed - but hurting accuracy). Higher values will increase the number of expansions (by reducing pruning-by-value, - thereby reducing speed but potentially improving accuracy). This is a hyper parameter to be experimentally - tuned on a validation set. - - softmax_temperature: Scales the logits of the joint prior to computing log_softmax. - - preserve_alignments: Bool flag which preserves the history of alignments generated during - beam decoding (sample). When set to true, the Hypothesis will contain - the non-null value for `alignments` in it. Here, `alignments` is a List of List of Tensor (of length V + 1). - - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary. - U is the number of target tokens for the current timestep Ti. - - NOTE: `preserve_alignments` is an invalid argument for any `search_type` - other than basic beam search. - - ngram_lm_model: str - The path to the N-gram LM - ngram_lm_alpha: float - Alpha weight of N-gram LM - tokens_type: str - Tokenization type ['subword', 'char'] - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - "partial_hypotheses": [NeuralType(elements_type=HypothesisType(), optional=True)], # must always be last - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return {"predictions": [NeuralType(elements_type=HypothesisType())]} - - def __init__( - self, - decoder_model: rnnt_abstract.AbstractRNNTDecoder, - joint_model: rnnt_abstract.AbstractRNNTJoint, - beam_size: int, - search_type: str = 'default', - score_norm: bool = True, - return_best_hypothesis: bool = True, - tsd_max_sym_exp_per_step: Optional[int] = 50, - alsd_max_target_len: Union[int, float] = 1.0, - nsc_max_timesteps_expansion: int = 1, - nsc_prefix_alpha: int = 1, - maes_num_steps: int = 2, - maes_prefix_alpha: int = 1, - maes_expansion_gamma: float = 2.3, - maes_expansion_beta: int = 2, - language_model: Optional[Dict[str, Any]] = None, - softmax_temperature: float = 1.0, - preserve_alignments: bool = False, - ngram_lm_model: Optional[str] = None, - ngram_lm_alpha: float = 0.0, - hat_subtract_ilm: bool = False, - hat_ilm_weight: float = 0.0, - ): - self.decoder = decoder_model - self.joint = joint_model - - self.blank = decoder_model.blank_idx - self.vocab_size = decoder_model.vocab_size - self.search_type = search_type - self.return_best_hypothesis = return_best_hypothesis - - if beam_size < 1: - raise ValueError("Beam search size cannot be less than 1!") - - self.beam_size = beam_size - self.score_norm = score_norm - self.max_candidates = beam_size - - if self.beam_size == 1: - logging.info("Beam size of 1 was used, switching to sample level `greedy_search`") - self.search_algorithm = self.greedy_search - elif search_type == "default": - self.search_algorithm = self.default_beam_search - elif search_type == "tsd": - self.search_algorithm = self.time_sync_decoding - elif search_type == "alsd": - self.search_algorithm = self.align_length_sync_decoding - elif search_type == "nsc": - raise NotImplementedError("`nsc` (Constrained Beam Search) has not been implemented.") - # self.search_algorithm = self.nsc_beam_search - elif search_type == "maes": - self.search_algorithm = self.modified_adaptive_expansion_search - else: - raise NotImplementedError( - f"The search type ({search_type}) supplied is not supported!\n" - f"Please use one of : (default, tsd, alsd, nsc)" - ) - - if tsd_max_sym_exp_per_step is None: - tsd_max_sym_exp_per_step = -1 - - if search_type in ['tsd', 'alsd', 'nsc'] and not self.decoder.blank_as_pad: - raise ValueError( - f"Search type was chosen as '{search_type}', however the decoder module provided " - f"does not support the `blank` token as a pad value. {search_type} requires " - f"the blank token as pad value support in order to perform batched beam search." - f"Please chose one of the other beam search methods, or re-train your model " - f"with this support." - ) - - self.tsd_max_symmetric_expansion_per_step = tsd_max_sym_exp_per_step - self.alsd_max_target_length = alsd_max_target_len - self.nsc_max_timesteps_expansion = nsc_max_timesteps_expansion - self.nsc_prefix_alpha = int(nsc_prefix_alpha) - self.maes_prefix_alpha = int(maes_prefix_alpha) - self.maes_num_steps = int(maes_num_steps) - self.maes_expansion_gamma = float(maes_expansion_gamma) - self.maes_expansion_beta = int(maes_expansion_beta) - - if self.search_type == 'maes' and self.maes_prefix_alpha < 0: - raise ValueError("`maes_prefix_alpha` must be a positive integer.") - - if self.search_type == 'maes' and self.vocab_size < beam_size + maes_expansion_beta: - raise ValueError( - f"beam_size ({beam_size}) + expansion_beta ({maes_expansion_beta}) " - f"should be smaller or equal to vocabulary size ({self.vocab_size})." - ) - - if search_type == 'maes': - self.max_candidates += maes_expansion_beta - - if self.search_type == 'maes' and self.maes_num_steps < 2: - raise ValueError("`maes_num_steps` must be greater than 1.") - - if softmax_temperature != 1.0 and language_model is not None: - logging.warning( - "Softmax temperature is not supported with LM decoding." "Setting softmax-temperature value to 1.0." - ) - - self.softmax_temperature = 1.0 - else: - self.softmax_temperature = softmax_temperature - self.language_model = language_model - self.preserve_alignments = preserve_alignments - - self.token_offset = 0 - - if ngram_lm_model: - if KENLM_AVAILABLE: - self.ngram_lm = kenlm.Model(ngram_lm_model) - self.ngram_lm_alpha = ngram_lm_alpha - else: - raise ImportError( - "KenLM package (https://github.com/kpu/kenlm) is not installed. " "Use ngram_lm_model=None." - ) - else: - self.ngram_lm = None - - if hat_subtract_ilm: - assert hasattr(self.joint, "return_hat_ilm") - assert search_type == "maes" - self.hat_subtract_ilm = hat_subtract_ilm - self.hat_ilm_weight = hat_ilm_weight - - @typecheck() - def __call__( - self, - encoder_output: torch.Tensor, - encoded_lengths: torch.Tensor, - partial_hypotheses: Optional[List[Hypothesis]] = None, - ) -> Union[Hypothesis, NBestHypotheses]: - """Perform general beam search. - - Args: - encoder_output: Encoded speech features (B, D_enc, T_max) - encoded_lengths: Lengths of the encoder outputs - - Returns: - Either a list containing a single Hypothesis (when `return_best_hypothesis=True`, - otherwise a list containing a single NBestHypotheses, which itself contains a list of - Hypothesis. This list is sorted such that the best hypothesis is the first element. - """ - # Preserve decoder and joint training state - decoder_training_state = self.decoder.training - joint_training_state = self.joint.training - - # setup hat outputs mode - return_hat_ilm_default = False - if self.hat_subtract_ilm: - assert hasattr(self.joint, "return_hat_ilm") - return_hat_ilm_default = self.joint.return_hat_ilm - self.joint.return_hat_ilm = self.hat_subtract_ilm - - with torch.no_grad(): - # Apply optional preprocessing - encoder_output = encoder_output.transpose(1, 2) # (B, T, D) - - self.decoder.eval() - self.joint.eval() - - hypotheses = [] - with tqdm( - range(encoder_output.size(0)), - desc='Beam search progress:', - total=encoder_output.size(0), - unit='sample', - ) as idx_gen: - - # Freeze the decoder and joint to prevent recording of gradients - # during the beam loop. - with self.decoder.as_frozen(), self.joint.as_frozen(): - - _p = next(self.joint.parameters()) - dtype = _p.dtype - - # Decode every sample in the batch independently. - for batch_idx in idx_gen: - inseq = encoder_output[batch_idx : batch_idx + 1, : encoded_lengths[batch_idx], :] # [1, T, D] - logitlen = encoded_lengths[batch_idx] - - if inseq.dtype != dtype: - inseq = inseq.to(dtype=dtype) - - # Extract partial hypothesis if exists - partial_hypothesis = partial_hypotheses[batch_idx] if partial_hypotheses is not None else None - - # Execute the specific search strategy - nbest_hyps = self.search_algorithm( - inseq, logitlen, partial_hypotheses=partial_hypothesis - ) # sorted list of hypothesis - - # Prepare the list of hypotheses - nbest_hyps = pack_hypotheses(nbest_hyps) - - # Pack the result - if self.return_best_hypothesis: - best_hypothesis = nbest_hyps[0] # type: Hypothesis - else: - best_hypothesis = NBestHypotheses(nbest_hyps) # type: NBestHypotheses - hypotheses.append(best_hypothesis) - - self.decoder.train(decoder_training_state) - self.joint.train(joint_training_state) - if self.hat_subtract_ilm: - self.joint.return_hat_ilm = return_hat_ilm_default - - return (hypotheses,) - - def sort_nbest(self, hyps: List[Hypothesis]) -> List[Hypothesis]: - """Sort hypotheses by score or score given sequence length. - - Args: - hyps: list of hypotheses - - Return: - hyps: sorted list of hypotheses - """ - if self.score_norm: - return sorted(hyps, key=lambda x: x.score / len(x.y_sequence), reverse=True) - else: - return sorted(hyps, key=lambda x: x.score, reverse=True) - - def greedy_search( - self, h: torch.Tensor, encoded_lengths: torch.Tensor, partial_hypotheses: Optional[Hypothesis] = None - ) -> List[Hypothesis]: - """Greedy search implementation for transducer. - Generic case when beam size = 1. Results might differ slightly due to implementation details - as compared to `GreedyRNNTInfer` and `GreedyBatchRNNTInfer`. - - Args: - h: Encoded speech features (1, T_max, D_enc) - - Returns: - hyp: 1-best decoding results - """ - if self.preserve_alignments: - # Alignments is a 2-dimensional dangling list representing T x U - alignments = [[]] - else: - alignments = None - - # Initialize zero state vectors - dec_state = self.decoder.initialize_state(h) - - # Construct initial hypothesis - hyp = Hypothesis( - score=0.0, y_sequence=[self.blank], dec_state=dec_state, timestep=[-1], length=encoded_lengths - ) - - if partial_hypotheses is not None: - if len(partial_hypotheses.y_sequence) > 0: - hyp.y_sequence = [int(partial_hypotheses.y_sequence[-1].cpu().numpy())] - hyp.dec_state = partial_hypotheses.dec_state - hyp.dec_state = _states_to_device(hyp.dec_state, h.device) - - cache = {} - - # Initialize state and first token - y, state, _ = self.decoder.score_hypothesis(hyp, cache) - - for i in range(int(encoded_lengths)): - hi = h[:, i : i + 1, :] # [1, 1, D] - - not_blank = True - symbols_added = 0 - - # TODO: Figure out how to remove this hard coding afterwords - while not_blank and (symbols_added < 5): - ytu = torch.log_softmax(self.joint.joint(hi, y) / self.softmax_temperature, dim=-1) # [1, 1, 1, V + 1] - ytu = ytu[0, 0, 0, :] # [V + 1] - - # max() requires float - if ytu.dtype != torch.float32: - ytu = ytu.float() - - logp, pred = torch.max(ytu, dim=-1) # [1, 1] - pred = pred.item() - - if self.preserve_alignments: - # insert logprobs into last timestep - alignments[-1].append((ytu.to('cpu'), torch.tensor(pred, dtype=torch.int32))) - - if pred == self.blank: - not_blank = False - - if self.preserve_alignments: - # convert Ti-th logits into a torch array - alignments.append([]) # blank buffer for next timestep - else: - # Update state and current sequence - hyp.y_sequence.append(int(pred)) - hyp.score += float(logp) - hyp.dec_state = state - hyp.timestep.append(i) - - # Compute next state and token - y, state, _ = self.decoder.score_hypothesis(hyp, cache) - symbols_added += 1 - - # Remove trailing empty list of alignments - if self.preserve_alignments: - if len(alignments[-1]) == 0: - del alignments[-1] - - # attach alignments to hypothesis - hyp.alignments = alignments - - # Remove the original input label if partial hypothesis was provided - if partial_hypotheses is not None: - hyp.y_sequence = hyp.y_sequence[1:] - - return [hyp] - - def default_beam_search( - self, h: torch.Tensor, encoded_lengths: torch.Tensor, partial_hypotheses: Optional[Hypothesis] = None - ) -> List[Hypothesis]: - """Beam search implementation. - - Args: - x: Encoded speech features (1, T_max, D_enc) - - Returns: - nbest_hyps: N-best decoding results - """ - # Initialize states - beam = min(self.beam_size, self.vocab_size) - beam_k = min(beam, (self.vocab_size - 1)) - blank_tensor = torch.tensor([self.blank], device=h.device, dtype=torch.long) - - # Precompute some constants for blank position - ids = list(range(self.vocab_size + 1)) - ids.remove(self.blank) - - # Used when blank token is first vs last token - if self.blank == 0: - index_incr = 1 - else: - index_incr = 0 - - # Initialize zero vector states - dec_state = self.decoder.initialize_state(h) - - # Initialize first hypothesis for the beam (blank) - kept_hyps = [Hypothesis(score=0.0, y_sequence=[self.blank], dec_state=dec_state, timestep=[-1], length=0)] - cache = {} - - if partial_hypotheses is not None: - if len(partial_hypotheses.y_sequence) > 0: - kept_hyps[0].y_sequence = [int(partial_hypotheses.y_sequence[-1].cpu().numpy())] - kept_hyps[0].dec_state = partial_hypotheses.dec_state - kept_hyps[0].dec_state = _states_to_device(kept_hyps[0].dec_state, h.device) - - if self.preserve_alignments: - kept_hyps[0].alignments = [[]] - - for i in range(int(encoded_lengths)): - hi = h[:, i : i + 1, :] # [1, 1, D] - hyps = kept_hyps - kept_hyps = [] - - while True: - max_hyp = max(hyps, key=lambda x: x.score) - hyps.remove(max_hyp) - - # update decoder state and get next score - y, state, lm_tokens = self.decoder.score_hypothesis(max_hyp, cache) # [1, 1, D] - - # get next token - ytu = torch.log_softmax(self.joint.joint(hi, y) / self.softmax_temperature, dim=-1) # [1, 1, 1, V + 1] - ytu = ytu[0, 0, 0, :] # [V + 1] - - # preserve alignments - if self.preserve_alignments: - logprobs = ytu.cpu().clone() - - # remove blank token before top k - top_k = ytu[ids].topk(beam_k, dim=-1) - - # Two possible steps - blank token or non-blank token predicted - ytu = ( - torch.cat((top_k[0], ytu[self.blank].unsqueeze(0))), - torch.cat((top_k[1] + index_incr, blank_tensor)), - ) - - # for each possible step - for logp, k in zip(*ytu): - # construct hypothesis for step - new_hyp = Hypothesis( - score=(max_hyp.score + float(logp)), - y_sequence=max_hyp.y_sequence[:], - dec_state=max_hyp.dec_state, - lm_state=max_hyp.lm_state, - timestep=max_hyp.timestep[:], - length=encoded_lengths, - ) - - if self.preserve_alignments: - new_hyp.alignments = copy.deepcopy(max_hyp.alignments) - - # if current token is blank, dont update sequence, just store the current hypothesis - if k == self.blank: - kept_hyps.append(new_hyp) - else: - # if non-blank token was predicted, update state and sequence and then search more hypothesis - new_hyp.dec_state = state - new_hyp.y_sequence.append(int(k)) - new_hyp.timestep.append(i) - - hyps.append(new_hyp) - - # Determine whether the alignment should be blank or token - if self.preserve_alignments: - if k == self.blank: - new_hyp.alignments[-1].append( - (logprobs.clone(), torch.tensor(self.blank, dtype=torch.int32)) - ) - else: - new_hyp.alignments[-1].append( - (logprobs.clone(), torch.tensor(new_hyp.y_sequence[-1], dtype=torch.int32)) - ) - - # keep those hypothesis that have scores greater than next search generation - hyps_max = float(max(hyps, key=lambda x: x.score).score) - kept_most_prob = sorted([hyp for hyp in kept_hyps if hyp.score > hyps_max], key=lambda x: x.score,) - - # If enough hypothesis have scores greater than next search generation, - # stop beam search. - if len(kept_most_prob) >= beam: - if self.preserve_alignments: - # convert Ti-th logits into a torch array - for kept_h in kept_most_prob: - kept_h.alignments.append([]) # blank buffer for next timestep - - kept_hyps = kept_most_prob - break - - # Remove trailing empty list of alignments - if self.preserve_alignments: - for h in kept_hyps: - if len(h.alignments[-1]) == 0: - del h.alignments[-1] - - # Remove the original input label if partial hypothesis was provided - if partial_hypotheses is not None: - for hyp in kept_hyps: - if hyp.y_sequence[0] == partial_hypotheses.y_sequence[-1] and len(hyp.y_sequence) > 1: - hyp.y_sequence = hyp.y_sequence[1:] - - return self.sort_nbest(kept_hyps) - - def time_sync_decoding( - self, h: torch.Tensor, encoded_lengths: torch.Tensor, partial_hypotheses: Optional[Hypothesis] = None - ) -> List[Hypothesis]: - """Time synchronous beam search implementation. - Based on https://ieeexplore.ieee.org/document/9053040 - - Args: - h: Encoded speech features (1, T_max, D_enc) - - Returns: - nbest_hyps: N-best decoding results - """ - if partial_hypotheses is not None: - raise NotImplementedError("`partial_hypotheses` support is not supported") - - # Precompute some constants for blank position - ids = list(range(self.vocab_size + 1)) - ids.remove(self.blank) - - # Used when blank token is first vs last token - if self.blank == 0: - index_incr = 1 - else: - index_incr = 0 - - # prepare the batched beam states - beam = min(self.beam_size, self.vocab_size) - beam_state = self.decoder.initialize_state( - torch.zeros(beam, device=h.device, dtype=h.dtype) - ) # [L, B, H], [L, B, H] (for LSTMs) - - # Initialize first hypothesis for the beam (blank) - B = [ - Hypothesis( - y_sequence=[self.blank], - score=0.0, - dec_state=self.decoder.batch_select_state(beam_state, 0), - timestep=[-1], - length=0, - ) - ] - cache = {} - - # Initialize alignments - if self.preserve_alignments: - for hyp in B: - hyp.alignments = [[]] - - for i in range(int(encoded_lengths)): - hi = h[:, i : i + 1, :] - - # Update caches - A = [] - C = B - - h_enc = hi - - # For a limited number of symmetric expansions per timestep "i" - for v in range(self.tsd_max_symmetric_expansion_per_step): - D = [] - - # Decode a batch of beam states and scores - beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score_hypothesis(C, cache, beam_state) - - # Extract the log probabilities and the predicted tokens - beam_logp = torch.log_softmax( - self.joint.joint(h_enc, beam_y) / self.softmax_temperature, dim=-1 - ) # [B, 1, 1, V + 1] - beam_logp = beam_logp[:, 0, 0, :] # [B, V + 1] - beam_topk = beam_logp[:, ids].topk(beam, dim=-1) - - seq_A = [h.y_sequence for h in A] - - for j, hyp in enumerate(C): - # create a new hypothesis in A - if hyp.y_sequence not in seq_A: - # If the sequence is not in seq_A, add it as the blank token - # In this step, we dont add a token but simply update score - _temp_hyp = Hypothesis( - score=(hyp.score + float(beam_logp[j, self.blank])), - y_sequence=hyp.y_sequence[:], - dec_state=hyp.dec_state, - lm_state=hyp.lm_state, - timestep=hyp.timestep[:], - length=encoded_lengths, - ) - - # Preserve the blank token alignment - if self.preserve_alignments: - _temp_hyp.alignments = copy.deepcopy(hyp.alignments) - _temp_hyp.alignments[-1].append( - (beam_logp[j].clone(), torch.tensor(self.blank, dtype=torch.int32)), - ) - - A.append(_temp_hyp) - else: - # merge the existing blank hypothesis score with current score. - dict_pos = seq_A.index(hyp.y_sequence) - - A[dict_pos].score = np.logaddexp( - A[dict_pos].score, (hyp.score + float(beam_logp[j, self.blank])) - ) - - if v < self.tsd_max_symmetric_expansion_per_step: - for j, hyp in enumerate(C): - # for each current hypothesis j - # extract the top token score and top token id for the jth hypothesis - for logp, k in zip(beam_topk[0][j], beam_topk[1][j] + index_incr): - # create new hypothesis and store in D - # Note: This loop does *not* include the blank token! - new_hyp = Hypothesis( - score=(hyp.score + float(logp)), - y_sequence=(hyp.y_sequence + [int(k)]), - dec_state=self.decoder.batch_select_state(beam_state, j), - lm_state=hyp.lm_state, - timestep=hyp.timestep[:] + [i], - length=encoded_lengths, - ) - - # Preserve token alignment - if self.preserve_alignments: - new_hyp.alignments = copy.deepcopy(hyp.alignments) - new_hyp.alignments[-1].append( - (beam_topk[0].clone().cpu(), torch.tensor(k, dtype=torch.int32)), - ) - - D.append(new_hyp) - - # Prune beam - C = sorted(D, key=lambda x: x.score, reverse=True)[:beam] - - if self.preserve_alignments: - # convert Ti-th logits into a torch array - for C_i in C: - # Check if the last token emitted at last timestep was a blank - # If so, move to next timestep - logp, label = C_i.alignments[-1][-1] # The last alignment of this step - if int(label) == self.blank: - C_i.alignments.append([]) # blank buffer for next timestep - - # Prune beam - B = sorted(A, key=lambda x: x.score, reverse=True)[:beam] - - if self.preserve_alignments: - # convert Ti-th logits into a torch array - for B_i in B: - # Check if the last token emitted at last timestep was a blank - # If so, move to next timestep - logp, label = B_i.alignments[-1][-1] # The last alignment of this step - if int(label) == self.blank: - B_i.alignments.append([]) # blank buffer for next timestep - - # Remove trailing empty list of alignments - if self.preserve_alignments: - for h in B: - if len(h.alignments[-1]) == 0: - del h.alignments[-1] - - return self.sort_nbest(B) - - def align_length_sync_decoding( - self, h: torch.Tensor, encoded_lengths: torch.Tensor, partial_hypotheses: Optional[Hypothesis] = None - ) -> List[Hypothesis]: - """Alignment-length synchronous beam search implementation. - Based on https://ieeexplore.ieee.org/document/9053040 - - Args: - h: Encoded speech features (1, T_max, D_enc) - - Returns: - nbest_hyps: N-best decoding results - """ - # delay this import here instead of at the beginning to avoid circular imports. - from nemo.collections.asr.modules.rnnt import RNNTDecoder, StatelessTransducerDecoder - - if partial_hypotheses is not None: - raise NotImplementedError("`partial_hypotheses` support is not supported") - - # Precompute some constants for blank position - ids = list(range(self.vocab_size + 1)) - ids.remove(self.blank) - - # Used when blank token is first vs last token - if self.blank == 0: - index_incr = 1 - else: - index_incr = 0 - - # prepare the batched beam states - beam = min(self.beam_size, self.vocab_size) - - h = h[0] # [T, D] - h_length = int(encoded_lengths) - beam_state = self.decoder.initialize_state( - torch.zeros(beam, device=h.device, dtype=h.dtype) - ) # [L, B, H], [L, B, H] for LSTMS - - # compute u_max as either a specific static limit, - # or a multiple of current `h_length` dynamically. - if type(self.alsd_max_target_length) == float: - u_max = int(self.alsd_max_target_length * h_length) - else: - u_max = int(self.alsd_max_target_length) - - # Initialize first hypothesis for the beam (blank) - B = [ - Hypothesis( - y_sequence=[self.blank], - score=0.0, - dec_state=self.decoder.batch_select_state(beam_state, 0), - timestep=[-1], - length=0, - ) - ] - - # Initialize alignments - if self.preserve_alignments: - B[0].alignments = [[]] - - final = [] - cache = {} - - # ALSD runs for T + U_max steps - for i in range(h_length + u_max): - # Update caches - A = [] - B_ = [] - h_states = [] - - # preserve the list of batch indices which are added into the list - # and those which are removed from the list - # This is necessary to perform state updates in the correct batch indices later - batch_ids = list(range(len(B))) # initialize as a list of all batch ids - batch_removal_ids = [] # update with sample ids which are removed - - for bid, hyp in enumerate(B): - u = len(hyp.y_sequence) - 1 - t = i - u - - if t > (h_length - 1): - batch_removal_ids.append(bid) - continue - - B_.append(hyp) - h_states.append((t, h[t])) - - if B_: - # Compute the subset of batch ids which were *not* removed from the list above - sub_batch_ids = None - if len(B_) != beam: - sub_batch_ids = batch_ids - for id in batch_removal_ids: - # sub_batch_ids contains list of ids *that were not removed* - sub_batch_ids.remove(id) - - # extract the states of the sub batch only. - if isinstance(self.decoder, RNNTDecoder): - # LSTM decoder, state is [layer x batch x hidden] - beam_state_ = [ - beam_state[state_id][:, sub_batch_ids, :] for state_id in range(len(beam_state)) - ] - elif isinstance(self.decoder, StatelessTransducerDecoder): - # stateless decoder, state is [batch x hidden] - beam_state_ = [beam_state[state_id][sub_batch_ids, :] for state_id in range(len(beam_state))] - else: - raise NotImplementedError("Unknown decoder type.") - - else: - # If entire batch was used (none were removed), simply take all the states - beam_state_ = beam_state - - # Decode a batch/sub-batch of beam states and scores - beam_y, beam_state_, beam_lm_tokens = self.decoder.batch_score_hypothesis(B_, cache, beam_state_) - - # If only a subset of batch ids were updated (some were removed) - if sub_batch_ids is not None: - # For each state in the RNN (2 for LSTM) - for state_id in range(len(beam_state)): - # Update the current batch states with the sub-batch states (in the correct indices) - # These indices are specified by sub_batch_ids, the ids of samples which were updated. - if isinstance(self.decoder, RNNTDecoder): - # LSTM decoder, state is [layer x batch x hidden] - beam_state[state_id][:, sub_batch_ids, :] = beam_state_[state_id][...] - elif isinstance(self.decoder, StatelessTransducerDecoder): - # stateless decoder, state is [batch x hidden] - beam_state[state_id][sub_batch_ids, :] = beam_state_[state_id][...] - else: - raise NotImplementedError("Unknown decoder type.") - else: - # If entire batch was updated, simply update all the states - beam_state = beam_state_ - - # h_states = list of [t, h[t]] - # so h[1] here is a h[t] of shape [D] - # Simply stack all of the h[t] within the sub_batch/batch (T <= beam) - h_enc = torch.stack([h[1] for h in h_states]) # [T=beam, D] - h_enc = h_enc.unsqueeze(1) # [B=beam, T=1, D]; batch over the beams - - # Extract the log probabilities and the predicted tokens - beam_logp = torch.log_softmax( - self.joint.joint(h_enc, beam_y) / self.softmax_temperature, dim=-1 - ) # [B=beam, 1, 1, V + 1] - beam_logp = beam_logp[:, 0, 0, :] # [B=beam, V + 1] - beam_topk = beam_logp[:, ids].topk(beam, dim=-1) - - for j, hyp in enumerate(B_): - # For all updated samples in the batch, add it as the blank token - # In this step, we dont add a token but simply update score - new_hyp = Hypothesis( - score=(hyp.score + float(beam_logp[j, self.blank])), - y_sequence=hyp.y_sequence[:], - dec_state=hyp.dec_state, - lm_state=hyp.lm_state, - timestep=hyp.timestep[:], - length=i, - ) - - if self.preserve_alignments: - new_hyp.alignments = copy.deepcopy(hyp.alignments) - - # Add the alignment of blank at this step - new_hyp.alignments[-1].append( - (beam_logp[j].clone().cpu(), torch.tensor(self.blank, dtype=torch.int32)) - ) - - # Add blank prediction to A - A.append(new_hyp) - - # If the prediction "timestep" t has reached the length of the input sequence - # we can add it to the "finished" hypothesis list. - if h_states[j][0] == (h_length - 1): - final.append(new_hyp) - - # Here, we carefully select the indices of the states that we want to preserve - # for the next token (non-blank) update. - if sub_batch_ids is not None: - h_states_idx = sub_batch_ids[j] - else: - h_states_idx = j - - # for each current hypothesis j - # extract the top token score and top token id for the jth hypothesis - for logp, k in zip(beam_topk[0][j], beam_topk[1][j] + index_incr): - # create new hypothesis and store in A - # Note: This loop does *not* include the blank token! - new_hyp = Hypothesis( - score=(hyp.score + float(logp)), - y_sequence=(hyp.y_sequence[:] + [int(k)]), - dec_state=self.decoder.batch_select_state(beam_state, h_states_idx), - lm_state=hyp.lm_state, - timestep=hyp.timestep[:] + [i], - length=i, - ) - - if self.preserve_alignments: - new_hyp.alignments = copy.deepcopy(hyp.alignments) - - # Add the alignment of Uj for this beam candidate at this step - new_hyp.alignments[-1].append( - (beam_logp[j].clone().cpu(), torch.tensor(new_hyp.y_sequence[-1], dtype=torch.int32)) - ) - - A.append(new_hyp) - - # Prune and recombine same hypothesis - # This may cause next beam to be smaller than max beam size - # Therefore larger beam sizes may be required for better decoding. - B = sorted(A, key=lambda x: x.score, reverse=True)[:beam] - B = self.recombine_hypotheses(B) - - if self.preserve_alignments: - # convert Ti-th logits into a torch array - for B_i in B: - # Check if the last token emitted at last timestep was a blank - # If so, move to next timestep - logp, label = B_i.alignments[-1][-1] # The last alignment of this step - if int(label) == self.blank: - B_i.alignments.append([]) # blank buffer for next timestep - - # If B_ is empty list, then we may be able to early exit - elif len(batch_ids) == len(batch_removal_ids): - # break early - break - - if final: - # Remove trailing empty list of alignments - if self.preserve_alignments: - for h in final: - if len(h.alignments[-1]) == 0: - del h.alignments[-1] - - return self.sort_nbest(final) - else: - # Remove trailing empty list of alignments - if self.preserve_alignments: - for h in B: - if len(h.alignments[-1]) == 0: - del h.alignments[-1] - - return B - - def modified_adaptive_expansion_search( - self, h: torch.Tensor, encoded_lengths: torch.Tensor, partial_hypotheses: Optional[Hypothesis] = None - ) -> List[Hypothesis]: - """ - Based on/modified from https://ieeexplore.ieee.org/document/9250505 - - Args: - h: Encoded speech features (1, T_max, D_enc) - - Returns: - nbest_hyps: N-best decoding results - """ - if partial_hypotheses is not None: - raise NotImplementedError("`partial_hypotheses` support is not supported") - - h = h[0] # [T, D] - - # prepare the batched beam states - beam = min(self.beam_size, self.vocab_size) - beam_state = self.decoder.initialize_state( - torch.zeros(beam, device=h.device, dtype=h.dtype) - ) # [L, B, H], [L, B, H] for LSTMS - - # Initialize first hypothesis for the beam (blank) - init_tokens = [ - Hypothesis( - y_sequence=[self.blank], - score=0.0, - dec_state=self.decoder.batch_select_state(beam_state, 0), - timestep=[-1], - length=0, - ) - ] - - cache = {} - - # Initialize alignment buffer - if self.preserve_alignments: - for hyp in init_tokens: - hyp.alignments = [[]] - - # Decode a batch of beam states and scores - beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score_hypothesis(init_tokens, cache, beam_state) - state = self.decoder.batch_select_state(beam_state, 0) - - # Setup ngram LM: - if self.ngram_lm: - init_lm_state = kenlm.State() - self.ngram_lm.BeginSentenceWrite(init_lm_state) - - # TODO: Setup LM - if self.language_model is not None: - # beam_lm_states, beam_lm_scores = self.lm.buff_predict( - # None, beam_lm_tokens, 1 - # ) - # lm_state = select_lm_state( - # beam_lm_states, 0, self.lm_layers, self.is_wordlm - # ) - # lm_scores = beam_lm_scores[0] - raise NotImplementedError() - else: - lm_state = None - lm_scores = None - - # Initialize first hypothesis for the beam (blank) for kept hypotheses - kept_hyps = [ - Hypothesis( - y_sequence=[self.blank], - score=0.0, - dec_state=state, - dec_out=[beam_dec_out[0]], - lm_state=lm_state, - lm_scores=lm_scores, - timestep=[-1], - length=0, - ) - ] - if self.ngram_lm: - kept_hyps[0].ngram_lm_state = init_lm_state - - # Initialize alignment buffer - if self.preserve_alignments: - for hyp in kept_hyps: - hyp.alignments = [[]] - - for t in range(encoded_lengths): - enc_out_t = h[t : t + 1].unsqueeze(0) # [1, 1, D] - - # Perform prefix search to obtain hypothesis - hyps = self.prefix_search( - sorted(kept_hyps, key=lambda x: len(x.y_sequence), reverse=True), - enc_out_t, - prefix_alpha=self.maes_prefix_alpha, - ) # type: List[Hypothesis] - kept_hyps = [] - - # Prepare output tensor - beam_enc_out = enc_out_t - - # List that contains the blank token emisions - list_b = [] - duplication_check = [hyp.y_sequence for hyp in hyps] - - # Repeat for number of mAES steps - for n in range(self.maes_num_steps): - # Pack the decoder logits for all current hypothesis - beam_dec_out = torch.stack([h.dec_out[-1] for h in hyps]) # [H, 1, D] - - # Extract the log probabilities - ytm, ilm_ytm = self.resolve_joint_output(beam_enc_out, beam_dec_out) - beam_logp, beam_idx = ytm.topk(self.max_candidates, dim=-1) - - beam_logp = beam_logp[:, 0, 0, :] # [B, V + 1] - beam_idx = beam_idx[:, 0, 0, :] # [B, max_candidates] - - # Compute k expansions for all the current hypotheses - k_expansions = select_k_expansions( - hyps, beam_idx, beam_logp, self.maes_expansion_gamma, self.maes_expansion_beta - ) - - # List that contains the hypothesis after prefix expansion - list_exp = [] - for i, hyp in enumerate(hyps): # For all hypothesis - for k, new_score in k_expansions[i]: # for all expansion within these hypothesis - new_hyp = Hypothesis( - y_sequence=hyp.y_sequence[:], - score=new_score, - dec_out=hyp.dec_out[:], - dec_state=hyp.dec_state, - lm_state=hyp.lm_state, - lm_scores=hyp.lm_scores, - timestep=hyp.timestep[:], - length=t, - ) - if self.ngram_lm: - new_hyp.ngram_lm_state = hyp.ngram_lm_state - - # If the expansion was for blank - if k == self.blank: - list_b.append(new_hyp) - else: - # If the expansion was a token - # new_hyp.y_sequence.append(int(k)) - if (new_hyp.y_sequence + [int(k)]) not in duplication_check: - new_hyp.y_sequence.append(int(k)) - new_hyp.timestep.append(t) - - # Setup ngram LM: - if self.ngram_lm: - lm_score, new_hyp.ngram_lm_state = self.compute_ngram_score( - hyp.ngram_lm_state, int(k) - ) - if self.hat_subtract_ilm: - new_hyp.score += self.ngram_lm_alpha * lm_score - float( - self.hat_ilm_weight * ilm_ytm[i, 0, 0, k] - ) - else: - new_hyp.score += self.ngram_lm_alpha * lm_score - - # TODO: Setup LM - if self.language_model is not None: - # new_hyp.score += self.lm_weight * float( - # hyp.lm_scores[k] - # ) - pass - - list_exp.append(new_hyp) - - # Preserve alignments - if self.preserve_alignments: - new_hyp.alignments = copy.deepcopy(hyp.alignments) - - if k == self.blank: - new_hyp.alignments[-1].append( - (beam_logp[i].clone().cpu(), torch.tensor(self.blank, dtype=torch.int32)), - ) - else: - new_hyp.alignments[-1].append( - ( - beam_logp[i].clone().cpu(), - torch.tensor(new_hyp.y_sequence[-1], dtype=torch.int32), - ), - ) - - # If there were no token expansions in any of the hypotheses, - # Early exit - if not list_exp: - kept_hyps = sorted(list_b, key=lambda x: x.score, reverse=True)[:beam] - - # Update aligments with next step - if self.preserve_alignments: - # convert Ti-th logits into a torch array - for h_i in kept_hyps: - # Check if the last token emitted at last timestep was a blank - # If so, move to next timestep - logp, label = h_i.alignments[-1][-1] # The last alignment of this step - if int(label) == self.blank: - h_i.alignments.append([]) # blank buffer for next timestep - - # Early exit - break - - else: - # Initialize the beam states for the hypotheses in the expannsion list - beam_state = self.decoder.batch_initialize_states( - beam_state, - [hyp.dec_state for hyp in list_exp], - # [hyp.y_sequence for hyp in list_exp], # - ) - - # Decode a batch of beam states and scores - beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score_hypothesis( - list_exp, - cache, - beam_state, - # self.language_model is not None, - ) - - # TODO: Setup LM - if self.language_model is not None: - # beam_lm_states = create_lm_batch_states( - # [hyp.lm_state for hyp in list_exp], - # self.lm_layers, - # self.is_wordlm, - # ) - # beam_lm_states, beam_lm_scores = self.lm.buff_predict( - # beam_lm_states, beam_lm_tokens, len(list_exp) - # ) - pass - - # If this isnt the last mAES step - if n < (self.maes_num_steps - 1): - # For all expanded hypothesis - for i, hyp in enumerate(list_exp): - # Preserve the decoder logits for the current beam - hyp.dec_out.append(beam_dec_out[i]) - hyp.dec_state = self.decoder.batch_select_state(beam_state, i) - - # TODO: Setup LM - if self.language_model is not None: - # hyp.lm_state = select_lm_state( - # beam_lm_states, i, self.lm_layers, self.is_wordlm - # ) - # hyp.lm_scores = beam_lm_scores[i] - pass - - # Copy the expanded hypothesis - hyps = list_exp[:] - - # Update aligments with next step - if self.preserve_alignments: - # convert Ti-th logits into a torch array - for h_i in hyps: - # Check if the last token emitted at last timestep was a blank - # If so, move to next timestep - logp, label = h_i.alignments[-1][-1] # The last alignment of this step - if int(label) == self.blank: - h_i.alignments.append([]) # blank buffer for next timestep - - else: - # Extract the log probabilities - beam_logp, _ = self.resolve_joint_output(beam_enc_out, beam_dec_out) - beam_logp = beam_logp[:, 0, 0, :] - - # For all expansions, add the score for the blank label - for i, hyp in enumerate(list_exp): - hyp.score += float(beam_logp[i, self.blank]) - - # Preserve the decoder's output and state - hyp.dec_out.append(beam_dec_out[i]) - hyp.dec_state = self.decoder.batch_select_state(beam_state, i) - - # TODO: Setup LM - if self.language_model is not None: - # hyp.lm_state = select_lm_state( - # beam_lm_states, i, self.lm_layers, self.is_wordlm - # ) - # hyp.lm_scores = beam_lm_scores[i] - pass - - # Finally, update the kept hypothesis of sorted top Beam candidates - kept_hyps = sorted(list_b + list_exp, key=lambda x: x.score, reverse=True)[:beam] - - # Update aligments with next step - if self.preserve_alignments: - # convert Ti-th logits into a torch array - for h_i in kept_hyps: - # Check if the last token emitted at last timestep was a blank - # If so, move to next timestep - logp, label = h_i.alignments[-1][-1] # The last alignment of this step - if int(label) == self.blank: - h_i.alignments.append([]) # blank buffer for next timestep - - # Remove trailing empty list of alignments - if self.preserve_alignments: - for h in kept_hyps: - if len(h.alignments[-1]) == 0: - del h.alignments[-1] - - # Sort the hypothesis with best scores - return self.sort_nbest(kept_hyps) - - def recombine_hypotheses(self, hypotheses: List[Hypothesis]) -> List[Hypothesis]: - """Recombine hypotheses with equivalent output sequence. - - Args: - hypotheses (list): list of hypotheses - - Returns: - final (list): list of recombined hypotheses - """ - final = [] - - for hyp in hypotheses: - seq_final = [f.y_sequence for f in final if f.y_sequence] - - if hyp.y_sequence in seq_final: - seq_pos = seq_final.index(hyp.y_sequence) - - final[seq_pos].score = np.logaddexp(final[seq_pos].score, hyp.score) - else: - final.append(hyp) - - return hypotheses - - def resolve_joint_output(self, enc_out: torch.Tensor, dec_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Resolve output types for RNNT and HAT joint models - """ - - joint_output = self.joint.joint(enc_out, dec_out) - if torch.is_tensor(joint_output): - ytm = torch.log_softmax(joint_output / self.softmax_temperature, dim=-1) - ilm_ytm = None - elif self.hat_subtract_ilm and isinstance(joint_output, HATJointOutput): - ytm, ilm_ytm = joint_output.hat_logprobs, joint_output.ilm_logprobs - else: - raise TypeError( - f"Joint output ({type(joint_output)}) must be torch.Tensor or HATJointOutput in case of HAT joint" - ) - - return ytm, ilm_ytm - - def prefix_search( - self, hypotheses: List[Hypothesis], enc_out: torch.Tensor, prefix_alpha: int - ) -> List[Hypothesis]: - """ - Prefix search for NSC and mAES strategies. - Based on https://arxiv.org/pdf/1211.3711.pdf - """ - - for j, hyp_j in enumerate(hypotheses[:-1]): - for hyp_i in hypotheses[(j + 1) :]: - curr_id = len(hyp_j.y_sequence) - pref_id = len(hyp_i.y_sequence) - - if is_prefix(hyp_j.y_sequence, hyp_i.y_sequence) and (curr_id - pref_id) <= prefix_alpha: - logp, ilm_logp = self.resolve_joint_output(enc_out, hyp_i.dec_out[-1]) - logp = logp[0, 0, 0, :] - curr_score = hyp_i.score + float(logp[hyp_j.y_sequence[pref_id]]) - # Setup ngram LM: - if self.ngram_lm: - lm_score, next_state = self.compute_ngram_score( - hyp_i.ngram_lm_state, int(hyp_j.y_sequence[pref_id]) - ) - if self.hat_subtract_ilm: - curr_score += self.ngram_lm_alpha * lm_score - self.hat_ilm_weight * float( - ilm_logp[0, 0, hyp_j.y_sequence[pref_id]] - ) - else: - curr_score += self.ngram_lm_alpha * lm_score - - for k in range(pref_id, (curr_id - 1)): - logp, ilm_logp = self.resolve_joint_output(enc_out, hyp_j.dec_out[k]) - logp = logp[0, 0, 0, :] - curr_score += float(logp[hyp_j.y_sequence[k + 1]]) - # Setup ngram LM: - if self.ngram_lm: - lm_score, next_state = self.compute_ngram_score(next_state, int(hyp_j.y_sequence[k + 1])) - if self.hat_subtract_ilm: - curr_score += self.ngram_lm_alpha * lm_score - self.hat_ilm_weight * float( - ilm_logp[0, 0, hyp_j.y_sequence[k + 1]] - ) - else: - curr_score += self.ngram_lm_alpha * lm_score - - hyp_j.score = np.logaddexp(hyp_j.score, curr_score) - - return hypotheses - - def compute_ngram_score(self, current_lm_state: "kenlm.State", label: int) -> Tuple[float, "kenlm.State"]: - """ - Score computation for kenlm ngram language model. - """ - - if self.token_offset: - label = chr(label + self.token_offset) - else: - label = str(label) - next_state = kenlm.State() - lm_score = self.ngram_lm.BaseScore(current_lm_state, label, next_state) - lm_score *= 1.0 / np.log10(np.e) - - return lm_score, next_state - - def set_decoding_type(self, decoding_type: str): - - # Please check train_kenlm.py in scripts/asr_language_modeling/ to find out why we need - # TOKEN_OFFSET for BPE-based models - if decoding_type == 'subword': - from nemo.collections.asr.parts.submodules.ctc_beam_decoding import DEFAULT_TOKEN_OFFSET - - self.token_offset = DEFAULT_TOKEN_OFFSET - - -@dataclass -class BeamRNNTInferConfig: - beam_size: int - search_type: str = 'default' - score_norm: bool = True - return_best_hypothesis: bool = True - tsd_max_sym_exp_per_step: Optional[int] = 50 - alsd_max_target_len: float = 1.0 - nsc_max_timesteps_expansion: int = 1 - nsc_prefix_alpha: int = 1 - maes_num_steps: int = 2 - maes_prefix_alpha: int = 1 - maes_expansion_gamma: float = 2.3 - maes_expansion_beta: int = 2 - language_model: Optional[Dict[str, Any]] = None - softmax_temperature: float = 1.0 - preserve_alignments: bool = False - ngram_lm_model: Optional[str] = None - ngram_lm_alpha: Optional[float] = 0.0 - hat_subtract_ilm: bool = False - hat_ilm_weight: float = 0.0 diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py deleted file mode 100644 index a0aea07f7bc8f90bbdc1036da03f8f98a19cdf74..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py +++ /dev/null @@ -1,2782 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field -from typing import List, Optional, Tuple, Union - -import numpy as np -import torch -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.asr.modules import rnnt_abstract -from nemo.collections.asr.parts.utils import rnnt_utils -from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodConfig, ConfidenceMethodMixin -from nemo.collections.common.parts.rnn import label_collate -from nemo.core.classes import Typing, typecheck -from nemo.core.neural_types import AcousticEncodedRepresentation, ElementType, HypothesisType, LengthsType, NeuralType -from nemo.utils import logging - - -def pack_hypotheses(hypotheses: List[rnnt_utils.Hypothesis], logitlen: torch.Tensor,) -> List[rnnt_utils.Hypothesis]: - - if hasattr(logitlen, 'cpu'): - logitlen_cpu = logitlen.to('cpu') - else: - logitlen_cpu = logitlen - - for idx, hyp in enumerate(hypotheses): # type: rnnt_utils.Hypothesis - hyp.y_sequence = torch.tensor(hyp.y_sequence, dtype=torch.long) - hyp.length = logitlen_cpu[idx] - - if hyp.dec_state is not None: - hyp.dec_state = _states_to_device(hyp.dec_state) - - return hypotheses - - -def _states_to_device(dec_state, device='cpu'): - if torch.is_tensor(dec_state): - dec_state = dec_state.to(device) - - elif isinstance(dec_state, (list, tuple)): - dec_state = tuple(_states_to_device(dec_i, device) for dec_i in dec_state) - - return dec_state - - -class _GreedyRNNTInfer(Typing, ConfidenceMethodMixin): - """A greedy transducer decoder. - - Provides a common abstraction for sample level and batch level greedy decoding. - - Args: - decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. - joint_model: rnnt_utils.AbstractRNNTJoint implementation. - blank_index: int index of the blank token. Can be 0 or len(vocabulary). - max_symbols_per_step: Optional int. The maximum number of symbols that can be added - to a sequence in a single time step; if set to None then there is - no limit. - preserve_alignments: Bool flag which preserves the history of alignments generated during - greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `alignments` in it. Here, `alignments` is a List of List of - Tuple(Tensor (of length V + 1), Tensor(scalar, label after argmax)). - - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary. - U is the number of target tokens for the current timestep Ti. - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores generated - during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of List of floats. - - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. - U is the number of target tokens for the current timestep Ti. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame - confidence scores. - - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "encoder_output": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - "partial_hypotheses": [NeuralType(elements_type=HypothesisType(), optional=True)], # must always be last - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return {"predictions": [NeuralType(elements_type=HypothesisType())]} - - def __init__( - self, - decoder_model: rnnt_abstract.AbstractRNNTDecoder, - joint_model: rnnt_abstract.AbstractRNNTJoint, - blank_index: int, - max_symbols_per_step: Optional[int] = None, - preserve_alignments: bool = False, - preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, - ): - super().__init__() - self.decoder = decoder_model - self.joint = joint_model - - self._blank_index = blank_index - self._SOS = blank_index # Start of single index - self.max_symbols = max_symbols_per_step - self.preserve_alignments = preserve_alignments - self.preserve_frame_confidence = preserve_frame_confidence - - # set confidence calculation method - self._init_confidence_method(confidence_method_cfg) - - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) - - @torch.no_grad() - def _pred_step( - self, - label: Union[torch.Tensor, int], - hidden: Optional[torch.Tensor], - add_sos: bool = False, - batch_size: Optional[int] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Common prediction step based on the AbstractRNNTDecoder implementation. - - Args: - label: (int/torch.Tensor): Label or "Start-of-Signal" token. - hidden: (Optional torch.Tensor): RNN State vector - add_sos (bool): Whether to add a zero vector at the begging as "start of sentence" token. - batch_size: Batch size of the output tensor. - - Returns: - g: (B, U, H) if add_sos is false, else (B, U + 1, H) - hid: (h, c) where h is the final sequence hidden state and c is - the final cell state: - h (tensor), shape (L, B, H) - c (tensor), shape (L, B, H) - """ - if isinstance(label, torch.Tensor): - # label: [batch, 1] - if label.dtype != torch.long: - label = label.long() - - else: - # Label is an integer - if label == self._SOS: - return self.decoder.predict(None, hidden, add_sos=add_sos, batch_size=batch_size) - - label = label_collate([[label]]) - - # output: [B, 1, K] - return self.decoder.predict(label, hidden, add_sos=add_sos, batch_size=batch_size) - - def _joint_step(self, enc, pred, log_normalize: Optional[bool] = None): - """ - Common joint step based on AbstractRNNTJoint implementation. - - Args: - enc: Output of the Encoder model. A torch.Tensor of shape [B, 1, H1] - pred: Output of the Decoder model. A torch.Tensor of shape [B, 1, H2] - log_normalize: Whether to log normalize or not. None will log normalize only for CPU. - - Returns: - logits of shape (B, T=1, U=1, V + 1) - """ - with torch.no_grad(): - logits = self.joint.joint(enc, pred) - - if log_normalize is None: - if not logits.is_cuda: # Use log softmax only if on CPU - logits = logits.log_softmax(dim=len(logits.shape) - 1) - else: - if log_normalize: - logits = logits.log_softmax(dim=len(logits.shape) - 1) - - return logits - - -class GreedyRNNTInfer(_GreedyRNNTInfer): - """A greedy transducer decoder. - - Sequence level greedy decoding, performed auto-regressively. - - Args: - decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. - joint_model: rnnt_utils.AbstractRNNTJoint implementation. - blank_index: int index of the blank token. Can be 0 or len(vocabulary). - max_symbols_per_step: Optional int. The maximum number of symbols that can be added - to a sequence in a single time step; if set to None then there is - no limit. - preserve_alignments: Bool flag which preserves the history of alignments generated during - greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `alignments` in it. Here, `alignments` is a List of List of - Tuple(Tensor (of length V + 1), Tensor(scalar, label after argmax)). - - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary. - U is the number of target tokens for the current timestep Ti. - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores generated - during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of List of floats. - - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. - U is the number of target tokens for the current timestep Ti. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame - confidence scores. - - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - """ - - def __init__( - self, - decoder_model: rnnt_abstract.AbstractRNNTDecoder, - joint_model: rnnt_abstract.AbstractRNNTJoint, - blank_index: int, - max_symbols_per_step: Optional[int] = None, - preserve_alignments: bool = False, - preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, - ): - super().__init__( - decoder_model=decoder_model, - joint_model=joint_model, - blank_index=blank_index, - max_symbols_per_step=max_symbols_per_step, - preserve_alignments=preserve_alignments, - preserve_frame_confidence=preserve_frame_confidence, - confidence_method_cfg=confidence_method_cfg, - ) - - @typecheck() - def forward( - self, - encoder_output: torch.Tensor, - encoded_lengths: torch.Tensor, - partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, - ): - """Returns a list of hypotheses given an input batch of the encoder hidden embedding. - Output token is generated auto-regressively. - - Args: - encoder_output: A tensor of size (batch, features, timesteps). - encoded_lengths: list of int representing the length of each sequence - output sequence. - - Returns: - packed list containing batch number of sentences (Hypotheses). - """ - # Preserve decoder and joint training state - decoder_training_state = self.decoder.training - joint_training_state = self.joint.training - - with torch.inference_mode(): - # Apply optional preprocessing - encoder_output = encoder_output.transpose(1, 2) # (B, T, D) - - self.decoder.eval() - self.joint.eval() - - hypotheses = [] - # Process each sequence independently - with self.decoder.as_frozen(), self.joint.as_frozen(): - for batch_idx in range(encoder_output.size(0)): - inseq = encoder_output[batch_idx, :, :].unsqueeze(1) # [T, 1, D] - logitlen = encoded_lengths[batch_idx] - - partial_hypothesis = partial_hypotheses[batch_idx] if partial_hypotheses is not None else None - hypothesis = self._greedy_decode(inseq, logitlen, partial_hypotheses=partial_hypothesis) - hypotheses.append(hypothesis) - - # Pack results into Hypotheses - packed_result = pack_hypotheses(hypotheses, encoded_lengths) - - self.decoder.train(decoder_training_state) - self.joint.train(joint_training_state) - - return (packed_result,) - - @torch.no_grad() - def _greedy_decode( - self, x: torch.Tensor, out_len: torch.Tensor, partial_hypotheses: Optional[rnnt_utils.Hypothesis] = None - ): - # x: [T, 1, D] - # out_len: [seq_len] - - # Initialize blank state and empty label set in Hypothesis - hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None) - - if partial_hypotheses is not None: - hypothesis.last_token = partial_hypotheses.last_token - hypothesis.y_sequence = ( - partial_hypotheses.y_sequence.cpu().tolist() - if isinstance(partial_hypotheses.y_sequence, torch.Tensor) - else partial_hypotheses.y_sequence - ) - if partial_hypotheses.dec_state is not None: - hypothesis.dec_state = self.decoder.batch_concat_states([partial_hypotheses.dec_state]) - hypothesis.dec_state = _states_to_device(hypothesis.dec_state, x.device) - - if self.preserve_alignments: - # Alignments is a 2-dimensional dangling list representing T x U - hypothesis.alignments = [[]] - - if self.preserve_frame_confidence: - hypothesis.frame_confidence = [[]] - - # For timestep t in X_t - for time_idx in range(out_len): - # Extract encoder embedding at timestep t - # f = x[time_idx, :, :].unsqueeze(0) # [1, 1, D] - f = x.narrow(dim=0, start=time_idx, length=1) - - # Setup exit flags and counter - not_blank = True - symbols_added = 0 - # While blank is not predicted, or we dont run out of max symbols per timestep - while not_blank and (self.max_symbols is None or symbols_added < self.max_symbols): - # In the first timestep, we initialize the network with RNNT Blank - # In later timesteps, we provide previous predicted label as input. - if hypothesis.last_token is None and hypothesis.dec_state is None: - last_label = self._SOS - else: - last_label = label_collate([[hypothesis.last_token]]) - - # Perform prediction network and joint network steps. - g, hidden_prime = self._pred_step(last_label, hypothesis.dec_state) - # If preserving per-frame confidence, log_normalize must be true - logp = self._joint_step(f, g, log_normalize=True if self.preserve_frame_confidence else None)[ - 0, 0, 0, : - ] - - del g - - # torch.max(0) op doesnt exist for FP 16. - if logp.dtype != torch.float32: - logp = logp.float() - - # get index k, of max prob - v, k = logp.max(0) - k = k.item() # K is the label at timestep t_s in inner loop, s >= 0. - - if self.preserve_alignments: - # insert logprobs into last timestep - hypothesis.alignments[-1].append((logp.to('cpu'), torch.tensor(k, dtype=torch.int32))) - - if self.preserve_frame_confidence: - # insert confidence into last timestep - hypothesis.frame_confidence[-1].append(self._get_confidence(logp)) - - del logp - - # If blank token is predicted, exit inner loop, move onto next timestep t - if k == self._blank_index: - not_blank = False - else: - # Append token to label set, update RNN state. - hypothesis.y_sequence.append(k) - hypothesis.score += float(v) - hypothesis.timestep.append(time_idx) - hypothesis.dec_state = hidden_prime - hypothesis.last_token = k - - # Increment token counter. - symbols_added += 1 - - if self.preserve_alignments: - # convert Ti-th logits into a torch array - hypothesis.alignments.append([]) # blank buffer for next timestep - - if self.preserve_frame_confidence: - hypothesis.frame_confidence.append([]) # blank buffer for next timestep - - # Remove trailing empty list of Alignments - if self.preserve_alignments: - if len(hypothesis.alignments[-1]) == 0: - del hypothesis.alignments[-1] - - # Remove trailing empty list of per-frame confidence - if self.preserve_frame_confidence: - if len(hypothesis.frame_confidence[-1]) == 0: - del hypothesis.frame_confidence[-1] - - # Unpack the hidden states - hypothesis.dec_state = self.decoder.batch_select_state(hypothesis.dec_state, 0) - - return hypothesis - - -class GreedyBatchedRNNTInfer(_GreedyRNNTInfer): - """A batch level greedy transducer decoder. - - Batch level greedy decoding, performed auto-regressively. - - Args: - decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. - joint_model: rnnt_utils.AbstractRNNTJoint implementation. - blank_index: int index of the blank token. Can be 0 or len(vocabulary). - max_symbols_per_step: Optional int. The maximum number of symbols that can be added - to a sequence in a single time step; if set to None then there is - no limit. - preserve_alignments: Bool flag which preserves the history of alignments generated during - greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `alignments` in it. Here, `alignments` is a List of List of - Tuple(Tensor (of length V + 1), Tensor(scalar, label after argmax)). - - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary. - U is the number of target tokens for the current timestep Ti. - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores generated - during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of List of floats. - - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. - U is the number of target tokens for the current timestep Ti. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame - confidence scores. - - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - """ - - def __init__( - self, - decoder_model: rnnt_abstract.AbstractRNNTDecoder, - joint_model: rnnt_abstract.AbstractRNNTJoint, - blank_index: int, - max_symbols_per_step: Optional[int] = None, - preserve_alignments: bool = False, - preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, - ): - super().__init__( - decoder_model=decoder_model, - joint_model=joint_model, - blank_index=blank_index, - max_symbols_per_step=max_symbols_per_step, - preserve_alignments=preserve_alignments, - preserve_frame_confidence=preserve_frame_confidence, - confidence_method_cfg=confidence_method_cfg, - ) - - # Depending on availability of `blank_as_pad` support - # switch between more efficient batch decoding technique - if self.decoder.blank_as_pad: - self._greedy_decode = self._greedy_decode_blank_as_pad - else: - self._greedy_decode = self._greedy_decode_masked - - @typecheck() - def forward( - self, - encoder_output: torch.Tensor, - encoded_lengths: torch.Tensor, - partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, - ): - """Returns a list of hypotheses given an input batch of the encoder hidden embedding. - Output token is generated auto-regressively. - - Args: - encoder_output: A tensor of size (batch, features, timesteps). - encoded_lengths: list of int representing the length of each sequence - output sequence. - - Returns: - packed list containing batch number of sentences (Hypotheses). - """ - # Preserve decoder and joint training state - decoder_training_state = self.decoder.training - joint_training_state = self.joint.training - - with torch.inference_mode(): - # Apply optional preprocessing - encoder_output = encoder_output.transpose(1, 2) # (B, T, D) - logitlen = encoded_lengths - - self.decoder.eval() - self.joint.eval() - - with self.decoder.as_frozen(), self.joint.as_frozen(): - inseq = encoder_output # [B, T, D] - hypotheses = self._greedy_decode( - inseq, logitlen, device=inseq.device, partial_hypotheses=partial_hypotheses - ) - - # Pack the hypotheses results - packed_result = pack_hypotheses(hypotheses, logitlen) - - self.decoder.train(decoder_training_state) - self.joint.train(joint_training_state) - - return (packed_result,) - - def _greedy_decode_blank_as_pad( - self, - x: torch.Tensor, - out_len: torch.Tensor, - device: torch.device, - partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, - ): - if partial_hypotheses is not None: - raise NotImplementedError("`partial_hypotheses` support is not supported") - - with torch.inference_mode(): - # x: [B, T, D] - # out_len: [B] - # device: torch.device - - # Initialize list of Hypothesis - batchsize = x.shape[0] - hypotheses = [ - rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], dec_state=None) for _ in range(batchsize) - ] - - # Initialize Hidden state matrix (shared by entire batch) - hidden = None - - # If alignments need to be preserved, register a dangling list to hold the values - if self.preserve_alignments: - # alignments is a 3-dimensional dangling list representing B x T x U - for hyp in hypotheses: - hyp.alignments = [[]] - - # If confidence scores need to be preserved, register a dangling list to hold the values - if self.preserve_frame_confidence: - # frame_confidence is a 3-dimensional dangling list representing B x T x U - for hyp in hypotheses: - hyp.frame_confidence = [[]] - - # Last Label buffer + Last Label without blank buffer - # batch level equivalent of the last_label - last_label = torch.full([batchsize, 1], fill_value=self._blank_index, dtype=torch.long, device=device) - - # Mask buffers - blank_mask = torch.full([batchsize], fill_value=0, dtype=torch.bool, device=device) - blank_mask_prev = None - - # Get max sequence length - max_out_len = out_len.max() - for time_idx in range(max_out_len): - f = x.narrow(dim=1, start=time_idx, length=1) # [B, 1, D] - - # Prepare t timestamp batch variables - not_blank = True - symbols_added = 0 - - # Reset blank mask - blank_mask.mul_(False) - - # Update blank mask with time mask - # Batch: [B, T, D], but Bi may have seq len < max(seq_lens_in_batch) - # Forcibly mask with "blank" tokens, for all sample where current time step T > seq_len - blank_mask = time_idx >= out_len - blank_mask_prev = blank_mask.clone() - - # Start inner loop - while not_blank and (self.max_symbols is None or symbols_added < self.max_symbols): - # Batch prediction and joint network steps - # If very first prediction step, submit SOS tag (blank) to pred_step. - # This feeds a zero tensor as input to AbstractRNNTDecoder to prime the state - if time_idx == 0 and symbols_added == 0 and hidden is None: - g, hidden_prime = self._pred_step(self._SOS, hidden, batch_size=batchsize) - else: - # Perform batch step prediction of decoder, getting new states and scores ("g") - g, hidden_prime = self._pred_step(last_label, hidden, batch_size=batchsize) - - # Batched joint step - Output = [B, V + 1] - # If preserving per-frame confidence, log_normalize must be true - logp = self._joint_step(f, g, log_normalize=True if self.preserve_frame_confidence else None)[ - :, 0, 0, : - ] - - if logp.dtype != torch.float32: - logp = logp.float() - - # Get index k, of max prob for batch - v, k = logp.max(1) - del g - - # Update blank mask with current predicted blanks - # This is accumulating blanks over all time steps T and all target steps min(max_symbols, U) - k_is_blank = k == self._blank_index - blank_mask.bitwise_or_(k_is_blank) - - del k_is_blank - - # If preserving alignments, check if sequence length of sample has been reached - # before adding alignment - if self.preserve_alignments: - # Insert logprobs into last timestep per sample - logp_vals = logp.to('cpu') - logp_ids = logp_vals.max(1)[1] - for batch_idx, is_blank in enumerate(blank_mask): - # we only want to update non-blanks and first-time blanks, - # otherwise alignments will contain duplicate predictions - if time_idx < out_len[batch_idx] and (not blank_mask_prev[batch_idx] or not is_blank): - hypotheses[batch_idx].alignments[-1].append( - (logp_vals[batch_idx], logp_ids[batch_idx]) - ) - del logp_vals - - # If preserving per-frame confidence, check if sequence length of sample has been reached - # before adding confidence scores - if self.preserve_frame_confidence: - # Insert probabilities into last timestep per sample - confidence = self._get_confidence(logp) - for batch_idx, is_blank in enumerate(blank_mask): - if time_idx < out_len[batch_idx] and (not blank_mask_prev[batch_idx] or not is_blank): - hypotheses[batch_idx].frame_confidence[-1].append(confidence[batch_idx]) - del logp - - blank_mask_prev.bitwise_or_(blank_mask) - - # If all samples predict / have predicted prior blanks, exit loop early - # This is equivalent to if single sample predicted k - if blank_mask.all(): - not_blank = False - else: - # Collect batch indices where blanks occurred now/past - blank_indices = (blank_mask == 1).nonzero(as_tuple=False) - - # Recover prior state for all samples which predicted blank now/past - if hidden is not None: - # LSTM has 2 states - hidden_prime = self.decoder.batch_copy_states(hidden_prime, hidden, blank_indices) - - elif len(blank_indices) > 0 and hidden is None: - # Reset state if there were some blank and other non-blank predictions in batch - # Original state is filled with zeros so we just multiply - # LSTM has 2 states - hidden_prime = self.decoder.batch_copy_states(hidden_prime, None, blank_indices, value=0.0) - - # Recover prior predicted label for all samples which predicted blank now/past - k[blank_indices] = last_label[blank_indices, 0] - - # Update new label and hidden state for next iteration - last_label = k.clone().view(-1, 1) - hidden = hidden_prime - - # Update predicted labels, accounting for time mask - # If blank was predicted even once, now or in the past, - # Force the current predicted label to also be blank - # This ensures that blanks propogate across all timesteps - # once they have occured (normally stopping condition of sample level loop). - for kidx, ki in enumerate(k): - if blank_mask[kidx] == 0: - hypotheses[kidx].y_sequence.append(ki) - hypotheses[kidx].timestep.append(time_idx) - hypotheses[kidx].score += float(v[kidx]) - symbols_added += 1 - - # If preserving alignments, convert the current Uj alignments into a torch.Tensor - # Then preserve U at current timestep Ti - # Finally, forward the timestep history to Ti+1 for that sample - # All of this should only be done iff the current time index <= sample-level AM length. - # Otherwise ignore and move to next sample / next timestep. - if self.preserve_alignments: - - # convert Ti-th logits into a torch array - for batch_idx in range(batchsize): - - # this checks if current timestep <= sample-level AM length - # If current timestep > sample-level AM length, no alignments will be added - # Therefore the list of Uj alignments is empty here. - if len(hypotheses[batch_idx].alignments[-1]) > 0: - hypotheses[batch_idx].alignments.append([]) # blank buffer for next timestep - - # Do the same if preserving per-frame confidence - if self.preserve_frame_confidence: - - for batch_idx in range(batchsize): - if len(hypotheses[batch_idx].frame_confidence[-1]) > 0: - hypotheses[batch_idx].frame_confidence.append([]) # blank buffer for next timestep - - # Remove trailing empty list of alignments at T_{am-len} x Uj - if self.preserve_alignments: - for batch_idx in range(batchsize): - if len(hypotheses[batch_idx].alignments[-1]) == 0: - del hypotheses[batch_idx].alignments[-1] - - # Remove trailing empty list of confidence scores at T_{am-len} x Uj - if self.preserve_frame_confidence: - for batch_idx in range(batchsize): - if len(hypotheses[batch_idx].frame_confidence[-1]) == 0: - del hypotheses[batch_idx].frame_confidence[-1] - - # Preserve states - for batch_idx in range(batchsize): - hypotheses[batch_idx].dec_state = self.decoder.batch_select_state(hidden, batch_idx) - - return hypotheses - - def _greedy_decode_masked( - self, - x: torch.Tensor, - out_len: torch.Tensor, - device: torch.device, - partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, - ): - if partial_hypotheses is not None: - raise NotImplementedError("`partial_hypotheses` support is not supported") - - # x: [B, T, D] - # out_len: [B] - # device: torch.device - - # Initialize state - batchsize = x.shape[0] - hypotheses = [ - rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], dec_state=None) for _ in range(batchsize) - ] - - # Initialize Hidden state matrix (shared by entire batch) - hidden = None - - # If alignments need to be preserved, register a danling list to hold the values - if self.preserve_alignments: - # alignments is a 3-dimensional dangling list representing B x T x U - for hyp in hypotheses: - hyp.alignments = [[]] - else: - alignments = None - - # If confidence scores need to be preserved, register a danling list to hold the values - if self.preserve_frame_confidence: - # frame_confidence is a 3-dimensional dangling list representing B x T x U - for hyp in hypotheses: - hyp.frame_confidence = [[]] - - # Last Label buffer + Last Label without blank buffer - # batch level equivalent of the last_label - last_label = torch.full([batchsize, 1], fill_value=self._blank_index, dtype=torch.long, device=device) - last_label_without_blank = last_label.clone() - - # Mask buffers - blank_mask = torch.full([batchsize], fill_value=0, dtype=torch.bool, device=device) - blank_mask_prev = None - - # Get max sequence length - max_out_len = out_len.max() - - with torch.inference_mode(): - for time_idx in range(max_out_len): - f = x.narrow(dim=1, start=time_idx, length=1) # [B, 1, D] - - # Prepare t timestamp batch variables - not_blank = True - symbols_added = 0 - - # Reset blank mask - blank_mask.mul_(False) - - # Update blank mask with time mask - # Batch: [B, T, D], but Bi may have seq len < max(seq_lens_in_batch) - # Forcibly mask with "blank" tokens, for all sample where current time step T > seq_len - blank_mask = time_idx >= out_len - blank_mask_prev = blank_mask.clone() - - # Start inner loop - while not_blank and (self.max_symbols is None or symbols_added < self.max_symbols): - # Batch prediction and joint network steps - # If very first prediction step, submit SOS tag (blank) to pred_step. - # This feeds a zero tensor as input to AbstractRNNTDecoder to prime the state - if time_idx == 0 and symbols_added == 0 and hidden is None: - g, hidden_prime = self._pred_step(self._SOS, hidden, batch_size=batchsize) - else: - # Set a dummy label for the blank value - # This value will be overwritten by "blank" again the last label update below - # This is done as vocabulary of prediction network does not contain "blank" token of RNNT - last_label_without_blank_mask = last_label == self._blank_index - last_label_without_blank[last_label_without_blank_mask] = 0 # temp change of label - last_label_without_blank[~last_label_without_blank_mask] = last_label[ - ~last_label_without_blank_mask - ] - - # Perform batch step prediction of decoder, getting new states and scores ("g") - g, hidden_prime = self._pred_step(last_label_without_blank, hidden, batch_size=batchsize) - - # Batched joint step - Output = [B, V + 1] - # If preserving per-frame confidence, log_normalize must be true - logp = self._joint_step(f, g, log_normalize=True if self.preserve_frame_confidence else None)[ - :, 0, 0, : - ] - - if logp.dtype != torch.float32: - logp = logp.float() - - # Get index k, of max prob for batch - v, k = logp.max(1) - del g - - # Update blank mask with current predicted blanks - # This is accumulating blanks over all time steps T and all target steps min(max_symbols, U) - k_is_blank = k == self._blank_index - blank_mask.bitwise_or_(k_is_blank) - - # If preserving alignments, check if sequence length of sample has been reached - # before adding alignment - if self.preserve_alignments: - # Insert logprobs into last timestep per sample - logp_vals = logp.to('cpu') - logp_ids = logp_vals.max(1)[1] - for batch_idx, is_blank in enumerate(blank_mask): - # we only want to update non-blanks and first-time blanks, - # otherwise alignments will contain duplicate predictions - if time_idx < out_len[batch_idx] and (not blank_mask_prev[batch_idx] or not is_blank): - hypotheses[batch_idx].alignments[-1].append( - (logp_vals[batch_idx], logp_ids[batch_idx]) - ) - - del logp_vals - - # If preserving per-frame confidence, check if sequence length of sample has been reached - # before adding confidence scores - if self.preserve_frame_confidence: - # Insert probabilities into last timestep per sample - confidence = self._get_confidence(logp) - for batch_idx, is_blank in enumerate(blank_mask): - if time_idx < out_len[batch_idx] and (not blank_mask_prev[batch_idx] or not is_blank): - hypotheses[batch_idx].frame_confidence[-1].append(confidence[batch_idx]) - del logp - - blank_mask_prev.bitwise_or_(blank_mask) - - # If all samples predict / have predicted prior blanks, exit loop early - # This is equivalent to if single sample predicted k - if blank_mask.all(): - not_blank = False - else: - # Collect batch indices where blanks occurred now/past - blank_indices = (blank_mask == 1).nonzero(as_tuple=False) - - # Recover prior state for all samples which predicted blank now/past - if hidden is not None: - # LSTM has 2 states - hidden_prime = self.decoder.batch_copy_states(hidden_prime, hidden, blank_indices) - - elif len(blank_indices) > 0 and hidden is None: - # Reset state if there were some blank and other non-blank predictions in batch - # Original state is filled with zeros so we just multiply - # LSTM has 2 states - hidden_prime = self.decoder.batch_copy_states(hidden_prime, None, blank_indices, value=0.0) - - # Recover prior predicted label for all samples which predicted blank now/past - k[blank_indices] = last_label[blank_indices, 0] - - # Update new label and hidden state for next iteration - last_label = k.view(-1, 1) - hidden = hidden_prime - - # Update predicted labels, accounting for time mask - # If blank was predicted even once, now or in the past, - # Force the current predicted label to also be blank - # This ensures that blanks propogate across all timesteps - # once they have occured (normally stopping condition of sample level loop). - for kidx, ki in enumerate(k): - if blank_mask[kidx] == 0: - hypotheses[kidx].y_sequence.append(ki) - hypotheses[kidx].timestep.append(time_idx) - hypotheses[kidx].score += float(v[kidx]) - - symbols_added += 1 - - # If preserving alignments, convert the current Uj alignments into a torch.Tensor - # Then preserve U at current timestep Ti - # Finally, forward the timestep history to Ti+1 for that sample - # All of this should only be done iff the current time index <= sample-level AM length. - # Otherwise ignore and move to next sample / next timestep. - if self.preserve_alignments: - - # convert Ti-th logits into a torch array - for batch_idx in range(batchsize): - - # this checks if current timestep <= sample-level AM length - # If current timestep > sample-level AM length, no alignments will be added - # Therefore the list of Uj alignments is empty here. - if len(hypotheses[batch_idx].alignments[-1]) > 0: - hypotheses[batch_idx].alignments.append([]) # blank buffer for next timestep - - # Do the same if preserving per-frame confidence - if self.preserve_frame_confidence: - - for batch_idx in range(batchsize): - if len(hypotheses[batch_idx].frame_confidence[-1]) > 0: - hypotheses[batch_idx].frame_confidence.append([]) # blank buffer for next timestep - - # Remove trailing empty list of alignments at T_{am-len} x Uj - if self.preserve_alignments: - for batch_idx in range(batchsize): - if len(hypotheses[batch_idx].alignments[-1]) == 0: - del hypotheses[batch_idx].alignments[-1] - - # Remove trailing empty list of confidence scores at T_{am-len} x Uj - if self.preserve_frame_confidence: - for batch_idx in range(batchsize): - if len(hypotheses[batch_idx].frame_confidence[-1]) == 0: - del hypotheses[batch_idx].frame_confidence[-1] - - # Preserve states - for batch_idx in range(batchsize): - hypotheses[batch_idx].dec_state = self.decoder.batch_select_state(hidden, batch_idx) - - return hypotheses - - -class ExportedModelGreedyBatchedRNNTInfer: - def __init__(self, encoder_model: str, decoder_joint_model: str, max_symbols_per_step: Optional[int] = None): - self.encoder_model_path = encoder_model - self.decoder_joint_model_path = decoder_joint_model - self.max_symbols_per_step = max_symbols_per_step - - # Will be populated at runtime - self._blank_index = None - - def __call__(self, audio_signal: torch.Tensor, length: torch.Tensor): - """Returns a list of hypotheses given an input batch of the encoder hidden embedding. - Output token is generated auto-regressively. - - Args: - encoder_output: A tensor of size (batch, features, timesteps). - encoded_lengths: list of int representing the length of each sequence - output sequence. - - Returns: - packed list containing batch number of sentences (Hypotheses). - """ - with torch.no_grad(): - # Apply optional preprocessing - encoder_output, encoded_lengths = self.run_encoder(audio_signal=audio_signal, length=length) - - if torch.is_tensor(encoder_output): - encoder_output = encoder_output.transpose(1, 2) - else: - encoder_output = encoder_output.transpose([0, 2, 1]) # (B, T, D) - logitlen = encoded_lengths - - inseq = encoder_output # [B, T, D] - hypotheses, timestamps = self._greedy_decode(inseq, logitlen) - - # Pack the hypotheses results - packed_result = [rnnt_utils.Hypothesis(score=-1.0, y_sequence=[]) for _ in range(len(hypotheses))] - for i in range(len(packed_result)): - packed_result[i].y_sequence = torch.tensor(hypotheses[i], dtype=torch.long) - packed_result[i].length = timestamps[i] - - del hypotheses - - return packed_result - - def _greedy_decode(self, x, out_len): - # x: [B, T, D] - # out_len: [B] - - # Initialize state - batchsize = x.shape[0] - hidden = self._get_initial_states(batchsize) - target_lengths = torch.ones(batchsize, dtype=torch.int32) - - # Output string buffer - label = [[] for _ in range(batchsize)] - timesteps = [[] for _ in range(batchsize)] - - # Last Label buffer + Last Label without blank buffer - # batch level equivalent of the last_label - last_label = torch.full([batchsize, 1], fill_value=self._blank_index, dtype=torch.long).numpy() - if torch.is_tensor(x): - last_label = torch.from_numpy(last_label).to(self.device) - - # Mask buffers - blank_mask = torch.full([batchsize], fill_value=0, dtype=torch.bool).numpy() - - # Get max sequence length - max_out_len = out_len.max() - for time_idx in range(max_out_len): - f = x[:, time_idx : time_idx + 1, :] # [B, 1, D] - - if torch.is_tensor(f): - f = f.transpose(1, 2) - else: - f = f.transpose([0, 2, 1]) - - # Prepare t timestamp batch variables - not_blank = True - symbols_added = 0 - - # Reset blank mask - blank_mask *= False - - # Update blank mask with time mask - # Batch: [B, T, D], but Bi may have seq len < max(seq_lens_in_batch) - # Forcibly mask with "blank" tokens, for all sample where current time step T > seq_len - blank_mask = time_idx >= out_len - # Start inner loop - while not_blank and (self.max_symbols_per_step is None or symbols_added < self.max_symbols_per_step): - - # Batch prediction and joint network steps - # If very first prediction step, submit SOS tag (blank) to pred_step. - # This feeds a zero tensor as input to AbstractRNNTDecoder to prime the state - if time_idx == 0 and symbols_added == 0: - g = torch.tensor([self._blank_index] * batchsize, dtype=torch.int32).view(-1, 1) - else: - if torch.is_tensor(last_label): - g = last_label.type(torch.int32) - else: - g = last_label.astype(np.int32) - - # Batched joint step - Output = [B, V + 1] - joint_out, hidden_prime = self.run_decoder_joint(f, g, target_lengths, *hidden) - logp, pred_lengths = joint_out - logp = logp[:, 0, 0, :] - - # Get index k, of max prob for batch - if torch.is_tensor(logp): - v, k = logp.max(1) - else: - k = np.argmax(logp, axis=1).astype(np.int32) - - # Update blank mask with current predicted blanks - # This is accumulating blanks over all time steps T and all target steps min(max_symbols, U) - k_is_blank = k == self._blank_index - blank_mask |= k_is_blank - - del k_is_blank - del logp - - # If all samples predict / have predicted prior blanks, exit loop early - # This is equivalent to if single sample predicted k - if blank_mask.all(): - not_blank = False - - else: - # Collect batch indices where blanks occurred now/past - if torch.is_tensor(blank_mask): - blank_indices = (blank_mask == 1).nonzero(as_tuple=False) - else: - blank_indices = blank_mask.astype(np.int32).nonzero() - - if type(blank_indices) in (list, tuple): - blank_indices = blank_indices[0] - - # Recover prior state for all samples which predicted blank now/past - if hidden is not None: - # LSTM has 2 states - for state_id in range(len(hidden)): - hidden_prime[state_id][:, blank_indices, :] = hidden[state_id][:, blank_indices, :] - - elif len(blank_indices) > 0 and hidden is None: - # Reset state if there were some blank and other non-blank predictions in batch - # Original state is filled with zeros so we just multiply - # LSTM has 2 states - for state_id in range(len(hidden_prime)): - hidden_prime[state_id][:, blank_indices, :] *= 0.0 - - # Recover prior predicted label for all samples which predicted blank now/past - k[blank_indices] = last_label[blank_indices, 0] - - # Update new label and hidden state for next iteration - if torch.is_tensor(k): - last_label = k.clone().reshape(-1, 1) - else: - last_label = k.copy().reshape(-1, 1) - hidden = hidden_prime - - # Update predicted labels, accounting for time mask - # If blank was predicted even once, now or in the past, - # Force the current predicted label to also be blank - # This ensures that blanks propogate across all timesteps - # once they have occured (normally stopping condition of sample level loop). - for kidx, ki in enumerate(k): - if blank_mask[kidx] == 0: - label[kidx].append(ki) - timesteps[kidx].append(time_idx) - - symbols_added += 1 - - return label, timesteps - - def _setup_blank_index(self): - raise NotImplementedError() - - def run_encoder(self, audio_signal, length): - raise NotImplementedError() - - def run_decoder_joint(self, enc_logits, targets, target_length, *states): - raise NotImplementedError() - - def _get_initial_states(self, batchsize): - raise NotImplementedError() - - -class ONNXGreedyBatchedRNNTInfer(ExportedModelGreedyBatchedRNNTInfer): - def __init__(self, encoder_model: str, decoder_joint_model: str, max_symbols_per_step: Optional[int] = 10): - super().__init__( - encoder_model=encoder_model, - decoder_joint_model=decoder_joint_model, - max_symbols_per_step=max_symbols_per_step, - ) - - try: - import onnx - import onnxruntime - except (ModuleNotFoundError, ImportError): - raise ImportError(f"`onnx` or `onnxruntime` could not be imported, please install the libraries.\n") - - if torch.cuda.is_available(): - # Try to use onnxruntime-gpu - providers = ['TensorrtExecutionProvider', 'CUDAExecutionProvider'] - else: - # Fall back to CPU and onnxruntime-cpu - providers = ['CPUExecutionProvider'] - - onnx_session_opt = onnxruntime.SessionOptions() - onnx_session_opt.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL - - onnx_model = onnx.load(self.encoder_model_path) - onnx.checker.check_model(onnx_model, full_check=True) - self.encoder_model = onnx_model - self.encoder = onnxruntime.InferenceSession( - onnx_model.SerializeToString(), providers=providers, provider_options=onnx_session_opt - ) - - onnx_model = onnx.load(self.decoder_joint_model_path) - onnx.checker.check_model(onnx_model, full_check=True) - self.decoder_joint_model = onnx_model - self.decoder_joint = onnxruntime.InferenceSession( - onnx_model.SerializeToString(), providers=providers, provider_options=onnx_session_opt - ) - - logging.info("Successfully loaded encoder, decoder and joint onnx models !") - - # Will be populated at runtime - self._blank_index = None - self.max_symbols_per_step = max_symbols_per_step - - self._setup_encoder_input_output_keys() - self._setup_decoder_joint_input_output_keys() - self._setup_blank_index() - - def _setup_encoder_input_output_keys(self): - self.encoder_inputs = list(self.encoder_model.graph.input) - self.encoder_outputs = list(self.encoder_model.graph.output) - - def _setup_decoder_joint_input_output_keys(self): - self.decoder_joint_inputs = list(self.decoder_joint_model.graph.input) - self.decoder_joint_outputs = list(self.decoder_joint_model.graph.output) - - def _setup_blank_index(self): - # ASSUME: Single input with no time length information - dynamic_dim = 257 - shapes = self.encoder_inputs[0].type.tensor_type.shape.dim - ip_shape = [] - for shape in shapes: - if hasattr(shape, 'dim_param') and 'dynamic' in shape.dim_param: - ip_shape.append(dynamic_dim) # replace dynamic axes with constant - else: - ip_shape.append(int(shape.dim_value)) - - enc_logits, encoded_length = self.run_encoder( - audio_signal=torch.randn(*ip_shape), length=torch.randint(0, 1, size=(dynamic_dim,)) - ) - - # prepare states - states = self._get_initial_states(batchsize=dynamic_dim) - - # run decoder 1 step - joint_out, states = self.run_decoder_joint(enc_logits, None, None, *states) - log_probs, lengths = joint_out - - self._blank_index = log_probs.shape[-1] - 1 # last token of vocab size is blank token - logging.info( - f"Enc-Dec-Joint step was evaluated, blank token id = {self._blank_index}; vocab size = {log_probs.shape[-1]}" - ) - - def run_encoder(self, audio_signal, length): - if hasattr(audio_signal, 'cpu'): - audio_signal = audio_signal.cpu().numpy() - - if hasattr(length, 'cpu'): - length = length.cpu().numpy() - - ip = { - self.encoder_inputs[0].name: audio_signal, - self.encoder_inputs[1].name: length, - } - enc_out = self.encoder.run(None, ip) - enc_out, encoded_length = enc_out # ASSUME: single output - return enc_out, encoded_length - - def run_decoder_joint(self, enc_logits, targets, target_length, *states): - # ASSUME: Decoder is RNN Transducer - if targets is None: - targets = torch.zeros(enc_logits.shape[0], 1, dtype=torch.int32) - target_length = torch.ones(enc_logits.shape[0], dtype=torch.int32) - - if hasattr(targets, 'cpu'): - targets = targets.cpu().numpy() - - if hasattr(target_length, 'cpu'): - target_length = target_length.cpu().numpy() - - ip = { - self.decoder_joint_inputs[0].name: enc_logits, - self.decoder_joint_inputs[1].name: targets, - self.decoder_joint_inputs[2].name: target_length, - } - - num_states = 0 - if states is not None and len(states) > 0: - num_states = len(states) - for idx, state in enumerate(states): - if hasattr(state, 'cpu'): - state = state.cpu().numpy() - - ip[self.decoder_joint_inputs[len(ip)].name] = state - - dec_out = self.decoder_joint.run(None, ip) - - # unpack dec output - if num_states > 0: - new_states = dec_out[-num_states:] - dec_out = dec_out[:-num_states] - else: - new_states = None - - return dec_out, new_states - - def _get_initial_states(self, batchsize): - # ASSUME: LSTM STATES of shape (layers, batchsize, dim) - input_state_nodes = [ip for ip in self.decoder_joint_inputs if 'state' in ip.name] - num_states = len(input_state_nodes) - if num_states == 0: - return - - input_states = [] - for state_id in range(num_states): - node = input_state_nodes[state_id] - ip_shape = [] - for shape_idx, shape in enumerate(node.type.tensor_type.shape.dim): - if hasattr(shape, 'dim_param') and 'dynamic' in shape.dim_param: - ip_shape.append(batchsize) # replace dynamic axes with constant - else: - ip_shape.append(int(shape.dim_value)) - - input_states.append(torch.zeros(*ip_shape)) - - return input_states - - -class TorchscriptGreedyBatchedRNNTInfer(ExportedModelGreedyBatchedRNNTInfer): - def __init__( - self, - encoder_model: str, - decoder_joint_model: str, - cfg: DictConfig, - device: str, - max_symbols_per_step: Optional[int] = 10, - ): - super().__init__( - encoder_model=encoder_model, - decoder_joint_model=decoder_joint_model, - max_symbols_per_step=max_symbols_per_step, - ) - - self.cfg = cfg - self.device = device - - self.encoder = torch.jit.load(self.encoder_model_path, map_location=self.device) - self.decoder_joint = torch.jit.load(self.decoder_joint_model_path, map_location=self.device) - - logging.info("Successfully loaded encoder, decoder and joint torchscript models !") - - # Will be populated at runtime - self._blank_index = None - self.max_symbols_per_step = max_symbols_per_step - - self._setup_encoder_input_keys() - self._setup_decoder_joint_input_keys() - self._setup_blank_index() - - def _setup_encoder_input_keys(self): - arguments = self.encoder.forward.schema.arguments[1:] - self.encoder_inputs = [arg for arg in arguments] - - def _setup_decoder_joint_input_keys(self): - arguments = self.decoder_joint.forward.schema.arguments[1:] - self.decoder_joint_inputs = [arg for arg in arguments] - - def _setup_blank_index(self): - self._blank_index = len(self.cfg.joint.vocabulary) - - logging.info(f"Blank token id = {self._blank_index}; vocab size = {len(self.cfg.joint.vocabulary) + 1}") - - def run_encoder(self, audio_signal, length): - enc_out = self.encoder(audio_signal, length) - enc_out, encoded_length = enc_out # ASSUME: single output - return enc_out, encoded_length - - def run_decoder_joint(self, enc_logits, targets, target_length, *states): - # ASSUME: Decoder is RNN Transducer - if targets is None: - targets = torch.zeros(enc_logits.shape[0], 1, dtype=torch.int32, device=enc_logits.device) - target_length = torch.ones(enc_logits.shape[0], dtype=torch.int32, device=enc_logits.device) - - num_states = 0 - if states is not None and len(states) > 0: - num_states = len(states) - - dec_out = self.decoder_joint(enc_logits, targets, target_length, *states) - - # unpack dec output - if num_states > 0: - new_states = dec_out[-num_states:] - dec_out = dec_out[:-num_states] - else: - new_states = None - - return dec_out, new_states - - def _get_initial_states(self, batchsize): - # ASSUME: LSTM STATES of shape (layers, batchsize, dim) - input_state_nodes = [ip for ip in self.decoder_joint_inputs if 'state' in ip.name] - num_states = len(input_state_nodes) - if num_states == 0: - return - - input_states = [] - for state_id in range(num_states): - # Hardcode shape size for LSTM (1 is for num layers in LSTM, which is flattened for export) - ip_shape = [1, batchsize, self.cfg.model_defaults.pred_hidden] - input_states.append(torch.zeros(*ip_shape, device=self.device)) - - return input_states - - -class GreedyMultiblankRNNTInfer(GreedyRNNTInfer): - """A greedy transducer decoder for multi-blank RNN-T. - - Sequence level greedy decoding, performed auto-regressively. - - Args: - decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. - joint_model: rnnt_utils.AbstractRNNTJoint implementation. - blank_index: int index of the blank token. Must be len(vocabulary) for multi-blank RNNTs. - big_blank_durations: a list containing durations for big blanks the model supports. - max_symbols_per_step: Optional int. The maximum number of symbols that can be added - to a sequence in a single time step; if set to None then there is - no limit. - preserve_alignments: Bool flag which preserves the history of alignments generated during - greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `alignments` in it. Here, `alignments` is a List of List of - Tuple(Tensor (of length V + 1 + num-big-blanks), Tensor(scalar, label after argmax)). - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary. - U is the number of target tokens for the current timestep Ti. - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores generated - during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of List of floats. - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. - U is the number of target tokens for the current timestep Ti. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame - confidence scores. - - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - """ - - def __init__( - self, - decoder_model: rnnt_abstract.AbstractRNNTDecoder, - joint_model: rnnt_abstract.AbstractRNNTJoint, - blank_index: int, - big_blank_durations: list, - max_symbols_per_step: Optional[int] = None, - preserve_alignments: bool = False, - preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, - ): - super().__init__( - decoder_model=decoder_model, - joint_model=joint_model, - blank_index=blank_index, - max_symbols_per_step=max_symbols_per_step, - preserve_alignments=preserve_alignments, - preserve_frame_confidence=preserve_frame_confidence, - confidence_method_cfg=confidence_method_cfg, - ) - self.big_blank_durations = big_blank_durations - self._SOS = blank_index - len(big_blank_durations) - - @torch.no_grad() - def _greedy_decode( - self, x: torch.Tensor, out_len: torch.Tensor, partial_hypotheses: Optional[rnnt_utils.Hypothesis] = None - ): - # x: [T, 1, D] - # out_len: [seq_len] - - # Initialize blank state and empty label set in Hypothesis - hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None) - - if partial_hypotheses is not None: - hypothesis.last_token = partial_hypotheses.last_token - hypothesis.y_sequence = ( - partial_hypotheses.y_sequence.cpu().tolist() - if isinstance(partial_hypotheses.y_sequence, torch.Tensor) - else partial_hypotheses.y_sequence - ) - if partial_hypotheses.dec_state is not None: - hypothesis.dec_state = self.decoder.batch_concat_states([partial_hypotheses.dec_state]) - hypothesis.dec_state = _states_to_device(hypothesis.dec_state, x.device) - - if self.preserve_alignments: - # Alignments is a 2-dimensional dangling list representing T x U - hypothesis.alignments = [[]] - - if self.preserve_frame_confidence: - hypothesis.frame_confidence = [[]] - - # if this variable is > 1, it means the last emission was a big-blank and we need to skip frames. - big_blank_duration = 1 - - # For timestep t in X_t - for time_idx in range(out_len): - if big_blank_duration > 1: - # skip frames until big_blank_duration == 1. - big_blank_duration -= 1 - continue - # Extract encoder embedding at timestep t - # f = x[time_idx, :, :].unsqueeze(0) # [1, 1, D] - f = x.narrow(dim=0, start=time_idx, length=1) - - # Setup exit flags and counter - not_blank = True - symbols_added = 0 - - # While blank is not predicted, or we dont run out of max symbols per timestep - while not_blank and (self.max_symbols is None or symbols_added < self.max_symbols): - # In the first timestep, we initialize the network with RNNT Blank - # In later timesteps, we provide previous predicted label as input. - if hypothesis.last_token is None and hypothesis.dec_state is None: - last_label = self._SOS - else: - last_label = label_collate([[hypothesis.last_token]]) - - # Perform prediction network and joint network steps. - g, hidden_prime = self._pred_step(last_label, hypothesis.dec_state) - # If preserving per-frame confidence, log_normalize must be true - logp = self._joint_step(f, g, log_normalize=True if self.preserve_frame_confidence else None)[ - 0, 0, 0, : - ] - - del g - - # torch.max(0) op doesnt exist for FP 16. - if logp.dtype != torch.float32: - logp = logp.float() - - # get index k, of max prob - v, k = logp.max(0) - k = k.item() # K is the label at timestep t_s in inner loop, s >= 0. - - # Note, we have non-blanks in the vocab first, followed by big blanks, and standard blank at last. - # here we check if it's a big blank and if yes, set the duration variable. - if k >= self._blank_index - len(self.big_blank_durations) and k < self._blank_index: - big_blank_duration = self.big_blank_durations[self._blank_index - k - 1] - - if self.preserve_alignments: - # insert logprobs into last timestep - hypothesis.alignments[-1].append((logp.to('cpu'), torch.tensor(k, dtype=torch.int32))) - - if self.preserve_frame_confidence: - # insert confidence into last timestep - hypothesis.frame_confidence[-1].append(self._get_confidence(logp)) - - del logp - - # If any type of blank token is predicted, exit inner loop, move onto next timestep t - if k >= self._blank_index - len(self.big_blank_durations): - not_blank = False - else: - # Append token to label set, update RNN state. - hypothesis.y_sequence.append(k) - hypothesis.score += float(v) - hypothesis.timestep.append(time_idx) - hypothesis.dec_state = hidden_prime - hypothesis.last_token = k - - # Increment token counter. - symbols_added += 1 - - if self.preserve_alignments: - # convert Ti-th logits into a torch array - hypothesis.alignments.append([]) # blank buffer for next timestep - - if self.preserve_frame_confidence: - hypothesis.frame_confidence.append([]) # blank buffer for next timestep - - # Remove trailing empty list of Alignments - if self.preserve_alignments: - if len(hypothesis.alignments[-1]) == 0: - del hypothesis.alignments[-1] - - # Remove trailing empty list of per-frame confidence - if self.preserve_frame_confidence: - if len(hypothesis.frame_confidence[-1]) == 0: - del hypothesis.frame_confidence[-1] - - # Unpack the hidden states - hypothesis.dec_state = self.decoder.batch_select_state(hypothesis.dec_state, 0) - - return hypothesis - - -class GreedyBatchedMultiblankRNNTInfer(GreedyBatchedRNNTInfer): - """A batch level greedy transducer decoder. - Batch level greedy decoding, performed auto-regressively. - Args: - decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. - joint_model: rnnt_utils.AbstractRNNTJoint implementation. - blank_index: int index of the blank token. Must be len(vocabulary) for multi-blank RNNTs. - big_blank_durations: a list containing durations for big blanks the model supports. - max_symbols_per_step: Optional int. The maximum number of symbols that can be added - to a sequence in a single time step; if set to None then there is - no limit. - preserve_alignments: Bool flag which preserves the history of alignments generated during - greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `alignments` in it. Here, `alignments` is a List of List of - Tuple(Tensor (of length V + 1 + num-big-blanks), Tensor(scalar, label after argmax)). - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary. - U is the number of target tokens for the current timestep Ti. - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores generated - during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of List of floats. - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. - U is the number of target tokens for the current timestep Ti. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame - confidence scores. - - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - """ - - def __init__( - self, - decoder_model: rnnt_abstract.AbstractRNNTDecoder, - joint_model: rnnt_abstract.AbstractRNNTJoint, - blank_index: int, - big_blank_durations: List[int], - max_symbols_per_step: Optional[int] = None, - preserve_alignments: bool = False, - preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, - ): - super().__init__( - decoder_model=decoder_model, - joint_model=joint_model, - blank_index=blank_index, - max_symbols_per_step=max_symbols_per_step, - preserve_alignments=preserve_alignments, - preserve_frame_confidence=preserve_frame_confidence, - confidence_method_cfg=confidence_method_cfg, - ) - self.big_blank_durations = big_blank_durations - - # Depending on availability of `blank_as_pad` support - # switch between more efficient batch decoding technique - if self.decoder.blank_as_pad: - self._greedy_decode = self._greedy_decode_blank_as_pad - else: - self._greedy_decode = self._greedy_decode_masked - self._SOS = blank_index - len(big_blank_durations) - - def _greedy_decode_blank_as_pad( - self, - x: torch.Tensor, - out_len: torch.Tensor, - device: torch.device, - partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, - ): - if partial_hypotheses is not None: - raise NotImplementedError("`partial_hypotheses` support is not supported") - - with torch.inference_mode(): - # x: [B, T, D] - # out_len: [B] - # device: torch.device - - # Initialize list of Hypothesis - batchsize = x.shape[0] - hypotheses = [ - rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], dec_state=None) for _ in range(batchsize) - ] - - # Initialize Hidden state matrix (shared by entire batch) - hidden = None - - # If alignments need to be preserved, register a danling list to hold the values - if self.preserve_alignments: - # alignments is a 3-dimensional dangling list representing B x T x U - for hyp in hypotheses: - hyp.alignments = [[]] - - # If confidence scores need to be preserved, register a danling list to hold the values - if self.preserve_frame_confidence: - # frame_confidence is a 3-dimensional dangling list representing B x T x U - for hyp in hypotheses: - hyp.frame_confidence = [[]] - - # Last Label buffer + Last Label without blank buffer - # batch level equivalent of the last_label - last_label = torch.full([batchsize, 1], fill_value=self._SOS, dtype=torch.long, device=device) - - # this mask is true for if the emission is *any type* of blank. - blank_mask = torch.full([batchsize], fill_value=0, dtype=torch.bool, device=device) - - # Get max sequence length - max_out_len = out_len.max() - - # We have a mask for each big blank. A mask is "true" means: the previous emission is exactly the big-blank - # with the corresponding duration, or has larger duration. E.g., for big_blank_mask for duration 2, it will - # be set true if the previous emission was a big blank with duration 4, or 3 or 2; but false if prevoius - # emission was a standard blank (with duration = 1). - big_blank_masks = [torch.full([batchsize], fill_value=0, dtype=torch.bool, device=device)] * len( - self.big_blank_durations - ) - - # if this variable > 1, it means the previous emission was big-blank and we need to skip frames. - big_blank_duration = 1 - - for time_idx in range(max_out_len): - if big_blank_duration > 1: - # skip frames until big_blank_duration == 1 - big_blank_duration -= 1 - continue - f = x.narrow(dim=1, start=time_idx, length=1) # [B, 1, D] - - # Prepare t timestamp batch variables - not_blank = True - symbols_added = 0 - - # Reset all blank masks - blank_mask.mul_(False) - for i in range(len(big_blank_masks)): - big_blank_masks[i].mul_(False) - - # Update blank mask with time mask - # Batch: [B, T, D], but Bi may have seq len < max(seq_lens_in_batch) - # Forcibly mask with "blank" tokens, for all sample where current time step T > seq_len - blank_mask = time_idx >= out_len - for i in range(len(big_blank_masks)): - big_blank_masks[i] = time_idx >= out_len - - # Start inner loop - while not_blank and (self.max_symbols is None or symbols_added < self.max_symbols): - # Batch prediction and joint network steps - # If very first prediction step, submit SOS tag (blank) to pred_step. - # This feeds a zero tensor as input to AbstractRNNTDecoder to prime the state - if time_idx == 0 and symbols_added == 0 and hidden is None: - g, hidden_prime = self._pred_step(self._SOS, hidden, batch_size=batchsize) - else: - # Perform batch step prediction of decoder, getting new states and scores ("g") - g, hidden_prime = self._pred_step(last_label, hidden, batch_size=batchsize) - - # Batched joint step - Output = [B, V + 1 + num-big-blanks] - # If preserving per-frame confidence, log_normalize must be true - logp = self._joint_step(f, g, log_normalize=True if self.preserve_frame_confidence else None)[ - :, 0, 0, : - ] - - if logp.dtype != torch.float32: - logp = logp.float() - - # Get index k, of max prob for batch - v, k = logp.max(1) - del g - - # Update blank mask with current predicted blanks - # This is accumulating blanks over all time steps T and all target steps min(max_symbols, U) - k_is_blank = k >= self._blank_index - len(self.big_blank_durations) - blank_mask.bitwise_or_(k_is_blank) - - for i in range(len(big_blank_masks)): - # using <= since as we mentioned before, the mask doesn't store exact matches. - # instead, it is True when the predicted blank's duration is >= the duration that the - # mask corresponds to. - k_is_big_blank = k <= self._blank_index - 1 - i - - # need to do a bitwise_and since it could also be a non-blank. - k_is_big_blank.bitwise_and_(k_is_blank) - big_blank_masks[i].bitwise_or_(k_is_big_blank) - - del k_is_blank - - # If preserving alignments, check if sequence length of sample has been reached - # before adding alignment - if self.preserve_alignments: - # Insert logprobs into last timestep per sample - logp_vals = logp.to('cpu') - logp_ids = logp_vals.max(1)[1] - for batch_idx in range(batchsize): - if time_idx < out_len[batch_idx]: - hypotheses[batch_idx].alignments[-1].append( - (logp_vals[batch_idx], logp_ids[batch_idx]) - ) - del logp_vals - - # If preserving per-frame confidence, check if sequence length of sample has been reached - # before adding confidence scores - if self.preserve_frame_confidence: - # Insert probabilities into last timestep per sample - confidence = self._get_confidence(logp) - for batch_idx in range(batchsize): - if time_idx < out_len[batch_idx]: - hypotheses[batch_idx].frame_confidence[-1].append(confidence[batch_idx]) - del logp - - # If all samples predict / have predicted prior blanks, exit loop early - # This is equivalent to if single sample predicted k - if blank_mask.all(): - not_blank = False - else: - # Collect batch indices where blanks occurred now/past - blank_indices = (blank_mask == 1).nonzero(as_tuple=False) - - # Recover prior state for all samples which predicted blank now/past - if hidden is not None: - # LSTM has 2 states - hidden_prime = self.decoder.batch_copy_states(hidden_prime, hidden, blank_indices) - - elif len(blank_indices) > 0 and hidden is None: - # Reset state if there were some blank and other non-blank predictions in batch - # Original state is filled with zeros so we just multiply - # LSTM has 2 states - hidden_prime = self.decoder.batch_copy_states(hidden_prime, None, blank_indices, value=0.0) - - # Recover prior predicted label for all samples which predicted blank now/past - k[blank_indices] = last_label[blank_indices, 0] - - # Update new label and hidden state for next iteration - last_label = k.clone().view(-1, 1) - hidden = hidden_prime - - # Update predicted labels, accounting for time mask - # If blank was predicted even once, now or in the past, - # Force the current predicted label to also be blank - # This ensures that blanks propogate across all timesteps - # once they have occured (normally stopping condition of sample level loop). - for kidx, ki in enumerate(k): - if blank_mask[kidx] == 0: - hypotheses[kidx].y_sequence.append(ki) - hypotheses[kidx].timestep.append(time_idx) - hypotheses[kidx].score += float(v[kidx]) - - symbols_added += 1 - - for i in range(len(big_blank_masks) + 1): - # The task here is find the shortest blank duration of all batches. - # so we start from the shortest blank duration and go up, - # and stop once we found the duration whose corresponding mask isn't all True. - if i == len(big_blank_masks) or not big_blank_masks[i].all(): - big_blank_duration = self.big_blank_durations[i - 1] if i > 0 else 1 - break - - # If preserving alignments, convert the current Uj alignments into a torch.Tensor - # Then preserve U at current timestep Ti - # Finally, forward the timestep history to Ti+1 for that sample - # All of this should only be done iff the current time index <= sample-level AM length. - # Otherwise ignore and move to next sample / next timestep. - if self.preserve_alignments: - - # convert Ti-th logits into a torch array - for batch_idx in range(batchsize): - - # this checks if current timestep <= sample-level AM length - # If current timestep > sample-level AM length, no alignments will be added - # Therefore the list of Uj alignments is empty here. - if len(hypotheses[batch_idx].alignments[-1]) > 0: - hypotheses[batch_idx].alignments.append([]) # blank buffer for next timestep - - # Do the same if preserving per-frame confidence - if self.preserve_frame_confidence: - - for batch_idx in range(batchsize): - if len(hypotheses[batch_idx].frame_confidence[-1]) > 0: - hypotheses[batch_idx].frame_confidence.append([]) # blank buffer for next timestep - - # Remove trailing empty list of alignments at T_{am-len} x Uj - if self.preserve_alignments: - for batch_idx in range(batchsize): - if len(hypotheses[batch_idx].alignments[-1]) == 0: - del hypotheses[batch_idx].alignments[-1] - - # Remove trailing empty list of confidence scores at T_{am-len} x Uj - if self.preserve_frame_confidence: - for batch_idx in range(batchsize): - if len(hypotheses[batch_idx].frame_confidence[-1]) == 0: - del hypotheses[batch_idx].frame_confidence[-1] - - # Preserve states - for batch_idx in range(batchsize): - hypotheses[batch_idx].dec_state = self.decoder.batch_select_state(hidden, batch_idx) - - return hypotheses - - def _greedy_decode_masked( - self, - x: torch.Tensor, - out_len: torch.Tensor, - device: torch.device, - partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, - ): - if partial_hypotheses is not None: - raise NotImplementedError("`partial_hypotheses` support is not supported") - - if self.big_blank_durations != [1] * len(self.big_blank_durations): - raise NotImplementedError( - "Efficient frame-skipping version for multi-blank masked decoding is not supported." - ) - - # x: [B, T, D] - # out_len: [B] - # device: torch.device - - # Initialize state - batchsize = x.shape[0] - hypotheses = [ - rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], dec_state=None) for _ in range(batchsize) - ] - - # Initialize Hidden state matrix (shared by entire batch) - hidden = None - - # If alignments need to be preserved, register a danling list to hold the values - if self.preserve_alignments: - # alignments is a 3-dimensional dangling list representing B x T x U - for hyp in hypotheses: - hyp.alignments = [[]] - else: - hyp.alignments = None - - # If confidence scores need to be preserved, register a danling list to hold the values - if self.preserve_frame_confidence: - # frame_confidence is a 3-dimensional dangling list representing B x T x U - for hyp in hypotheses: - hyp.frame_confidence = [[]] - - # Last Label buffer + Last Label without blank buffer - # batch level equivalent of the last_label - last_label = torch.full([batchsize, 1], fill_value=self._blank_index, dtype=torch.long, device=device) - last_label_without_blank = last_label.clone() - - # Mask buffers - blank_mask = torch.full([batchsize], fill_value=0, dtype=torch.bool, device=device) - - # Get max sequence length - max_out_len = out_len.max() - - with torch.inference_mode(): - for time_idx in range(max_out_len): - f = x.narrow(dim=1, start=time_idx, length=1) # [B, 1, D] - - # Prepare t timestamp batch variables - not_blank = True - symbols_added = 0 - - # Reset blank mask - blank_mask.mul_(False) - - # Update blank mask with time mask - # Batch: [B, T, D], but Bi may have seq len < max(seq_lens_in_batch) - # Forcibly mask with "blank" tokens, for all sample where current time step T > seq_len - blank_mask = time_idx >= out_len - - # Start inner loop - while not_blank and (self.max_symbols is None or symbols_added < self.max_symbols): - # Batch prediction and joint network steps - # If very first prediction step, submit SOS tag (blank) to pred_step. - # This feeds a zero tensor as input to AbstractRNNTDecoder to prime the state - if time_idx == 0 and symbols_added == 0 and hidden is None: - g, hidden_prime = self._pred_step(self._SOS, hidden, batch_size=batchsize) - else: - # Set a dummy label for the blank value - # This value will be overwritten by "blank" again the last label update below - # This is done as vocabulary of prediction network does not contain "blank" token of RNNT - last_label_without_blank_mask = last_label >= self._blank_index - last_label_without_blank[last_label_without_blank_mask] = 0 # temp change of label - last_label_without_blank[~last_label_without_blank_mask] = last_label[ - ~last_label_without_blank_mask - ] - - # Perform batch step prediction of decoder, getting new states and scores ("g") - g, hidden_prime = self._pred_step(last_label_without_blank, hidden, batch_size=batchsize) - - # Batched joint step - Output = [B, V + 1 + num-big-blanks] - # If preserving per-frame confidence, log_normalize must be true - logp = self._joint_step(f, g, log_normalize=True if self.preserve_frame_confidence else None)[ - :, 0, 0, : - ] - - if logp.dtype != torch.float32: - logp = logp.float() - - # Get index k, of max prob for batch - v, k = logp.max(1) - del g - - # Update blank mask with current predicted blanks - # This is accumulating blanks over all time steps T and all target steps min(max_symbols, U) - k_is_blank = k == self._blank_index - blank_mask.bitwise_or_(k_is_blank) - - # If preserving alignments, check if sequence length of sample has been reached - # before adding alignment - if self.preserve_alignments: - # Insert logprobs into last timestep per sample - logp_vals = logp.to('cpu') - logp_ids = logp_vals.max(1)[1] - for batch_idx in range(batchsize): - if time_idx < out_len[batch_idx]: - hypotheses[batch_idx].alignments[-1].append( - (logp_vals[batch_idx], logp_ids[batch_idx]) - ) - del logp_vals - - # If preserving per-frame confidence, check if sequence length of sample has been reached - # before adding confidence scores - if self.preserve_frame_confidence: - # Insert probabilities into last timestep per sample - confidence = self._get_confidence(logp) - for batch_idx in range(batchsize): - if time_idx < out_len[batch_idx]: - hypotheses[batch_idx].frame_confidence[-1].append(confidence[batch_idx]) - del logp - - # If all samples predict / have predicted prior blanks, exit loop early - # This is equivalent to if single sample predicted k - if blank_mask.all(): - not_blank = False - else: - # Collect batch indices where blanks occurred now/past - blank_indices = (blank_mask == 1).nonzero(as_tuple=False) - - # Recover prior state for all samples which predicted blank now/past - if hidden is not None: - # LSTM has 2 states - hidden_prime = self.decoder.batch_copy_states(hidden_prime, hidden, blank_indices) - - elif len(blank_indices) > 0 and hidden is None: - # Reset state if there were some blank and other non-blank predictions in batch - # Original state is filled with zeros so we just multiply - # LSTM has 2 states - hidden_prime = self.decoder.batch_copy_states(hidden_prime, None, blank_indices, value=0.0) - - # Recover prior predicted label for all samples which predicted blank now/past - k[blank_indices] = last_label[blank_indices, 0] - - # Update new label and hidden state for next iteration - last_label = k.view(-1, 1) - hidden = hidden_prime - - # Update predicted labels, accounting for time mask - # If blank was predicted even once, now or in the past, - # Force the current predicted label to also be blank - # This ensures that blanks propogate across all timesteps - # once they have occured (normally stopping condition of sample level loop). - for kidx, ki in enumerate(k): - if blank_mask[kidx] == 0: - hypotheses[kidx].y_sequence.append(ki) - hypotheses[kidx].timestep.append(time_idx) - hypotheses[kidx].score += float(v[kidx]) - - symbols_added += 1 - - # If preserving alignments, convert the current Uj alignments into a torch.Tensor - # Then preserve U at current timestep Ti - # Finally, forward the timestep history to Ti+1 for that sample - # All of this should only be done iff the current time index <= sample-level AM length. - # Otherwise ignore and move to next sample / next timestep. - if self.preserve_alignments: - - # convert Ti-th logits into a torch array - for batch_idx in range(batchsize): - - # this checks if current timestep <= sample-level AM length - # If current timestep > sample-level AM length, no alignments will be added - # Therefore the list of Uj alignments is empty here. - if len(hypotheses[batch_idx].alignments[-1]) > 0: - hypotheses[batch_idx].alignments.append([]) # blank buffer for next timestep - - # Do the same if preserving per-frame confidence - if self.preserve_frame_confidence: - - for batch_idx in range(batchsize): - if len(hypotheses[batch_idx].frame_confidence[-1]) > 0: - hypotheses[batch_idx].frame_confidence.append([]) # blank buffer for next timestep - - # Remove trailing empty list of alignments at T_{am-len} x Uj - if self.preserve_alignments: - for batch_idx in range(batchsize): - if len(hypotheses[batch_idx].alignments[-1]) == 0: - del hypotheses[batch_idx].alignments[-1] - - # Remove trailing empty list of confidence scores at T_{am-len} x Uj - if self.preserve_frame_confidence: - for batch_idx in range(batchsize): - if len(hypotheses[batch_idx].frame_confidence[-1]) == 0: - del hypotheses[batch_idx].frame_confidence[-1] - - # Preserve states - for batch_idx in range(batchsize): - hypotheses[batch_idx].dec_state = self.decoder.batch_select_state(hidden, batch_idx) - - return hypotheses - - -@dataclass -class GreedyRNNTInferConfig: - max_symbols_per_step: Optional[int] = 10 - preserve_alignments: bool = False - preserve_frame_confidence: bool = False - confidence_method_cfg: Optional[ConfidenceMethodConfig] = field(default_factory=lambda: ConfidenceMethodConfig()) - - def __post_init__(self): - # OmegaConf.structured ensures that post_init check is always executed - self.confidence_method_cfg = OmegaConf.structured( - self.confidence_method_cfg - if isinstance(self.confidence_method_cfg, ConfidenceMethodConfig) - else ConfidenceMethodConfig(**self.confidence_method_cfg) - ) - - -@dataclass -class GreedyBatchedRNNTInferConfig: - max_symbols_per_step: Optional[int] = 10 - preserve_alignments: bool = False - preserve_frame_confidence: bool = False - confidence_method_cfg: Optional[ConfidenceMethodConfig] = field(default_factory=lambda: ConfidenceMethodConfig()) - - def __post_init__(self): - # OmegaConf.structured ensures that post_init check is always executed - self.confidence_method_cfg = OmegaConf.structured( - self.confidence_method_cfg - if isinstance(self.confidence_method_cfg, ConfidenceMethodConfig) - else ConfidenceMethodConfig(**self.confidence_method_cfg) - ) - - -class GreedyTDTInfer(_GreedyRNNTInfer): - """A greedy TDT decoder. - - Sequence level greedy decoding, performed auto-regressively. - - Args: - decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. - joint_model: rnnt_utils.AbstractRNNTJoint implementation. - blank_index: int index of the blank token. Must be len(vocabulary) for TDT models. - durations: a list containing durations for TDT. - max_symbols_per_step: Optional int. The maximum number of symbols that can be added - to a sequence in a single time step; if set to None then there is - no limit. - preserve_alignments: Bool flag which preserves the history of alignments generated during - greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `alignments` in it. Here, `alignments` is a List of List of - Tuple(Tensor (of length V + 1 + num-big-blanks), Tensor(scalar, label after argmax)). - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary. - U is the number of target tokens for the current timestep Ti. - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores generated - during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of List of floats. - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. - U is the number of target tokens for the current timestep Ti. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame - confidence scores. - - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - """ - - def __init__( - self, - decoder_model: rnnt_abstract.AbstractRNNTDecoder, - joint_model: rnnt_abstract.AbstractRNNTJoint, - blank_index: int, - durations: list, - max_symbols_per_step: Optional[int] = None, - preserve_alignments: bool = False, - preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, - ): - super().__init__( - decoder_model=decoder_model, - joint_model=joint_model, - blank_index=blank_index, - max_symbols_per_step=max_symbols_per_step, - preserve_alignments=preserve_alignments, - preserve_frame_confidence=preserve_frame_confidence, - confidence_method_cfg=confidence_method_cfg, - ) - self.durations = durations - - @typecheck() - def forward( - self, - encoder_output: torch.Tensor, - encoded_lengths: torch.Tensor, - partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, - ): - """Returns a list of hypotheses given an input batch of the encoder hidden embedding. - Output token is generated auto-regressively. - Args: - encoder_output: A tensor of size (batch, features, timesteps). - encoded_lengths: list of int representing the length of each sequence - output sequence. - Returns: - packed list containing batch number of sentences (Hypotheses). - """ - # Preserve decoder and joint training state - decoder_training_state = self.decoder.training - joint_training_state = self.joint.training - - with torch.inference_mode(): - # Apply optional preprocessing - encoder_output = encoder_output.transpose(1, 2) # (B, T, D) - - self.decoder.eval() - self.joint.eval() - - hypotheses = [] - # Process each sequence independently - with self.decoder.as_frozen(), self.joint.as_frozen(): - for batch_idx in range(encoder_output.size(0)): - inseq = encoder_output[batch_idx, :, :].unsqueeze(1) # [T, 1, D] - logitlen = encoded_lengths[batch_idx] - - partial_hypothesis = partial_hypotheses[batch_idx] if partial_hypotheses is not None else None - hypothesis = self._greedy_decode(inseq, logitlen, partial_hypotheses=partial_hypothesis) - hypotheses.append(hypothesis) - - # Pack results into Hypotheses - packed_result = pack_hypotheses(hypotheses, encoded_lengths) - - self.decoder.train(decoder_training_state) - self.joint.train(joint_training_state) - - return (packed_result,) - - @torch.no_grad() - def _greedy_decode( - self, x: torch.Tensor, out_len: torch.Tensor, partial_hypotheses: Optional[rnnt_utils.Hypothesis] = None - ): - # x: [T, 1, D] - # out_len: [seq_len] - - # Initialize blank state and empty label set in Hypothesis - hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None) - - if partial_hypotheses is not None: - hypothesis.last_token = partial_hypotheses.last_token - hypothesis.y_sequence = ( - partial_hypotheses.y_sequence.cpu().tolist() - if isinstance(partial_hypotheses.y_sequence, torch.Tensor) - else partial_hypotheses.y_sequence - ) - if partial_hypotheses.dec_state is not None: - hypothesis.dec_state = self.decoder.batch_concat_states([partial_hypotheses.dec_state]) - hypothesis.dec_state = _states_to_device(hypothesis.dec_state, x.device) - - if self.preserve_alignments: - # Alignments is a 2-dimensional dangling list representing T x U - hypothesis.alignments = [[]] - - if self.preserve_frame_confidence: - hypothesis.frame_confidence = [[]] - - time_idx = 0 - while time_idx < out_len: - # Extract encoder embedding at timestep t - # f = x[time_idx, :, :].unsqueeze(0) # [1, 1, D] - f = x.narrow(dim=0, start=time_idx, length=1) - - # Setup exit flags and counter - not_blank = True - symbols_added = 0 - - need_loop = True - # While blank is not predicted, or we dont run out of max symbols per timestep - while need_loop and (self.max_symbols is None or symbols_added < self.max_symbols): - # In the first timestep, we initialize the network with RNNT Blank - # In later timesteps, we provide previous predicted label as input. - if hypothesis.last_token is None and hypothesis.dec_state is None: - last_label = self._SOS - else: - last_label = label_collate([[hypothesis.last_token]]) - - # Perform prediction network and joint network steps. - g, hidden_prime = self._pred_step(last_label, hypothesis.dec_state) - # If preserving per-frame confidence, log_normalize must be true - logits = self._joint_step(f, g, log_normalize=False) - logp = logits[0, 0, 0, : -len(self.durations)] - if self.preserve_frame_confidence: - logp = torch.log_softmax(logp, -1) - - duration_logp = torch.log_softmax(logits[0, 0, 0, -len(self.durations) :], dim=-1) - del g - - # torch.max(0) op doesnt exist for FP 16. - if logp.dtype != torch.float32: - logp = logp.float() - - # get index k, of max prob - v, k = logp.max(0) - k = k.item() # K is the label at timestep t_s in inner loop, s >= 0. - - d_v, d_k = duration_logp.max(0) - d_k = d_k.item() - - skip = self.durations[d_k] - - if self.preserve_alignments: - # insert logprobs into last timestep - hypothesis.alignments[-1].append((logp.to('cpu'), torch.tensor(k, dtype=torch.int32))) - - if self.preserve_frame_confidence: - # insert confidence into last timestep - hypothesis.frame_confidence[-1].append(self._get_confidence(logp)) - - del logp - - # If blank token is predicted, exit inner loop, move onto next timestep t - if k == self._blank_index: - not_blank = False - else: - # Append token to label set, update RNN state. - hypothesis.y_sequence.append(k) - hypothesis.score += float(v) - hypothesis.timestep.append(time_idx) - hypothesis.dec_state = hidden_prime - hypothesis.last_token = k - - # Increment token counter. - symbols_added += 1 - time_idx += skip - need_loop = skip == 0 - - # this rarely happens, but we manually increment the `skip` number - # if blank is emitted and duration=0 is predicted. This prevents possible - # infinite loops. - if skip == 0: - skip = 1 - - if self.preserve_alignments: - # convert Ti-th logits into a torch array - hypothesis.alignments.append([]) # blank buffer for next timestep - - if self.preserve_frame_confidence: - hypothesis.frame_confidence.append([]) # blank buffer for next timestep - - if symbols_added == self.max_symbols: - time_idx += 1 - - # Remove trailing empty list of Alignments - if self.preserve_alignments: - if len(hypothesis.alignments[-1]) == 0: - del hypothesis.alignments[-1] - - # Remove trailing empty list of per-frame confidence - if self.preserve_frame_confidence: - if len(hypothesis.frame_confidence[-1]) == 0: - del hypothesis.frame_confidence[-1] - - # Unpack the hidden states - hypothesis.dec_state = self.decoder.batch_select_state(hypothesis.dec_state, 0) - - return hypothesis - - -class GreedyBatchedTDTInfer(_GreedyRNNTInfer): - """A batch level greedy TDT decoder. - Batch level greedy decoding, performed auto-regressively. - Args: - decoder_model: rnnt_utils.AbstractRNNTDecoder implementation. - joint_model: rnnt_utils.AbstractRNNTJoint implementation. - blank_index: int index of the blank token. Must be len(vocabulary) for TDT models. - durations: a list containing durations. - max_symbols_per_step: Optional int. The maximum number of symbols that can be added - to a sequence in a single time step; if set to None then there is - no limit. - preserve_alignments: Bool flag which preserves the history of alignments generated during - greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `alignments` in it. Here, `alignments` is a List of List of - Tuple(Tensor (of length V + 1 + num-big-blanks), Tensor(scalar, label after argmax)). - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary. - U is the number of target tokens for the current timestep Ti. - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores generated - during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of List of floats. - The length of the list corresponds to the Acoustic Length (T). - Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores. - U is the number of target tokens for the current timestep Ti. - confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame - confidence scores. - - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - """ - - def __init__( - self, - decoder_model: rnnt_abstract.AbstractRNNTDecoder, - joint_model: rnnt_abstract.AbstractRNNTJoint, - blank_index: int, - durations: List[int], - max_symbols_per_step: Optional[int] = None, - preserve_alignments: bool = False, - preserve_frame_confidence: bool = False, - confidence_method_cfg: Optional[DictConfig] = None, - ): - super().__init__( - decoder_model=decoder_model, - joint_model=joint_model, - blank_index=blank_index, - max_symbols_per_step=max_symbols_per_step, - preserve_alignments=preserve_alignments, - preserve_frame_confidence=preserve_frame_confidence, - confidence_method_cfg=confidence_method_cfg, - ) - self.durations = durations - - # Depending on availability of `blank_as_pad` support - # switch between more efficient batch decoding technique - if self.decoder.blank_as_pad: - self._greedy_decode = self._greedy_decode_blank_as_pad - else: - self._greedy_decode = self._greedy_decode_masked - - @typecheck() - def forward( - self, - encoder_output: torch.Tensor, - encoded_lengths: torch.Tensor, - partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, - ): - """Returns a list of hypotheses given an input batch of the encoder hidden embedding. - Output token is generated auto-regressively. - Args: - encoder_output: A tensor of size (batch, features, timesteps). - encoded_lengths: list of int representing the length of each sequence - output sequence. - Returns: - packed list containing batch number of sentences (Hypotheses). - """ - # Preserve decoder and joint training state - decoder_training_state = self.decoder.training - joint_training_state = self.joint.training - - with torch.inference_mode(): - # Apply optional preprocessing - encoder_output = encoder_output.transpose(1, 2) # (B, T, D) - logitlen = encoded_lengths - - self.decoder.eval() - self.joint.eval() - - with self.decoder.as_frozen(), self.joint.as_frozen(): - inseq = encoder_output # [B, T, D] - hypotheses = self._greedy_decode( - inseq, logitlen, device=inseq.device, partial_hypotheses=partial_hypotheses - ) - - # Pack the hypotheses results - packed_result = pack_hypotheses(hypotheses, logitlen) - - self.decoder.train(decoder_training_state) - self.joint.train(joint_training_state) - - return (packed_result,) - - def _greedy_decode_blank_as_pad( - self, - x: torch.Tensor, - out_len: torch.Tensor, - device: torch.device, - partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, - ): - if partial_hypotheses is not None: - raise NotImplementedError("`partial_hypotheses` support is not supported") - - with torch.inference_mode(): - # x: [B, T, D] - # out_len: [B] - # device: torch.device - - # Initialize list of Hypothesis - batchsize = x.shape[0] - hypotheses = [ - rnnt_utils.Hypothesis(score=0.0, y_sequence=[], timestep=[], dec_state=None) for _ in range(batchsize) - ] - - # Initialize Hidden state matrix (shared by entire batch) - hidden = None - - # If alignments need to be preserved, register a danling list to hold the values - if self.preserve_alignments: - # alignments is a 3-dimensional dangling list representing B x T x U - for hyp in hypotheses: - hyp.alignments = [[]] - - # If confidence scores need to be preserved, register a danling list to hold the values - if self.preserve_frame_confidence: - # frame_confidence is a 3-dimensional dangling list representing B x T x U - for hyp in hypotheses: - hyp.frame_confidence = [[]] - - # Last Label buffer + Last Label without blank buffer - # batch level equivalent of the last_label - last_label = torch.full([batchsize, 1], fill_value=self._blank_index, dtype=torch.long, device=device) - - # Mask buffers - blank_mask = torch.full([batchsize], fill_value=0, dtype=torch.bool, device=device) - - # Get max sequence length - max_out_len = out_len.max() - - # skip means the number of frames the next decoding step should "jump" to. When skip == 1 - # it means the next decoding step will just use the next input frame. - skip = 1 - for time_idx in range(max_out_len): - if skip > 1: # if skip > 1 at the current step, we decrement it and skip the current frame. - skip -= 1 - continue - f = x.narrow(dim=1, start=time_idx, length=1) # [B, 1, D] - - # need_to_stay is a boolean indicates whether the next decoding step should remain in the same frame. - need_to_stay = True - symbols_added = 0 - - # Reset blank mask - blank_mask.mul_(False) - - # Update blank mask with time mask - # Batch: [B, T, D], but Bi may have seq len < max(seq_lens_in_batch) - # Forcibly mask with "blank" tokens, for all sample where current time step T > seq_len - blank_mask = time_idx >= out_len - - # Start inner loop - while need_to_stay and (self.max_symbols is None or symbols_added < self.max_symbols): - # Batch prediction and joint network steps - # If very first prediction step, submit SOS tag (blank) to pred_step. - # This feeds a zero tensor as input to AbstractRNNTDecoder to prime the state - if time_idx == 0 and symbols_added == 0 and hidden is None: - g, hidden_prime = self._pred_step(self._SOS, hidden, batch_size=batchsize) - else: - # Perform batch step prediction of decoder, getting new states and scores ("g") - g, hidden_prime = self._pred_step(last_label, hidden, batch_size=batchsize) - - # Batched joint step - Output = [B, V + 1 + num-big-blanks] - # Note: log_normalize must not be True here since the joiner output is contanetation of both token logits and duration logits, - # and they need to be normalized independently. - joined = self._joint_step(f, g, log_normalize=None) - logp = joined[:, 0, 0, : -len(self.durations)] - duration_logp = joined[:, 0, 0, -len(self.durations) :] - - if logp.dtype != torch.float32: - logp = logp.float() - duration_logp = duration_logp.float() - - # get the max for both token and duration predictions. - v, k = logp.max(1) - dv, dk = duration_logp.max(1) - - # here we set the skip value to be the minimum of all predicted durations, hense the "torch.min(dk)" call there. - # Please refer to Section 5.2 of our paper https://arxiv.org/pdf/2304.06795.pdf for explanation of this. - skip = self.durations[int(torch.min(dk))] - - # this is a special case: if all batches emit blanks, we require that skip be at least 1 - # so we don't loop forever at the current frame. - if blank_mask.all(): - if skip == 0: - skip = 1 - - need_to_stay = skip == 0 - del g - - # Update blank mask with current predicted blanks - # This is accumulating blanks over all time steps T and all target steps min(max_symbols, U) - k_is_blank = k == self._blank_index - blank_mask.bitwise_or_(k_is_blank) - - del k_is_blank - del logp, duration_logp - - # If all samples predict / have predicted prior blanks, exit loop early - # This is equivalent to if single sample predicted k - if not blank_mask.all(): - # Collect batch indices where blanks occurred now/past - blank_indices = (blank_mask == 1).nonzero(as_tuple=False) - - # Recover prior state for all samples which predicted blank now/past - if hidden is not None: - hidden_prime = self.decoder.batch_copy_states(hidden_prime, hidden, blank_indices) - - elif len(blank_indices) > 0 and hidden is None: - # Reset state if there were some blank and other non-blank predictions in batch - # Original state is filled with zeros so we just multiply - # LSTM has 2 states - hidden_prime = self.decoder.batch_copy_states(hidden_prime, None, blank_indices, value=0.0) - - # Recover prior predicted label for all samples which predicted blank now/past - k[blank_indices] = last_label[blank_indices, 0] - - # Update new label and hidden state for next iteration - last_label = k.clone().view(-1, 1) - hidden = hidden_prime - - # Update predicted labels, accounting for time mask - # If blank was predicted even once, now or in the past, - # Force the current predicted label to also be blank - # This ensures that blanks propogate across all timesteps - # once they have occured (normally stopping condition of sample level loop). - for kidx, ki in enumerate(k): - if blank_mask[kidx] == 0: - hypotheses[kidx].y_sequence.append(ki) - hypotheses[kidx].timestep.append(time_idx) - hypotheses[kidx].score += float(v[kidx]) - - symbols_added += 1 - - # Remove trailing empty list of alignments at T_{am-len} x Uj - if self.preserve_alignments: - for batch_idx in range(batchsize): - if len(hypotheses[batch_idx].alignments[-1]) == 0: - del hypotheses[batch_idx].alignments[-1] - - # Remove trailing empty list of confidence scores at T_{am-len} x Uj - if self.preserve_frame_confidence: - for batch_idx in range(batchsize): - if len(hypotheses[batch_idx].frame_confidence[-1]) == 0: - del hypotheses[batch_idx].frame_confidence[-1] - - # Preserve states - for batch_idx in range(batchsize): - hypotheses[batch_idx].dec_state = self.decoder.batch_select_state(hidden, batch_idx) - - return hypotheses - - def _greedy_decode_masked( - self, - x: torch.Tensor, - out_len: torch.Tensor, - device: torch.device, - partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None, - ): - raise NotImplementedError("masked greedy-batched decode is not supported for TDT models.") diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/spectr_augment.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/spectr_augment.py deleted file mode 100644 index 9b379ce10f375d0d75b0b8bf61c85e301f8f22a9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/spectr_augment.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random - -import numpy as np -import torch -import torch.nn as nn - -from nemo.core.classes import Typing, typecheck -from nemo.core.neural_types import LengthsType, NeuralType, SpectrogramType - - -class SpecAugment(nn.Module, Typing): - """ - Zeroes out(cuts) random continuous horisontal or - vertical segments of the spectrogram as described in - SpecAugment (https://arxiv.org/abs/1904.08779). - - params: - freq_masks - how many frequency segments should be cut - time_masks - how many time segments should be cut - freq_width - maximum number of frequencies to be cut in one segment - time_width - maximum number of time steps to be cut in one segment. - Can be a positive integer or a float value in the range [0, 1]. - If positive integer value, defines maximum number of time steps - to be cut in one segment. - If a float value, defines maximum percentage of timesteps that - are cut adaptively. - """ - - @property - def input_types(self): - """Returns definitions of module input types - """ - return { - "input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType()), - "length": NeuralType(tuple('B'), LengthsType()), - } - - @property - def output_types(self): - """Returns definitions of module output types - """ - return {"augmented_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())} - - def __init__( - self, freq_masks=0, time_masks=0, freq_width=10, time_width=10, rng=None, mask_value=0.0, - ): - super().__init__() - - self._rng = random.Random() if rng is None else rng - - self.freq_masks = freq_masks - self.time_masks = time_masks - - self.freq_width = freq_width - self.time_width = time_width - - self.mask_value = mask_value - - if isinstance(time_width, int): - self.adaptive_temporal_width = False - else: - if time_width > 1.0 or time_width < 0.0: - raise ValueError("If `time_width` is a float value, must be in range [0, 1]") - - self.adaptive_temporal_width = True - - @typecheck() - @torch.no_grad() - def forward(self, input_spec, length): - batch_size, num_freq_bins, _ = input_spec.shape - # Move lengths to CPU before repeated indexing - lengths_cpu = length.cpu().numpy() - # Generate a numpy boolean mask. `True` elements represent where the input spec will be augmented. - fill_mask: np.array = np.full(shape=input_spec.shape, fill_value=False) - freq_start_upper_bound = num_freq_bins - self.freq_width - # Choose different mask ranges for each element of the batch - for idx in range(batch_size): - # Set freq masking - for _ in range(self.freq_masks): - start = self._rng.randint(0, freq_start_upper_bound) - width = self._rng.randint(0, self.freq_width) - fill_mask[idx, start : start + width, :] = True - - # Derive time width, sometimes based percentage of input length. - if self.adaptive_temporal_width: - time_max_width = max(1, int(lengths_cpu[idx] * self.time_width)) - else: - time_max_width = self.time_width - time_start_upper_bound = max(1, lengths_cpu[idx] - time_max_width) - - # Set time masking - for _ in range(self.time_masks): - start = self._rng.randint(0, time_start_upper_bound) - width = self._rng.randint(0, time_max_width) - fill_mask[idx, :, start : start + width] = True - # Bring the mask to device and fill spec - fill_mask = torch.from_numpy(fill_mask).to(input_spec.device) - masked_spec = input_spec.masked_fill(mask=fill_mask, value=self.mask_value) - return masked_spec - - -class SpecCutout(nn.Module, Typing): - """ - Zeroes out(cuts) random rectangles in the spectrogram - as described in (https://arxiv.org/abs/1708.04552). - - params: - rect_masks - how many rectangular masks should be cut - rect_freq - maximum size of cut rectangles along the frequency dimension - rect_time - maximum size of cut rectangles along the time dimension - """ - - @property - def input_types(self): - """Returns definitions of module input types - """ - return {"input_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())} - - @property - def output_types(self): - """Returns definitions of module output types - """ - return {"augmented_spec": NeuralType(('B', 'D', 'T'), SpectrogramType())} - - def __init__(self, rect_masks=0, rect_time=5, rect_freq=20, rng=None): - super(SpecCutout, self).__init__() - - self._rng = random.Random() if rng is None else rng - - self.rect_masks = rect_masks - self.rect_time = rect_time - self.rect_freq = rect_freq - - @typecheck() - @torch.no_grad() - def forward(self, input_spec): - sh = input_spec.shape - - for idx in range(sh[0]): - for i in range(self.rect_masks): - rect_x = self._rng.randint(0, sh[1] - self.rect_freq) - rect_y = self._rng.randint(0, sh[2] - self.rect_time) - - w_x = self._rng.randint(0, self.rect_freq) - w_y = self._rng.randint(0, self.rect_time) - - input_spec[idx, rect_x : rect_x + w_x, rect_y : rect_y + w_y] = 0.0 - - return input_spec diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/squeezeformer_modules.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/squeezeformer_modules.py deleted file mode 100644 index 56c462396523b22bf988e4adf4db48c45cd6afce..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/squeezeformer_modules.py +++ /dev/null @@ -1,260 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import torch -from torch import nn as nn -from torch.nn import LayerNorm - -from nemo.collections.asr.parts.submodules.conformer_modules import ConformerConvolution, ConformerFeedForward -from nemo.collections.asr.parts.submodules.multi_head_attention import ( - MultiHeadAttention, - RelPositionMultiHeadAttention, -) -from nemo.collections.common.parts import adapter_modules -from nemo.core.classes.mixins import AccessMixin -from nemo.core.classes.mixins.adapter_mixins import AdapterModuleMixin - -__all__ = ['SqueezeformerLayer', 'ConformerFeedForward', 'SqueezeformerLayer'] - - -class ScaleBiasLayer(torch.nn.Module): - """ - Computes an affine transformation y = x * scale + bias, either learned via adaptive weights, or fixed. - Efficient alternative to LayerNorm where we can avoid computing the mean and variance of the input, and - just rescale the output of the previous layer. - - Args: - d_model (int): input dimension of layer. - adaptive_scale (bool): whether to learn the affine transformation parameters or not. If set to False, - the scale is fixed to 1 and bias to 0, effectively performing a No-Op on the input. - This is done for export compatibility. - """ - - def __init__(self, d_model: int, adaptive_scale: bool): - super().__init__() - self.adaptive_scale = adaptive_scale - if adaptive_scale: - self.scale = nn.Parameter(torch.ones(d_model)) - self.bias = nn.Parameter(torch.zeros(d_model)) - else: - self.register_buffer('scale', torch.ones(d_model), persistent=True) - self.register_buffer('bias', torch.zeros(d_model), persistent=True) - - def forward(self, x): - scale = self.scale.view(1, 1, -1) - bias = self.bias.view(1, 1, -1) - return x * scale + bias - - -class SqueezeformerLayer(torch.nn.Module, AdapterModuleMixin, AccessMixin): - """A single block of the Squeezeformer encoder. - - Args: - d_model (int): input dimension of MultiheadAttentionMechanism and PositionwiseFeedForward - d_ff (int): hidden dimension of PositionwiseFeedForward - n_heads (int): number of heads for multi-head attention - conv_kernel_size (int): kernel size for depthwise convolution in convolution module - dropout (float): dropout probabilities for linear layers - dropout_att (float): dropout probabilities for attention distributions - adaptive_scale (bool): Whether to scale the inputs to each component by affine `scale` and `bias` layer. - Or use a fixed scale=1 and bias=0. - """ - - def __init__( - self, - d_model, - d_ff, - self_attention_model='rel_pos', - n_heads=4, - conv_kernel_size=31, - conv_norm_type='batch_norm', - dropout=0.1, - dropout_att=0.1, - pos_bias_u=None, - pos_bias_v=None, - adaptive_scale: bool = True, - ): - super().__init__() - - self.self_attention_model = self_attention_model - self.n_heads = n_heads - self.fc_factor = 1.0 - self.adaptive_scale = adaptive_scale - - # first feed forward module - self.norm_feed_forward1 = LayerNorm(d_model) - self.feed_forward1 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout) - self.feed_forward1_scale = ScaleBiasLayer(d_model=d_model, adaptive_scale=adaptive_scale) - - # convolution module - self.norm_conv = LayerNorm(d_model) - self.conv = ConformerConvolution( - d_model=d_model, kernel_size=conv_kernel_size, norm_type=conv_norm_type, pointwise_activation='swish' - ) - self.conv_scale = ScaleBiasLayer(d_model=d_model, adaptive_scale=adaptive_scale) - - # multi-headed self-attention module - self.norm_self_att = LayerNorm(d_model) - if self_attention_model == 'rel_pos': - self.self_attn = RelPositionMultiHeadAttention( - n_head=n_heads, n_feat=d_model, dropout_rate=dropout_att, pos_bias_u=pos_bias_u, pos_bias_v=pos_bias_v - ) - elif self_attention_model == 'abs_pos': - self.self_attn = MultiHeadAttention(n_head=n_heads, n_feat=d_model, dropout_rate=dropout_att) - else: - raise ValueError( - f"'{self_attention_model}' is not not a valid value for 'self_attention_model', " - f"valid values can be from ['rel_pos', 'abs_pos']" - ) - self.self_attn_scale = ScaleBiasLayer(d_model=d_model, adaptive_scale=adaptive_scale) - - # second feed forward module - self.norm_feed_forward2 = LayerNorm(d_model) - self.feed_forward2 = ConformerFeedForward(d_model=d_model, d_ff=d_ff, dropout=dropout) - self.feed_forward2_scale = ScaleBiasLayer(d_model=d_model, adaptive_scale=adaptive_scale) - - self.dropout = nn.Dropout(dropout) - # self.norm_out = LayerNorm(d_model) - - # initialize parameters properly - self.reset_parameters() - - def forward(self, x, att_mask=None, pos_emb=None, pad_mask=None): - """ - Args: - x (torch.Tensor): input signals (B, T, d_model) - att_mask (torch.Tensor): attention masks(B, T, T) - pos_emb (torch.Tensor): (L, 1, d_model) - pad_mask (torch.tensor): padding mask - Returns: - x (torch.Tensor): (B, T, d_model) - """ - residual = x - - x = self.self_attn_scale(x) - if self.self_attention_model == 'rel_pos': - x = self.self_attn(query=x, key=x, value=x, mask=att_mask, pos_emb=pos_emb) - elif self.self_attention_model == 'abs_pos': - x = self.self_attn(query=x, key=x, value=x, mask=att_mask) - else: - x = None - x = residual + self.dropout(x) - x = self.norm_self_att(x) - residual = x - - if self.is_adapter_available(): - # Call the MHA adapters - pack_ip = { - 'x': residual, - 'loc': 'mha', - 'att_mask': att_mask, - 'pos_emb': pos_emb, - } - pack_ip = self.forward_enabled_adapters(pack_ip) - x = pack_ip['x'] - - x = self.feed_forward1_scale(x) - x = self.feed_forward1(x) - x = residual + self.dropout(x) * self.fc_factor - x = self.norm_feed_forward1(x) - residual = x - - x = self.conv_scale(x) - x = self.conv(x, pad_mask) - x = residual + self.dropout(x) - x = self.norm_conv(x) - residual = x - - x = self.feed_forward2_scale(x) - x = self.feed_forward2(x) - x = residual + self.dropout(x) * self.fc_factor - x = self.norm_feed_forward2(x) - - if self.is_adapter_available(): - # Call the adapters - pack_ip = { - 'x': x, - 'loc': 'post', - } - pack_ip = self.forward_enabled_adapters(pack_ip) - x = pack_ip['x'] - - if self.is_access_enabled() and self.access_cfg.get('save_encoder_tensors', False): - self.register_accessible_tensor(name='encoder', tensor=x) - - return x - - def forward_single_enabled_adapter_( - self, - input: dict, - adapter_module: torch.nn.Module, - *, - adapter_name: str, - adapter_strategy: 'nemo.core.classes.mixins.adapter_mixin_strategies.AbstractAdapterStrategy', - ): - """ - Perform the forward step of a single adapter module on some input data. - - **Note**: Subclasses can override this method to accommodate more complicate adapter forward steps. - - Args: - input: Dictionary of packed tensors. The dict should contain at least - `x`: output tensor - `loc`: Semantic location in module where this adapter was called - `att_mask`: Optional, Attention mask - `pos_emb`: Optional, Positional Embedding for Relative Positional Encoding. - The output tensor of the calling module is the input to the first adapter, whose output - is then chained to the next adapter until all adapters are consumed. - adapter_module: The adapter module that is currently required to perform the forward pass. - adapter_name: The resolved name of the adapter that is undergoing the current forward pass. - adapter_strategy: A subclass of `AbstractAdapterStrategy`, that determines how the - output of the adapter should be merged with the input, or if it should be merged at all. - - Returns: - The result tensor, after the current active adapter has finished its forward pass. - """ - # (input: torch.Tensor, adapter: torch.nn.Module, *, module: 'AdapterModuleMixin') - x = input['x'] - loc = input['loc'] - att_mask = input.get('att_mask', None) - pos_emb = input.get('pos_emb', None) - - if isinstance(adapter_module, adapter_modules.LinearAdapter) and loc == 'post': - output = adapter_strategy(x, adapter_module, module=self) - - elif isinstance(adapter_module, MultiHeadAttention) and loc == 'mha': - if self.self_attention_model == 'rel_pos': - x = dict(query=x, key=x, value=x, mask=att_mask, pos_emb=pos_emb) - output = adapter_strategy(x, adapter_module, module=self) - - elif self.self_attention_model == 'abs_pos': - x = dict(query=x, key=x, value=x, mask=att_mask) - output = adapter_strategy(x, adapter_module, module=self) - - else: - raise ValueError(f"Unsupported value of self_attention_model , provided {self.self_attention_model}!") - - else: - # No adapter compatible, skip - output = x - - input['x'] = output - - return input - - def reset_parameters(self): - # Used for Squeezeformer initialization only - self.feed_forward1.reset_parameters_ff() - self.feed_forward2.reset_parameters_ff() - self.conv.reset_parameters_conv() diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/ssl_quantizers.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/ssl_quantizers.py deleted file mode 100644 index 944589e3f2cccde0841e9a9426ccf2bba13e799f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/ssl_quantizers.py +++ /dev/null @@ -1,200 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import torch -import torch.nn.functional as F -from torch import nn - -from nemo.collections.asr.parts.submodules.jasper import jasper_activations -from nemo.core import NeuralModule -from nemo.core.neural_types import EncodedRepresentation, LossType, NeuralType - - -class GumbelVectorQuantizer(NeuralModule): - def __init__( - self, - dim, - num_vars, - temp, - groups, - combine_groups, - vq_dim, - time_first, - activation="gelu", - weight_proj_depth=1, - weight_proj_factor=1, - ): - """Vector quantization using gumbel softmax - - Args: - dim: input dimension (channels) - num_vars: number of quantized vectors per group - temp: temperature for training. this should be a tuple of 3 elements: (start, stop, decay factor) - groups: number of groups for vector quantization - combine_groups: whether to use the vectors for all groups - vq_dim: dimensionality of the resulting quantized vector - time_first: if true, expect input in BxTxC format, otherwise in BxCxT - activation: what activation to use (should be a module). this is only used if weight_proj_depth is > 1 - weight_proj_depth: number of layers (with activation in between) to project input before computing logits - weight_proj_factor: this is used only if weight_proj_depth is > 1. scales the inner dimensionality of - projections by this factor - """ - super().__init__() - - self.groups = groups - self.combine_groups = combine_groups - self.input_dim = dim - self.num_vars = num_vars - self.time_first = time_first - - assert vq_dim % groups == 0, f"dim {vq_dim} must be divisible by groups {groups} for concatenation" - - var_dim = vq_dim // groups - num_groups = groups if not combine_groups else 1 - - self.vars = nn.Parameter(torch.FloatTensor(1, num_groups * num_vars, var_dim)) - nn.init.uniform_(self.vars) - - if weight_proj_depth > 1: - activation = jasper_activations["gelu"] - - def block(input_dim, output_dim): - return nn.Sequential(nn.Linear(input_dim, output_dim), activation) - - inner_dim = self.input_dim * weight_proj_factor - self.weight_proj = nn.Sequential( - *[block(self.input_dim if i == 0 else inner_dim, inner_dim) for i in range(weight_proj_depth - 1)], - nn.Linear(inner_dim, groups * num_vars), - ) - else: - self.weight_proj = nn.Linear(self.input_dim, groups * num_vars) - nn.init.normal_(self.weight_proj.weight, mean=0, std=1) - nn.init.zeros_(self.weight_proj.bias) - - assert len(temp) == 3, "Quantize temperature should be a tuple of 3 elements: (start, stop, decay factor)" - - self.max_temp, self.min_temp, self.temp_decay = temp - self.curr_temp = self.max_temp - self.codebook_indices = None - - def set_num_updates(self, num_updates): - self.curr_temp = max(self.max_temp * self.temp_decay ** num_updates, self.min_temp) - - def get_codebook_indices(self): - if self.codebook_indices is None: - from itertools import product - - p = [range(self.num_vars)] * self.groups - inds = list(product(*p)) - self.codebook_indices = torch.tensor(inds, dtype=torch.long, device=self.vars.device).flatten() - - if not self.combine_groups: - self.codebook_indices = self.codebook_indices.view(self.num_vars ** self.groups, -1) - for b in range(1, self.groups): - self.codebook_indices[:, b] += self.num_vars * b - self.codebook_indices = self.codebook_indices.flatten() - return self.codebook_indices - - def sample_from_codebook(self, b, n): - indices = self.get_codebook_indices() - indices = indices.view(-1, self.groups) - cb_size = indices.size(0) - assert n < cb_size, f"sample size {n} is greater than size of codebook {cb_size}" - sample_idx = torch.randint(low=0, high=cb_size, size=(b * n,)) - indices = indices[sample_idx] - - z = self.vars.squeeze(0).index_select(0, indices.flatten()).view(b, n, -1) - return z - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - if self.time_first: - return {"x": NeuralType(('B', 'T', 'D'), EncodedRepresentation())} - return {"x": NeuralType(('B', 'D', 'T'), EncodedRepresentation())} - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - if self.time_first: - return { - "x": NeuralType(('B', 'T', 'D'), EncodedRepresentation()), - "quantize_prob_ppl": NeuralType(elements_type=LossType()), - } - return { - "x": NeuralType(('B', 'D', 'T'), EncodedRepresentation()), - "quantize_prob_ppl": NeuralType(elements_type=LossType()), - } - - def forward(self, x, return_ids=False): - - if not self.time_first: - x = x.transpose(1, 2) - - bsz, tsz, fsz = x.shape - x = x.reshape(-1, fsz) - x = self.weight_proj(x) - x = x.view(bsz * tsz * self.groups, -1) - - _, k = x.max(-1) - hard_x = x.new_zeros(*x.shape).scatter_(-1, k.view(-1, 1), 1.0).view(bsz * tsz, self.groups, -1) - - # Calculate quantize prob perplexity - num_vars = self.num_vars * self.groups - avg_probs = torch.softmax(x.view(bsz * tsz, self.groups, -1).float(), dim=-1).mean(dim=0) - quantize_prob_ppl = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-7), dim=-1)).sum() - quantize_prob_ppl = (num_vars - quantize_prob_ppl) / num_vars - - if self.training: - x = F.gumbel_softmax(x.float(), tau=self.curr_temp, hard=True).type_as(x) - else: - x = hard_x - - x = x.view(bsz * tsz, -1) - - vars = self.vars - if self.combine_groups: - vars = vars.repeat(1, self.groups, 1) - - x = x.unsqueeze(-1) * vars - x = x.view(bsz * tsz, self.groups, self.num_vars, -1) - x = x.sum(-2) - x = x.view(bsz, tsz, -1) - - cur_codebook_temp = self.curr_temp - - if not self.time_first: - x = x.transpose(1, 2) # BTC -> BCT - - if return_ids: - hard_x_max = hard_x.argmax(-1).reshape(bsz, tsz, -1) - # BxTxG - - # create single id from multiple group ids - target_ids = hard_x.new_zeros(bsz, tsz).long() - - for i in range(self.groups): - target_ids *= self.num_vars - target_ids += hard_x_max[:, :, i] - - return x, quantize_prob_ppl, cur_codebook_temp, target_ids - else: - return x, quantize_prob_ppl, cur_codebook_temp diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/stateless_net.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/stateless_net.py deleted file mode 100644 index 7581fdc2834d7acee9ad5c2c008c948955fff945..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/stateless_net.py +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Any, Dict, List, Optional, Tuple, Union - -import torch - - -class StatelessNet(torch.nn.Module): - """ - Helper class used in transducer models with stateless decoders. This stateless - simply outputs embedding or concatenated embeddings for the input label[s], - depending on the configured context size. - - Args: - context_size: history context size for the stateless decoder network. Could be any positive integer. We recommend setting this as 2. - vocab_size: total vocabulary size. - emb_dim: total embedding size of the stateless net output. - blank_idx: index for the blank symbol for the transducer model. - normalization_mode: normalization run on the output embeddings. Could be either 'layer' or None. We recommend using 'layer' to stabilize training. - dropout: dropout rate on the embedding outputs. - """ - - def __init__(self, context_size, vocab_size, emb_dim, blank_idx, normalization_mode, dropout): - super().__init__() - assert context_size > 0 - self.context_size = context_size - self.vocab_size = vocab_size - self.emb_dim = emb_dim - self.dropout = torch.nn.Dropout(dropout) - self.norm = torch.nn.Identity() - if normalization_mode == 'layer': - self.norm = torch.nn.LayerNorm(emb_dim, elementwise_affine=False) - - embeds = [] - for i in range(self.context_size): - # We use different embedding matrices for different context positions. - # In this list, a smaller index means more recent history word. - # We assign more dimensions for the most recent word in the history. - # The detailed method is, we first allocate half the embedding-size - # to the most recent history word, and then allocate the remaining - # dimensions evenly among all history contexts. E.g. if total embedding - # size is 200, and context_size is 2, then we allocate 150 dimensions - # to the last word, and 50 dimensions to the second-to-last word. - if i != 0: - embed_size = emb_dim // 2 // self.context_size - else: - embed_size = emb_dim - (emb_dim // 2 // self.context_size) * (self.context_size - 1) - - embed = torch.nn.Embedding(vocab_size + 1, embed_size, padding_idx=blank_idx) - embeds.append(embed) - - self.embeds = torch.nn.ModuleList(embeds) - self.blank_idx = blank_idx - - def forward( - self, y: Optional[torch.Tensor] = None, state: Optional[List[torch.Tensor]] = None, - ): - """ - Although this is a *stateless* net, we use the "state" parameter to - pass in the previous labels, unlike LSTMs where state would represent - hidden activations of the network. - - Args: - y: a Integer tensor of shape B x U. - state: a list of 1 tensor in order to be consistent with the stateful - decoder interface, and the element is a tensor of shape [B x context-length]. - - Returns: - The return dimension of this function's output is B x U x D, with D being the total embedding dim. - """ - outs = [] - - [B, U] = y.shape - appended_y = y - if state != None: - appended_y = torch.concat([state[0], y], axis=1) - context_size = appended_y.shape[1] - - if context_size < self.context_size: - # This is the case at the beginning of an utterance where we have - # seen less words than context_size. In this case, we need to pad - # it to the right length. - padded_state = torch.ones([B, self.context_size], dtype=torch.long, device=y.device) * self.blank_idx - padded_state[:, self.context_size - context_size :] = appended_y - elif context_size == self.context_size + 1: - padded_state = appended_y[:, 1:] - # This is the case where the previous state already has reached context_size. - # We need to truncate the history by omitting the 0'th token. - else: - # Context has just the right size. Copy directly. - padded_state = appended_y - - for i in range(self.context_size): - out = self.embeds[i](padded_state[:, self.context_size - 1 - i : self.context_size - i]) - outs.append(out) - else: - for i in range(self.context_size): - out = self.embeds[i](y) - - if i != 0: - out[:, i:, :] = out[ - :, :-i, : - ].clone() # needs clone() here or it might complain about src and dst mem location have overlaps. - out[:, :i, :] *= 0.0 - outs.append(out) - - out = self.dropout(torch.concat(outs, axis=-1)) - out = self.norm(out) - - state = None - if y is not None: - state = [appended_y[:, appended_y.shape[1] - self.context_size + 1 :]] - return out, state diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/subsampling.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/subsampling.py deleted file mode 100644 index 068cd36022b0fca73dbdbab2924682e447c3a005..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/subsampling.py +++ /dev/null @@ -1,693 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math - -import torch -import torch.nn as nn -from torch.nn import LayerNorm - -from nemo.collections.asr.parts.submodules.causal_convs import CausalConv1D, CausalConv2D -from nemo.utils import logging - - -class StackingSubsampling(torch.nn.Module): - """Stacking subsampling which simply stacks consecutive frames to reduce the sampling rate - Args: - subsampling_factor (int): The subsampling factor - feat_in (int): size of the input features - feat_out (int): size of the output features - norm (bool): whether to use an MLP layer after the stacking along with normalization. default is False. - """ - - def __init__(self, subsampling_factor, feat_in, feat_out, norm=False): - super(StackingSubsampling, self).__init__() - self.subsampling_factor = subsampling_factor - self.proj_out = torch.nn.Linear(subsampling_factor * feat_in, feat_out) - if norm: - self.pre_norm = LayerNorm(feat_in) - else: - self.pre_norm = None - - def get_sampling_frames(self): - return self.subsampling_factor - - def get_streaming_cache_size(self): - return 0 - - def forward(self, x, lengths): - b, t, h = x.size() - pad_size = (self.subsampling_factor - (t % self.subsampling_factor)) % self.subsampling_factor - x = torch.nn.functional.pad(x, (0, 0, 0, pad_size)) - if self.pre_norm is not None: - x = self.pre_norm(x) - _, t, _ = x.size() - x = torch.reshape(x, (b, t // self.subsampling_factor, h * self.subsampling_factor)) - x = self.proj_out(x) - lengths = torch.div(lengths + pad_size, self.subsampling_factor, rounding_mode='floor') - return x, lengths - - -class ConvSubsampling(torch.nn.Module): - """Convolutional subsampling which supports VGGNet and striding approach introduced in: - VGGNet Subsampling: Transformer-transducer: end-to-end speech recognition with self-attention (https://arxiv.org/pdf/1910.12977.pdf) - Striding Subsampling: "Speech-Transformer: A No-Recurrence Sequence-to-Sequence Model for Speech Recognition" by Linhao Dong et al. (https://ieeexplore.ieee.org/document/8462506) - Args: - subsampling (str): The subsampling technique from {"vggnet", "striding", "dw-striding"} - subsampling_factor (int): The subsampling factor which should be a power of 2 - subsampling_conv_chunking_factor (int): Input chunking factor which can be -1 (no chunking) - 1 (auto) or a power of 2. Default is 1 - feat_in (int): size of the input features - feat_out (int): size of the output features - conv_channels (int): Number of channels for the convolution layers. - activation (Module): activation function, default is nn.ReLU() - """ - - def __init__( - self, - subsampling, - subsampling_factor, - feat_in, - feat_out, - conv_channels, - subsampling_conv_chunking_factor=1, - activation=nn.ReLU(), - is_causal=False, - ): - super(ConvSubsampling, self).__init__() - self._subsampling = subsampling - self._conv_channels = conv_channels - self._feat_in = feat_in - self._feat_out = feat_out - - if subsampling_factor % 2 != 0: - raise ValueError("Sampling factor should be a multiply of 2!") - self._sampling_num = int(math.log(subsampling_factor, 2)) - self.subsampling_factor = subsampling_factor - self.is_causal = is_causal - - if ( - subsampling_conv_chunking_factor != -1 - and subsampling_conv_chunking_factor != 1 - and subsampling_conv_chunking_factor % 2 != 0 - ): - raise ValueError("subsampling_conv_chunking_factor should be -1, 1, or a power of 2") - self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor - - in_channels = 1 - layers = [] - - if subsampling == 'vggnet': - self._stride = 2 - self._kernel_size = 2 - self._ceil_mode = True - - self._left_padding = 0 - self._right_padding = 0 - - for i in range(self._sampling_num): - layers.append( - torch.nn.Conv2d( - in_channels=in_channels, out_channels=conv_channels, kernel_size=3, stride=1, padding=1 - ) - ) - layers.append(activation) - layers.append( - torch.nn.Conv2d( - in_channels=conv_channels, out_channels=conv_channels, kernel_size=3, stride=1, padding=1 - ) - ) - layers.append(activation) - layers.append( - torch.nn.MaxPool2d( - kernel_size=self._kernel_size, - stride=self._stride, - padding=self._left_padding, - ceil_mode=self._ceil_mode, - ) - ) - in_channels = conv_channels - - elif subsampling == 'dw_striding': - self._stride = 2 - self._kernel_size = 3 - self._ceil_mode = False - - if self.is_causal: - self._left_padding = self._kernel_size - 1 - self._right_padding = self._stride - 1 - self._max_cache_len = subsampling_factor + 1 - else: - self._left_padding = (self._kernel_size - 1) // 2 - self._right_padding = (self._kernel_size - 1) // 2 - self._max_cache_len = 0 - - # Layer 1 - if self.is_causal: - layers.append( - CausalConv2D( - in_channels=in_channels, - out_channels=conv_channels, - kernel_size=self._kernel_size, - stride=self._stride, - padding=None, - ) - ) - else: - layers.append( - torch.nn.Conv2d( - in_channels=in_channels, - out_channels=conv_channels, - kernel_size=self._kernel_size, - stride=self._stride, - padding=self._left_padding, - ) - ) - in_channels = conv_channels - layers.append(activation) - - for i in range(self._sampling_num - 1): - if self.is_causal: - layers.append( - CausalConv2D( - in_channels=in_channels, - out_channels=in_channels, - kernel_size=self._kernel_size, - stride=self._stride, - padding=None, - groups=in_channels, - ) - ) - else: - layers.append( - torch.nn.Conv2d( - in_channels=in_channels, - out_channels=in_channels, - kernel_size=self._kernel_size, - stride=self._stride, - padding=self._left_padding, - groups=in_channels, - ) - ) - - layers.append( - torch.nn.Conv2d( - in_channels=in_channels, - out_channels=conv_channels, - kernel_size=1, - stride=1, - padding=0, - groups=1, - ) - ) - layers.append(activation) - in_channels = conv_channels - - elif subsampling == 'striding': - self._stride = 2 - self._kernel_size = 3 - self._ceil_mode = False - - if self.is_causal: - self._left_padding = self._kernel_size - 1 - self._right_padding = self._stride - 1 - self._max_cache_len = subsampling_factor + 1 - else: - self._left_padding = (self._kernel_size - 1) // 2 - self._right_padding = (self._kernel_size - 1) // 2 - self._max_cache_len = 0 - - for i in range(self._sampling_num): - if self.is_causal: - layers.append( - CausalConv2D( - in_channels=in_channels, - out_channels=conv_channels, - kernel_size=self._kernel_size, - stride=self._stride, - padding=None, - ) - ) - else: - layers.append( - torch.nn.Conv2d( - in_channels=in_channels, - out_channels=conv_channels, - kernel_size=self._kernel_size, - stride=self._stride, - padding=self._left_padding, - ) - ) - layers.append(activation) - in_channels = conv_channels - - elif subsampling == 'striding_conv1d': - - in_channels = feat_in - - self._stride = 2 - self._kernel_size = 5 - self._ceil_mode = False - - if self.is_causal: - self._left_padding = self._kernel_size - 1 - self._right_padding = self._stride - 1 - self._max_cache_len = subsampling_factor + 1 - else: - self._left_padding = (self._kernel_size - 1) // 2 - self._right_padding = (self._kernel_size - 1) // 2 - self._max_cache_len = 0 - - for i in range(self._sampling_num): - if self.is_causal: - layers.append( - CausalConv1D( - in_channels=in_channels, - out_channels=feat_out if self._sampling_num == i + 1 else conv_channels, - kernel_size=self._kernel_size, - stride=self._stride, - padding=None, - ) - ) - else: - layers.append( - torch.nn.Conv1d( - in_channels=in_channels, - out_channels=feat_out if self._sampling_num == i + 1 else conv_channels, - kernel_size=self._kernel_size, - stride=self._stride, - padding=self._left_padding, - ) - ) - layers.append(activation) - in_channels = conv_channels - - elif subsampling == 'dw_striding_conv1d': - - in_channels = feat_in - - self._stride = 2 - self._kernel_size = 5 - self._ceil_mode = False - - self._left_padding = (self._kernel_size - 1) // 2 - self._right_padding = (self._kernel_size - 1) // 2 - - # Layer 1 - layers.extend( - [ - torch.nn.Conv1d( - in_channels=in_channels, - out_channels=in_channels, - kernel_size=self._kernel_size, - stride=self._stride, - padding=self._left_padding, - groups=in_channels, - ), - torch.nn.Conv1d( - in_channels=in_channels, - out_channels=feat_out if self._sampling_num == 1 else conv_channels, - kernel_size=1, - stride=1, - padding=0, - groups=1, - ), - ] - ) - in_channels = conv_channels - layers.append(activation) - - for i in range(self._sampling_num - 1): - layers.extend( - [ - torch.nn.Conv1d( - in_channels=in_channels, - out_channels=in_channels, - kernel_size=self._kernel_size, - stride=self._stride, - padding=self._left_padding, - groups=in_channels, - ), - torch.nn.Conv1d( - in_channels=in_channels, - out_channels=feat_out if self._sampling_num == i + 2 else conv_channels, - kernel_size=1, - stride=1, - padding=0, - groups=1, - ), - ] - ) - layers.append(activation) - in_channels = conv_channels - - else: - raise ValueError(f"Not valid sub-sampling: {subsampling}!") - - if subsampling in ["vggnet", "dw_striding", "striding"]: - - in_length = torch.tensor(feat_in, dtype=torch.float) - out_length = calc_length( - lengths=in_length, - all_paddings=self._left_padding + self._right_padding, - kernel_size=self._kernel_size, - stride=self._stride, - ceil_mode=self._ceil_mode, - repeat_num=self._sampling_num, - ) - self.out = torch.nn.Linear(conv_channels * int(out_length), feat_out) - self.conv2d_subsampling = True - elif subsampling in ["striding_conv1d", "dw_striding_conv1d"]: - self.out = None - self.conv2d_subsampling = False - else: - raise ValueError(f"Not valid sub-sampling: {subsampling}!") - - self.conv = torch.nn.Sequential(*layers) - - def get_sampling_frames(self): - return [1, self.subsampling_factor] - - def get_streaming_cache_size(self): - return [0, self.subsampling_factor + 1] - - def forward(self, x, lengths): - lengths = calc_length( - lengths, - all_paddings=self._left_padding + self._right_padding, - kernel_size=self._kernel_size, - stride=self._stride, - ceil_mode=self._ceil_mode, - repeat_num=self._sampling_num, - ) - - # Unsqueeze Channel Axis - if self.conv2d_subsampling: - x = x.unsqueeze(1) - # Transpose to Channel First mode - else: - x = x.transpose(1, 2) - - # split inputs if chunking_factor is set - if self.subsampling_conv_chunking_factor != -1 and self.conv2d_subsampling: - if self.subsampling_conv_chunking_factor == 1: - # if subsampling_conv_chunking_factor is 1, we split only if needed - # avoiding a bug / feature limiting indexing of tensors to 2**31 - # see https://github.com/pytorch/pytorch/issues/80020 - x_ceil = 2 ** 31 / self._conv_channels * self._stride * self._stride - if torch.numel(x) > x_ceil: - need_to_split = True - else: - need_to_split = False - else: - # if subsampling_conv_chunking_factor > 1 we always split - need_to_split = True - - if need_to_split: - x, success = self.conv_split_by_batch(x) - if not success: # if unable to split by batch, try by channel - if self._subsampling == 'dw_striding': - x = self.conv_split_by_channel(x) - else: - x = self.conv(x) # try anyway - else: - x = self.conv(x) - else: - x = self.conv(x) - - # Flatten Channel and Frequency Axes - if self.conv2d_subsampling: - b, c, t, f = x.size() - x = self.out(x.transpose(1, 2).reshape(b, t, -1)) - # Transpose to Channel Last mode - else: - x = x.transpose(1, 2) - - return x, lengths - - def reset_parameters(self): - # initialize weights - if self._subsampling == 'dw_striding': - with torch.no_grad(): - # init conv - scale = 1.0 / self._kernel_size - dw_max = (self._kernel_size ** 2) ** -0.5 - pw_max = self._conv_channels ** -0.5 - - torch.nn.init.uniform_(self.conv[0].weight, -scale, scale) - torch.nn.init.uniform_(self.conv[0].bias, -scale, scale) - - for idx in range(2, len(self.conv), 3): - torch.nn.init.uniform_(self.conv[idx].weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.conv[idx].bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.conv[idx + 1].weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.conv[idx + 1].bias, -pw_max, pw_max) - - # init fc (80 * 64 = 5120 from https://github.com/kssteven418/Squeezeformer/blob/13c97d6cf92f2844d2cb3142b4c5bfa9ad1a8951/src/models/conformer_encoder.py#L487 - fc_scale = (self._feat_out * self._feat_in / self._sampling_num) ** -0.5 - torch.nn.init.uniform_(self.out.weight, -fc_scale, fc_scale) - torch.nn.init.uniform_(self.out.bias, -fc_scale, fc_scale) - - def conv_split_by_batch(self, x): - """ Tries to split input by batch, run conv and concat results """ - b, _, _, _ = x.size() - if b == 1: # can't split if batch size is 1 - return x, False - - if self.subsampling_conv_chunking_factor > 1: - cf = self.subsampling_conv_chunking_factor - logging.debug(f'using manually set chunking factor: {cf}') - else: - # avoiding a bug / feature limiting indexing of tensors to 2**31 - # see https://github.com/pytorch/pytorch/issues/80020 - x_ceil = 2 ** 31 / self._conv_channels * self._stride * self._stride - p = math.ceil(math.log(torch.numel(x) / x_ceil, 2)) - cf = 2 ** p - logging.debug(f'using auto set chunking factor: {cf}') - - new_batch_size = b // cf - if new_batch_size == 0: # input is too big - return x, False - - logging.debug(f'conv subsampling: using split batch size {new_batch_size}') - return torch.cat([self.conv(chunk) for chunk in torch.split(x, new_batch_size, 0)]), True - - def conv_split_by_channel(self, x): - """ For dw convs, tries to split input by time, run conv and concat results """ - x = self.conv[0](x) # full conv2D - x = self.conv[1](x) # activation - - for i in range(self._sampling_num - 1): - _, c, t, _ = x.size() - - if self.subsampling_conv_chunking_factor > 1: - cf = self.subsampling_conv_chunking_factor - logging.debug(f'using manually set chunking factor: {cf}') - else: - # avoiding a bug / feature limiting indexing of tensors to 2**31 - # see https://github.com/pytorch/pytorch/issues/80020 - p = math.ceil(math.log(torch.numel(x) / 2 ** 31, 2)) - cf = 2 ** p - logging.debug(f'using auto set chunking factor: {cf}') - - new_c = int(c // cf) - if new_c == 0: - logging.warning(f'chunking factor {cf} is too high; splitting down to one channel.') - new_c = 1 - - new_t = int(t // cf) - if new_t == 0: - logging.warning(f'chunking factor {cf} is too high; splitting down to one timestep.') - new_t = 1 - - logging.debug(f'conv dw subsampling: using split C size {new_c} and split T size {new_t}') - x = self.channel_chunked_conv(self.conv[i * 3 + 2], new_c, x) # conv2D, depthwise - - # splitting pointwise convs by time - x = torch.cat([self.conv[i * 3 + 3](chunk) for chunk in torch.split(x, new_t, 2)], 2) # conv2D, pointwise - x = self.conv[i * 3 + 4](x) # activation - return x - - def channel_chunked_conv(self, conv, chunk_size, x): - """ Performs channel chunked convolution""" - - ind = 0 - out_chunks = [] - for chunk in torch.split(x, chunk_size, 1): - step = chunk.size()[1] - - if self.is_causal: - chunk = nn.functional.pad( - chunk, pad=(self._kernel_size - 1, self._stride - 1, self._kernel_size - 1, self._stride - 1) - ) - ch_out = nn.functional.conv2d( - chunk, - conv.weight[ind : ind + step, :, :, :], - bias=conv.bias[ind : ind + step], - stride=self._stride, - padding=0, - groups=step, - ) - else: - ch_out = nn.functional.conv2d( - chunk, - conv.weight[ind : ind + step, :, :, :], - bias=conv.bias[ind : ind + step], - stride=self._stride, - padding=self._left_padding, - groups=step, - ) - out_chunks.append(ch_out) - ind += step - - return torch.cat(out_chunks, 1) - - def change_subsampling_conv_chunking_factor(self, subsampling_conv_chunking_factor: int): - if ( - subsampling_conv_chunking_factor != -1 - and subsampling_conv_chunking_factor != 1 - and subsampling_conv_chunking_factor % 2 != 0 - ): - raise ValueError("subsampling_conv_chunking_factor should be -1, 1, or a power of 2") - self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor - - -def calc_length(lengths, all_paddings, kernel_size, stride, ceil_mode, repeat_num=1): - """ Calculates the output length of a Tensor passed through a convolution or max pooling layer""" - add_pad: float = all_paddings - kernel_size - one: float = 1.0 - for i in range(repeat_num): - lengths = torch.div(lengths.to(dtype=torch.float) + add_pad, stride) + one - if ceil_mode: - lengths = torch.ceil(lengths) - else: - lengths = torch.floor(lengths) - return lengths.to(dtype=torch.int) - - -class TimeReductionModule(nn.Module): - """ - Squeezeformer Time Reduction procedure. Downsamples the audio by `stride` in the time dimension. - - Args: - d_model (int): input dimension of MultiheadAttentionMechanism and PositionwiseFeedForward - out_dim (int): Output dimension of the module. - kernel_size (int): Conv kernel size for depthwise convolution in convolution module - stride (int): Downsampling factor in time dimension. - """ - - def __init__(self, d_model: int, out_dim: int, kernel_size: int = 5, stride: int = 2): - super().__init__() - - self.d_model = d_model - self.out_dim = out_dim - self.kernel_size = kernel_size - self.stride = stride - self.padding = max(0, self.kernel_size - self.stride) - - self.dw_conv = nn.Conv1d( - in_channels=d_model, - out_channels=d_model, - kernel_size=kernel_size, - stride=stride, - padding=self.padding, - groups=d_model, - ) - - self.pw_conv = nn.Conv1d( - in_channels=d_model, out_channels=out_dim, kernel_size=1, stride=1, padding=0, groups=1, - ) - - self.reset_parameters() - - def forward(self, x, att_mask=None, pad_mask=None): - x = x.transpose(1, 2) # [B, C, T] - if pad_mask is not None: - x = x.float().masked_fill(pad_mask.unsqueeze(1), 0.0) - - x = self.dw_conv(x) - x = self.pw_conv(x) - - x = x.transpose(1, 2) # [B, T, C] - - B, T, D = x.size() - if att_mask is not None and pad_mask is not None: - att_mask = att_mask[:, :: self.stride, :: self.stride] - pad_mask = pad_mask[:, :: self.stride] - L = pad_mask.size(-1) - x = torch.nn.functional.pad(x, (0, 0, 0, L - T)) - - return x, att_mask, pad_mask - - def reset_parameters(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.d_model ** -0.5 - - with torch.no_grad(): - torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max) - torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max) - torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max) - torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max) - - -class SubsamplingReductionModule(nn.Module): - """Downsamples the audio signal in time dimension.""" - - def __init__(self, reduction: str, d_model: int, reduction_factor: int = 2): - super().__init__() - - assert reduction in ['pooling', 'striding'] - - self.reduction = reduction - self.d_model = d_model - self._sampling_num = int(math.log(reduction_factor, 2)) - - if reduction == 'pooling': - self.reduction_enc = nn.MaxPool1d(kernel_size=reduction_factor) - self.padding = 0 - self.kernel_size = self.reduction_enc.kernel_size - self.stride = self.reduction_enc.stride - elif reduction == 'striding': - self.reduction_enc = ConvSubsampling( - subsampling='striding', - subsampling_factor=reduction_factor, - feat_in=d_model, - feat_out=d_model, - conv_channels=d_model, - activation=nn.ReLU(), - is_causal=False, - ) - - def forward(self, x, lengths): - """Shapes: - - x: [B, T, C] - - lengths: [B] - """ - - if self.reduction == 'striding': - x, lengths = self.reduction_enc(x=x, lengths=lengths) - else: - x = torch.transpose(x, 1, 2) # [B, C, T] - lengths = calc_length( - lengths=lengths, - all_paddings=self.padding, - kernel_size=self.kernel_size, - stride=self.stride, - ceil_mode=False, - repeat_num=self._sampling_num, - ) - x = self.reduction_enc(x) - x = torch.transpose(x, 1, 2) # [B, T, C] - - return x, lengths diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/tdnn_attention.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/tdnn_attention.py deleted file mode 100644 index 14f27ef41af7ddc35271b7f6cc8fca57c8d831c7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/submodules/tdnn_attention.py +++ /dev/null @@ -1,324 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -from typing import List - -import torch -from numpy import inf -from torch import nn as nn -from torch.nn import functional as F - -from nemo.collections.asr.parts.submodules.jasper import get_same_padding, init_weights - - -class StatsPoolLayer(nn.Module): - """Statistics and time average pooling (TAP) layer - - This computes mean and, optionally, standard deviation statistics across the time dimension. - - Args: - feat_in: Input features with shape [B, D, T] - pool_mode: Type of pool mode. Supported modes are 'xvector' (mean and standard deviation) and 'tap' (time - average pooling, i.e., mean) - eps: Epsilon, minimum value before taking the square root, when using 'xvector' mode. - biased: Whether to use the biased estimator for the standard deviation when using 'xvector' mode. The default - for torch.Tensor.std() is True. - - Returns: - Pooled statistics with shape [B, D]. - - Raises: - ValueError if an unsupported pooling mode is specified. - """ - - def __init__(self, feat_in: int, pool_mode: str = 'xvector', eps: float = 1e-10, biased: bool = True): - super().__init__() - supported_modes = {"xvector", "tap"} - if pool_mode not in supported_modes: - raise ValueError(f"Pool mode must be one of {supported_modes}; got '{pool_mode}'") - self.pool_mode = pool_mode - self.feat_in = feat_in - self.eps = eps - self.biased = biased - if self.pool_mode == 'xvector': - # Mean + std - self.feat_in *= 2 - - def forward(self, encoder_output, length=None): - if length is None: - mean = encoder_output.mean(dim=-1) # Time Axis - if self.pool_mode == 'xvector': - std = encoder_output.std(dim=-1) - pooled = torch.cat([mean, std], dim=-1) - else: - pooled = mean - else: - mask = make_seq_mask_like(like=encoder_output, lengths=length, valid_ones=False) - encoder_output = encoder_output.masked_fill(mask, 0.0) - # [B, D, T] -> [B, D] - means = encoder_output.mean(dim=-1) - # Re-scale to get padded means - means = means * (encoder_output.shape[-1] / length).unsqueeze(-1) - if self.pool_mode == "xvector": - stds = ( - encoder_output.sub(means.unsqueeze(-1)) - .masked_fill(mask, 0.0) - .pow(2.0) - .sum(-1) # [B, D, T] -> [B, D] - .div(length.view(-1, 1).sub(1 if self.biased else 0)) - .clamp(min=self.eps) - .sqrt() - ) - pooled = torch.cat((means, stds), dim=-1) - else: - pooled = means - return pooled - - -@torch.jit.script_if_tracing -def make_seq_mask_like( - like: torch.Tensor, lengths: torch.Tensor, valid_ones: bool = True, time_dim: int = -1 -) -> torch.Tensor: - mask = torch.arange(like.shape[time_dim], device=like.device).repeat(lengths.shape[0], 1).lt(lengths.unsqueeze(-1)) - # Match number of dims in `like` tensor - for _ in range(like.dim() - mask.dim()): - mask = mask.unsqueeze(1) - # If time dim != -1, transpose to proper dim. - if time_dim != -1: - mask = mask.transpose(time_dim, -1) - if not valid_ones: - mask = ~mask - return mask - - -def lens_to_mask(lens: List[int], max_len: int, device: str = None): - """ - outputs masking labels for list of lengths of audio features, with max length of any - mask as max_len - input: - lens: list of lens - max_len: max length of any audio feature - output: - mask: masked labels - num_values: sum of mask values for each feature (useful for computing statistics later) - """ - lens_mat = torch.arange(max_len).to(device) - mask = lens_mat[:max_len].unsqueeze(0) < lens.unsqueeze(1) - mask = mask.unsqueeze(1) - num_values = torch.sum(mask, dim=2, keepdim=True) - return mask, num_values - - -def get_statistics_with_mask(x: torch.Tensor, m: torch.Tensor, dim: int = 2, eps: float = 1e-10): - """ - compute mean and standard deviation of input(x) provided with its masking labels (m) - input: - x: feature input - m: averaged mask labels - output: - mean: mean of input features - std: stadard deviation of input features - """ - mean = torch.sum((m * x), dim=dim) - std = torch.sqrt((m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps)) - return mean, std - - -class TDNNModule(nn.Module): - """ - Time Delayed Neural Module (TDNN) - 1D - input: - inp_filters: input filter channels for conv layer - out_filters: output filter channels for conv layer - kernel_size: kernel weight size for conv layer - dilation: dilation for conv layer - stride: stride for conv layer - padding: padding for conv layer (default None: chooses padding value such that input and output feature shape matches) - output: - tdnn layer output - """ - - def __init__( - self, - inp_filters: int, - out_filters: int, - kernel_size: int = 1, - dilation: int = 1, - stride: int = 1, - padding: int = None, - ): - super().__init__() - if padding is None: - padding = get_same_padding(kernel_size, stride=stride, dilation=dilation) - - self.conv_layer = nn.Conv1d( - in_channels=inp_filters, - out_channels=out_filters, - kernel_size=kernel_size, - dilation=dilation, - padding=padding, - ) - - self.activation = nn.ReLU() - self.bn = nn.BatchNorm1d(out_filters) - - def forward(self, x, length=None): - x = self.conv_layer(x) - x = self.activation(x) - return self.bn(x) - - -class MaskedSEModule(nn.Module): - """ - Squeeze and Excite module implementation with conv1d layers - input: - inp_filters: input filter channel size - se_filters: intermediate squeeze and excite channel output and input size - out_filters: output filter channel size - kernel_size: kernel_size for both conv1d layers - dilation: dilation size for both conv1d layers - - output: - squeeze and excite layer output - """ - - def __init__(self, inp_filters: int, se_filters: int, out_filters: int, kernel_size: int = 1, dilation: int = 1): - super().__init__() - self.se_layer = nn.Sequential( - nn.Conv1d(inp_filters, se_filters, kernel_size=kernel_size, dilation=dilation,), - nn.ReLU(), - nn.BatchNorm1d(se_filters), - nn.Conv1d(se_filters, out_filters, kernel_size=kernel_size, dilation=dilation,), - nn.Sigmoid(), - ) - - def forward(self, input, length=None): - if length is None: - x = torch.mean(input, dim=2, keep_dim=True) - else: - max_len = input.size(2) - mask, num_values = lens_to_mask(length, max_len=max_len, device=input.device) - x = torch.sum((input * mask), dim=2, keepdim=True) / (num_values) - - out = self.se_layer(x) - return out * input - - -class TDNNSEModule(nn.Module): - """ - Modified building SE_TDNN group module block from ECAPA implementation for faster training and inference - Reference: ECAPA-TDNN Embeddings for Speaker Diarization (https://arxiv.org/pdf/2104.01466.pdf) - inputs: - inp_filters: input filter channel size - out_filters: output filter channel size - group_scale: scale value to group wider conv channels (deafult:8) - se_channels: squeeze and excite output channel size (deafult: 1024/8= 128) - kernel_size: kernel_size for group conv1d layers (default: 1) - dilation: dilation size for group conv1d layers (default: 1) - """ - - def __init__( - self, - inp_filters: int, - out_filters: int, - group_scale: int = 8, - se_channels: int = 128, - kernel_size: int = 1, - dilation: int = 1, - init_mode: str = 'xavier_uniform', - ): - super().__init__() - self.out_filters = out_filters - padding_val = get_same_padding(kernel_size=kernel_size, dilation=dilation, stride=1) - - group_conv = nn.Conv1d( - out_filters, - out_filters, - kernel_size=kernel_size, - dilation=dilation, - padding=padding_val, - groups=group_scale, - ) - self.group_tdnn_block = nn.Sequential( - TDNNModule(inp_filters, out_filters, kernel_size=1, dilation=1), - group_conv, - nn.ReLU(), - nn.BatchNorm1d(out_filters), - TDNNModule(out_filters, out_filters, kernel_size=1, dilation=1), - ) - - self.se_layer = MaskedSEModule(out_filters, se_channels, out_filters) - - self.apply(lambda x: init_weights(x, mode=init_mode)) - - def forward(self, input, length=None): - x = self.group_tdnn_block(input) - x = self.se_layer(x, length) - return x + input - - -class AttentivePoolLayer(nn.Module): - """ - Attention pooling layer for pooling speaker embeddings - Reference: ECAPA-TDNN Embeddings for Speaker Diarization (https://arxiv.org/pdf/2104.01466.pdf) - inputs: - inp_filters: input feature channel length from encoder - attention_channels: intermediate attention channel size - kernel_size: kernel_size for TDNN and attention conv1d layers (default: 1) - dilation: dilation size for TDNN and attention conv1d layers (default: 1) - """ - - def __init__( - self, - inp_filters: int, - attention_channels: int = 128, - kernel_size: int = 1, - dilation: int = 1, - eps: float = 1e-10, - ): - super().__init__() - - self.feat_in = 2 * inp_filters - - self.attention_layer = nn.Sequential( - TDNNModule(inp_filters * 3, attention_channels, kernel_size=kernel_size, dilation=dilation), - nn.Tanh(), - nn.Conv1d( - in_channels=attention_channels, out_channels=inp_filters, kernel_size=kernel_size, dilation=dilation, - ), - ) - self.eps = eps - - def forward(self, x, length=None): - max_len = x.size(2) - - if length is None: - length = torch.ones(x.shape[0], device=x.device) - - mask, num_values = lens_to_mask(length, max_len=max_len, device=x.device) - - # encoder statistics - mean, std = get_statistics_with_mask(x, mask / num_values) - mean = mean.unsqueeze(2).repeat(1, 1, max_len) - std = std.unsqueeze(2).repeat(1, 1, max_len) - attn = torch.cat([x, mean, std], dim=1) - - # attention statistics - attn = self.attention_layer(attn) # attention pass - attn = attn.masked_fill(mask == 0, -inf) - alpha = F.softmax(attn, dim=2) # attention values, α - mu, sg = get_statistics_with_mask(x, alpha) # µ and ∑ - - # gather - return torch.cat((mu, sg), dim=1).unsqueeze(2) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__init__.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__init__.py deleted file mode 100644 index bc443be41c4c3fcf0764eb9fd3e1828d63f8438c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 930efff8cdb15ddc5512d56b933b089fd33c695b..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/activations.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/activations.cpython-310.pyc deleted file mode 100644 index 12d8df7719bcc4b0b7b99ae6006c8bf1149259f7..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/activations.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/adapter_utils.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/adapter_utils.cpython-310.pyc deleted file mode 100644 index e5adcb6d356817c77282e2902fb8427c7867204b..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/adapter_utils.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/asr_confidence_utils.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/asr_confidence_utils.cpython-310.pyc deleted file mode 100644 index e97e934caba20b62b5d344af0082140c07195530..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/asr_confidence_utils.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/asr_module_utils.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/asr_module_utils.cpython-310.pyc deleted file mode 100644 index 040449cbf76c3564cc0f9b887d6d9b093b91cc66..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/asr_module_utils.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/audio_utils.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/audio_utils.cpython-310.pyc deleted file mode 100644 index 527178d7db93cb7263875ce84d9de2dc3b5f218e..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/audio_utils.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/numba_utils.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/numba_utils.cpython-310.pyc deleted file mode 100644 index 0cb01646ae3287f10624aaef3801150e9f16e019..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/numba_utils.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/offline_clustering.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/offline_clustering.cpython-310.pyc deleted file mode 100644 index 3eb5e8859e2c4d9ca3ff5090db54cd76487d8aaf..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/offline_clustering.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/optimization_utils.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/optimization_utils.cpython-310.pyc deleted file mode 100644 index 0ca494b08463a5210f6ef485457c979293ba8619..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/optimization_utils.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/regularization_utils.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/regularization_utils.cpython-310.pyc deleted file mode 100644 index 20b3a4f8b5efbdf93beddf43dbf1d3f62aa1b7b4..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/regularization_utils.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/rnnt_utils.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/rnnt_utils.cpython-310.pyc deleted file mode 100644 index 6222d5b656770e39febb0dfd887c6b78c09e5b95..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/rnnt_utils.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/slu_utils.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/slu_utils.cpython-310.pyc deleted file mode 100644 index 108ff964f021de8f7473a1979aa84c90e43184c3..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/slu_utils.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/speaker_utils.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/speaker_utils.cpython-310.pyc deleted file mode 100644 index 2588fda4351ab5e8ac06ab6da5b4b515e5b52a0e..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/speaker_utils.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/vad_utils.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/vad_utils.cpython-310.pyc deleted file mode 100644 index 7d5a03e480f264b6daccb11bc373afc27dc95d22..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/__pycache__/vad_utils.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/activations.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/activations.py deleted file mode 100644 index c4ba911e2e11329afd63d94bfe77377dc337591e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/activations.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import torch.nn as nn - -__all__ = ['Swish', 'Snake'] - - -@torch.jit.script -def snake(x: torch.Tensor, alpha: torch.Tensor, eps: float = 1e-9) -> torch.Tensor: - """ - equation for snake activation function: x + (alpha + eps)^-1 * sin(alpha * x)^2 - """ - shape = x.shape - x = x.reshape(shape[0], shape[1], -1) - x = x + (alpha + eps).reciprocal() * torch.sin(alpha * x).pow(2) - x = x.reshape(shape) - return x - - -class Snake(nn.Module): - """ - Snake activation function introduced in 'https://arxiv.org/abs/2006.08195' - """ - - def __init__(self, channels: int): - super().__init__() - self.alpha = nn.Parameter(torch.ones(1, channels, 1)) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return snake(x, self.alpha) - - -class Swish(nn.SiLU): - """ - Swish activation function introduced in 'https://arxiv.org/abs/1710.05941' - Mathematically identical to SiLU. See note in nn.SiLU for references. - """ diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/adapter_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/adapter_utils.py deleted file mode 100644 index 5b74a296419a94b97c28a357b22643a427129ddd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/adapter_utils.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import is_dataclass - -import torch -from omegaconf import DictConfig, OmegaConf - -from nemo.utils import logging - -# Constants -LINEAR_ADAPTER_CLASSPATH = "nemo.collections.common.parts.adapter_modules.LinearAdapter" -MHA_ADAPTER_CLASSPATH = ( - "nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.MultiHeadAttentionAdapter" -) -RELMHA_ADAPTER_CLASSPATH = "nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.RelPositionMultiHeadAttentionAdapter" -POS_ENCODING_ADAPTER_CLASSPATH = ( - "nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.PositionalEncodingAdapter" -) -REL_POS_ENCODING_ADAPTER_CLASSPATH = ( - "nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.RelPositionalEncodingAdapter" -) - - -def convert_adapter_cfg_to_dict_config(cfg: DictConfig): - # Convert to DictConfig from dict or Dataclass - if is_dataclass(cfg): - cfg = OmegaConf.structured(cfg) - - if not isinstance(cfg, DictConfig): - cfg = DictConfig(cfg) - - return cfg - - -def update_adapter_cfg_input_dim(module: torch.nn.Module, cfg: DictConfig, *, module_dim: int): - """ - Update the input dimension of the provided adapter config with some default value. - - Args: - module: The module that implements AdapterModuleMixin. - cfg: A DictConfig or a Dataclass representing the adapter config. - module_dim: A default module dimension, used if cfg has an incorrect input dimension. - - Returns: - A DictConfig representing the adapter's config. - """ - cfg = convert_adapter_cfg_to_dict_config(cfg) - - input_dim_valid_keys = ['in_features', 'n_feat'] - input_key = None - - for key in input_dim_valid_keys: - if key in cfg: - input_key = key - break - - if input_key is None: - raise ValueError( - f"Failed to infer the input dimension of the Adapter cfg. \nExpected one of : {input_dim_valid_keys}.\n" - f"Provided config : \n" - f"{OmegaConf.to_yaml(cfg)}" - ) - - input_dim = cfg[input_key] - - if input_dim != module_dim: - logging.info(f"Updating {module.__class__.__name__} Adapter input dim from {input_dim} to {module_dim}") - input_dim = module_dim - - cfg[input_key] = input_dim - return cfg diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/asr_confidence_benchmarking_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/asr_confidence_benchmarking_utils.py deleted file mode 100644 index 0e057e012542a939b945e22f0941fba1a6b20d67..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/asr_confidence_benchmarking_utils.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import contextlib -import copy -import os -from pathlib import Path -from typing import List, Optional, Tuple, Union - -import numpy as np -import texterrors -import torch -from omegaconf import open_dict - -from nemo.collections.asr.models import ASRModel, EncDecRNNTModel -from nemo.collections.asr.parts.utils.confidence_metrics import ( - auc_nt, - auc_pr, - auc_roc, - auc_yc, - ece, - nce, - save_confidence_hist, - save_custom_confidence_curve, - save_nt_curve, - save_pr_curve, - save_roc_curve, -) -from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis - - -def get_correct_marks(r: Union[List[int], List[str]], h: Union[List[int], List[str]]) -> List[bool]: - """Get correct marks by aligning the reference text with a hypothesis. - - This method considers only insertions and substitutions as incorrect marks. - """ - return [ - a == b - for a, b in zip(*(texterrors.align_texts([str(rr) for rr in r], [str(hh) for hh in h], False)[:-1])) - if b != "" - ] - - -def get_token_targets_with_confidence(hyp: Hypothesis) -> List[Tuple[str, float]]: - return [(y, c) for y, c in zip(hyp.y_sequence, hyp.token_confidence)] - - -def get_word_targets_with_confidence(hyp: Hypothesis) -> List[Tuple[str, float]]: - return [(y, c) for y, c in zip(hyp.words, hyp.word_confidence)] - - -def run_confidence_benchmark( - model: ASRModel, - target_level: str, - filepaths: List[str], - reference_texts: List[str], - batch_size: int = 8, - num_workers: int = 4, - plot_dir: Optional[Union[str, Path]] = None, - autocast: Optional = None, -): - """Run benchmark and plot histograms and curves, if plot_dir is provided. - - Returns: - Dictionary with benchmark results of the following scheme: - `level: (auc_roc, auc_pr, auc_nt, nce, ece, auc_yc, std_yc, max_yc)` with `level` being 'token' or 'word'. - """ - draw_plot = plot_dir is not None - if isinstance(plot_dir, str): - plot_dir = Path(plot_dir) - is_rnnt = isinstance(model, EncDecRNNTModel) - - # setup autocast if necessary - if autocast is None: - - @contextlib.contextmanager - def autocast(): - yield - - # transcribe audio - with autocast(): - with torch.no_grad(): - transcriptions = model.transcribe( - paths2audio_files=filepaths, batch_size=batch_size, return_hypotheses=True, num_workers=num_workers - ) - if is_rnnt: - transcriptions = transcriptions[0] - - levels = [] - if target_level != "word": - levels.append("token") - if target_level != "token": - levels.append("word") - results = {} - for level in levels: - if level == "token": - targets_with_confidence = [get_token_targets_with_confidence(tran) for tran in transcriptions] - correct_marks = [ - get_correct_marks(model.tokenizer.text_to_ids(r), model.tokenizer.text_to_ids(h.text)) - for r, h in zip(reference_texts, transcriptions) - ] - else: # "word" - targets_with_confidence = [get_word_targets_with_confidence(tran) for tran in transcriptions] - correct_marks = [get_correct_marks(r.split(), h.words) for r, h in zip(reference_texts, transcriptions)] - - y_true, y_score = np.array( - [[f, p[1]] for cm, twc in zip(correct_marks, targets_with_confidence) for f, p in zip(cm, twc)] - ).T - # output scheme: yc.mean(), yc.max(), yc.std() or yc.mean(), yc.max(), yc.std(), (thresholds, yc) - result_yc = auc_yc(y_true, y_score, return_std_maximum=True, return_curve=draw_plot) - # output scheme: ece or ece, (thresholds, ece_curve) - results_ece = ece(y_true, y_score, return_curve=draw_plot) - results[level] = [ - auc_roc(y_true, y_score), - auc_pr(y_true, y_score), - auc_nt(y_true, y_score), - nce(y_true, y_score), - results_ece if isinstance(results_ece, float) else results_ece[0], - ] + list(result_yc[:3]) - - if draw_plot: - os.makedirs(plot_dir, exist_ok=True) - - mask_correct = y_true == 1 - y_score_correct = y_score[mask_correct] - y_score_incorrect = y_score[~mask_correct] - # histogram of the correct distribution - save_confidence_hist(y_score_correct, plot_dir, level + "_" + "hist_correct") - # histogram of the incorrect distribution - save_confidence_hist(y_score_incorrect, plot_dir, level + "_" + "hist_incorrect") - # AUC-ROC curve - save_roc_curve(y_true, y_score, plot_dir, level + "_" + "roc") - # AUC-PR curve - save_pr_curve(y_true, y_score, plot_dir, level + "_" + "pr") - # AUC-NT curve - save_nt_curve(y_true, y_score, plot_dir, level + "_" + "nt") - # AUC-YC curve - yc_thresholds, yc_values = result_yc[-1] - save_custom_confidence_curve( - yc_thresholds, - yc_values, - plot_dir, - level + "_" + "yc", - "Threshold", - "True positive rate − False Positive Rate", - ) - # ECE curve - ece_thresholds, ece_values = results_ece[-1] - ece_values /= max(ece_values) - save_custom_confidence_curve( - ece_thresholds, ece_values, plot_dir, level + "_" + "ece", "Threshold", "|Accuracy − Confidence score|" - ) - - return results - - -def apply_confidence_parameters(decoding_cfg, hp): - """Apply parameters from a parameter grid to a decoding config. - - Returns: - Updated decoding config. - """ - new_decoding_cfg = copy.deepcopy(decoding_cfg) - confidence_cfg_fields = ("aggregation", "exclude_blank") - confidence_method_cfg_fields = ("name", "alpha", "entropy_type", "entropy_norm") - with open_dict(new_decoding_cfg): - for p, v in hp.items(): - if p in confidence_cfg_fields: - new_decoding_cfg.confidence_cfg[p] = v - elif p in confidence_method_cfg_fields: - new_decoding_cfg.confidence_cfg.method_cfg[p] = v - return new_decoding_cfg diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/asr_confidence_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/asr_confidence_utils.py deleted file mode 100644 index c406f5451e844268178cbe4a21421ea9f78cd62a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/asr_confidence_utils.py +++ /dev/null @@ -1,461 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from abc import ABC, abstractmethod -from dataclasses import dataclass, field -from functools import partial -from typing import List, Optional - -import torch -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis -from nemo.utils import logging - - -class ConfidenceMethodConstants: - NAMES = ("max_prob", "entropy") - ENTROPY_TYPES = ("gibbs", "tsallis", "renyi") - ENTROPY_NORMS = ("lin", "exp") - - @classmethod - def print(cls): - return ( - cls.__name__ - + ": " - + str({"NAMES": cls.NAMES, "ENTROPY_TYPES": cls.ENTROPY_TYPES, "ENTROPY_NORMS": cls.ENTROPY_NORMS}) - ) - - -class ConfidenceConstants: - AGGREGATIONS = ("mean", "min", "max", "prod") - - @classmethod - def print(cls): - return cls.__name__ + ": " + str({"AGGREGATIONS": cls.AGGREGATIONS}) - - -@dataclass -class ConfidenceMethodConfig: - """A Config which contains the method name and settings to compute per-frame confidence scores. - - Args: - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). - Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - """ - - name: str = "entropy" - entropy_type: str = "tsallis" - alpha: float = 0.33 - entropy_norm: str = "exp" - temperature: str = "DEPRECATED" - - def __post_init__(self): - if self.temperature != "DEPRECATED": - # self.temperature has type str - self.alpha = float(self.temperature) - self.temperature = "DEPRECATED" - if self.name not in ConfidenceMethodConstants.NAMES: - raise ValueError( - f"`name` must be one of the following: " - f"{'`' + '`, `'.join(ConfidenceMethodConstants.NAMES) + '`'}. Provided: `{self.name}`" - ) - if self.entropy_type not in ConfidenceMethodConstants.ENTROPY_TYPES: - raise ValueError( - f"`entropy_type` must be one of the following: " - f"{'`' + '`, `'.join(ConfidenceMethodConstants.ENTROPY_TYPES) + '`'}. Provided: `{self.entropy_type}`" - ) - if self.alpha <= 0.0: - raise ValueError(f"`alpha` must be > 0. Provided: {self.alpha}") - if self.entropy_norm not in ConfidenceMethodConstants.ENTROPY_NORMS: - raise ValueError( - f"`entropy_norm` must be one of the following: " - f"{'`' + '`, `'.join(ConfidenceMethodConstants.ENTROPY_NORMS) + '`'}. Provided: `{self.entropy_norm}`" - ) - - -@dataclass -class ConfidenceConfig: - """A config which contains the following key-value pairs related to confidence scores. - - Args: - preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores - generated during decoding. When set to true, the Hypothesis will contain - the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of floats. - preserve_token_confidence: Bool flag which preserves the history of per-token confidence scores - generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `token_confidence` in it. Here, `token_confidence` is a List of floats. - - The length of the list corresponds to the number of recognized tokens. - preserve_word_confidence: Bool flag which preserves the history of per-word confidence scores - generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain - the non-null value for `word_confidence` in it. Here, `word_confidence` is a List of floats. - - The length of the list corresponds to the number of recognized words. - exclude_blank: Bool flag indicating that blank token confidence scores are to be excluded - from the `token_confidence`. - aggregation: Which aggregation type to use for collapsing per-token confidence into per-word confidence. - Valid options are `mean`, `min`, `max`, `prod`. - method_cfg: A dict-like object which contains the method name and settings to compute per-frame - confidence scores. - - name: The method name (str). - Supported values: - - 'max_prob' for using the maximum token probability as a confidence. - - 'entropy' for using a normalized entropy of a log-likelihood vector. - - entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`. - Supported values: - - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided, - the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)). - Note that for this entropy, the alpha should comply the following inequality: - (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1) - where V is the model vocabulary size. - - 'tsallis' for the Tsallis entropy with the Boltzmann constant one. - Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/Tsallis_entropy - - 'renyi' for the Rényi entropy. - Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)), - where α is a parameter. When α == 1, it works like the Gibbs entropy. - More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy - - alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0. - When the alpha equals one, scaling is not applied to 'max_prob', - and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i)) - - entropy_norm: A mapping of the entropy value to the interval [0,1]. - Supported values: - - 'lin' for using the linear mapping. - - 'exp' for using exponential mapping with linear shift. - """ - - preserve_frame_confidence: bool = False - preserve_token_confidence: bool = False - preserve_word_confidence: bool = False - exclude_blank: bool = True - aggregation: str = "min" - method_cfg: ConfidenceMethodConfig = field(default_factory=lambda: ConfidenceMethodConfig()) - - def __post_init__(self): - # OmegaConf.structured ensures that post_init check is always executed - self.method_cfg = OmegaConf.structured( - self.method_cfg - if isinstance(self.method_cfg, ConfidenceMethodConfig) - else ConfidenceMethodConfig(**self.method_cfg) - ) - if self.aggregation not in ConfidenceConstants.AGGREGATIONS: - raise ValueError( - f"`aggregation` has to be one of the following: " - f"{'`' + '`, `'.join(ConfidenceConstants.AGGREGATIONS) + '`'}. Provided: `{self.aggregation}`" - ) - - -def get_confidence_measure_bank(): - """Generate a dictionary with confidence measure functionals. - - Supported confidence measures: - max_prob: normalized maximum probability - entropy_gibbs_lin: Gibbs entropy with linear normalization - entropy_gibbs_exp: Gibbs entropy with exponential normalization - entropy_tsallis_lin: Tsallis entropy with linear normalization - entropy_tsallis_exp: Tsallis entropy with exponential normalization - entropy_renyi_lin: Rényi entropy with linear normalization - entropy_renyi_exp: Rényi entropy with exponential normalization - - Returns: - dictionary with lambda functions. - """ - # helper functions - # Gibbs entropy is implemented without alpha - neg_entropy_gibbs = lambda x: (x.exp() * x).sum(-1) - neg_entropy_alpha = lambda x, t: (x * t).exp().sum(-1) - neg_entropy_alpha_gibbs = lambda x, t: ((x * t).exp() * x).sum(-1) - # too big for a lambda - def entropy_tsallis_exp(x, v, t): - exp_neg_max_ent = math.exp((1 - math.pow(v, 1 - t)) / (1 - t)) - return (((1 - neg_entropy_alpha(x, t)) / (1 - t)).exp() - exp_neg_max_ent) / (1 - exp_neg_max_ent) - - def entropy_gibbs_exp(x, v, t): - exp_neg_max_ent = math.pow(v, -t * math.pow(v, 1 - t)) - return ((neg_entropy_alpha_gibbs(x, t) * t).exp() - exp_neg_max_ent) / (1 - exp_neg_max_ent) - - # use Gibbs entropies for Tsallis and Rényi with t == 1.0 - entropy_gibbs_lin_baseline = lambda x, v: 1 + neg_entropy_gibbs(x) / math.log(v) - entropy_gibbs_exp_baseline = lambda x, v: (neg_entropy_gibbs(x).exp() * v - 1) / (v - 1) - # fill the measure bank - confidence_measure_bank = {} - # Maximum probability measure is implemented without alpha - confidence_measure_bank["max_prob"] = ( - lambda x, v, t: (x.max(dim=-1)[0].exp() * v - 1) / (v - 1) - if t == 1.0 - else ((x.max(dim=-1)[0] * t).exp() * math.pow(v, t) - 1) / (math.pow(v, t) - 1) - ) - confidence_measure_bank["entropy_gibbs_lin"] = ( - lambda x, v, t: entropy_gibbs_lin_baseline(x, v) - if t == 1.0 - else 1 + neg_entropy_alpha_gibbs(x, t) / math.log(v) / math.pow(v, 1 - t) - ) - confidence_measure_bank["entropy_gibbs_exp"] = ( - lambda x, v, t: entropy_gibbs_exp_baseline(x, v) if t == 1.0 else entropy_gibbs_exp(x, v, t) - ) - confidence_measure_bank["entropy_tsallis_lin"] = ( - lambda x, v, t: entropy_gibbs_lin_baseline(x, v) - if t == 1.0 - else 1 + (1 - neg_entropy_alpha(x, t)) / (math.pow(v, 1 - t) - 1) - ) - confidence_measure_bank["entropy_tsallis_exp"] = ( - lambda x, v, t: entropy_gibbs_exp_baseline(x, v) if t == 1.0 else entropy_tsallis_exp(x, v, t) - ) - confidence_measure_bank["entropy_renyi_lin"] = ( - lambda x, v, t: entropy_gibbs_lin_baseline(x, v) - if t == 1.0 - else 1 + neg_entropy_alpha(x, t).log2() / (t - 1) / math.log(v, 2) - ) - confidence_measure_bank["entropy_renyi_exp"] = ( - lambda x, v, t: entropy_gibbs_exp_baseline(x, v) - if t == 1.0 - else (neg_entropy_alpha(x, t).pow(1 / (t - 1)) * v - 1) / (v - 1) - ) - return confidence_measure_bank - - -def get_confidence_aggregation_bank(): - """Generate a dictionary with confidence aggregation functions. - - Supported confidence aggregation functions: - min: minimum - max: maximum - mean: arithmetic mean - prod: product - - Returns: - dictionary with functions. - """ - confidence_aggregation_bank = {"mean": lambda x: sum(x) / len(x), "min": min, "max": max} - # python 3.7 and earlier do not have math.prod - if hasattr(math, "prod"): - confidence_aggregation_bank["prod"] = math.prod - else: - import operator - from functools import reduce - - confidence_aggregation_bank["prod"] = lambda x: reduce(operator.mul, x, 1) - return confidence_aggregation_bank - - -class ConfidenceMethodMixin(ABC): - """Confidence Method Mixin class. - - It initializes per-frame confidence method. - """ - - def _init_confidence_method(self, confidence_method_cfg: Optional[DictConfig] = None): - """Initialize per-frame confidence method from config. - """ - # OmegaConf.structured ensures that post_init check is always executed - confidence_method_cfg = OmegaConf.structured( - ConfidenceMethodConfig() - if confidence_method_cfg is None - else ConfidenceMethodConfig(**confidence_method_cfg) - ) - - # set confidence calculation method - # we suppose that self.blank_id == len(vocabulary) - self.num_tokens = (self.blank_id if hasattr(self, "blank_id") else self._blank_index) + 1 - self.alpha = confidence_method_cfg.alpha - - # init confidence measure bank - self.confidence_measure_bank = get_confidence_measure_bank() - - measure = None - # construct measure_name - measure_name = "" - if confidence_method_cfg.name == "max_prob": - measure_name = "max_prob" - elif confidence_method_cfg.name == "entropy": - measure_name = '_'.join( - [confidence_method_cfg.name, confidence_method_cfg.entropy_type, confidence_method_cfg.entropy_norm] - ) - else: - raise ValueError(f"Unsupported `confidence_method_cfg.name`: `{confidence_method_cfg.name}`") - if measure_name not in self.confidence_measure_bank: - raise ValueError(f"Unsupported measure setup: `{measure_name}`") - measure = partial(self.confidence_measure_bank[measure_name], v=self.num_tokens, t=self.alpha) - self._get_confidence = lambda x: measure(torch.nan_to_num(x)).tolist() - - -class ConfidenceMixin(ABC): - """Confidence Mixin class. - - It is responsible for confidence estimation method initialization and high-level confidence score calculation. - """ - - def _init_confidence(self, confidence_cfg: Optional[DictConfig] = None): - """Initialize confidence-related fields and confidence aggregation function from config. - """ - # OmegaConf.structured ensures that post_init check is always executed - confidence_cfg = OmegaConf.structured( - ConfidenceConfig() if confidence_cfg is None else ConfidenceConfig(**confidence_cfg) - ) - self.confidence_method_cfg = confidence_cfg.method_cfg - - # extract the config - self.preserve_word_confidence = confidence_cfg.get('preserve_word_confidence', False) - # set preserve_frame_confidence and preserve_token_confidence to True - # if preserve_word_confidence is True - self.preserve_token_confidence = ( - confidence_cfg.get('preserve_token_confidence', False) | self.preserve_word_confidence - ) - # set preserve_frame_confidence to True if preserve_token_confidence is True - self.preserve_frame_confidence = ( - confidence_cfg.get('preserve_frame_confidence', False) | self.preserve_token_confidence - ) - self.exclude_blank_from_confidence = confidence_cfg.get('exclude_blank', True) - self.word_confidence_aggregation = confidence_cfg.get('aggregation', "min") - - # define aggregation functions - self.confidence_aggregation_bank = get_confidence_aggregation_bank() - self._aggregate_confidence = self.confidence_aggregation_bank[self.word_confidence_aggregation] - - # Update preserve frame confidence - if self.preserve_frame_confidence is False: - if self.cfg.strategy in ['greedy', 'greedy_batch']: - self.preserve_frame_confidence = self.cfg.greedy.get('preserve_frame_confidence', False) - # OmegaConf.structured ensures that post_init check is always executed - confidence_method_cfg = OmegaConf.structured(self.cfg.greedy).get('confidence_method_cfg', None) - self.confidence_method_cfg = ( - OmegaConf.structured(ConfidenceMethodConfig()) - if confidence_method_cfg is None - else OmegaConf.structured(ConfidenceMethodConfig(**confidence_method_cfg)) - ) - - @abstractmethod - def compute_confidence(self, hypotheses_list: List[Hypothesis]) -> List[Hypothesis]: - """Computes high-level (per-token and/or per-word) confidence scores for a list of hypotheses. - Assumes that `frame_confidence` is present in the hypotheses. - - Args: - hypotheses_list: List of Hypothesis. - - Returns: - A list of hypotheses with high-level confidence scores. - """ - raise NotImplementedError() - - @abstractmethod - def _aggregate_token_confidence(self, hypothesis: Hypothesis) -> List[float]: - """Implemented by subclass in order to aggregate token confidence to a word-level confidence. - - Args: - hypothesis: Hypothesis - - Returns: - A list of word-level confidence scores. - """ - raise NotImplementedError() - - def _aggregate_token_confidence_chars(self, words: List[str], token_confidence: List[float]) -> List[float]: - """Implementation of token confidence aggregation for character-based models. - - Args: - words: List of words of a hypothesis. - token_confidence: List of token-level confidence scores of a hypothesis. - - Returns: - A list of word-level confidence scores. - """ - word_confidence = [] - i = 0 - for word in words: - word_len = len(word) - word_confidence.append(self._aggregate_confidence(token_confidence[i : i + word_len])) - # we assume that there is exactly one space token between words and exclude it from word confidence - i += word_len + 1 - return word_confidence - - def _aggregate_token_confidence_subwords_sentencepiece( - self, words: List[str], token_confidence: List[float], token_ids: List[int] - ) -> List[float]: - """Implementation of token confidence aggregation for subword-based models. - - **Note**: Only supports Sentencepiece based tokenizers ! - - Args: - words: List of words of a hypothesis. - token_confidence: List of token-level confidence scores of a hypothesis. - token_ids: List of token ids of a hypothesis. - - Returns: - A list of word-level confidence scores. - """ - word_confidence = [] - # run only if there are final words - if len(words) > 0: - j = 0 - prev_unk = False - prev_underline = False - for i, token_id in enumerate(token_ids): - token = self.decode_ids_to_tokens([int(token_id)])[0] - token_text = self.decode_tokens_to_str([int(token_id)]) - # treat `` as a separate word regardless of the next token - # to match the result of `tokenizer.ids_to_text` - if (token != token_text or prev_unk) and i > j: - # do not add confidence for `▁` if the current token starts with `▁` - # to match the result of `tokenizer.ids_to_text` - if not prev_underline: - word_confidence.append(self._aggregate_confidence(token_confidence[j:i])) - j = i - prev_unk = token == '' - prev_underline = token == '▁' - if not prev_underline: - word_confidence.append(self._aggregate_confidence(token_confidence[j : len(token_ids)])) - if len(words) != len(word_confidence): - raise RuntimeError( - f"""Something went wrong with word-level confidence aggregation.\n - Please check these values for debugging:\n - len(words): {len(words)},\n - len(word_confidence): {len(word_confidence)},\n - recognized text: `{' '.join(words)}`""" - ) - return word_confidence diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/asr_module_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/asr_module_utils.py deleted file mode 100644 index e077d7948c0dd7770876b8604bf54d23de5c5294..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/asr_module_utils.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Optional - -from omegaconf import DictConfig, open_dict - -from nemo.collections.asr.modules import conv_asr -from nemo.collections.asr.parts.submodules import jasper -from nemo.utils import logging - - -def change_conv_asr_se_context_window(model: 'ASRModel', context_window: int, update_config: bool = True): - """ - Update the context window of the SqueezeExcitation module if the provided model contains an - `encoder` which is an instance of `ConvASREncoder`. - - Args: - model: A subclass of `ASRModel`, itself a subclass of `ModelPT`. - context_window: An integer representing the number of input timeframes that will be used - to compute the context. Each timeframe corresponds to a single window stride of the - STFT features. - - Say the window_stride = 0.01s, then a context window of 128 represents 128 * 0.01 s - of context to compute the Squeeze step. - update_config: Whether to update the config or not with the new context window. - """ - if update_config and not hasattr(model.cfg, 'encoder'): - logging.info( - "Could not change the context window in SqueezeExcite module " - "since the model provided does not contain an `encoder` module in its config." - ) - return - - if not isinstance(model.encoder, conv_asr.ConvASREncoder): - logging.info( - f"Could not change the context window in SqueezeExcite module " - f"since the `encoder` module is not an instance of `ConvASREncoder`.\n" - f"Provided encoder class = {model.encoder.__class__.__name__}" - ) - return - - enc_cfg = model.cfg.encoder if update_config else None - - if enc_cfg is not None: - with open_dict(enc_cfg): - _update_se_context_window(model, context_window, cfg=enc_cfg) - else: - _update_se_context_window(model, context_window) - - # Update model config - if update_config: - model.cfg.encoder = enc_cfg - - -def _update_se_context_window(model: 'ASRModel', context_window: int, cfg: Optional[DictConfig] = None): - jasper_block_counter = -1 - for name, m in model.named_modules(): - if type(m) == jasper.JasperBlock: - jasper_block_counter += 1 - - if type(m) == jasper.MaskedConv1d: - if m.conv.stride[0] > 1 and 'mconv' in name: - context_window = context_window // m.conv.stride[0] - - if type(m) == jasper.SqueezeExcite: - m.change_context_window(context_window=context_window) - - # update config - if cfg is not None: - cfg.jasper[jasper_block_counter].se_context_size = context_window diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/audio_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/audio_utils.py deleted file mode 100644 index 8188dbed003b5ffa9595ea3ce247ba163725c078..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/audio_utils.py +++ /dev/null @@ -1,604 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from typing import Iterable, Optional, Union - -import librosa -import numpy as np -import numpy.typing as npt -import scipy -import soundfile as sf -import torch -from scipy.spatial.distance import pdist, squareform - -from nemo.utils import logging - -SOUND_VELOCITY = 343.0 # m/s -ChannelSelectorType = Union[int, Iterable[int], str] - - -def get_samples(audio_file: str, target_sr: int = 16000, dtype: str = 'float32'): - """ - Read the samples from the given audio_file path. If not specified, the input audio file is automatically - resampled to 16kHz. - - Args: - audio_file (str): - Path to the input audio file - target_sr (int): - Targeted sampling rate - Returns: - samples (numpy.ndarray): - Time-series sample data from the given audio file - """ - with sf.SoundFile(audio_file, 'r') as f: - samples = f.read(dtype=dtype) - if f.samplerate != target_sr: - samples = librosa.core.resample(samples, orig_sr=f.samplerate, target_sr=target_sr) - samples = samples.transpose() - return samples - - -def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelectorType] = None) -> npt.NDArray: - """ - Convert a multi-channel signal to a single-channel signal by averaging over channels or selecting a single channel, - or pass-through multi-channel signal when channel_selector is `None`. - - Args: - signal: numpy array with shape (..., num_channels) - channel selector: string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable - of integers denoting a subset of channels. Channel selector is using zero-based indexing. - If set to `None`, the original signal will be returned. Uses zero-based indexing. - - Returns: - numpy array - """ - if signal.ndim == 1: - # For one-dimensional input, return the input signal. - if channel_selector not in [None, 0, 'average']: - raise ValueError( - 'Input signal is one-dimensional, channel selector (%s) cannot not be used.', str(channel_selector) - ) - return signal - - num_channels = signal.shape[-1] - num_samples = signal.size // num_channels # handle multi-dimensional signals - - if num_channels >= num_samples: - logging.warning( - 'Number of channels (%d) is greater or equal than number of samples (%d). Check for possible transposition.', - num_channels, - num_samples, - ) - - # Samples are arranged as (num_channels, ...) - if channel_selector is None: - # keep the original multi-channel signal - pass - elif channel_selector == 'average': - # default behavior: downmix by averaging across channels - signal = np.mean(signal, axis=-1) - elif isinstance(channel_selector, int): - # select a single channel - if channel_selector >= num_channels: - raise ValueError(f'Cannot select channel {channel_selector} from a signal with {num_channels} channels.') - signal = signal[..., channel_selector] - elif isinstance(channel_selector, Iterable): - # select multiple channels - if max(channel_selector) >= num_channels: - raise ValueError( - f'Cannot select channel subset {channel_selector} from a signal with {num_channels} channels.' - ) - signal = signal[..., channel_selector] - # squeeze the channel dimension if a single-channel is selected - # this is done to have the same shape as when using integer indexing - if len(channel_selector) == 1: - signal = np.squeeze(signal, axis=-1) - else: - raise ValueError(f'Unexpected value for channel_selector ({channel_selector})') - - return signal - - -def sinc_unnormalized(x: float) -> float: - """Unnormalized sinc. - - Args: - x: input value - - Returns: - Calculates sin(x)/x - """ - return np.sinc(x / np.pi) - - -def theoretical_coherence( - mic_positions: npt.NDArray, - sample_rate: float, - field: str = 'spherical', - fft_length: int = 512, - sound_velocity: float = SOUND_VELOCITY, -) -> npt.NDArray: - """Calculate a theoretical coherence matrix for given mic positions and field type. - - Args: - mic_positions: 3D Cartesian coordinates of microphone positions, shape (num_mics, 3) - field: string denoting the type of the soundfield - sample_rate: sampling rate of the input signal in Hz - fft_length: length of the fft in samples - sound_velocity: speed of sound in m/s - - Returns: - Calculated coherence with shape (num_subbands, num_mics, num_mics) - """ - assert mic_positions.shape[1] == 3, "Expecting 3D microphone positions" - num_mics = mic_positions.shape[0] - - if num_mics < 2: - raise ValueError(f'Expecting at least 2 microphones, received {num_mics}') - - num_subbands = fft_length // 2 + 1 - angular_freq = 2 * np.pi * sample_rate * np.arange(0, num_subbands) / fft_length - desired_coherence = np.zeros((num_subbands, num_mics, num_mics)) - - mic_distance = squareform(pdist(mic_positions)) - - for p in range(num_mics): - desired_coherence[:, p, p] = 1.0 - for q in range(p + 1, num_mics): - dist_pq = mic_distance[p, q] - if field == 'spherical': - desired_coherence[:, p, q] = sinc_unnormalized(angular_freq * dist_pq / sound_velocity) - else: - raise ValueError(f'Unknown noise field {field}.') - # symmetry - desired_coherence[:, q, p] = desired_coherence[:, p, q] - - return desired_coherence - - -def estimated_coherence(S: npt.NDArray, eps: float = 1e-16) -> npt.NDArray: - """Estimate complex-valued coherence for the input STFT-domain signal. - - Args: - S: STFT of the signal with shape (num_subbands, num_frames, num_channels) - eps: small regularization constant - - Returns: - Estimated coherence with shape (num_subbands, num_channels, num_channels) - """ - if S.ndim != 3: - raise RuntimeError('Expecting the input STFT to be a 3D array') - - num_subbands, num_frames, num_channels = S.shape - - if num_channels < 2: - raise ValueError('Expecting at least 2 microphones') - - psd = np.mean(np.abs(S) ** 2, axis=1) - estimated_coherence = np.zeros((num_subbands, num_channels, num_channels), dtype=complex) - - for p in range(num_channels): - estimated_coherence[:, p, p] = 1.0 - for q in range(p + 1, num_channels): - cross_psd = np.mean(S[:, :, p] * np.conjugate(S[:, :, q]), axis=1) - estimated_coherence[:, p, q] = cross_psd / np.sqrt(psd[:, p] * psd[:, q] + eps) - # symmetry - estimated_coherence[:, q, p] = np.conjugate(estimated_coherence[:, p, q]) - - return estimated_coherence - - -def generate_approximate_noise_field( - mic_positions: npt.NDArray, - noise_signal: npt.NDArray, - sample_rate: float, - field: str = 'spherical', - fft_length: int = 512, - method: str = 'cholesky', - sound_velocity: float = SOUND_VELOCITY, -): - """ - Args: - mic_positions: 3D microphone positions, shape (num_mics, 3) - noise_signal: signal used to generate the approximate noise field, shape (num_samples, num_mics). - Different channels need to be independent. - sample_rate: sampling rate of the input signal - field: string denoting the type of the soundfield - fft_length: length of the fft in samples - method: coherence decomposition method - sound_velocity: speed of sound in m/s - - Returns: - Signal with coherence approximately matching the desired coherence, shape (num_samples, num_channels) - - References: - E.A.P. Habets, I. Cohen and S. Gannot, 'Generating nonstationary multisensor - signals under a spatial coherence constraint', Journal of the Acoustical Society - of America, Vol. 124, Issue 5, pp. 2911-2917, Nov. 2008. - """ - assert fft_length % 2 == 0 - num_mics = mic_positions.shape[0] - - if num_mics < 2: - raise ValueError('Expecting at least 2 microphones') - - desired_coherence = theoretical_coherence( - mic_positions=mic_positions, - field=field, - sample_rate=sample_rate, - fft_length=fft_length, - sound_velocity=sound_velocity, - ) - - return transform_to_match_coherence(signal=noise_signal, desired_coherence=desired_coherence, method=method) - - -def transform_to_match_coherence( - signal: npt.NDArray, - desired_coherence: npt.NDArray, - method: str = 'cholesky', - ref_channel: int = 0, - corrcoef_threshold: float = 0.2, -) -> npt.NDArray: - """Transform the input multichannel signal to match the desired coherence. - - Note: It's assumed that channels are independent. - - Args: - signal: independent noise signals with shape (num_samples, num_channels) - desired_coherence: desired coherence with shape (num_subbands, num_channels, num_channels) - method: decomposition method used to construct the transformation matrix - ref_channel: reference channel for power normalization of the input signal - corrcoef_threshold: used to detect input signals with high correlation between channels - - Returns: - Signal with coherence approximately matching the desired coherence, shape (num_samples, num_channels) - - References: - E.A.P. Habets, I. Cohen and S. Gannot, 'Generating nonstationary multisensor - signals under a spatial coherence constraint', Journal of the Acoustical Society - of America, Vol. 124, Issue 5, pp. 2911-2917, Nov. 2008. - """ - num_channels = signal.shape[1] - num_subbands = desired_coherence.shape[0] - assert desired_coherence.shape[1] == num_channels - assert desired_coherence.shape[2] == num_channels - - fft_length = 2 * (num_subbands - 1) - - # remove DC component - signal = signal - np.mean(signal, axis=0) - - # channels needs to have equal power, so normalize with the ref mic - signal_power = np.mean(np.abs(signal) ** 2, axis=0) - signal = signal * np.sqrt(signal_power[ref_channel]) / np.sqrt(signal_power) - - # input channels should be uncorrelated - # here, we just check for high correlation coefficients between channels to detect ill-constructed inputs - corrcoef_matrix = np.corrcoef(signal.transpose()) - # mask the diagonal elements - np.fill_diagonal(corrcoef_matrix, 0.0) - if np.any(np.abs(corrcoef_matrix) > corrcoef_threshold): - raise RuntimeError( - f'Input channels are correlated above the threshold {corrcoef_threshold}. Max abs off-diagonal element of the coefficient matrix: {np.abs(corrcoef_matrix).max()}.' - ) - - # analysis transform - S = librosa.stft(signal.transpose(), n_fft=fft_length) - # (channel, subband, frame) -> (subband, frame, channel) - S = S.transpose(1, 2, 0) - - # generate output signal for each subband - X = np.zeros_like(S) - - # factorize the desired coherence (skip the DC component) - if method == 'cholesky': - L = np.linalg.cholesky(desired_coherence[1:]) - A = L.swapaxes(1, 2) - elif method == 'evd': - w, V = np.linalg.eig(desired_coherence[1:]) - # scale eigenvectors - A = np.sqrt(w)[:, None, :] * V - # prepare transform matrix - A = A.swapaxes(1, 2) - else: - raise ValueError(f'Unknown method {method}') - - # transform vectors at each time step: - # x_t = A^T * s_t - # or in matrix notation: X = S * A - X[1:, ...] = np.matmul(S[1:, ...], A) - - # synthesis transform - # transpose X from (subband, frame, channel) to (channel, subband, frame) - x = librosa.istft(X.transpose(2, 0, 1), length=len(signal)) - # (channel, sample) -> (sample, channel) - x = x.transpose() - - return x - - -def rms(x: np.ndarray) -> float: - """Calculate RMS value for the input signal. - - Args: - x: input signal - - Returns: - RMS of the input signal. - """ - return np.sqrt(np.mean(np.abs(x) ** 2)) - - -def mag2db(mag: float, eps: Optional[float] = 1e-16) -> float: - """Convert magnitude ratio from linear scale to dB. - - Args: - mag: linear magnitude value - eps: small regularization constant - - Returns: - Value in dB. - """ - return 20 * np.log10(mag + eps) - - -def db2mag(db: float) -> float: - """Convert value in dB to linear magnitude ratio. - - Args: - db: magnitude ratio in dB - - Returns: - Magnitude ratio in linear scale. - """ - return 10 ** (db / 20) - - -def pow2db(power: float, eps: Optional[float] = 1e-16) -> float: - """Convert power ratio from linear scale to dB. - - Args: - power: power ratio in linear scale - eps: small regularization constant - - Returns: - Power in dB. - """ - return 10 * np.log10(power + eps) - - -def get_segment_start(signal: np.ndarray, segment: np.ndarray) -> int: - """Get starting point of `segment` in `signal`. - We assume that `segment` is a sub-segment of `signal`. - For example, `signal` may be a 10 second audio signal, - and `segment` could be the signal between 2 seconds and - 5 seconds. This function will then return the index of - the sample where `segment` starts (at 2 seconds). - - Args: - signal: numpy array with shape (num_samples,) - segment: numpy array with shape (num_samples,) - - Returns: - Index of the start of `segment` in `signal`. - """ - if len(signal) <= len(segment): - raise ValueError( - f'segment must be shorter than signal: len(segment) = {len(segment)}, len(signal) = {len(signal)}' - ) - cc = scipy.signal.correlate(signal, segment, mode='valid') - return np.argmax(cc) - - -def calculate_sdr_numpy( - estimate: np.ndarray, - target: np.ndarray, - scale_invariant: bool = False, - convolution_invariant: bool = False, - convolution_filter_length: Optional[int] = None, - remove_mean: bool = True, - sdr_max: Optional[float] = None, - eps: float = 1e-8, -) -> float: - """Calculate signal-to-distortion ratio. - - SDR = 10 * log10( ||t||_2^2 / (||e-t||_2^2 + alpha * ||t||^2) - - where - alpha = 10^(-sdr_max/10) - - Optionally, apply scale-invariant scaling to target signal. - - Args: - estimate: estimated signal - target: target signal - - Returns: - SDR in dB. - """ - if scale_invariant and convolution_invariant: - raise ValueError('Arguments scale_invariant and convolution_invariant cannot be used simultaneously.') - - if remove_mean: - estimate = estimate - np.mean(estimate) - target = target - np.mean(target) - - if scale_invariant or (convolution_invariant and convolution_filter_length == 1): - target = scale_invariant_target_numpy(estimate=estimate, target=target, eps=eps) - elif convolution_invariant: - target = convolution_invariant_target_numpy( - estimate=estimate, target=target, filter_length=convolution_filter_length, eps=eps - ) - - target_pow = np.mean(np.abs(target) ** 2) - distortion_pow = np.mean(np.abs(estimate - target) ** 2) - - if sdr_max is not None: - distortion_pow = distortion_pow + 10 ** (-sdr_max / 10) * target_pow - - sdr = 10 * np.log10(target_pow / (distortion_pow + eps) + eps) - return sdr - - -def wrap_to_pi(x: torch.Tensor) -> torch.Tensor: - """Wrap angle in radians to [-pi, pi] - - Args: - x: angle in radians - - Returns: - Angle in radians wrapped to [-pi, pi] - """ - pi = torch.tensor(math.pi, device=x.device) - return torch.remainder(x + pi, 2 * pi) - pi - - -def convmtx_numpy(x: np.ndarray, filter_length: int, delay: int = 0, n_steps: Optional[int] = None) -> np.ndarray: - """Construct a causal convolutional matrix from x delayed by `delay` samples. - - Args: - x: input signal, shape (N,) - filter_length: length of the filter in samples - delay: delay the signal by a number of samples - n_steps: total number of time steps (rows) for the output matrix - - Returns: - Convolutional matrix, shape (n_steps, filter_length) - """ - if x.ndim != 1: - raise ValueError(f'Expecting one-dimensional signal. Received signal with shape {x.shape}') - - if n_steps is None: - # Keep the same length as the input signal - n_steps = len(x) - - # pad as necessary - x_pad = np.hstack([np.zeros(delay), x]) - if (pad_len := n_steps - len(x_pad)) > 0: - x_pad = np.hstack([x_pad, np.zeros(pad_len)]) - else: - x_pad = x_pad[:n_steps] - - return scipy.linalg.toeplitz(x_pad, np.hstack([x_pad[0], np.zeros(filter_length - 1)])) - - -def convmtx_mc_numpy(x: np.ndarray, filter_length: int, delay: int = 0, n_steps: Optional[int] = None) -> np.ndarray: - """Construct a causal multi-channel convolutional matrix from `x` delayed by `delay` samples. - - Args: - x: input signal, shape (N, M) - filter_length: length of the filter in samples - delay: delay the signal by a number of samples - n_steps: total number of time steps (rows) for the output matrix - - Returns: - Multi-channel convolutional matrix, shape (n_steps, M * filter_length) - """ - if x.ndim != 2: - raise ValueError(f'Expecting two-dimensional signal. Received signal with shape {x.shape}') - - mc_mtx = [] - - for m in range(x.shape[1]): - mc_mtx.append(convmtx_numpy(x[:, m], filter_length=filter_length, delay=delay, n_steps=n_steps)) - - return np.hstack(mc_mtx) - - -def scale_invariant_target_numpy(estimate: np.ndarray, target: np.ndarray, eps: float = 1e-8) -> np.ndarray: - """Calculate convolution-invariant target for a given estimated signal. - - Calculate scaled target obtained by solving - - min_scale || scale * target - estimate ||^2 - - Args: - estimate: one-dimensional estimated signal, shape (T,) - target: one-dimensional target signal, shape (T,) - eps: regularization constans - - Returns: - Scaled target signal, shape (T,) - """ - assert target.ndim == estimate.ndim == 1, f'Only one-dimensional inputs supported' - - estimate_dot_target = np.mean(estimate * target) - target_pow = np.mean(np.abs(target) ** 2) - scale = estimate_dot_target / (target_pow + eps) - return scale * target - - -def convolution_invariant_target_numpy( - estimate: np.ndarray, target: np.ndarray, filter_length, diag_reg: float = 1e-6, eps: float = 1e-8 -) -> np.ndarray: - """Calculate convolution-invariant target for a given estimated signal. - - Calculate target filtered with a linear f obtained by solving - - min_filter || conv(filter, target) - estimate ||^2 - - Args: - estimate: one-dimensional estimated signal - target: one-dimensional target signal - filter_length: length of the (convolutive) filter - diag_reg: multiplicative factor for relative diagonal loading - eps: absolute diagonal loading - """ - assert target.ndim == estimate.ndim == 1, f'Only one-dimensional inputs supported' - - n_fft = 2 ** math.ceil(math.log2(len(target) + len(estimate) - 1)) - - T = np.fft.rfft(target, n=n_fft) - E = np.fft.rfft(estimate, n=n_fft) - - # target autocorrelation - tt_corr = np.fft.irfft(np.abs(T) ** 2, n=n_fft) - # target-estimate crosscorrelation - te_corr = np.fft.irfft(T.conj() * E, n=n_fft) - - # Use only filter_length - tt_corr = tt_corr[:filter_length] - te_corr = te_corr[:filter_length] - - if diag_reg is not None: - tt_corr[0] += diag_reg * tt_corr[0] + eps - - # Construct the Toeplitz system matrix - TT = scipy.linalg.toeplitz(tt_corr) - - # Solve the linear system for the optimal filter - filt = np.linalg.solve(TT, te_corr) - - # Calculate filtered target - T_filt = T * np.fft.rfft(filt, n=n_fft) - target_filt = np.fft.irfft(T_filt, n=n_fft) - - return target_filt[: len(target)] - - -def toeplitz(x: torch.Tensor) -> torch.Tensor: - """Create Toeplitz matrix for one-dimensional signals along the last dimension. - - Args: - x: tensor with shape (..., T) - - Returns: - Tensor with shape (..., T, T) - """ - length = x.size(-1) - x = torch.cat([x[..., 1:].flip(dims=(-1,)), x], dim=-1) - return x.unfold(-1, length, 1).flip(dims=(-1,)) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/confidence_metrics.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/confidence_metrics.py deleted file mode 100644 index 7d793c9df607c922c3f5f4ec941f2052ae259c61..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/confidence_metrics.py +++ /dev/null @@ -1,266 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import os -from pathlib import Path -from typing import List, Optional, Tuple, Union - -import matplotlib.pyplot as plt -import numpy as np -from sklearn.metrics import ( - PrecisionRecallDisplay, - RocCurveDisplay, - average_precision_score, - log_loss, - precision_recall_curve, - roc_auc_score, - roc_curve, -) - - -def auc_roc(y_true: Union[List[int], np.ndarray], y_score: Union[List[float], np.ndarray]) -> float: - """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. - - Note: If only one class is present in y_true, 0.5 is returned. - """ - y_true = np.array(y_true) - y_score = np.array(y_score) - assert len(y_true) == len(y_score) - assert np.all(y_true >= 0) and np.all(y_true <= 1) - if np.all(y_true == 0) or np.all(y_true == 1): - return 0.5 - return roc_auc_score(y_true, y_score) - - -def auc_pr(y_true: Union[List[int], np.ndarray], y_score: Union[List[float], np.ndarray]) -> float: - """Compute Area Under the Precision-Recall Curve (PR AUC) from prediction scores. - - Note: If only regatives are present in y_true, 0.0 is returned. - """ - y_true = np.array(y_true) - y_score = np.array(y_score) - assert len(y_true) == len(y_score) - assert np.all(y_true >= 0) and np.all(y_true <= 1) - if np.all(y_true == 0): - return 0.0 - return average_precision_score(y_true, y_score) - - -def auc_nt(y_true: Union[List[int], np.ndarray], y_score: Union[List[float], np.ndarray]) -> float: - """Compute Area Under the Negative Predictive Value vs. True Negative Rate Curve (NT AUC) from prediction scores. - - This metric can be thought of as a PR AUC in which errors are treated as positives. - - Note: If only positives are present in y_true, 0.0 is returned. - """ - y_true = np.array(y_true) - y_score = np.array(y_score) - assert len(y_true) == len(y_score) - assert np.all(y_true >= 0) and np.all(y_true <= 1) - if np.all(y_true == 1): - return 0.0 - return average_precision_score(1 - y_true, 1 - y_score) - - -def nce(y_true: Union[List[int], np.ndarray], y_score: Union[List[float], np.ndarray]) -> float: - """Compute Normalized Cross Entropy (NCE) from prediction scores. Also known as the Normalized Mutual Information. - - NCE measures how close the correct prediction scores are to one and the incorrect prediction scores are to zero. - Negative NCE values indicate that the classifier performs worse than the setting all prediction scores - as the proportion of correct predictions. - - Note: If only one class is present in y_true, 0.5 is returned. - """ - y_true = np.array(y_true) - y_score = np.array(y_score) - assert len(y_true) == len(y_score) - assert np.all(y_true >= 0) and np.all(y_true <= 1) - if np.all(y_true == 0) or np.all(y_true == 1): - return -math.inf - p = y_true.mean() - eps = 1e-15 - Hp = -(math.log(p + eps) * p + math.log(1 - p + eps) * (1 - p)) - return (Hp - log_loss(y_true, y_score)) / Hp - - -def ece( - y_true: Union[List[int], np.ndarray], - y_score: Union[List[float], np.ndarray], - n_bins: int = 100, - return_curve: bool = False, -) -> Union[float, Tuple[float, Tuple[List[int], List[float]]]]: - """Compute Expected Calibration Error (ECE) from prediction scores. - - ECE measures how close the correct prediction scores are to one and the incorrect prediction scores are to zero. - ECE ranges from zero to one with the best value zero (the lower the value, the better). - """ - y_true = np.array(y_true) - y_score = np.array(y_score) - assert len(y_true) == len(y_score) - assert np.all(y_true >= 0) and np.all(y_true <= 1) - py = np.array([1 - y_score, y_score]).T - acc, conf = np.zeros(n_bins), np.zeros(n_bins) - Bm = np.zeros(n_bins) - ece_curve = [] - thresholds = [] - for m in range(n_bins): - a, b = m / n_bins, (m + 1) / n_bins - threshold = (a + b) / 2 - thresholds.append(threshold) - py_index = (py.T[1] >= threshold).astype(int) - py_value = py[np.arange(len(py_index)), py_index] - bin_range = ((py_value > a) & (py_value <= b)).nonzero()[0] - Bm[m] = len(bin_range) - if Bm[m] > 0: - acc[m] = (py_index[bin_range] == y_true[bin_range]).sum() / Bm[m] - conf[m] = py_value[bin_range].sum() / Bm[m] - ece_curve.append(Bm[m] * np.abs(acc[m] - conf[m])) - ece = sum(ece_curve) / sum(Bm) - if return_curve: - return ece, (thresholds, ece_curve) - else: - return ece - - -def auc_yc( - y_true: Union[List[int], np.ndarray], - y_score: Union[List[float], np.ndarray], - n_bins: int = 100, - return_std_maximum: bool = False, - return_curve: bool = False, -) -> Union[ - float, - Tuple[float, Tuple[List[int], List[float]]], - Tuple[float, float, float], - Tuple[float, float, float, Tuple[List[int], List[float]]], -]: - """Compute Area Under the Youden's Curve (YC AUC) from prediction scores. - - YC AUC represents the rate of the effective threshold range. - - If return_std_maximum is set to True, std and maximum values of the Youden's Curve are returned with the AUC. - - Note: If only one class is present in y_true, zeroes are returned for every entity. - """ - y_true = np.array(y_true) - y_score = np.array(y_score) - thresholds = np.linspace(0, 1, n_bins + 1) - assert len(y_true) == len(y_score) - assert np.all(y_true >= 0) and np.all(y_true <= 1) - if np.all(y_true == 0) or np.all(y_true == 1): - if return_std_maximum and return_curve: - return 0.0, 0.0, 0.0, (thresholds, np.zeros(len(thresholds))) - elif return_std_maximum: - return 0.0, 0.0, 0.0 - elif return_curve: - return 0.0, (thresholds, np.zeros(len(thresholds))) - else: - return 0.0 - mask_correct = y_true == 1 - count_correct = max(len(mask_correct.nonzero()[0]), 1) - count_incorrect = max(len(y_true) - count_correct, 1) - y_score_correct = y_score[mask_correct] - y_score_incorrect = y_score[~mask_correct] - yc = [] - for threshold in thresholds: - tnr = len((y_score_incorrect < threshold).nonzero()[0]) / count_incorrect - fnr = len((y_score_correct < threshold).nonzero()[0]) / count_correct - yc.append(abs(tnr - fnr)) - yc = np.array(yc) - if return_std_maximum and return_curve: - return yc.mean(), yc.std(), yc.max(), (thresholds, yc) - elif return_std_maximum: - return yc.mean(), yc.std(), yc.max() - elif return_curve: - return yc.mean(), (thresholds, yc) - else: - return yc.mean() - - -def save_confidence_hist(y_score: Union[List[float], np.ndarray], plot_dir: Union[str, Path], name: str = "hist"): - os.makedirs(plot_dir, exist_ok=True) - plt.hist(np.array(y_score), 50, range=(0, 1)) - plt.title(name) - plt.xlabel("Confidence score") - plt.ylabel("Count") - plt.savefig(Path(plot_dir) / Path(name + ".png"), dpi=300) - plt.clf() - - -def save_roc_curve( - y_true: Union[List[int], np.ndarray], - y_score: Union[List[float], np.ndarray], - plot_dir: Union[str, Path], - name: str = "roc", -): - assert len(y_true) == len(y_score) - os.makedirs(plot_dir, exist_ok=True) - fpr, tpr, _ = roc_curve(1 - np.array(y_true), 1 - np.array(y_score)) - RocCurveDisplay(fpr=fpr, tpr=tpr).plot() - plt.title(name) - plt.savefig(Path(plot_dir) / Path(name + ".png"), dpi=300) - plt.clf() - - -def save_pr_curve( - y_true: Union[List[int], np.ndarray], - y_score: Union[List[float], np.ndarray], - plot_dir: Union[str, Path], - name: str = "pr", -): - assert len(y_true) == len(y_score) - os.makedirs(plot_dir, exist_ok=True) - precision, recall, _ = precision_recall_curve(np.array(y_true), np.array(y_score)) - PrecisionRecallDisplay(precision=precision, recall=recall).plot() - plt.title(name) - plt.savefig(Path(plot_dir) / Path(name + ".png"), dpi=300) - plt.clf() - - -def save_nt_curve( - y_true: Union[List[int], np.ndarray], - y_score: Union[List[float], np.ndarray], - plot_dir: Union[str, Path], - name: str = "nt", -): - assert len(y_true) == len(y_score) - os.makedirs(plot_dir, exist_ok=True) - precision, recall, _ = precision_recall_curve(1 - np.array(y_true), 1 - np.array(y_score)) - PrecisionRecallDisplay(precision=precision, recall=recall).plot() - plt.title(name) - plt.savefig(Path(plot_dir) / Path(name + ".png"), dpi=300) - plt.clf() - - -def save_custom_confidence_curve( - thresholds: Union[List[float], np.ndarray], - values: Union[List[float], np.ndarray], - plot_dir: Union[str, Path], - name: str = "my_awesome_curve", - xlabel: Optional[str] = None, - ylabel: Optional[str] = None, -): - assert len(thresholds) == len(values) - os.makedirs(plot_dir, exist_ok=True) - plt.plot(thresholds, values) - plt.xlim([0, 1]) - plt.ylim([0, 1]) - plt.title(name) - if xlabel is not None: - plt.xlabel(xlabel) - if ylabel is not None: - plt.ylabel(ylabel) - plt.savefig(Path(plot_dir) / Path(name + ".png"), dpi=300) - plt.clf() diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/data_simulation_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/data_simulation_utils.py deleted file mode 100644 index a9a1e10ae385811954b950ad4e6fc8b5bc4a15f0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/data_simulation_utils.py +++ /dev/null @@ -1,1127 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os -import shutil -from collections import defaultdict -from typing import IO, Dict, List, Optional, Tuple - -import numpy as np -import torch -from scipy.stats import beta, gamma -from tqdm import tqdm - -from nemo.collections.asr.parts.preprocessing.perturb import AudioAugmentor -from nemo.collections.asr.parts.preprocessing.segment import AudioSegment -from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_ctm, write_manifest, write_text -from nemo.collections.asr.parts.utils.speaker_utils import labels_to_rttmfile -from nemo.utils import logging - - -def get_cleaned_base_path(output_dir: str, overwrite_output: bool = True) -> str: - """ - Delete output directory if it exists or throw warning. - - Args: - output_dir (str): Path to output directory - overwrite_output (bool): If True, delete output directory if it exists - - Returns: - basepath (str): Path to base-path directory for writing output files - """ - if os.path.isdir(output_dir) and os.listdir(output_dir): - if overwrite_output: - if os.path.exists(output_dir): - shutil.rmtree(output_dir) - os.mkdir(output_dir) - else: - raise Exception("Output directory is nonempty and overwrite_output = false") - elif not os.path.isdir(output_dir): - os.makedirs(output_dir) - - # only add root if paths are relative - if not os.path.isabs(output_dir): - ROOT = os.getcwd() - basepath = os.path.join(ROOT, output_dir) - else: - basepath = output_dir - return basepath - - -def binary_search_alignments( - inds: List[int], max_audio_read_sec: float, min_alignment_count: int, alignments: List[float], -) -> int: - """ - Binary search to find the index of the alignment that satisfies the maximum audio read duration, - `max_audio_read_sec`. This is used to avoid reading the short audio files. - NOTE: `offset_max` should be at least 1 to avoid feeding max=0 to random sampling function. - - Args: - inds (list): List of indices to search from - max_audio_read_sec (float): Maximum audio read duration - min_alignment_count (int): Minimum number of alignments to read - audio_manifest (dict): Dictionary containing the audio file's alignments - - Returns: - offset_max (int) Index of the alignment that satisfies the maximum audio read duration - """ - # Start from the left end (at index 0) and -1 * min_alignment_count for the right end - left, right = 0, len(inds) - 1 - min_alignment_count - while left < right: - mid = left + (right - left) // 2 - dur_left = alignments[-1 * min_alignment_count] - alignments[inds[mid]] - if dur_left < max_audio_read_sec: - right = mid - 1 - elif dur_left > max_audio_read_sec: - left = mid + 1 - else: - break - mid_out = left + (right - left) // 2 - # If mid_out is on the boundary, move it to the left. - if alignments[-1 * min_alignment_count] - alignments[inds[mid_out]] < max_audio_read_sec: - mid_out -= 1 - offset_max = max(mid_out, 1) - return offset_max - - -def get_subset_of_audio_manifest( - audio_manifest: dict, offset_index: int, max_audio_read_sec: float, min_alignment_count: int, -) -> dict: - """ - Get a subset of `audio_manifest` for faster audio-file reading. - - Args: - audio_manifest (dict): Audio manifest dictionary. - keys: 'offset', 'duration', 'alignments', 'words' - offset_index (int): Index of the offset. - max_audio_read_sec (float): Maximum audio read duration. - min_alignment_count (int): Minimum number of alignments to read. - - Returns: - audio_manifest (dict): Subset of `audio_manifest` is returned for `words` and `alignments` keys. - """ - alignment_array = np.array(audio_manifest['alignments']) - alignment_array_pr = np.array(alignment_array[offset_index:]) - alignment_array[offset_index] - subset_alignments = alignment_array_pr[alignment_array_pr < max_audio_read_sec] - if len(subset_alignments) < min_alignment_count: - # Cases where the word next to the offset is longer than the max_audio_read_sec. - logging.warning( - f"subset_alignments of {audio_manifest['audio_filepath']} \n" - f"has subset alignment length:{len(subset_alignments)} at offset_index:{offset_index}, " - f"word:{audio_manifest['words'][offset_index:offset_index+min_alignment_count]}, " - f"alignments:{alignment_array_pr[:min_alignment_count]} which is longer than _max_audio_read_sec:{[0, max_audio_read_sec]}." - " Truncating the alignements." - ) - # Attach the `_max_audio_read_sec` to the `subset_alignments` to truncate the alignment timestamp. - subset_alignments = np.concatenate([subset_alignments, np.array([max_audio_read_sec])]) - audio_manifest['offset'], audio_manifest['duration'] = ( - alignment_array[offset_index], - subset_alignments[-1] - subset_alignments[0], - ) - audio_manifest['alignments'] = subset_alignments.tolist() - audio_manifest['words'] = audio_manifest['words'][offset_index : offset_index + len(subset_alignments)] - return audio_manifest - - -def read_audio_from_buffer( - audio_manifest: dict, - buffer_dict: dict, - offset_index: int, - device: torch.device, - max_audio_read_sec: float = 2.5, - min_alignment_count: int = 2, - read_subset: bool = True, -) -> Tuple[torch.Tensor, int, dict]: - """ - Read from the provided file path while maintaining a hash-table that saves loading time. - Also, this function only reads a subset of the audio file if `read_subset` is True for faster audio-file reading. - - Args: - audio_manifest (dict): Audio manifest dictionary. - keys: 'audio_filepath', 'duration', 'alignments', 'words' - buffer_dict (dict): Hash-table that saves loaded audio files. - offset_index (int): Index of the offset for the audio file. - device (torch.device): Device to load the audio file. - max_audio_read_sec (float): Maximum audio read duration. - min_alignment_count (int): Minimum number of alignments to read. - read_subset (bool): If True, read a subset of the audio file. - To control the length of the audio file, use data_simulator.session_params.max_audio_read_sec. - Note that using large value (greater than 3~4 sec) for `max_audio_read_sec` will slow down the generation process. - If False, read the entire audio file. - - Returns: - audio_file (torch.Tensor): Time-series audio data in a tensor. - sr (int): Sample rate of the audio file. - audio_manifest (dict): (modified) audio manifest dictionary. - """ - audio_file_id = f"{audio_manifest['audio_filepath']}#{offset_index}" - if audio_file_id in buffer_dict: - audio_file, sr, audio_manifest = buffer_dict[audio_file_id] - else: - if read_subset: - audio_manifest = get_subset_of_audio_manifest( - audio_manifest=audio_manifest, - offset_index=offset_index, - max_audio_read_sec=max_audio_read_sec, - min_alignment_count=min_alignment_count, - ) - segment = AudioSegment.from_file( - audio_file=audio_manifest['audio_filepath'], - offset=audio_manifest['offset'], - duration=audio_manifest['duration'], - ) - else: - segment = AudioSegment.from_file(audio_file=audio_manifest['audio_filepath']) - audio_file, sr = torch.from_numpy(segment.samples).to(device), segment.sample_rate - if read_subset and segment.duration < (audio_manifest['alignments'][-1] - audio_manifest['alignments'][0]): - audio_manifest['alignments'][-1] = min(segment.duration, audio_manifest['alignments'][-1]) - if audio_file.ndim > 1: - audio_file = torch.mean(audio_file, 1, False).to(device) - buffer_dict[audio_file_id] = (audio_file, sr, audio_manifest) - return audio_file, sr, audio_manifest - - -def perturb_audio( - audio: torch.Tensor, sr: int, augmentor: Optional[AudioAugmentor] = None, device: Optional[torch.device] = None -) -> torch.Tensor: - - """ - Perturb the audio (segment or session) using audio augmentor. - - Args: - audio (torch.Tensor): Time-series signal of the segment - sr (int): Sample rate of the original audio file - augmentor (AudioAugmentor): Audio augmentor to use - device (torch.device): Device to load the audio file - - Returns: - audio (torch.Tensor): Perturbed audio (time-series signal) of the segment - """ - if augmentor is None: - return audio - device = device if device is not None else torch.device('cpu') - if isinstance(audio, torch.Tensor): - audio = audio.cpu().numpy() - audio_segment = AudioSegment(audio, sample_rate=sr) - augmentor.perturb(audio_segment) - audio_segment = torch.from_numpy(audio_segment.samples).to(device) - return audio_segment - - -def normalize_audio(array: torch.Tensor) -> torch.Tensor: - """ - Normalize the audio signal to avoid clipping. - - Args: - array (torch.Tensor): Time-series audio data in a tensor. - - Returns: - (torch.Tensor): Normalized audio signal. - """ - return array / (1.0 * torch.max(torch.abs(array))) - - -def get_power_of_audio_file(audio_file: str, end_audio_file: int, running_len_samples: int, device: torch.device): - """ - Calculate the power of the audio signal. - - Args: - audio_file (torch.Tensor): Time-series audio data in a tensor. - end_audio_file (int): End index of the audio file. - running_len_samples (int): Running length of the audio file. - device (torch.device): Device to use. - - Returns: - (float): Power of the audio signal. - """ - return torch.mean(audio_file[: end_audio_file - running_len_samples] ** 2).to(device) - - -def get_scaled_audio_signal( - audio_file: torch.Tensor, - end_audio_file: int, - running_len_samples: int, - desired_avg_power_noise: float, - device: torch.device, -): - """ - Scale the audio signal to the desired average power. - - Args: - audio_file (torch.Tensor): Time-series audio data in a tensor. - end_audio_file (int): End index of the audio file. - running_len_samples (int): Running length of the audio file. - desired_avg_power_noise (float): Desired average power of the audio file. - device (torch.device): Device to use. - - Returns: - scaled_audio_file (torch.Tensor): Scaled audio signal. - """ - pow_audio_file = get_power_of_audio_file( - audio_file=audio_file, end_audio_file=end_audio_file, running_len_samples=running_len_samples, device=device - ) - scaled_audio_file = audio_file[: end_audio_file - running_len_samples] * torch.sqrt( - desired_avg_power_noise / pow_audio_file - ).to(device) - return scaled_audio_file - - -def get_desired_avg_power_noise( - power_array: float, snr_min: float, snr_max: float, background_noise_snr: float, -): - """ - Calculate the desired average power of the noise. - - Args: - power_array (float): Power of the audio signal. - snr_min (float): Minimum SNR. - snr_max (float): Maximum SNR. - background_noise_snr (float): SNR of the background noise. - - Returns: - desired_avg_power_noise (float): Desired average power of the noise. - """ - if (snr_min is not None) and (snr_max is not None) and (snr_min <= snr_max): - desired_snr = np.random.uniform(snr_min, snr_max) - else: - desired_snr = background_noise_snr - ratio = 10 ** (desired_snr / 20) - desired_avg_power_noise = power_array / ratio - return desired_avg_power_noise, desired_snr - - -def get_background_noise( - len_array: int, - power_array: float, - noise_samples: list, - audio_read_buffer_dict: dict, - snr_min: float, - snr_max: float, - background_noise_snr: float, - seed: int, - device: torch.device, -): - """ - Augment with background noise (inserting ambient background noise up to the desired SNR for the full clip). - - Args: - len_array (int): Length of background noise required. - power_array (float): Power of the audio signal. - noise_samples (list): List of noise samples. - audio_read_buffer_dict (dict): Dictionary containing audio read buffer. - snr_min (float): Minimum SNR. - snr_max (float): Maximum SNR. - background_noise_snr (float): SNR of the background noise. - seed (int): Seed for random number generator. - device (torch.device): Device to use. - - Returns: - bg_array (tensor): Tensor containing background noise. - desired_snr (float): Desired SNR for adding background noise. - """ - np.random.seed(seed) - bg_array = torch.zeros(len_array).to(device) - desired_avg_power_noise, desired_snr = get_desired_avg_power_noise( - power_array=power_array, snr_min=snr_min, snr_max=snr_max, background_noise_snr=background_noise_snr - ) - running_len_samples = 0 - - while running_len_samples < len_array: # build background audio stream (the same length as the full file) - file_id = np.random.randint(len(noise_samples)) - audio_file, sr, audio_manifest = read_audio_from_buffer( - audio_manifest=noise_samples[file_id], - buffer_dict=audio_read_buffer_dict, - offset_index=0, - device=device, - read_subset=False, - ) - if running_len_samples + len(audio_file) < len_array: - end_audio_file = running_len_samples + len(audio_file) - else: - end_audio_file = len_array - scaled_audio_file = get_scaled_audio_signal( - audio_file=audio_file, - end_audio_file=end_audio_file, - running_len_samples=running_len_samples, - desired_avg_power_noise=desired_avg_power_noise, - device=device, - ) - - bg_array[running_len_samples:end_audio_file] = scaled_audio_file - running_len_samples = end_audio_file - - return bg_array, desired_snr - - -def get_random_offset_index( - audio_manifest: dict, - audio_read_buffer_dict: dict, - offset_min: int = 0, - max_audio_read_sec: float = 2.5, - min_alignment_count: int = 2, -) -> int: - """ - Get an index for randomly accessing the silence in alignment timestamps. - - Args: - audio_manifest (dict): Audio manifest dictionary. - keys: 'audio_filepath', 'duration', 'alignments', 'words' - audio_read_buffer_dict (dict): Dictionary containing audio read buffer. - offset_min (int): Minimum offset index. (Default: 0) - max_audio_read_sec (float): Maximum audio read duration in seconds. (Default: 2.5) - min_alignment_count (int): Minimum number of alignment timestamps. (Default: 2) - - Returns: - (int): Random offset index smaller than `offset_count`. - """ - if len(audio_manifest['alignments']) <= min_alignment_count: - raise ValueError( - f"Audio file {audio_manifest['audio_filepath']} has less than {min_alignment_count} alignment timestamps." - ) - index_file_id = f"{audio_manifest['audio_filepath']}#index" - - # Avoid multiple indexings of the same audio file by using a hash-table. - if index_file_id in audio_read_buffer_dict: - (sil_inds, offset_max) = audio_read_buffer_dict[index_file_id] - else: - # Find all silence indices - sil_inds = np.where((np.array(audio_manifest['words']) == '') == True)[0] - if audio_manifest['alignments'][-1] - audio_manifest['alignments'][0] < max_audio_read_sec: - # The total duration is already short, therefore skip range search. - offset_max = 1 - else: - # Find the range that satisfies `max_audio_read_sec` duration. - offset_max = binary_search_alignments( - inds=sil_inds, - max_audio_read_sec=max_audio_read_sec, - min_alignment_count=min_alignment_count, - alignments=audio_manifest['alignments'], - ) - - audio_read_buffer_dict[index_file_id] = (sil_inds, offset_max) - - # If the audio file is shorter than the max_audio_read_sec, then we don't need to read a subset of the audio file. - if ( - len(sil_inds) <= min_alignment_count - or (audio_manifest['alignments'][-1] - audio_manifest['alignments'][0]) < max_audio_read_sec - ): - return offset_min - else: - offset_index = np.random.randint(offset_min, offset_max) - return sil_inds[offset_index] - - -def get_speaker_ids(sess_idx: int, speaker_samples: dict, permutated_speaker_inds: list) -> List[str]: - """ - Randomly select speaker IDs from the loaded manifest file. - - Args: - sess_idx (int): Session index in integer. - speaker_samples (dict): Dictionary mapping speaker ID to their list of samples. - permutated_speaker_inds (list): List of permutated speaker indices. - - Returns: - speaker_ids (list): List of speaker IDs - """ - all_speaker_ids = list(speaker_samples.keys()) - idx_list = permutated_speaker_inds[sess_idx, :] - speaker_ids = [all_speaker_ids[i] for i in idx_list] - return speaker_ids - - -def build_speaker_samples_map(manifest: dict) -> dict: - """ - Build a dictionary for mapping speaker ID to their list of samples - - Returns: - speaker_samples (Dict[list]): - Dictionary mapping speaker ID to their list of samples - """ - speaker_samples = defaultdict(list) - logging.info("Building speaker to samples map...") - for sample in tqdm(manifest, total=len(manifest)): - speaker_id = sample['speaker_id'] - speaker_samples[speaker_id].append(sample) - return speaker_samples - - -def read_noise_manifest(add_bg: bool, background_manifest: str): - """ - Read the noise manifest file and sample the noise manifest. - - Args: - add_bg (bool): Whether to add background noise. - background_manifest (str): Path to the background noise manifest file. - - Returns: - noise_manifest (list): List of the entire noise source samples. - """ - noise_manifest = [] - if add_bg is True: - if background_manifest is not None: - background_manifest_list = background_manifest - if isinstance(background_manifest_list, str): - background_manifest_list = [background_manifest_list] - for background_manifest in background_manifest_list: - if os.path.exists(background_manifest): - noise_manifest += read_manifest(background_manifest) - else: - raise FileNotFoundError(f"Noise manifest file: {background_manifest} file not found.") - else: - raise FileNotFoundError( - f"Noise manifest file is {background_manifest}. Please provide a valid noise manifest file/list if add_bg=True." - ) - return noise_manifest - - -def get_speaker_samples(speaker_ids: List[str], speaker_samples: dict) -> Dict[str, list]: - """ - Get a list of the samples for each of the specified speakers. - - Args: - speaker_ids (list): LibriSpeech speaker IDs for each speaker in the current session. - speaker_samples (dict): Dictionary mapping speaker ID to their list of samples. - - Returns: - speaker_wav_align_map (dict): Dictionary containing speaker IDs and their corresponding wav filepath and alignments. - """ - speaker_wav_align_map = defaultdict(list) - for sid in speaker_ids: - speaker_wav_align_map[sid] = speaker_samples[sid] - return speaker_wav_align_map - - -def add_silence_to_alignments(audio_manifest: dict): - """ - Add silence to the beginning of the alignments and words. - - Args: - audio_manifest (dict): Audio manifest dictionary. - keys: 'audio_filepath', 'duration', 'alignments', 'words' - - Returns: - audio_manifest (dict): Audio manifest dictionary with silence added to the beginning. - """ - if type(audio_manifest['words'][0]) == str and len(audio_manifest['words'][0]) > 0: - audio_manifest['words'].insert(0, "") - audio_manifest['alignments'].insert(0, 0.0) - return audio_manifest - - -def load_speaker_sample( - speaker_wav_align_map: List[dict], speaker_ids: List[str], speaker_turn: int, min_alignment_count: int, -) -> str: - """ - Load a sample for the selected speaker ID. - The first alignment and word must be silence that determines the start of the alignments. - - Args: - speaker_wav_align_map (dict): Dictionary containing speaker IDs and their corresponding wav filepath and alignments. - speaker_ids (list): LibriSpeech speaker IDs for each speaker in the current session. - speaker_turn (int): Current speaker turn. - output_precision (int): Precision of the output alignments in integer. - min_alignment_count (int): Minimum number of alignments in the audio file. - - Returns: - audio_manifest (dict): Audio manifest dictionary containing the wav filepath, words and alignments. - """ - speaker_id = speaker_ids[speaker_turn] - file_id = np.random.randint(0, max(len(speaker_wav_align_map[str(speaker_id)]) - 1, 1)) - audio_manifest = speaker_wav_align_map[str(speaker_id)][file_id] - - # Check if the alignment file has at least 2 words. - if len(audio_manifest['alignments']) < min_alignment_count: - raise ValueError( - f"Alignment file {audio_manifest['audio_filepath']} has an inappropriate length of {len(audio_manifest['alignments'])} < 2." - ) - - # Check whether the first word is silence and insert a silence token if the first token is not silence. - if audio_manifest['words'][0] != "": - audio_manifest = add_silence_to_alignments(audio_manifest) - - audio_manifest = copy.deepcopy(audio_manifest) - return audio_manifest - - -def get_split_points_in_alignments( - words: List[str], - alignments: List[float], - split_buffer: float, - sr: int, - sentence_audio_len: int, - new_start: float = 0, -): - """ - Collect split points in the alignment based on silence. - Silence is defined as a blank symbol between two words that is longer than 2 * split_buffer. - - Args: - words (List[str]): List of words in the sentence. - alignments (List[float]): List of alignment timestamps in the sentence. - split_buffer (float): Buffer length in seconds. - sr (int): Sample rate of the audio. - sentence_audio_len (int): Length of the sentence audio in samples. - new_start (float): Start of the sentence audio in seconds. - - Returns: - splits (List[List[int]]): List of integer split points in the sentence audio. - """ - splits = [] - for i in range(len(words)): - if words[i] == "" and i != 0 and i != len(words) - 1: - silence_length = alignments[i] - alignments[i - 1] - if silence_length > 2 * split_buffer: # split utterance on silence - new_end = alignments[i - 1] + split_buffer - splits.append( - [int(new_start * sr), int(new_end * sr),] - ) - new_start = alignments[i] - split_buffer - # The last split point should be added - splits.append([int(new_start * sr), sentence_audio_len]) - return splits - - -def per_speaker_normalize( - sentence_audio: torch.Tensor, splits: List[List[int]], speaker_turn: int, volume: List[float], device: torch.device -) -> torch.Tensor: - """ - Normalize time-series audio signal per speaker. - - Args: - sentence_audio (torch.Tensor): Time-series audio signal. - splits (List[List[int]]): List of integer split points in the sentence audio. - speaker_turn (int): Speaker ID of the current speaker. - volume (List[float]): List of volume levels for each speaker. - device (torch.device): Device to use for computations. - - Returns: - sentence_audio (torch.Tensor): Normalized time-series audio signal. - """ - split_length = torch.tensor(0).to(device).double() - split_sum = torch.tensor(0).to(device).double() - for split in splits: - split_length += len(sentence_audio[split[0] : split[1]]) - split_sum += torch.sum(sentence_audio[split[0] : split[1]] ** 2) - average_rms = torch.sqrt(split_sum * 1.0 / split_length) - sentence_audio = sentence_audio / (1.0 * average_rms) * volume[speaker_turn] - return sentence_audio - - -class DataAnnotator(object): - """ - Class containing the functions that create RTTM, CTM, JSON files. - - Arguments in config: - - data_simulator: - session_config: - num_speakers (int): Number of unique speakers per multispeaker audio session - session_params: - split_buffer (float): Split RTTM labels if greater than twice this amount of silence (to avoid long gaps between - utterances as being labelled as speech) - outputs: - output_dir (str): Output directory for audio sessions and corresponding label files - output_filename (str): Output filename for the wav and RTTM files - overwrite_output (bool): If true, delete the output directory if it exists - output_precision (int): Number of decimal places in output files - """ - - def __init__(self, cfg): - """ - Args: - cfg: OmegaConf configuration loaded from yaml file. - """ - self._params = cfg - self._files = {} - self._init_file_write() - self._init_filelist_lists() - - def _init_file_write(self): - """ - Initialize file writing arguments - """ - self._file_base_str = "synthetic" - self._file_types = ["wav", "rttm", "json", "ctm", "txt", "meta"] - self._annotation_types = ["rttm", "json", "ctm"] - - def _init_filelist_lists(self): - """ - Initialize lists to store the filelists for each file type - """ - self.annote_lists = {} - for file_type in self._file_types: - self.annote_lists[f"{file_type}_list"] = [] - - def init_annotation_lists(self): - """ - Initialize lists to store the annotations for each file type - """ - for file_type in self._file_types: - self.annote_lists[file_type] = [] - - def create_new_rttm_entry( - self, words: List[str], alignments: List[float], start: int, end: int, speaker_id: int - ) -> List[str]: - - """ - Create new RTTM entries (to write to output rttm file) - - Args: - words (list): List of words in the current audio file. - alignments (list): List of alignments (timestamps) for the current audio file. - start (int): Current start of the audio file being inserted. - end (int): End of the audio file being inserted. - speaker_id (int): LibriSpeech speaker ID for the current entry. - - Returns: - rttm_list (list): List of rttm entries - """ - rttm_list = [] - new_start = start - # look for split locations - for i in range(len(words)): - if words[i] == "" and i != 0 and i != len(words) - 1: - silence_length = alignments[i] - alignments[i - 1] - if ( - silence_length > 2 * self._params.data_simulator.session_params.split_buffer - ): # split utterance on silence - new_end = start + alignments[i - 1] + self._params.data_simulator.session_params.split_buffer - t_stt = round(float(new_start), self._params.data_simulator.outputs.output_precision) - t_end = round(float(new_end), self._params.data_simulator.outputs.output_precision) - rttm_list.append(f"{t_stt} {t_end} {speaker_id}") - new_start = start + alignments[i] - self._params.data_simulator.session_params.split_buffer - - t_stt = round(float(new_start), self._params.data_simulator.outputs.output_precision) - t_end = round(float(end), self._params.data_simulator.outputs.output_precision) - rttm_list.append(f"{t_stt} {t_end} {speaker_id}") - return rttm_list - - def create_new_json_entry( - self, - text: List[str], - wav_filename: str, - start: float, - length: float, - speaker_id: int, - rttm_filepath: str, - ctm_filepath: str, - ) -> dict: - """ - Create new JSON entries (to write to output json file). - - Args: - text (list): string of text for the current entry. - wav_filename (str): Filename of the wav file. - start (float): Start time of the current entry. - length (float): Length of the current entry. - speaker_id (int): speaker ID for the current entry. - rttm_filepath (str): Path to the RTTM file. - ctm_filepath (str): Path to the CTM file. - - Returns: - meta (dict): JSON entry dictionary. - """ - start = round(float(start), self._params.data_simulator.outputs.output_precision) - length = round(float(length), self._params.data_simulator.outputs.output_precision) - meta = { - "audio_filepath": wav_filename, - "offset": start, - "duration": length, - "label": speaker_id, - "text": text, - "num_speakers": self._params.data_simulator.session_config.num_speakers, - "rttm_filepath": rttm_filepath, - "ctm_filepath": ctm_filepath, - "uem_filepath": None, - } - return meta - - def create_new_ctm_entry( - self, words: List[str], alignments: List[float], session_name: str, speaker_id: int, start: int - ) -> List[str]: - """ - Create new CTM entry (to write to output ctm file) - - Args: - words (list): List of words in the current audio file. - alignments (list): List of alignments (timestamps) for the current audio file. - session_name (str): Current session name. - speaker_id (int): LibriSpeech speaker ID for the current entry. - start (int): Current start of the audio file being inserted. - - Returns: - arr (list): List of ctm entries - """ - arr = [] - start = float(round(start, self._params.data_simulator.outputs.output_precision)) - for i in range(len(words)): - word = words[i] - if ( - word != "" - ): # note that using the current alignments the first word is always empty, so there is no error from indexing the array with i-1 - prev_align = 0 if i == 0 else alignments[i - 1] - align1 = round(float(prev_align + start), self._params.data_simulator.outputs.output_precision) - align2 = round(float(alignments[i] - prev_align), self._params.data_simulator.outputs.output_precision) - text = f"{session_name} {speaker_id} {align1} {align2} {word} 0\n" - arr.append((align1, text)) - return arr - - def add_to_filename_lists(self, basepath: str, filename: str): - """ - Add the current filename to the list of filenames for each file type. - - Args: - basepath (str): Basepath for output files. - filename (str): Base filename for all output files. - """ - full_base_filepath = os.path.join(basepath, filename) - for file_type in self._file_types: - self.annote_lists[f"{file_type}_list"].append(f"{full_base_filepath}.{file_type}") - - def write_filelist_files(self, basepath): - """ - Write all filelist files. - - Args: - basepath (str): Basepath for output files. - """ - for file_type in self._file_types: - with open(f"{basepath}/{self._file_base_str}_{file_type}.list", "w") as list_file: - list_file.write("\n".join(self.annote_lists[f"{file_type}_list"])) - list_file.close() - - def write_annotation_files(self, basepath: str, filename: str, meta_data: dict): - """ - Write all annotation files: RTTM, JSON, CTM, TXT, and META. - - Args: - basepath (str): Basepath for output files. - filename (str): Base filename for all output files. - meta_data (dict): Metadata for the current session. - rttm_list (list): List of RTTM entries. - json_list (list): List of JSON entries. - ctm_list (list): List of CTM entries. - """ - labels_to_rttmfile(self.annote_lists['rttm'], filename, self._params.data_simulator.outputs.output_dir) - write_manifest(os.path.join(basepath, filename + '.json'), self.annote_lists['json']) - write_ctm(os.path.join(basepath, filename + '.ctm'), self.annote_lists['ctm']) - write_text(os.path.join(basepath, filename + '.txt'), self.annote_lists['ctm']) - write_manifest(os.path.join(basepath, filename + '.meta'), [meta_data]) - - -class SpeechSampler(object): - """ - Class for sampling speech samples for Multispeaker Audio Session Simulator - - Args: - cfg: OmegaConf configuration loaded from yaml file. - - Variables for sampling speech: - self.running_speech_len_samples (int): Running total of speech samples in the current audio session. - self.running_silence_len_samples (int): Running total of silence samples in the current audio session. - self.running_overlap_len_samples (int): Running total of overlap samples in the current audio session. - - self.sess_silence_mean (int) : Targeted mean number of silence samples in the current audio session. - self.per_silence_min_len (int): Minimum number of silence samples in the silence segment. - self.per_silence_max_len (int): Maximum number of silence samples in the silence segment. - - self.sess_overlap_mean (int): Targeted mean number of overlap samples in the current audio session. - self.per_overlap_min_len (int): Minimum number of overlap samples in the overlap segment. - self.per_overlap_max_len (int): Maximum number of overlap samples in the overlap segment. - - data_simulator: - session_params: - mean_silence (float): Mean proportion of silence to speaking time in the audio session. Should be in range [0, 1). - mean_silence_var (float): Variance for mean silence in all audio sessions. - This value should be 0 <= mean_silence_var < mean_silence * (1 - mean_silence). - per_silence_var (float): Variance for each silence in an audio session, set large values (e.g., 20) for de-correlation. - per_silence_min (float): Minimum duration for each silence, default to 0. - per_silence_max (float): Maximum duration for each silence, default to -1 for no maximum. - - mean_overlap (float): Mean proportion of overlap in the overall non-silence duration. Should be in range [0, 1) and - recommend [0, 0.15] range for accurate results. - mean_overlap_var (float): Variance for mean overlap in all audio sessions. - This value should be 0 <= mean_overlap_var < mean_overlap * (1 - mean_overlap). - per_overlap_var (float): Variance for per overlap in each session, set large values to de-correlate silence lengths - with the latest speech segment lengths - per_overlap_min (float): Minimum per overlap duration in seconds - per_overlap_max (float): Maximum per overlap duration in seconds, set -1 for no maximum - """ - - def __init__(self, cfg): - """ - Args: - cfg: OmegaConf configuration loaded from yaml file. - """ - self._params = cfg - - self.running_speech_len_samples = 0 - self.running_silence_len_samples = 0 - self.running_overlap_len_samples = 0 - - self.sess_silence_mean = None - self.per_silence_min_len = 0 - self.per_silence_max_len = 0 - - self.sess_overlap_mean = None - self.per_overlap_min_len = 0 - self.per_overlap_max_len = 0 - - self.mean_overlap = float(self._params.data_simulator.session_params.mean_overlap) - self.mean_overlap_var = float(self._params.data_simulator.session_params.mean_overlap_var) - - self.mean_silence = float(self._params.data_simulator.session_params.mean_silence) - self.mean_silence_var = float(self._params.data_simulator.session_params.mean_silence_var) - - self.per_silence_var = float(self._params.data_simulator.session_params.per_silence_var) - self.per_overlap_var = float(self._params.data_simulator.session_params.per_overlap_var) - - self.num_noise_files = int(self._params.data_simulator.background_noise.num_noise_files) - - def _mean_var_to_a_and_b(self, mean: float, var: float) -> Tuple[float, float]: - """ - Convert mean and variance to a and b parameters for beta distribution. - - Args: - mean (float): Mean of the beta distribution. - var (float): Variance of the beta distribution. - - Returns: - Tuple[float, float]: a and b parameters for beta distribution. - """ - a = mean ** 2 * (1 - mean) / var - mean - b = mean * (1 - mean) ** 2 / var - (1 - mean) - return a, b - - def _init_silence_params(self): - """ - Initialize parameters for silence insertion in the current session. - """ - self.running_speech_len_samples = 0 - self.running_silence_len_samples = 0 - - self.per_silence_min_len = int( - max(0, self._params.data_simulator.session_params.per_silence_min) * self._params.data_simulator.sr - ) - if self._params.data_simulator.session_params.per_silence_max > 0: - self.per_silence_max_len = int( - self._params.data_simulator.session_params.per_silence_max * self._params.data_simulator.sr - ) - else: - self.per_silence_max_len = int( - self._params.data_simulator.session_config.session_length * self._params.data_simulator.sr - ) - - def _init_overlap_params(self): - """ - Initialize parameters for overlap insertion in the current session. - """ - self.running_overlap_len_samples = 0 - - self.per_overlap_min_len = int( - max(0, self._params.data_simulator.session_params.per_overlap_min) * self._params.data_simulator.sr - ) - if self._params.data_simulator.session_params.per_overlap_max > 0: - self.per_overlap_max_len = int( - self._params.data_simulator.session_params.per_overlap_max * self._params.data_simulator.sr - ) - else: - self.per_overlap_max_len = int( - self._params.data_simulator.session_config.session_length * self._params.data_simulator.sr - ) - - def silence_vs_overlap_selector(self, running_len_samples: int, non_silence_len_samples: int) -> bool: - """ - Compare the current silence ratio to the current overlap ratio. Switch to either silence or overlap mode according - to the amount of the gap between current ratio and session mean in config. - - Args: - running_len_samples (int): Length of the current session in samples. - non_silence_len_samples (int): Length of the signal that is not silence in samples. - - Returns: - add_overlap (bool): True if the current silence ratio is less than the current overlap ratio, False otherwise. - """ - if running_len_samples > 0: - self.current_silence_ratio = (running_len_samples - self.running_speech_len_samples) / running_len_samples - self.current_overlap_ratio = self.running_overlap_len_samples / non_silence_len_samples - else: - self.current_silence_ratio, self.current_overlap_ratio = 0, 0 - - # self.silence_discrepancy = max(0, self.sess_silence_mean - self.current_silence_ratio) - # self.overlap_discrepancy = max(0, self.sess_overlap_mean - self.current_overlap_ratio) - # threshold = self.silence_discrepancy / (self.overlap_discrepancy + self.silence_discrepancy + 1e-10) - # add_overlap = np.random.rand() > threshold - self.silence_discrepancy = self.current_silence_ratio - self.sess_silence_mean - self.overlap_discrepancy = self.current_overlap_ratio - self.sess_overlap_mean - add_overlap = bool(self.overlap_discrepancy < self.silence_discrepancy) - return add_overlap - - def get_session_silence_mean(self): - """ - Get the target mean silence for current session using re-parameterized Beta distribution. - The following constraints are applied to make a > 0 and b > 0: - - 0 < mean_silence < 1 - 0 < mean_silence_var < mean_silence * (1 - mean_silence) - - Args: - silence_mean (float): - Target mean silence for the current session - """ - self._init_silence_params() - mean, var = self.mean_silence, self.mean_silence_var - if var > 0: - a, b = self._mean_var_to_a_and_b(mean, var) - if a < 0 or b < 0: - raise ValueError( - f"Beta(a, b), a = {a:.3f} and b = {b:.3f} should be both greater than 0. " - f"Invalid `mean_silence_var` value {var} for sampling from Beta distribution. " - f"`mean_silence_var` should be less than `mean_silence * (1 - mean_silence)`. " - f"Please check `mean_silence_var` and try again." - ) - self.sess_silence_mean = beta(a, b).rvs() - else: - self.sess_silence_mean = mean - return self.sess_silence_mean - - def get_session_overlap_mean(self): - """ - Get the target mean overlap for current session using re-parameterized Beta distribution. - The following constraints are applied to make a > 0 and b > 0: - - 0 < mean_overlap < 1 - 0 < mean_overlap_var < mean_overlap * (1 - mean_overlap) - - Returns: - overlap_mean (float): - Target mean overlap for the current session - """ - self._init_overlap_params() - mean, var = self.mean_overlap, self.mean_overlap_var - if var > 0: - a, b = self._mean_var_to_a_and_b(mean, var) - if a < 0 or b < 0: - raise ValueError( - f"Beta(a, b), a = {a:.3f} and b = {b:.3f} should be both greater than 0. " - f"Invalid `mean_overlap_var` value {var} for sampling from Beta distribution. " - f"`mean_overlap_var` should be less than `mean_overlap * (1 - mean_overlap)`. " - f"Please check `mean_overlap_var` and try again." - ) - self.sess_overlap_mean = beta(a, b).rvs() - else: - self.sess_overlap_mean = mean - return self.sess_overlap_mean - - def sample_from_silence_model(self, running_len_samples: int) -> int: - """ - Sample from the silence model to determine the amount of silence to add between sentences. - Gamma distribution is employed for modeling the highly skewed distribution of silence length distribution. - When we add silence between sentences, we want to ensure that the proportion of silence meets the `sess_silence_mean`. - Thus, [Session Silence Mean] = [Total Running Silence Time] / [Total Running Session Time] equation holds. We employ the following - formula to determine the amount of silence to add, which is `silence_mean`: - - self.sess_silence_mean = (silence_mean + self.running_silence_len_samples) / (silence_mean + running_len_samples) - - The above equation is setting `silence_mean` to yield the desired silence ratio `self.sess_silence_mean`. - We use the above `silence_mean` value to sample silence-length for each silence occurrence. - - Args: - running_len_samples (int): - Running length of the session (in terms of number of samples). - session_len_samples (int): - Targeted total session length (in terms of number of samples). - - Returns: - silence_amount (int): Amount of silence to add between sentences (in terms of number of samples). - """ - silence_mean = ((self.sess_silence_mean * running_len_samples) - self.running_silence_len_samples) / ( - 1 - self.sess_silence_mean - ) - silence_mean = max(self.per_silence_min_len, min(silence_mean, self.per_silence_max_len)) - if silence_mean > 0: - self.per_silence_var = self._params.data_simulator.session_params.per_silence_var - silence_amount = ( - int( - gamma( - a=(silence_mean ** 2) / self.per_silence_var, scale=self.per_silence_var / silence_mean - ).rvs() - ) - if self.per_silence_var > 0 - else int(silence_mean) - ) - silence_amount = max(self.per_silence_min_len, min(silence_amount, self.per_silence_max_len)) - else: - silence_amount = 0 - return silence_amount - - def sample_from_overlap_model(self, non_silence_len_samples: int): - """ - Sample from the overlap model to determine the amount of overlap between segments. - Gamma distribution is employed for modeling the highly skewed distribution of overlap length distribution. - When we add an overlap occurrence, we want to meet the desired overlap ratio defined by `self.sess_overlap_mean`. - Thus, [Session Overlap Mean] = [Total Running Overlap Speech Time] / [Total Running Non-Silence Speech Time]. - Let `overlap_mean` be the desired overlap amount, then the mean and variance of the gamma distribution is given by: - - self.sess_overlap_mean = (overlap_mean + self.running_overlap_len_samples) / (non_silence_len_samples - overlap_mean) - - The above equation is setting `overlap_mean` to yield the desired overlap ratio `self.sess_overlap_mean`. - We use the above `overlap_mean` value to sample overlap-length for each overlap occurrence. - - Args: - non_silence_len_samples (int): - The total amount of non-silence (speech) region regardless of overlap status - - Returns: - desired_overlap_amount (int): - Amount of overlap between segments (in terms of number of samples). - """ - overlap_mean = ((self.sess_overlap_mean * non_silence_len_samples) - self.running_overlap_len_samples) / ( - 1 + self.sess_overlap_mean - ) - overlap_mean = max(self.per_overlap_min_len, min(max(0, overlap_mean), self.per_overlap_max_len)) - - if overlap_mean > 0: - desired_overlap_amount = ( - int(gamma(a=overlap_mean ** 2 / self.per_overlap_var, scale=self.per_overlap_var / overlap_mean).rvs()) - if self.per_overlap_var > 0 - else int(overlap_mean) - ) - desired_overlap_amount = max( - self.per_overlap_min_len, min(desired_overlap_amount, self.per_overlap_max_len) - ) - else: - desired_overlap_amount = 0 - return desired_overlap_amount - - def sample_noise_manifest(self, noise_manifest: dict) -> list: - """ - Sample noise manifest to a specified count `num_noise_files` for the current simulated audio session. - - Args: - noise_manifest (list): - List of noise source samples to be sampled from. - - Returns: - sampled_noise_manifest (list): - List of noise samples to be used for the current session. - """ - num_noise_files = min(len(noise_manifest), self.num_noise_files) - sampled_noise_manifest = [] - if num_noise_files > 0: - selected_noise_ids = np.random.choice(range(len(noise_manifest)), num_noise_files, replace=False) - for k in selected_noise_ids: - sampled_noise_manifest.append(noise_manifest[k]) - return sampled_noise_manifest diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/decoder_timestamps_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/decoder_timestamps_utils.py deleted file mode 100644 index f26b0c6b701a786103ad8b1af1d49946f182cc96..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/decoder_timestamps_utils.py +++ /dev/null @@ -1,779 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import math -from typing import Dict, List, Tuple, Type, Union - -import numpy as np -import torch -from omegaconf import OmegaConf - -import nemo.collections.asr as nemo_asr -from nemo.collections.asr.metrics.wer import WER, CTCDecoding, CTCDecodingConfig -from nemo.collections.asr.metrics.wer_bpe import WERBPE, CTCBPEDecoding, CTCBPEDecodingConfig -from nemo.collections.asr.models import EncDecCTCModel, EncDecCTCModelBPE -from nemo.collections.asr.parts.utils.audio_utils import get_samples -from nemo.collections.asr.parts.utils.speaker_utils import audio_rttm_map, get_uniqname_from_filepath -from nemo.collections.asr.parts.utils.streaming_utils import AudioFeatureIterator, FrameBatchASR -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.utils import logging - -__all__ = ['ASRDecoderTimeStamps'] - -try: - from pyctcdecode import build_ctcdecoder - - PYCTCDECODE = True -except ImportError: - PYCTCDECODE = False - - -def if_none_get_default(param, default_value): - return (param, default_value)[param is None] - - -class WERBPE_TS(WERBPE): - """ - This is WERBPE_TS class that is modified for generating word_timestamps with logits. - The functions in WER class is modified to save the word_timestamps whenever BPE token - is being saved into a list. - This class is designed to support ASR models based on CTC and BPE. - Please refer to the definition of WERBPE class for more information. - """ - - def __init__( - self, - tokenizer: TokenizerSpec, - batch_dim_index=0, - use_cer=False, - ctc_decode=None, - log_prediction=True, - dist_sync_on_step=False, - ): - if ctc_decode is not None: - logging.warning(f'`ctc_decode` was set to {ctc_decode}. Note that this is ignored.') - - decoding_cfg = CTCBPEDecodingConfig(batch_dim_index=batch_dim_index) - decoding = CTCBPEDecoding(decoding_cfg, tokenizer=tokenizer) - super().__init__(decoding, use_cer, log_prediction, dist_sync_on_step) - - def ctc_decoder_predictions_tensor_with_ts( - self, time_stride, predictions: torch.Tensor, predictions_len: torch.Tensor = None - ) -> List[str]: - hypotheses, timestamps, word_timestamps = [], [], [] - # '⁇' string should be removed since it causes error during string split. - unk = '⁇' - prediction_cpu_tensor = predictions.long().cpu() - # iterate over batch - self.time_stride = time_stride - for ind in range(prediction_cpu_tensor.shape[self.decoding.batch_dim_index]): - prediction = prediction_cpu_tensor[ind].detach().numpy().tolist() - if predictions_len is not None: - prediction = prediction[: predictions_len[ind]] - # CTC decoding procedure - decoded_prediction, char_ts, timestamp_list = [], [], [] - previous = self.decoding.blank_id - for pdx, p in enumerate(prediction): - if (p != previous or previous == self.decoding.blank_id) and p != self.decoding.blank_id: - decoded_prediction.append(p) - char_ts.append(round(pdx * self.time_stride, 2)) - timestamp_list.append(round(pdx * self.time_stride, 2)) - - previous = p - - hypothesis = self.decode_tokens_to_str_with_ts(decoded_prediction) - hypothesis = hypothesis.replace(unk, '') - word_ts, word_seq = self.get_ts_from_decoded_prediction(decoded_prediction, hypothesis, char_ts) - - hypotheses.append(" ".join(word_seq)) - timestamps.append(timestamp_list) - word_timestamps.append(word_ts) - return hypotheses, timestamps, word_timestamps - - def decode_tokens_to_str_with_ts(self, tokens: List[int]) -> str: - hypothesis = self.decoding.tokenizer.ids_to_text(tokens) - return hypothesis - - def decode_ids_to_tokens_with_ts(self, tokens: List[int]) -> List[str]: - token_list = self.decoding.tokenizer.ids_to_tokens(tokens) - return token_list - - def get_ts_from_decoded_prediction( - self, decoded_prediction: List[str], hypothesis: str, char_ts: List[str] - ) -> Tuple[List[List[float]], List[str]]: - decoded_char_list = self.decoding.tokenizer.ids_to_tokens(decoded_prediction) - stt_idx, end_idx = 0, len(decoded_char_list) - 1 - stt_ch_idx, end_ch_idx = 0, 0 - space = '▁' - word_ts, word_seq = [], [] - word_open_flag = False - for idx, ch in enumerate(decoded_char_list): - - # If the symbol is space and not an end of the utterance, move on - if idx != end_idx and (space == ch and space in decoded_char_list[idx + 1]): - continue - - # If the word does not containg space (the start of the word token), keep counting - if (idx == stt_idx or space == decoded_char_list[idx - 1] or (space in ch and len(ch) > 1)) and ( - ch != space - ): - _stt = char_ts[idx] - stt_ch_idx = idx - word_open_flag = True - - # If this char has `word_open_flag=True` and meets any of one of the following condition: - # (1) last word (2) unknown word (3) start symbol in the following word, - # close the `word_open_flag` and add the word to the `word_seq` list. - close_cond = idx == end_idx or ch in [''] or space in decoded_char_list[idx + 1] - if (word_open_flag and ch != space) and close_cond: - _end = round(char_ts[idx] + self.time_stride, 2) - end_ch_idx = idx - word_open_flag = False - word_ts.append([_stt, _end]) - stitched_word = ''.join(decoded_char_list[stt_ch_idx : end_ch_idx + 1]).replace(space, '') - word_seq.append(stitched_word) - - assert len(word_ts) == len(hypothesis.split()), "Text hypothesis does not match word timestamps." - return word_ts, word_seq - - -class WER_TS(WER): - """ - This is WER class that is modified for generating timestamps with logits. - The functions in WER class is modified to save the timestamps whenever character - is being saved into a list. - This class is designed to support ASR models based on CTC and Character-level tokens. - Please refer to the definition of WER class for more information. - """ - - def __init__( - self, - vocabulary, - batch_dim_index=0, - use_cer=False, - ctc_decode=None, - log_prediction=True, - dist_sync_on_step=False, - ): - if ctc_decode is not None: - logging.warning(f'`ctc_decode` was set to {ctc_decode}. Note that this is ignored.') - - decoding_cfg = CTCDecodingConfig(batch_dim_index=batch_dim_index) - decoding = CTCDecoding(decoding_cfg, vocabulary=vocabulary) - super().__init__(decoding, use_cer, log_prediction, dist_sync_on_step) - - def decode_tokens_to_str_with_ts(self, tokens: List[int], timestamps: List[int]) -> str: - """ - Take frame-level tokens and timestamp list and collect the timestamps for - start and end of each word. - """ - token_list, timestamp_list = self.decode_ids_to_tokens_with_ts(tokens, timestamps) - hypothesis = ''.join(self.decoding.decode_ids_to_tokens(tokens)) - return hypothesis, timestamp_list - - def decode_ids_to_tokens_with_ts(self, tokens: List[int], timestamps: List[int]) -> List[str]: - token_list, timestamp_list = [], [] - for i, c in enumerate(tokens): - if c != self.decoding.blank_id: - token_list.append(self.decoding.labels_map[c]) - timestamp_list.append(timestamps[i]) - return token_list, timestamp_list - - def ctc_decoder_predictions_tensor_with_ts( - self, predictions: torch.Tensor, predictions_len: torch.Tensor = None, - ) -> List[str]: - """ - A shortened version of the original function ctc_decoder_predictions_tensor(). - Replaced decode_tokens_to_str() function with decode_tokens_to_str_with_ts(). - """ - hypotheses, timestamps = [], [] - prediction_cpu_tensor = predictions.long().cpu() - for ind in range(prediction_cpu_tensor.shape[self.decoding.batch_dim_index]): - prediction = prediction_cpu_tensor[ind].detach().numpy().tolist() - if predictions_len is not None: - prediction = prediction[: predictions_len[ind]] - - # CTC decoding procedure with timestamps - decoded_prediction, decoded_timing_list = [], [] - previous = self.decoding.blank_id - for pdx, p in enumerate(prediction): - if (p != previous or previous == self.decoding.blank_id) and p != self.decoding.blank_id: - decoded_prediction.append(p) - decoded_timing_list.append(pdx) - previous = p - - text, timestamp_list = self.decode_tokens_to_str_with_ts(decoded_prediction, decoded_timing_list) - hypotheses.append(text) - timestamps.append(timestamp_list) - - return hypotheses, timestamps - - -def get_wer_feat_logit(audio_file_path, asr, frame_len, tokens_per_chunk, delay, model_stride_in_secs): - """ - Create a preprocessor to convert audio samples into raw features, - Normalization will be done per buffer in frame_bufferer. - """ - asr.reset() - asr.read_audio_file_and_return(audio_file_path, delay, model_stride_in_secs) - hyp, tokens, log_prob = asr.transcribe_with_ts(tokens_per_chunk, delay) - return hyp, tokens, log_prob - - -class FrameBatchASRLogits(FrameBatchASR): - """ - A class for streaming frame-based ASR. - Inherits from FrameBatchASR and adds new capability of returning the logit output. - Please refer to FrameBatchASR for more detailed information. - """ - - def __init__( - self, - asr_model: Type[EncDecCTCModelBPE], - frame_len: float = 1.6, - total_buffer: float = 4.0, - batch_size: int = 4, - ): - super().__init__(asr_model, frame_len, total_buffer, batch_size) - self.all_logprobs = [] - - def clear_buffer(self): - self.all_logprobs = [] - self.all_preds = [] - - def read_audio_file_and_return(self, audio_filepath: str, delay: float, model_stride_in_secs: float): - samples = get_samples(audio_filepath) - samples = np.pad(samples, (0, int(delay * model_stride_in_secs * self.asr_model._cfg.sample_rate))) - frame_reader = AudioFeatureIterator(samples, self.frame_len, self.raw_preprocessor, self.asr_model.device) - self.set_frame_reader(frame_reader) - - @torch.no_grad() - def _get_batch_preds(self, keep_logits): - device = self.asr_model.device - for batch in iter(self.data_loader): - feat_signal, feat_signal_len = batch - feat_signal, feat_signal_len = feat_signal.to(device), feat_signal_len.to(device) - log_probs, encoded_len, predictions = self.asr_model( - processed_signal=feat_signal, processed_signal_length=feat_signal_len - ) - preds = torch.unbind(predictions) - for pred in preds: - self.all_preds.append(pred.cpu().numpy()) - # Always keep logits in FrameBatchASRLogits - _ = keep_logits - log_probs_tup = torch.unbind(log_probs) - for log_prob in log_probs_tup: - self.all_logprobs.append(log_prob) - del log_probs, log_probs_tup - del encoded_len - del predictions - - def transcribe_with_ts( - self, tokens_per_chunk: int, delay: int, - ): - self.infer_logits() - self.unmerged = [] - self.part_logprobs = [] - for idx, pred in enumerate(self.all_preds): - decoded = pred.tolist() - _stt, _end = len(decoded) - 1 - delay, len(decoded) - 1 - delay + tokens_per_chunk - self.unmerged += decoded[len(decoded) - 1 - delay : len(decoded) - 1 - delay + tokens_per_chunk] - self.part_logprobs.append(self.all_logprobs[idx][_stt:_end, :]) - self.unmerged_logprobs = torch.cat(self.part_logprobs, 0) - assert ( - len(self.unmerged) == self.unmerged_logprobs.shape[0] - ), "Unmerged decoded result and log prob lengths are different." - return self.greedy_merge(self.unmerged), self.unmerged, self.unmerged_logprobs - - -class ASRDecoderTimeStamps: - """ - A class designed for extracting word timestamps while the ASR decoding process. - This class contains a few setups for a slew of NeMo ASR models such as QuartzNet, CitriNet and ConformerCTC models. - """ - - def __init__(self, cfg_diarizer): - self.manifest_filepath = cfg_diarizer.manifest_filepath - self.params = cfg_diarizer.asr.parameters - self.ctc_decoder_params = cfg_diarizer.asr.ctc_decoder_parameters - self.ASR_model_name = cfg_diarizer.asr.model_path - self.nonspeech_threshold = self.params.asr_based_vad_threshold - self.root_path = None - self.run_ASR = None - self.encdec_class = None - self.AUDIO_RTTM_MAP = audio_rttm_map(self.manifest_filepath) - self.audio_file_list = [value['audio_filepath'] for _, value in self.AUDIO_RTTM_MAP.items()] - - def set_asr_model(self): - """ - Initialize the parameters for the given ASR model. - Currently, the following NGC models are supported: - - stt_en_quartznet15x5, - stt_en_citrinet*, - stt_en_conformer_ctc* - - To assign a proper decoding function for generating timestamp output, - the name of .nemo file should include the architecture name such as: - 'quartznet', 'conformer', and 'citrinet'. - - decoder_delay_in_sec is the amount of delay that is compensated during the word timestamp extraction. - word_ts_anchor_offset is the reference point for a word and used for matching the word with diarization labels. - Each ASR model has a different optimal decoder delay and word timestamp anchor offset. - To obtain an optimized diarization result with ASR, decoder_delay_in_sec and word_ts_anchor_offset - need to be searched on a development set. - """ - if 'quartznet' in self.ASR_model_name.lower(): - self.run_ASR = self.run_ASR_QuartzNet_CTC - self.encdec_class = EncDecCTCModel - self.decoder_delay_in_sec = if_none_get_default(self.params['decoder_delay_in_sec'], 0.04) - self.word_ts_anchor_offset = if_none_get_default(self.params['word_ts_anchor_offset'], 0.12) - self.asr_batch_size = if_none_get_default(self.params['asr_batch_size'], 4) - self.model_stride_in_secs = 0.02 - - elif 'conformer' in self.ASR_model_name.lower(): - self.run_ASR = self.run_ASR_BPE_CTC - self.encdec_class = EncDecCTCModelBPE - self.decoder_delay_in_sec = if_none_get_default(self.params['decoder_delay_in_sec'], 0.08) - self.word_ts_anchor_offset = if_none_get_default(self.params['word_ts_anchor_offset'], 0.12) - self.asr_batch_size = if_none_get_default(self.params['asr_batch_size'], 16) - self.model_stride_in_secs = 0.04 - # Conformer requires buffered inference and the parameters for buffered processing. - self.chunk_len_in_sec = 5 - self.total_buffer_in_secs = 25 - - elif 'citrinet' in self.ASR_model_name.lower(): - self.run_ASR = self.run_ASR_CitriNet_CTC - self.encdec_class = EncDecCTCModelBPE - self.decoder_delay_in_sec = if_none_get_default(self.params['decoder_delay_in_sec'], 0.16) - self.word_ts_anchor_offset = if_none_get_default(self.params['word_ts_anchor_offset'], 0.2) - self.asr_batch_size = if_none_get_default(self.params['asr_batch_size'], 4) - self.model_stride_in_secs = 0.08 - - else: - raise ValueError(f"Cannot find the ASR model class for: {self.params['self.ASR_model_name']}") - - if self.ASR_model_name.endswith('.nemo'): - asr_model = self.encdec_class.restore_from(restore_path=self.ASR_model_name) - else: - asr_model = self.encdec_class.from_pretrained(model_name=self.ASR_model_name, strict=False) - - if self.ctc_decoder_params['pretrained_language_model']: - if not PYCTCDECODE: - raise ImportError( - 'LM for beam search decoding is provided but pyctcdecode is not installed. Install pyctcdecode using PyPI: pip install pyctcdecode' - ) - self.beam_search_decoder = self.load_LM_for_CTC_decoder(asr_model) - else: - self.beam_search_decoder = None - - asr_model.eval() - return asr_model - - def load_LM_for_CTC_decoder(self, asr_model: Type[Union[EncDecCTCModel, EncDecCTCModelBPE]]): - """ - Load a language model for CTC decoder (pyctcdecode). - Note that only EncDecCTCModel and EncDecCTCModelBPE models can use pyctcdecode. - """ - kenlm_model = self.ctc_decoder_params['pretrained_language_model'] - logging.info(f"Loading language model : {self.ctc_decoder_params['pretrained_language_model']}") - - if 'EncDecCTCModelBPE' in str(type(asr_model)): - vocab = asr_model.tokenizer.tokenizer.get_vocab() - labels = list(vocab.keys()) - labels[0] = "" - elif 'EncDecCTCModel' in str(type(asr_model)): - labels = asr_model.decoder.vocabulary - else: - raise ValueError(f"Cannot find a vocabulary or tokenizer for: {self.params['self.ASR_model_name']}") - - decoder = build_ctcdecoder( - labels, kenlm_model, alpha=self.ctc_decoder_params['alpha'], beta=self.ctc_decoder_params['beta'] - ) - return decoder - - def run_ASR_QuartzNet_CTC(self, asr_model: Type[EncDecCTCModel]) -> Tuple[Dict, Dict]: - """ - Launch QuartzNet ASR model and collect logit, timestamps and text output. - - Args: - asr_model (class): - The loaded NeMo ASR model. - - Returns: - words_dict (dict): - Dictionary containing the sequence of words from hypothesis. - word_ts_dict (dict): - Dictionary containing the time-stamps of words. - """ - words_dict, word_ts_dict = {}, {} - - wer_ts = WER_TS( - vocabulary=asr_model.decoder.vocabulary, - batch_dim_index=0, - use_cer=asr_model._cfg.get('use_cer', False), - ctc_decode=True, - dist_sync_on_step=True, - log_prediction=asr_model._cfg.get("log_prediction", False), - ) - - with torch.cuda.amp.autocast(): - transcript_logits_list = asr_model.transcribe( - self.audio_file_list, batch_size=self.asr_batch_size, logprobs=True - ) - for idx, logit_np in enumerate(transcript_logits_list): - uniq_id = get_uniqname_from_filepath(self.audio_file_list[idx]) - if self.beam_search_decoder: - logging.info( - f"Running beam-search decoder on {uniq_id} with LM {self.ctc_decoder_params['pretrained_language_model']}" - ) - hyp_words, word_ts = self.run_pyctcdecode(logit_np) - else: - log_prob = torch.from_numpy(logit_np) - logits_len = torch.from_numpy(np.array([log_prob.shape[0]])) - greedy_predictions = log_prob.argmax(dim=-1, keepdim=False).unsqueeze(0) - text, char_ts = wer_ts.ctc_decoder_predictions_tensor_with_ts( - greedy_predictions, predictions_len=logits_len - ) - trans, char_ts_in_feature_frame_idx = self.clean_trans_and_TS(text[0], char_ts[0]) - spaces_in_sec, hyp_words = self._get_spaces( - trans, char_ts_in_feature_frame_idx, self.model_stride_in_secs - ) - word_ts = self.get_word_ts_from_spaces( - char_ts_in_feature_frame_idx, spaces_in_sec, end_stamp=logit_np.shape[0] - ) - word_ts = self.align_decoder_delay(word_ts, self.decoder_delay_in_sec) - assert len(hyp_words) == len(word_ts), "Words and word timestamp list length does not match." - words_dict[uniq_id] = hyp_words - word_ts_dict[uniq_id] = word_ts - - return words_dict, word_ts_dict - - @staticmethod - def clean_trans_and_TS(trans: str, char_ts: List[str]) -> Tuple[str, List[str]]: - """ - Remove the spaces in the beginning and the end. - The char_ts need to be changed and synced accordingly. - - Args: - trans (list): - List containing the character output (str). - char_ts (list): - List containing the timestamps (int) for each character. - - Returns: - trans (list): - List containing the cleaned character output. - char_ts (list): - List containing the cleaned timestamps for each character. - """ - assert (len(trans) > 0) and (len(char_ts) > 0) - assert len(trans) == len(char_ts) - - trans = trans.lstrip() - diff_L = len(char_ts) - len(trans) - char_ts = char_ts[diff_L:] - - trans = trans.rstrip() - diff_R = len(char_ts) - len(trans) - if diff_R > 0: - char_ts = char_ts[: -1 * diff_R] - return trans, char_ts - - def _get_spaces(self, trans: str, char_ts: List[str], time_stride: float) -> Tuple[float, List[str]]: - """ - Collect the space symbols with a list of words. - - Args: - trans (list): - List containing the character output (str). - char_ts (list): - List containing the timestamps of the characters. - time_stride (float): - The size of stride of the model in second. - - Returns: - spaces_in_sec (list): - List containing the ranges of spaces - word_list (list): - List containing the words from ASR inference. - """ - blank = ' ' - spaces_in_sec, word_list = [], [] - stt_idx = 0 - assert (len(trans) > 0) and (len(char_ts) > 0), "Transcript and char_ts length should not be 0." - assert len(trans) == len(char_ts), "Transcript and timestamp lengths do not match." - - # If there is a blank, update the time stamps of the space and the word. - for k, s in enumerate(trans): - if s == blank: - spaces_in_sec.append( - [round(char_ts[k] * time_stride, 2), round((char_ts[k + 1] - 1) * time_stride, 2)] - ) - word_list.append(trans[stt_idx:k]) - stt_idx = k + 1 - - # Add the last word - if len(trans) > stt_idx and trans[stt_idx] != blank: - word_list.append(trans[stt_idx:]) - return spaces_in_sec, word_list - - def run_ASR_CitriNet_CTC(self, asr_model: Type[EncDecCTCModelBPE]) -> Tuple[Dict, Dict]: - """ - Launch CitriNet ASR model and collect logit, timestamps and text output. - - Args: - asr_model (class): - The loaded NeMo ASR model. - - Returns: - words_dict (dict): - Dictionary containing the sequence of words from hypothesis. - word_ts_dict (dict): - Dictionary containing the timestamps of hypothesis words. - """ - words_dict, word_ts_dict = {}, {} - - werbpe_ts = WERBPE_TS( - tokenizer=asr_model.tokenizer, - batch_dim_index=0, - use_cer=asr_model._cfg.get('use_cer', False), - ctc_decode=True, - dist_sync_on_step=True, - log_prediction=asr_model._cfg.get("log_prediction", False), - ) - - with torch.cuda.amp.autocast(): - transcript_logits_list = asr_model.transcribe( - self.audio_file_list, batch_size=self.asr_batch_size, logprobs=True - ) - for idx, logit_np in enumerate(transcript_logits_list): - uniq_id = get_uniqname_from_filepath(self.audio_file_list[idx]) - if self.beam_search_decoder: - logging.info( - f"Running beam-search decoder with LM {self.ctc_decoder_params['pretrained_language_model']}" - ) - hyp_words, word_ts = self.run_pyctcdecode(logit_np) - else: - log_prob = torch.from_numpy(logit_np) - greedy_predictions = log_prob.argmax(dim=-1, keepdim=False).unsqueeze(0) - logits_len = torch.from_numpy(np.array([log_prob.shape[0]])) - text, char_ts, word_ts = werbpe_ts.ctc_decoder_predictions_tensor_with_ts( - self.model_stride_in_secs, greedy_predictions, predictions_len=logits_len - ) - hyp_words, word_ts = text[0].split(), word_ts[0] - word_ts = self.align_decoder_delay(word_ts, self.decoder_delay_in_sec) - assert len(hyp_words) == len(word_ts), "Words and word timestamp list length does not match." - words_dict[uniq_id] = hyp_words - word_ts_dict[uniq_id] = word_ts - - return words_dict, word_ts_dict - - def set_buffered_infer_params(self, asr_model: Type[EncDecCTCModelBPE]) -> Tuple[float, float, float]: - """ - Prepare the parameters for the buffered inference. - """ - cfg = copy.deepcopy(asr_model._cfg) - OmegaConf.set_struct(cfg.preprocessor, False) - - # some changes for streaming scenario - cfg.preprocessor.dither = 0.0 - cfg.preprocessor.pad_to = 0 - cfg.preprocessor.normalize = "None" - - preprocessor = nemo_asr.models.EncDecCTCModelBPE.from_config_dict(cfg.preprocessor) - preprocessor.to(asr_model.device) - - # Disable config overwriting - OmegaConf.set_struct(cfg.preprocessor, True) - - onset_delay = ( - math.ceil(((self.total_buffer_in_secs - self.chunk_len_in_sec) / 2) / self.model_stride_in_secs) + 1 - ) - mid_delay = math.ceil( - (self.chunk_len_in_sec + (self.total_buffer_in_secs - self.chunk_len_in_sec) / 2) - / self.model_stride_in_secs - ) - tokens_per_chunk = math.ceil(self.chunk_len_in_sec / self.model_stride_in_secs) - - return onset_delay, mid_delay, tokens_per_chunk - - def run_ASR_BPE_CTC(self, asr_model: Type[EncDecCTCModelBPE]) -> Tuple[Dict, Dict]: - """ - Launch CTC-BPE based ASR model and collect logit, timestamps and text output. - - Args: - asr_model (class): - The loaded NeMo ASR model. - - Returns: - words_dict (dict): - Dictionary containing the sequence of words from hypothesis. - word_ts_dict (dict): - Dictionary containing the time-stamps of words. - """ - torch.manual_seed(0) - torch.set_grad_enabled(False) - words_dict, word_ts_dict = {}, {} - - werbpe_ts = WERBPE_TS( - tokenizer=asr_model.tokenizer, - batch_dim_index=0, - use_cer=asr_model._cfg.get('use_cer', False), - ctc_decode=True, - dist_sync_on_step=True, - log_prediction=asr_model._cfg.get("log_prediction", False), - ) - - frame_asr = FrameBatchASRLogits( - asr_model=asr_model, - frame_len=self.chunk_len_in_sec, - total_buffer=self.total_buffer_in_secs, - batch_size=self.asr_batch_size, - ) - - onset_delay, mid_delay, tokens_per_chunk = self.set_buffered_infer_params(asr_model) - onset_delay_in_sec = round(onset_delay * self.model_stride_in_secs, 2) - - with torch.cuda.amp.autocast(): - logging.info(f"Running ASR model {self.ASR_model_name}") - - for idx, audio_file_path in enumerate(self.audio_file_list): - uniq_id = get_uniqname_from_filepath(audio_file_path) - logging.info(f"[{idx+1}/{len(self.audio_file_list)}] FrameBatchASR: {audio_file_path}") - frame_asr.clear_buffer() - - hyp, greedy_predictions_list, log_prob = get_wer_feat_logit( - audio_file_path, - frame_asr, - self.chunk_len_in_sec, - tokens_per_chunk, - mid_delay, - self.model_stride_in_secs, - ) - if self.beam_search_decoder: - logging.info( - f"Running beam-search decoder with LM {self.ctc_decoder_params['pretrained_language_model']}" - ) - log_prob = log_prob.unsqueeze(0).cpu().numpy()[0] - hyp_words, word_ts = self.run_pyctcdecode(log_prob, onset_delay_in_sec=onset_delay_in_sec) - else: - logits_len = torch.from_numpy(np.array([len(greedy_predictions_list)])) - greedy_predictions_list = greedy_predictions_list[onset_delay:] - greedy_predictions = torch.from_numpy(np.array(greedy_predictions_list)).unsqueeze(0) - text, char_ts, word_ts = werbpe_ts.ctc_decoder_predictions_tensor_with_ts( - self.model_stride_in_secs, greedy_predictions, predictions_len=logits_len - ) - hyp_words, word_ts = text[0].split(), word_ts[0] - - word_ts = self.align_decoder_delay(word_ts, self.decoder_delay_in_sec) - assert len(hyp_words) == len(word_ts), "Words and word timestamp list length does not match." - words_dict[uniq_id] = hyp_words - word_ts_dict[uniq_id] = word_ts - - return words_dict, word_ts_dict - - def get_word_ts_from_spaces(self, char_ts: List[float], spaces_in_sec: List[float], end_stamp: float) -> List[str]: - """ - Take word timestamps from the spaces from the decoded prediction. - - Args: - char_ts (list): - List containing the timestamp for each character. - spaces_in_sec (list): - List containing the start and the end time of each space token. - end_stamp (float): - The end time of the session in sec. - - Returns: - word_timestamps (list): - List containing the timestamps for the resulting words. - """ - end_stamp = min(end_stamp, (char_ts[-1] + 2)) - start_stamp_in_sec = round(char_ts[0] * self.model_stride_in_secs, 2) - end_stamp_in_sec = round(end_stamp * self.model_stride_in_secs, 2) - - # In case of one word output with no space information. - if len(spaces_in_sec) == 0: - word_timestamps = [[start_stamp_in_sec, end_stamp_in_sec]] - elif len(spaces_in_sec) > 0: - # word_timetamps_middle should be an empty list if len(spaces_in_sec) == 1. - word_timetamps_middle = [ - [round(spaces_in_sec[k][1], 2), round(spaces_in_sec[k + 1][0], 2),] - for k in range(len(spaces_in_sec) - 1) - ] - word_timestamps = ( - [[start_stamp_in_sec, round(spaces_in_sec[0][0], 2)]] - + word_timetamps_middle - + [[round(spaces_in_sec[-1][1], 2), end_stamp_in_sec]] - ) - return word_timestamps - - def run_pyctcdecode( - self, logprob: np.ndarray, onset_delay_in_sec: float = 0, beam_width: int = 32 - ) -> Tuple[List[str], List[str]]: - """ - Launch pyctcdecode with the loaded pretrained language model. - - Args: - logprob (np.ndarray): - The log probability from the ASR model inference in numpy array format. - onset_delay_in_sec (float): - The amount of delay that needs to be compensated for the timestamp outputs froM pyctcdecode. - beam_width (int): - The beam width parameter for beam search decodring. - Returns: - hyp_words (list): - List containing the words in the hypothesis. - word_ts (list): - List containing the word timestamps from the decoder. - """ - beams = self.beam_search_decoder.decode_beams(logprob, beam_width=self.ctc_decoder_params['beam_width']) - word_ts_beam, words_beam = [], [] - for idx, (word, _) in enumerate(beams[0][2]): - ts = self.get_word_ts_from_wordframes(idx, beams[0][2], self.model_stride_in_secs, onset_delay_in_sec) - word_ts_beam.append(ts) - words_beam.append(word) - hyp_words, word_ts = words_beam, word_ts_beam - return hyp_words, word_ts - - @staticmethod - def get_word_ts_from_wordframes( - idx, word_frames: List[List[float]], frame_duration: float, onset_delay: float, word_block_delay: float = 2.25 - ): - """ - Extract word timestamps from word frames generated from pyctcdecode. - """ - offset = -1 * word_block_delay * frame_duration - onset_delay - frame_begin = word_frames[idx][1][0] - if frame_begin == -1: - frame_begin = word_frames[idx - 1][1][1] if idx != 0 else 0 - frame_end = word_frames[idx][1][1] - return [ - round(max(frame_begin * frame_duration + offset, 0), 2), - round(max(frame_end * frame_duration + offset, 0), 2), - ] - - @staticmethod - def align_decoder_delay(word_ts, decoder_delay_in_sec: float): - """ - Subtract decoder_delay_in_sec from the word timestamp output. - """ - for k in range(len(word_ts)): - word_ts[k] = [ - round(word_ts[k][0] - decoder_delay_in_sec, 2), - round(word_ts[k][1] - decoder_delay_in_sec, 2), - ] - return word_ts diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/diarization_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/diarization_utils.py deleted file mode 100644 index f0b951eb89d33c90ada57e7c2097e1eea3c84901..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/diarization_utils.py +++ /dev/null @@ -1,1306 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import csv -import json -import os -from collections import OrderedDict as od -from datetime import datetime -from typing import Dict, List, Tuple - -import numpy as np - -from nemo.collections.asr.metrics.der import concat_perm_word_error_rate -from nemo.collections.asr.metrics.wer import word_error_rate -from nemo.collections.asr.models import ClusteringDiarizer -from nemo.collections.asr.parts.utils.speaker_utils import ( - audio_rttm_map, - get_uniqname_from_filepath, - labels_to_rttmfile, - rttm_to_labels, - write_rttm2manifest, -) -from nemo.utils import logging - -try: - import arpa - - ARPA = True -except ImportError: - ARPA = False - -__all__ = ['OfflineDiarWithASR'] - - -def dump_json_to_file(file_path: str, session_trans_dict: dict): - """ - Write a json file from the session_trans_dict dictionary. - - Args: - file_path (str): - Target filepath where json file is saved - session_trans_dict (dict): - Dictionary containing transcript, speaker labels and timestamps - """ - with open(file_path, "w") as outfile: - json.dump(session_trans_dict, outfile, indent=4) - - -def write_txt(w_path: str, val: str): - """ - Write a text file from the string input. - - Args: - w_path (str): - Target path for saving a file - val (str): - String variable to be written - """ - with open(w_path, "w") as output: - output.write(val + '\n') - - -def convert_ctm_to_text(ctm_file_path: str) -> Tuple[List[str], str]: - """ - Convert ctm file into a list containing transcription (space seperated string) per each speaker. - - Args: - ctm_file_path (str): - Filepath to the reference CTM files. - - Returns: - spk_reference (list): - List containing the reference transcripts for each speaker. - - Example: - >>> spk_reference = ["hi how are you well that's nice", "i'm good yeah how is your sister"] - - mix_reference (str): - Reference transcript from CTM file. This transcript has word sequence in temporal order. - - Example: - >>> mix_reference = "hi how are you i'm good well that's nice yeah how is your sister" - """ - mix_reference, per_spk_ref_trans_dict = [], {} - ctm_content = open(ctm_file_path).readlines() - for ctm_line in ctm_content: - ctm_split = ctm_line.split() - spk = ctm_split[1] - if spk not in per_spk_ref_trans_dict: - per_spk_ref_trans_dict[spk] = [] - per_spk_ref_trans_dict[spk].append(ctm_split[4]) - mix_reference.append(ctm_split[4]) - spk_reference = [" ".join(word_list) for word_list in per_spk_ref_trans_dict.values()] - mix_reference = " ".join(mix_reference) - return spk_reference, mix_reference - - -def convert_word_dict_seq_to_text(word_dict_seq_list: List[Dict[str, float]]) -> Tuple[List[str], str]: - """ - Convert word_dict_seq_list into a list containing transcription (space seperated string) per each speaker. - - Args: - word_dict_seq_list (list): - List containing words and corresponding word timestamps in dictionary format. - - Example: - >>> word_dict_seq_list = \ - >>> [{'word': 'right', 'start_time': 0.0, 'end_time': 0.04, 'speaker': 'speaker_0'}, - {'word': 'and', 'start_time': 0.64, 'end_time': 0.68, 'speaker': 'speaker_1'}, - ...], - - Returns: - spk_hypothesis (list): - Dictionary containing the hypothesis transcript for each speaker. A list containing the sequence - of words is assigned for each speaker. - - Example: - >>> spk_hypothesis= ["hi how are you well that's nice", "i'm good yeah how is your sister"] - - mix_hypothesis (str): - Hypothesis transcript from ASR output. This transcript has word sequence in temporal order. - - Example: - >>> mix_hypothesis = "hi how are you i'm good well that's nice yeah how is your sister" - """ - mix_hypothesis, per_spk_hyp_trans_dict = [], {} - for word_dict in word_dict_seq_list: - spk = word_dict['speaker'] - if spk not in per_spk_hyp_trans_dict: - per_spk_hyp_trans_dict[spk] = [] - per_spk_hyp_trans_dict[spk].append(word_dict['word']) - mix_hypothesis.append(word_dict['word']) - - # Create a list containing string formatted transcript - spk_hypothesis = [" ".join(word_list) for word_list in per_spk_hyp_trans_dict.values()] - mix_hypothesis = " ".join(mix_hypothesis) - return spk_hypothesis, mix_hypothesis - - -def convert_word_dict_seq_to_ctm( - word_dict_seq_list: List[Dict[str, float]], uniq_id: str = 'null', decimals: int = 3 -) -> Tuple[List[str], str]: - """ - Convert word_dict_seq_list into a list containing transcription in CTM format. - - Args: - word_dict_seq_list (list): - List containing words and corresponding word timestamps in dictionary format. - - Example: - >>> word_dict_seq_list = \ - >>> [{'word': 'right', 'start_time': 0.0, 'end_time': 0.34, 'speaker': 'speaker_0'}, - {'word': 'and', 'start_time': 0.64, 'end_time': 0.81, 'speaker': 'speaker_1'}, - ...], - - Returns: - ctm_lines_list (list): - List containing the hypothesis transcript in CTM format. - - Example: - >>> ctm_lines_list= ["my_audio_01 speaker_0 0.0 0.34 right 0", - my_audio_01 speaker_0 0.64 0.81 and 0", - - - """ - ctm_lines = [] - confidence = 0 - for word_dict in word_dict_seq_list: - spk = word_dict['speaker'] - stt = word_dict['start_time'] - dur = round(word_dict['end_time'] - word_dict['start_time'], decimals) - word = word_dict['word'] - ctm_line_str = f"{uniq_id} {spk} {stt} {dur} {word} {confidence}" - ctm_lines.append(ctm_line_str) - return ctm_lines - - -def get_total_result_dict( - der_results: Dict[str, Dict[str, float]], wer_results: Dict[str, Dict[str, float]], csv_columns: List[str], -): - """ - Merge WER results and DER results into a single dictionary variable. - - Args: - der_results (dict): - Dictionary containing FA, MISS, CER and DER values for both aggregated amount and - each session. - wer_results (dict): - Dictionary containing session-by-session WER and cpWER. `wer_results` only - exists when CTM files are provided. - - Returns: - total_result_dict (dict): - Dictionary containing both DER and WER results. This dictionary contains unique-IDs of - each session and `total` key that includes average (cp)WER and DER/CER/Miss/FA values. - """ - total_result_dict = {} - for uniq_id in der_results.keys(): - if uniq_id == 'total': - continue - total_result_dict[uniq_id] = {x: "-" for x in csv_columns} - total_result_dict[uniq_id]["uniq_id"] = uniq_id - if uniq_id in der_results: - total_result_dict[uniq_id].update(der_results[uniq_id]) - if uniq_id in wer_results: - total_result_dict[uniq_id].update(wer_results[uniq_id]) - total_result_jsons = list(total_result_dict.values()) - return total_result_jsons - - -def get_audacity_label(word: str, stt_sec: float, end_sec: float, speaker: str) -> str: - """ - Get a string formatted line for Audacity label. - - Args: - word (str): - A decoded word - stt_sec (float): - Start timestamp of the word - end_sec (float): - End timestamp of the word - - Returns: - speaker (str): - Speaker label in string type - """ - spk = speaker.split('_')[-1] - return f'{stt_sec}\t{end_sec}\t[{spk}] {word}' - - -def get_num_of_spk_from_labels(labels: List[str]) -> int: - """ - Count the number of speakers in a segment label list. - Args: - labels (list): - List containing segment start and end timestamp and speaker labels. - - Example: - >>> labels = ["15.25 21.82 speaker_0", "21.18 29.51 speaker_1", ... ] - - Returns: - n_spk (int): - The number of speakers in the list `labels` - - """ - spk_set = [x.split(' ')[-1].strip() for x in labels] - return len(set(spk_set)) - - -class OfflineDiarWithASR: - """ - A class designed for performing ASR and diarization together. - - Attributes: - cfg_diarizer (OmegaConf): - Hydra config for diarizer key - params (OmegaConf): - Parameters config in diarizer.asr - ctc_decoder_params (OmegaConf) - Hydra config for beam search decoder - realigning_lm_params (OmegaConf): - Hydra config for realigning language model - manifest_filepath (str): - Path to the input manifest path - nonspeech_threshold (float): - Threshold for VAD logits that are used for creating speech segments - fix_word_ts_with_VAD (bool): - Choose whether to fix word timestamps by using VAD results - root_path (str): - Path to the folder where diarization results are saved - vad_threshold_for_word_ts (float): - Threshold used for compensating word timestamps with VAD output - max_word_ts_length_in_sec (float): - Maximum limit for the duration of each word timestamp - word_ts_anchor_offset (float): - Offset for word timestamps from ASR decoders - run_ASR: - Placeholder variable for an ASR launcher function - realigning_lm: - Placeholder variable for a loaded ARPA Language model - ctm_exists (bool): - Boolean that indicates whether all files have the corresponding reference CTM file - frame_VAD (dict): - Dictionary containing frame-level VAD logits - AUDIO_RTTM_MAP: - Dictionary containing the input manifest information - color_palette (dict): - Dictionary containing the ANSI color escape codes for each speaker label (speaker index) - """ - - def __init__(self, cfg_diarizer): - self.cfg_diarizer = cfg_diarizer - self.params = cfg_diarizer.asr.parameters - self.ctc_decoder_params = cfg_diarizer.asr.ctc_decoder_parameters - self.realigning_lm_params = cfg_diarizer.asr.realigning_lm_parameters - self.manifest_filepath = cfg_diarizer.manifest_filepath - self.nonspeech_threshold = self.params.asr_based_vad_threshold - self.fix_word_ts_with_VAD = self.params.fix_word_ts_with_VAD - self.root_path = cfg_diarizer.out_dir - - self.vad_threshold_for_word_ts = 0.7 - self.max_word_ts_length_in_sec = 0.6 - self.word_ts_anchor_offset = 0.0 - self.run_ASR = None - self.realigning_lm = None - self.ctm_exists = False - self.frame_VAD = {} - - self.make_file_lists() - - self.color_palette = self.get_color_palette() - self.csv_columns = self.get_csv_columns() - - @staticmethod - def get_color_palette() -> Dict[str, str]: - return { - 'speaker_0': '\033[1;32m', - 'speaker_1': '\033[1;34m', - 'speaker_2': '\033[1;30m', - 'speaker_3': '\033[1;31m', - 'speaker_4': '\033[1;35m', - 'speaker_5': '\033[1;36m', - 'speaker_6': '\033[1;37m', - 'speaker_7': '\033[1;30m', - 'speaker_8': '\033[1;33m', - 'speaker_9': '\033[0;34m', - 'white': '\033[0;37m', - } - - @staticmethod - def get_csv_columns() -> List[str]: - return [ - 'uniq_id', - 'DER', - 'CER', - 'FA', - 'MISS', - 'est_n_spk', - 'ref_n_spk', - 'cpWER', - 'WER', - 'mapping', - ] - - def make_file_lists(self): - """ - Create lists containing the filepaths of audio clips and CTM files. - """ - self.AUDIO_RTTM_MAP = audio_rttm_map(self.manifest_filepath) - self.audio_file_list = [value['audio_filepath'] for _, value in self.AUDIO_RTTM_MAP.items()] - - self.ctm_file_list = [] - for k, audio_file_path in enumerate(self.audio_file_list): - uniq_id = get_uniqname_from_filepath(audio_file_path) - if ( - 'ctm_filepath' in self.AUDIO_RTTM_MAP[uniq_id] - and self.AUDIO_RTTM_MAP[uniq_id]['ctm_filepath'] is not None - and uniq_id in self.AUDIO_RTTM_MAP[uniq_id]['ctm_filepath'] - ): - self.ctm_file_list.append(self.AUDIO_RTTM_MAP[uniq_id]['ctm_filepath']) - - # check if all unique IDs have CTM files - if len(self.audio_file_list) == len(self.ctm_file_list): - self.ctm_exists = True - - def _load_realigning_LM(self): - """ - Load ARPA language model for realigning speaker labels for words. - """ - self.N_range = ( - self.realigning_lm_params['min_number_of_words'], - self.realigning_lm_params['max_number_of_words'], - ) - self.stt_end_tokens = ['', ''] - logging.info(f"Loading LM for realigning: {self.realigning_lm_params['arpa_language_model']}") - return arpa.loadf(self.realigning_lm_params['arpa_language_model'])[0] - - def _init_session_trans_dict(self, uniq_id: str, n_spk: int): - """ - Initialize json (in dictionary variable) formats for session level result and Gecko style json. - - Returns: - (dict): Session level result dictionary variable - """ - return od( - { - 'status': 'initialized', - 'session_id': uniq_id, - 'transcription': '', - 'speaker_count': n_spk, - 'words': [], - 'sentences': [], - } - ) - - def _init_session_gecko_dict(self): - """ - Initialize a dictionary format for Gecko style json. - - Returns: - (dict): - Gecko style json dictionary. - """ - return od({'schemaVersion': 2.0, 'monologues': []}) - - def _save_VAD_labels_list(self, word_ts_dict: Dict[str, Dict[str, List[float]]]): - """ - Take the non_speech labels from logit output. The logit output is obtained from - `run_ASR` function. - - Args: - word_ts_dict (dict): - Dictionary containing word timestamps. - """ - self.VAD_RTTM_MAP = {} - for idx, (uniq_id, word_timestamps) in enumerate(word_ts_dict.items()): - speech_labels_float = self.get_speech_labels_from_decoded_prediction( - word_timestamps, self.nonspeech_threshold - ) - speech_labels = self.get_str_speech_labels(speech_labels_float) - output_path = os.path.join(self.root_path, 'pred_rttms') - if not os.path.exists(output_path): - os.makedirs(output_path) - filename = labels_to_rttmfile(speech_labels, uniq_id, output_path) - self.VAD_RTTM_MAP[uniq_id] = {'audio_filepath': self.audio_file_list[idx], 'rttm_filepath': filename} - - @staticmethod - def get_speech_labels_from_decoded_prediction( - input_word_ts: List[float], nonspeech_threshold: float, - ) -> List[float]: - """ - Extract speech labels from the ASR output (decoded predictions) - - Args: - input_word_ts (list): - List containing word timestamps. - - Returns: - word_ts (list): - The ranges of the speech segments, which are merged ranges of input_word_ts. - """ - speech_labels = [] - word_ts = copy.deepcopy(input_word_ts) - if word_ts == []: - return speech_labels - else: - count = len(word_ts) - 1 - while count > 0: - if len(word_ts) > 1: - if word_ts[count][0] - word_ts[count - 1][1] <= nonspeech_threshold: - trangeB = word_ts.pop(count) - trangeA = word_ts.pop(count - 1) - word_ts.insert(count - 1, [trangeA[0], trangeB[1]]) - count -= 1 - return word_ts - - def run_diarization(self, diar_model_config, word_timestamps) -> Dict[str, List[str]]: - """ - Launch the diarization process using the given VAD timestamp (oracle_manifest). - - Args: - diar_model_config (OmegaConf): - Hydra configurations for speaker diarization - word_and_timestamps (list): - List containing words and word timestamps - - Returns: - diar_hyp (dict): - A dictionary containing rttm results which are indexed by a unique ID. - score Tuple[pyannote object, dict]: - A tuple containing pyannote metric instance and mapping dictionary between - speakers in hypotheses and speakers in reference RTTM files. - """ - - if diar_model_config.diarizer.asr.parameters.asr_based_vad: - self._save_VAD_labels_list(word_timestamps) - oracle_manifest = os.path.join(self.root_path, 'asr_vad_manifest.json') - oracle_manifest = write_rttm2manifest(self.VAD_RTTM_MAP, oracle_manifest) - diar_model_config.diarizer.vad.model_path = None - diar_model_config.diarizer.vad.external_vad_manifest = oracle_manifest - - diar_model = ClusteringDiarizer(cfg=diar_model_config) - score = diar_model.diarize() - if diar_model_config.diarizer.vad.model_path is not None and not diar_model_config.diarizer.oracle_vad: - self._get_frame_level_VAD( - vad_processing_dir=diar_model.vad_pred_dir, - smoothing_type=diar_model_config.diarizer.vad.parameters.smoothing, - ) - - diar_hyp = {} - for k, audio_file_path in enumerate(self.audio_file_list): - uniq_id = get_uniqname_from_filepath(audio_file_path) - pred_rttm = os.path.join(self.root_path, 'pred_rttms', uniq_id + '.rttm') - diar_hyp[uniq_id] = rttm_to_labels(pred_rttm) - return diar_hyp, score - - def _get_frame_level_VAD(self, vad_processing_dir, smoothing_type=False): - """ - Read frame-level VAD outputs. - - Args: - vad_processing_dir (str): - Path to the directory where the VAD results are saved. - smoothing_type (bool or str): [False, median, mean] - type of smoothing applied softmax logits to smooth the predictions. - """ - if isinstance(smoothing_type, bool) and not smoothing_type: - ext_type = 'frame' - else: - ext_type = smoothing_type - - for uniq_id in self.AUDIO_RTTM_MAP: - frame_vad = os.path.join(vad_processing_dir, uniq_id + '.' + ext_type) - frame_vad_float_list = [] - with open(frame_vad, 'r') as fp: - for line in fp.readlines(): - frame_vad_float_list.append(float(line.strip())) - self.frame_VAD[uniq_id] = frame_vad_float_list - - @staticmethod - def gather_eval_results( - diar_score, - audio_rttm_map_dict: Dict[str, Dict[str, str]], - trans_info_dict: Dict[str, Dict[str, float]], - root_path: str, - decimals: int = 4, - ) -> Dict[str, Dict[str, float]]: - """ - Gather diarization evaluation results from pyannote DiarizationErrorRate metric object. - - Args: - metric (DiarizationErrorRate metric): - DiarizationErrorRate metric pyannote object - trans_info_dict (dict): - Dictionary containing word timestamps, speaker labels and words from all sessions. - Each session is indexed by unique ID as a key. - mapping_dict (dict): - Dictionary containing speaker mapping labels for each audio file with key as unique name - decimals (int): - The number of rounding decimals for DER value - - Returns: - der_results (dict): - Dictionary containing scores for each audio file along with aggregated results - """ - metric, mapping_dict, _ = diar_score - results = metric.results_ - der_results = {} - count_correct_spk_counting = 0 - for result in results: - key, score = result - if 'hyp_rttm_filepath' in audio_rttm_map_dict[key]: - pred_rttm = audio_rttm_map_dict[key]['hyp_rttm_filepath'] - else: - pred_rttm = os.path.join(root_path, 'pred_rttms', key + '.rttm') - pred_labels = rttm_to_labels(pred_rttm) - - ref_rttm = audio_rttm_map_dict[key]['rttm_filepath'] - ref_labels = rttm_to_labels(ref_rttm) - ref_n_spk = get_num_of_spk_from_labels(ref_labels) - est_n_spk = get_num_of_spk_from_labels(pred_labels) - - _DER, _CER, _FA, _MISS = ( - (score['confusion'] + score['false alarm'] + score['missed detection']) / score['total'], - score['confusion'] / score['total'], - score['false alarm'] / score['total'], - score['missed detection'] / score['total'], - ) - - der_results[key] = { - "DER": round(_DER, decimals), - "CER": round(_CER, decimals), - "FA": round(_FA, decimals), - "MISS": round(_MISS, decimals), - "est_n_spk": est_n_spk, - "ref_n_spk": ref_n_spk, - "mapping": mapping_dict[key], - } - count_correct_spk_counting += int(est_n_spk == ref_n_spk) - - DER, CER, FA, MISS = ( - abs(metric), - metric['confusion'] / metric['total'], - metric['false alarm'] / metric['total'], - metric['missed detection'] / metric['total'], - ) - der_results["total"] = { - "DER": DER, - "CER": CER, - "FA": FA, - "MISS": MISS, - "spk_counting_acc": count_correct_spk_counting / len(metric.results_), - } - - return der_results - - def _get_the_closest_silence_start( - self, vad_index_word_end: float, vad_frames: np.ndarray, offset: int = 10 - ) -> float: - """ - Find the closest silence frame from the given starting position. - - Args: - vad_index_word_end (float): - The timestamp of the end of the current word. - vad_frames (numpy.array): - The numpy array containing frame-level VAD probability. - params (dict): - Contains the parameters for diarization and ASR decoding. - - Returns: - cursor (float): - A timestamp of the earliest start of a silence region from - the given time point, vad_index_word_end. - """ - - cursor = vad_index_word_end + offset - limit = int(100 * self.max_word_ts_length_in_sec + vad_index_word_end) - while cursor < len(vad_frames): - if vad_frames[cursor] < self.vad_threshold_for_word_ts: - break - else: - cursor += 1 - if cursor > limit: - break - cursor = min(len(vad_frames) - 1, cursor) - cursor = round(cursor / 100.0, 2) - return cursor - - def _compensate_word_ts_list( - self, audio_file_list: List[str], word_ts_dict: Dict[str, List[float]], - ) -> Dict[str, List[List[float]]]: - """ - Compensate the word timestamps based on the VAD output. - The length of each word is capped by self.max_word_ts_length_in_sec. - - Args: - audio_file_list (list): - List containing audio file paths. - word_ts_dict (dict): - Dictionary containing timestamps of words. - - Returns: - enhanced_word_ts_dict (dict): - Dictionary containing the enhanced word timestamp values indexed by unique-IDs. - """ - enhanced_word_ts_dict = {} - for idx, (uniq_id, word_ts_seq_list) in enumerate(word_ts_dict.items()): - N = len(word_ts_seq_list) - enhanced_word_ts_buffer = [] - for k, word_ts in enumerate(word_ts_seq_list): - if k < N - 1: - word_len = round(word_ts[1] - word_ts[0], 2) - len_to_next_word = round(word_ts_seq_list[k + 1][0] - word_ts[0] - 0.01, 2) - if uniq_id in self.frame_VAD: - vad_index_word_end = int(100 * word_ts[1]) - closest_sil_stt = self._get_the_closest_silence_start( - vad_index_word_end, self.frame_VAD[uniq_id] - ) - vad_est_len = round(closest_sil_stt - word_ts[0], 2) - else: - vad_est_len = len_to_next_word - min_candidate = min(vad_est_len, len_to_next_word) - fixed_word_len = max(min(self.max_word_ts_length_in_sec, min_candidate), word_len) - enhanced_word_ts_buffer.append([word_ts[0], word_ts[0] + fixed_word_len]) - else: - enhanced_word_ts_buffer.append([word_ts[0], word_ts[1]]) - - enhanced_word_ts_dict[uniq_id] = enhanced_word_ts_buffer - return enhanced_word_ts_dict - - def get_transcript_with_speaker_labels( - self, diar_hyp: Dict[str, List[str]], word_hyp: Dict[str, List[str]], word_ts_hyp: Dict[str, List[float]] - ) -> Dict[str, Dict[str, float]]: - """ - Match the diarization result with the ASR output. - The words and the timestamps for the corresponding words are matched in a for loop. - - Args: - diar_hyp (dict): - Dictionary of the Diarization output labels in str. Indexed by unique IDs. - - Example: - >>> diar_hyp['my_audio_01'] = ['0.0 4.375 speaker_1', '4.375 5.125 speaker_0', ...] - - word_hyp (dict): - Dictionary of words from ASR inference. Indexed by unique IDs. - - Example: - >>> word_hyp['my_audio_01'] = ['hi', 'how', 'are', ...] - - word_ts_hyp (dict): - Dictionary containing the start time and the end time of each word. - Indexed by unique IDs. - - Example: - >>> word_ts_hyp['my_audio_01'] = [[0.0, 0.04], [0.64, 0.68], [0.84, 0.88], ...] - - Returns: - trans_info_dict (dict): - Dictionary containing word timestamps, speaker labels and words from all sessions. - Each session is indexed by a unique ID. - """ - trans_info_dict = {} - if self.fix_word_ts_with_VAD: - if self.frame_VAD == {}: - logging.warning( - f"VAD timestamps are not provided. Fixing word timestamps without VAD. Please check the hydra configurations." - ) - word_ts_refined = self._compensate_word_ts_list(self.audio_file_list, word_ts_hyp) - else: - word_ts_refined = word_ts_hyp - - if self.realigning_lm_params['arpa_language_model']: - if not ARPA: - raise ImportError( - 'LM for realigning is provided but arpa is not installed. Install arpa using PyPI: pip install arpa' - ) - else: - self.realigning_lm = self._load_realigning_LM() - - word_dict_seq_list = [] - for k, audio_file_path in enumerate(self.audio_file_list): - uniq_id = get_uniqname_from_filepath(audio_file_path) - words, diar_labels = word_hyp[uniq_id], diar_hyp[uniq_id] - word_ts, word_rfnd_ts = word_ts_hyp[uniq_id], word_ts_refined[uniq_id] - - # Assign speaker labels to words - word_dict_seq_list = self.get_word_level_json_list( - words=words, word_ts=word_ts, word_rfnd_ts=word_rfnd_ts, diar_labels=diar_labels - ) - if self.realigning_lm: - word_dict_seq_list = self.realign_words_with_lm(word_dict_seq_list) - - # Create a transscript information json dictionary from the output variables - trans_info_dict[uniq_id] = self._make_json_output(uniq_id, diar_labels, word_dict_seq_list) - logging.info(f"Diarization with ASR output files are saved in: {self.root_path}/pred_rttms") - return trans_info_dict - - def get_word_level_json_list( - self, - words: List[str], - diar_labels: List[str], - word_ts: List[List[float]], - word_rfnd_ts: List[List[float]] = None, - decimals: int = 2, - ) -> Dict[str, Dict[str, str]]: - """ - Assign speaker labels to each word and save the hypothesis words and speaker labels to - a dictionary variable for future use. - - Args: - uniq_id (str): - A unique ID (key) that identifies each input audio file. - diar_labels (list): - List containing the Diarization output labels in str. Indexed by unique IDs. - - Example: - >>> diar_labels = ['0.0 4.375 speaker_1', '4.375 5.125 speaker_0', ...] - - words (list): - Dictionary of words from ASR inference. Indexed by unique IDs. - - Example: - >>> words = ['hi', 'how', 'are', ...] - - word_ts (list): - Dictionary containing the start time and the end time of each word. - Indexed by unique IDs. - - Example: - >>> word_ts = [[0.0, 0.04], [0.64, 0.68], [0.84, 0.88], ...] - - word_ts_refined (list): - Dictionary containing the refined (end point fixed) word timestamps based on hypothesis - word timestamps. Indexed by unique IDs. - - Example: - >>> word_rfnd_ts = [[0.0, 0.60], [0.64, 0.80], [0.84, 0.92], ...] - - Returns: - word_dict_seq_list (list): - List containing word by word dictionary containing word, timestamps and speaker labels. - - Example: - >>> [{'word': 'right', 'start_time': 0.0, 'end_time': 0.04, 'speaker': 'speaker_0'}, - {'word': 'and', 'start_time': 0.64, 'end_time': 0.68, 'speaker': 'speaker_1'}, - {'word': 'i', 'start_time': 0.84, 'end_time': 0.88, 'speaker': 'speaker_1'}, - ...] - """ - if word_rfnd_ts is None: - word_rfnd_ts = word_ts - start_point, end_point, speaker = diar_labels[0].split() - word_pos, turn_idx = 0, 0 - word_dict_seq_list = [] - for word_idx, (word, word_ts_stt_end, refined_word_ts_stt_end) in enumerate(zip(words, word_ts, word_rfnd_ts)): - word_pos = self._get_word_timestamp_anchor(word_ts_stt_end) - if word_pos > float(end_point): - turn_idx += 1 - turn_idx = min(turn_idx, len(diar_labels) - 1) - start_point, end_point, speaker = diar_labels[turn_idx].split() - stt_sec = round(refined_word_ts_stt_end[0], decimals) - end_sec = round(refined_word_ts_stt_end[1], decimals) - word_dict_seq_list.append({'word': word, 'start_time': stt_sec, 'end_time': end_sec, 'speaker': speaker}) - return word_dict_seq_list - - def _make_json_output( - self, uniq_id: str, diar_labels: List[str], word_dict_seq_list: List[Dict[str, float]], - ) -> Dict[str, Dict[str, str]]: - """ - Generate json output files and transcripts from the ASR and diarization results. - - Args: - uniq_id (str): - A unique ID (key) that identifies each input audio file. - diar_labels (list): - List containing the diarization hypothesis timestamps - - Example: - >>> diar_hyp['my_audio_01'] = ['0.0 4.375 speaker_1', '4.375 5.125 speaker_0', ...] - - word_dict_seq_list (list): - List containing words and corresponding word timestamps in dictionary format. - - Example: - >>> [{'word': 'right', 'start_time': 0.0, 'end_time': 0.04, 'speaker': 'speaker_0'}, - {'word': 'and', 'start_time': 0.64, 'end_time': 0.68, 'speaker': 'speaker_1'}, - {'word': 'i', 'start_time': 0.84, 'end_time': 0.88, 'speaker': 'speaker_1'}, - ...] - - Returns: - session_result_dict (dict): - A dictionary containing overall results of diarization and ASR inference. - `session_result_dict` has following keys: `status`, `session_id`, `transcription`, `speaker_count`, - `words`, `sentences`. - - Example: - >>> session_trans_dict = \ - { - 'status': 'Success', - 'session_id': 'my_audio_01', - 'transcription': 'right and i really think ...', - 'speaker_count': 2, - 'words': [{'word': 'right', 'start_time': 0.0, 'end_time': 0.04, 'speaker': 'speaker_0'}, - {'word': 'and', 'start_time': 0.64, 'end_time': 0.68, 'speaker': 'speaker_1'}, - {'word': 'i', 'start_time': 0.84, 'end_time': 0.88, 'speaker': 'speaker_1'}, - ... - ] - 'sentences': [{'sentence': 'right', 'start_time': 0.0, 'end_time': 0.04, 'speaker': 'speaker_0'}, - {'sentence': 'and i really think ...', - 'start_time': 0.92, 'end_time': 4.12, 'speaker': 'speaker_0'}, - ... - ] - } - """ - word_seq_list, audacity_label_words = [], [] - start_point, end_point, speaker = diar_labels[0].split() - prev_speaker = speaker - - sentences, terms_list = [], [] - sentence = {'speaker': speaker, 'start_time': start_point, 'end_time': end_point, 'text': ''} - - n_spk = get_num_of_spk_from_labels(diar_labels) - logging.info(f"Creating results for Session: {uniq_id} n_spk: {n_spk} ") - session_trans_dict = self._init_session_trans_dict(uniq_id=uniq_id, n_spk=n_spk) - gecko_dict = self._init_session_gecko_dict() - - for k, word_dict in enumerate(word_dict_seq_list): - word, speaker = word_dict['word'], word_dict['speaker'] - word_seq_list.append(word) - start_point, end_point = word_dict['start_time'], word_dict['end_time'] - if speaker != prev_speaker: - if len(terms_list) != 0: - gecko_dict['monologues'].append( - {'speaker': {'name': None, 'id': prev_speaker}, 'terms': terms_list} - ) - terms_list = [] - - # remove trailing space in text - sentence['text'] = sentence['text'].strip() - - # store last sentence - sentences.append(sentence) - - # start construction of a new sentence - sentence = {'speaker': speaker, 'start_time': start_point, 'end_time': end_point, 'text': ''} - else: - # correct the ending time - sentence['end_time'] = end_point - - stt_sec, end_sec = start_point, end_point - terms_list.append({'start': stt_sec, 'end': end_sec, 'text': word, 'type': 'WORD'}) - - # add current word to sentence - sentence['text'] += word.strip() + ' ' - - audacity_label_words.append(get_audacity_label(word, stt_sec, end_sec, speaker)) - prev_speaker = speaker - - session_trans_dict['words'] = word_dict_seq_list - - # note that we need to add the very last sentence. - sentence['text'] = sentence['text'].strip() - sentences.append(sentence) - gecko_dict['monologues'].append({'speaker': {'name': None, 'id': speaker}, 'terms': terms_list}) - - # Speaker independent transcription - session_trans_dict['transcription'] = ' '.join(word_seq_list) - # add sentences to transcription information dict - session_trans_dict['sentences'] = sentences - self._write_and_log(uniq_id, session_trans_dict, audacity_label_words, gecko_dict, sentences) - return session_trans_dict - - def _get_realignment_ranges(self, k: int, word_seq_len: int) -> Tuple[int, int]: - """ - Calculate word ranges for realignment operation. - N1, N2 are calculated to not exceed the start and end of the input word sequence. - - Args: - k (int): - Index of the current word - word_seq_len (int): - Length of the sentence - - Returns: - N1 (int): - Start index of the word sequence - N2 (int): - End index of the word sequence - """ - if k < self.N_range[1]: - N1 = max(k, self.N_range[0]) - N2 = min(word_seq_len - k, self.N_range[1]) - elif k > (word_seq_len - self.N_range[1]): - N1 = min(k, self.N_range[1]) - N2 = max(word_seq_len - k, self.N_range[0]) - else: - N1, N2 = self.N_range[1], self.N_range[1] - return N1, N2 - - def _get_word_timestamp_anchor(self, word_ts_stt_end: List[float]) -> float: - """ - Determine a reference point to match a word with the diarization results. - word_ts_anchor_pos determines the position of a word in relation to the given diarization labels: - - 'start' uses the beginning of the word - - 'end' uses the end of the word - - 'mid' uses the mean of start and end of the word - - word_ts_anchor_offset determines how much offset we want to add to the anchor position. - It is recommended to use the default value. - - Args: - word_ts_stt_end (list): - List containing start and end of the decoded word. - - Returns: - word_pos (float): - Floating point number that indicates temporal location of the word. - """ - if self.params['word_ts_anchor_pos'] == 'start': - word_pos = word_ts_stt_end[0] - elif self.params['word_ts_anchor_pos'] == 'end': - word_pos = word_ts_stt_end[1] - elif self.params['word_ts_anchor_pos'] == 'mid': - word_pos = (word_ts_stt_end[0] + word_ts_stt_end[1]) / 2 - else: - logging.info( - f"word_ts_anchor_pos: {self.params['word_ts_anchor']} is not a supported option. Using the default 'start' option." - ) - word_pos = word_ts_stt_end[0] - - word_pos = word_pos + self.word_ts_anchor_offset - return word_pos - - def realign_words_with_lm(self, word_dict_seq_list: List[Dict[str, float]]) -> List[Dict[str, float]]: - """ - Realign the mapping between speaker labels and words using a language model. - The realigning process calculates the probability of the certain range around the words, - especially at the boundary between two hypothetical sentences spoken by different speakers. - - Example: - k-th word: "but" - - hyp_former: - since i think like tuesday but he's coming back to albuquerque - hyp_latter: - since i think like tuesday but he's coming back to albuquerque - - The joint probabilities of words in the sentence are computed for these two hypotheses. In addition, - logprob_diff_threshold parameter is used for reducing the false positive realigning. - - Args: - word_dict_seq_list (list): - List containing words and corresponding word timestamps in dictionary format. - - Returns: - realigned_list (list): - List of dictionaries containing words, word timestamps and speaker labels. - """ - word_seq_len = len(word_dict_seq_list) - hyp_w_dict_list, spk_list = [], [] - for k, line_dict in enumerate(word_dict_seq_list): - word, spk_label = line_dict['word'], line_dict['speaker'] - hyp_w_dict_list.append(word) - spk_list.append(spk_label) - - realigned_list = [] - org_spk_list = copy.deepcopy(spk_list) - for k, line_dict in enumerate(word_dict_seq_list): - if self.N_range[0] < k < (word_seq_len - self.N_range[0]) and ( - spk_list[k] != org_spk_list[k + 1] or spk_list[k] != org_spk_list[k - 1] - ): - N1, N2 = self._get_realignment_ranges(k, word_seq_len) - hyp_former = self.realigning_lm.log_s( - ' '.join(hyp_w_dict_list[k - N1 : k] + self.stt_end_tokens + hyp_w_dict_list[k : k + N2]) - ) - hyp_latter = self.realigning_lm.log_s( - ' '.join(hyp_w_dict_list[k - N1 : k + 1] + self.stt_end_tokens + hyp_w_dict_list[k + 1 : k + N2]) - ) - log_p = [hyp_former, hyp_latter] - p_order = np.argsort(log_p)[::-1] - if log_p[p_order[0]] > log_p[p_order[1]] + self.realigning_lm_params['logprob_diff_threshold']: - if p_order[0] == 0: - spk_list[k] = org_spk_list[k + 1] - line_dict['speaker'] = spk_list[k] - realigned_list.append(line_dict) - return realigned_list - - @staticmethod - def evaluate( - audio_file_list: List[str], - hyp_trans_info_dict: Dict[str, Dict[str, float]], - hyp_ctm_file_list: List[str] = None, - ref_ctm_file_list: List[str] = None, - ) -> Dict[str, Dict[str, float]]: - """ - Evaluate the result transcripts based on the provided CTM file. WER and cpWER are calculated to assess - the performance of ASR system and diarization at the same time. - - Args: - audio_file_list (list): - List containing file path to the input audio files. - hyp_trans_info_dict (dict): - Dictionary containing the hypothesis transcriptions for all sessions. - hyp_ctm_file_list (list): - List containing file paths of the hypothesis transcriptions in CTM format for all sessions. - ref_ctm_file_list (list): - List containing file paths of the reference transcriptions in CTM format for all sessions. - - Note: Either `hyp_trans_info_dict` or `hyp_ctm_file_list` should be provided. - - Returns: - wer_results (dict): - Session-by-session results including DER, miss rate, false alarm rate, WER and cpWER - """ - wer_results = {} - - if ref_ctm_file_list is not None: - spk_hypotheses, spk_references = [], [] - mix_hypotheses, mix_references = [], [] - WER_values, uniq_id_list = [], [] - - for k, (audio_file_path, ctm_file_path) in enumerate(zip(audio_file_list, ref_ctm_file_list)): - uniq_id = get_uniqname_from_filepath(audio_file_path) - uniq_id_list.append(uniq_id) - if uniq_id != get_uniqname_from_filepath(ctm_file_path): - raise ValueError("audio_file_list has mismatch in uniq_id with ctm_file_path") - - # Either hypothesis CTM file or hyp_trans_info_dict should be provided - if hyp_ctm_file_list is not None: - if uniq_id == get_uniqname_from_filepath(hyp_ctm_file_list[k]): - spk_hypothesis, mix_hypothesis = convert_ctm_to_text(hyp_ctm_file_list[k]) - else: - raise ValueError("Hypothesis CTM files are provided but uniq_id is mismatched") - elif hyp_trans_info_dict is not None and uniq_id in hyp_trans_info_dict: - spk_hypothesis, mix_hypothesis = convert_word_dict_seq_to_text( - hyp_trans_info_dict[uniq_id]['words'] - ) - else: - raise ValueError("Hypothesis information is not provided in the correct format.") - - spk_reference, mix_reference = convert_ctm_to_text(ctm_file_path) - - spk_hypotheses.append(spk_hypothesis) - spk_references.append(spk_reference) - mix_hypotheses.append(mix_hypothesis) - mix_references.append(mix_reference) - - # Calculate session by session WER value - WER_values.append(word_error_rate([mix_hypothesis], [mix_reference])) - - cpWER_values, hyps_spk, refs_spk = concat_perm_word_error_rate(spk_hypotheses, spk_references) - - # Take an average of cpWER and regular WER value on all sessions - wer_results['total'] = {} - wer_results['total']['average_cpWER'] = word_error_rate(hypotheses=hyps_spk, references=refs_spk) - wer_results['total']['average_WER'] = word_error_rate(hypotheses=mix_hypotheses, references=mix_references) - - for (uniq_id, cpWER, WER) in zip(uniq_id_list, cpWER_values, WER_values): - # Save session-level cpWER and WER values - wer_results[uniq_id] = {} - wer_results[uniq_id]['cpWER'] = cpWER - wer_results[uniq_id]['WER'] = WER - - return wer_results - - @staticmethod - def get_str_speech_labels(speech_labels_float: List[List[float]]) -> List[str]: - """ - Convert floating point speech labels list to a list containing string values. - - Args: - speech_labels_float (list): - List containing start and end timestamps of the speech segments in floating point type - speech_labels (list): - List containing start and end timestamps of the speech segments in string format - """ - speech_labels = [] - for start, end in speech_labels_float: - speech_labels.append("{:.3f} {:.3f} speech".format(start, end)) - return speech_labels - - @staticmethod - def write_session_level_result_in_csv( - der_results: Dict[str, Dict[str, float]], - wer_results: Dict[str, Dict[str, float]], - root_path: str, - csv_columns: List[str], - csv_file_name: str = "ctm_eval.csv", - ): - """ - This function is for development use when a CTM file is provided. - Saves the session-level diarization and ASR result into a csv file. - - Args: - wer_results (dict): - Dictionary containing session-by-session results of ASR and diarization in terms of - WER and cpWER. - """ - target_path = f"{root_path}/pred_rttms" - os.makedirs(target_path, exist_ok=True) - logging.info(f"Writing {target_path}/{csv_file_name}") - total_result_jsons = get_total_result_dict(der_results, wer_results, csv_columns) - try: - with open(f"{target_path}/{csv_file_name}", 'w') as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=csv_columns) - writer.writeheader() - for data in total_result_jsons: - writer.writerow(data) - except IOError: - logging.info("I/O error has occurred while writing a csv file.") - - def _break_lines(self, string_out: str, max_chars_in_line: int = 90) -> str: - """ - Break the lines in the transcript. - - Args: - string_out (str): - Input transcript with speaker labels - max_chars_in_line (int): - Maximum characters in each line - - Returns: - return_string_out (str): - String variable containing line breaking - """ - color_str_len = len('\033[1;00m') if self.params['colored_text'] else 0 - split_string_out = string_out.split('\n') - return_string_out = [] - for org_chunk in split_string_out: - buffer = [] - if len(org_chunk) - color_str_len > max_chars_in_line: - color_str = org_chunk[:color_str_len] if color_str_len > 0 else '' - for i in range(color_str_len, len(org_chunk), max_chars_in_line): - trans_str = org_chunk[i : i + max_chars_in_line] - if len(trans_str.strip()) > 0: - c_trans_str = color_str + trans_str - buffer.append(c_trans_str) - return_string_out.extend(buffer) - else: - return_string_out.append(org_chunk) - return_string_out = '\n'.join(return_string_out) - return return_string_out - - def _write_and_log( - self, - uniq_id: str, - session_trans_dict: Dict[str, Dict[str, float]], - audacity_label_words: List[str], - gecko_dict: Dict[str, Dict[str, float]], - sentences: List[Dict[str, float]], - ): - """ - Write output files and display logging messages. - - Args: - uniq_id (str): - A unique ID (key) that identifies each input audio file - session_trans_dict (dict): - Dictionary containing the transcription output for a session - audacity_label_words (list): - List containing word and word timestamp information in Audacity label format - gecko_dict (dict): - Dictionary formatted to be opened in Gecko software - sentences (list): - List containing sentence dictionary - """ - # print the sentences in the .txt output - string_out = self.print_sentences(sentences) - if self.params['break_lines']: - string_out = self._break_lines(string_out) - - session_trans_dict["status"] = "success" - ctm_lines_list = convert_word_dict_seq_to_ctm(session_trans_dict['words']) - - dump_json_to_file(f'{self.root_path}/pred_rttms/{uniq_id}.json', session_trans_dict) - dump_json_to_file(f'{self.root_path}/pred_rttms/{uniq_id}_gecko.json', gecko_dict) - write_txt(f'{self.root_path}/pred_rttms/{uniq_id}.ctm', '\n'.join(ctm_lines_list)) - write_txt(f'{self.root_path}/pred_rttms/{uniq_id}.txt', string_out.strip()) - write_txt(f'{self.root_path}/pred_rttms/{uniq_id}.w.label', '\n'.join(audacity_label_words)) - - @staticmethod - def print_errors(der_results: Dict[str, Dict[str, float]], wer_results: Dict[str, Dict[str, float]]): - """ - Print a slew of error metrics for ASR and Diarization. - - Args: - der_results (dict): - Dictionary containing FA, MISS, CER and DER values for both aggregated amount and - each session. - wer_results (dict): - Dictionary containing session-by-session WER and cpWER. `wer_results` only - exists when CTM files are provided. - """ - DER_info = f"\nDER : {der_results['total']['DER']:.4f} \ - \nFA : {der_results['total']['FA']:.4f} \ - \nMISS : {der_results['total']['MISS']:.4f} \ - \nCER : {der_results['total']['CER']:.4f} \ - \nSpk. counting acc. : {der_results['total']['spk_counting_acc']:.4f}" - if wer_results is not None and len(wer_results) > 0: - logging.info( - DER_info - + f"\ncpWER : {wer_results['total']['average_cpWER']:.4f} \ - \nWER : {wer_results['total']['average_WER']:.4f}" - ) - else: - logging.info(DER_info) - - def print_sentences(self, sentences: List[Dict[str, float]]): - """ - Print a transcript with speaker labels and timestamps. - - Args: - sentences (list): - List containing sentence-level dictionaries. - - Returns: - string_out (str): - String variable containing transcript and the corresponding speaker label. - """ - # init output - string_out = '' - - for sentence in sentences: - # extract info - speaker = sentence['speaker'] - start_point = sentence['start_time'] - end_point = sentence['end_time'] - text = sentence['text'] - - if self.params['colored_text']: - color = self.color_palette.get(speaker, '\033[0;37m') - else: - color = '' - - # cast timestamp to the correct format - datetime_offset = 16 * 3600 - if float(start_point) > 3600: - time_str = '%H:%M:%S.%f' - else: - time_str = '%M:%S.%f' - start_point, end_point = max(float(start_point), 0), max(float(end_point), 0) - start_point_str = datetime.fromtimestamp(start_point - datetime_offset).strftime(time_str)[:-4] - end_point_str = datetime.fromtimestamp(end_point - datetime_offset).strftime(time_str)[:-4] - - if self.params['print_time']: - time_str = f'[{start_point_str} - {end_point_str}] ' - else: - time_str = '' - - # string out concatenation - string_out += f'{color}{time_str}{speaker}: {text}\n' - - return string_out diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/eval_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/eval_utils.py deleted file mode 100644 index 5838f3b4035d839ec6aff193f3966c9c53cb297f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/eval_utils.py +++ /dev/null @@ -1,153 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -from typing import Tuple - -from nemo.collections.asr.metrics.wer import word_error_rate_detail -from nemo.utils import logging - - -def clean_label(_str: str, num_to_words: bool = True, langid="en") -> str: - """ - Remove unauthorized characters in a string, lower it and remove unneeded spaces - """ - replace_with_space = [char for char in '/?*\",.:=?_{|}~¨«·»¡¿„…‧‹›≪≫!:;ː→'] - replace_with_blank = [char for char in '`¨´‘’“”`ʻ‘’“"‘”'] - replace_with_apos = [char for char in '‘’ʻ‘’‘'] - _str = _str.strip() - _str = _str.lower() - for i in replace_with_blank: - _str = _str.replace(i, "") - for i in replace_with_space: - _str = _str.replace(i, " ") - for i in replace_with_apos: - _str = _str.replace(i, "'") - if num_to_words: - if langid == "en": - _str = convert_num_to_words(_str, langid="en") - else: - logging.info( - "Currently support basic num_to_words in English only. Please use Text Normalization to convert other languages! Skipping!" - ) - - ret = " ".join(_str.split()) - return ret - - -def convert_num_to_words(_str: str, langid: str = "en") -> str: - """ - Convert digits to corresponding words. Note this is a naive approach and could be replaced with text normalization. - """ - if langid == "en": - num_to_words = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"] - _str = _str.strip() - words = _str.split() - out_str = "" - num_word = [] - for word in words: - if word.isdigit(): - num = int(word) - while num: - digit = num % 10 - digit_word = num_to_words[digit] - num_word.append(digit_word) - num = int(num / 10) - if not (num): - num_str = "" - num_word = num_word[::-1] - for ele in num_word: - num_str += ele + " " - out_str += num_str + " " - num_word.clear() - else: - out_str += word + " " - out_str = out_str.strip() - else: - raise ValueError( - "Currently support basic num_to_words in English only. Please use Text Normalization to convert other languages!" - ) - return out_str - - -def cal_write_wer( - pred_manifest: str = None, - pred_text_attr_name: str = "pred_text", - clean_groundtruth_text: bool = False, - langid: str = 'en', - use_cer: bool = False, - output_filename: str = None, -) -> Tuple[str, dict, str]: - """ - Calculate wer, inserion, deletion and substitution rate based on groundtruth text and pred_text_attr_name (pred_text) - We use WER in function name as a convention, but Error Rate (ER) currently support Word Error Rate (WER) and Character Error Rate (CER) - """ - samples = [] - hyps = [] - refs = [] - eval_metric = "cer" if use_cer else "wer" - - with open(pred_manifest, 'r') as fp: - for line in fp: - sample = json.loads(line) - - if 'text' not in sample: - logging.info( - "ground-truth text is not present in manifest! Cannot calculate Word Error Rate. Returning!" - ) - return None, None, eval_metric - - hyp = sample[pred_text_attr_name] - ref = sample['text'] - - if clean_groundtruth_text: - ref = clean_label(ref, langid=langid) - - wer, tokens, ins_rate, del_rate, sub_rate = word_error_rate_detail( - hypotheses=[hyp], references=[ref], use_cer=use_cer - ) - sample[eval_metric] = wer # evaluatin metric, could be word error rate of character error rate - sample['tokens'] = tokens # number of word/characters/tokens - sample['ins_rate'] = ins_rate # insertion error rate - sample['del_rate'] = del_rate # deletion error rate - sample['sub_rate'] = sub_rate # substitution error rate - - samples.append(sample) - hyps.append(hyp) - refs.append(ref) - - total_wer, total_tokens, total_ins_rate, total_del_rate, total_sub_rate = word_error_rate_detail( - hypotheses=hyps, references=refs, use_cer=use_cer - ) - - if not output_filename: - output_manifest_w_wer = pred_manifest - else: - output_manifest_w_wer = output_filename - - with open(output_manifest_w_wer, 'w') as fout: - for sample in samples: - json.dump(sample, fout) - fout.write('\n') - fout.flush() - - total_res = { - "samples": len(samples), - "tokens": total_tokens, - eval_metric: total_wer, - "ins_rate": total_ins_rate, - "del_rate": total_del_rate, - "sub_rate": total_sub_rate, - } - return output_manifest_w_wer, total_res, eval_metric diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/manifest_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/manifest_utils.py deleted file mode 100644 index 39bd6a8a24e7aa0843453d5dfeee677c185f56db..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/manifest_utils.py +++ /dev/null @@ -1,433 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -from collections import Counter -from collections import OrderedDict as od -from pathlib import Path -from typing import Dict, List, Union - -import librosa -import numpy as np - -from nemo.collections.asr.parts.utils.speaker_utils import ( - audio_rttm_map, - get_subsegments, - get_uniqname_from_filepath, - rttm_to_labels, - segments_manifest_to_subsegments_manifest, - write_rttm2manifest, -) -from nemo.utils.data_utils import DataStoreObject - - -def rreplace(s: str, old: str, new: str) -> str: - """ - Replace end of string. - - Args: - s (str): string to operate on - old (str): ending of string to replace - new (str): replacement for ending of string - Returns: - new.join(li) (string): new string with end replaced - """ - li = s.rsplit(old, 1) - return new.join(li) - - -def get_uniq_id_with_period(path: str) -> str: - """ - Get uniq_id from path string with period in it. - - Args: - path (str): path to audio file - Returns: - uniq_id (str): unique speaker ID - """ - split_path = os.path.basename(path).split('.')[:-1] - uniq_id = '.'.join(split_path) if len(split_path) > 1 else split_path[0] - return uniq_id - - -def get_subsegment_dict(subsegments_manifest_file: str, window: float, shift: float, deci: int) -> Dict[str, dict]: - """ - Get subsegment dictionary from manifest file. - - Args: - subsegments_manifest_file (str): Path to subsegment manifest file - window (float): Window length for segmentation - shift (float): Shift length for segmentation - deci (int): Rounding number of decimal places - Returns: - _subsegment_dict (dict): Subsegment dictionary - """ - _subsegment_dict = {} - with open(subsegments_manifest_file, 'r') as subsegments_manifest: - segments = subsegments_manifest.readlines() - for segment in segments: - segment = segment.strip() - dic = json.loads(segment) - audio, offset, duration, label = dic['audio_filepath'], dic['offset'], dic['duration'], dic['label'] - subsegments = get_subsegments(offset=offset, window=window, shift=shift, duration=duration) - if dic['uniq_id'] is not None: - uniq_id = dic['uniq_id'] - else: - uniq_id = get_uniq_id_with_period(audio) - if uniq_id not in _subsegment_dict: - _subsegment_dict[uniq_id] = {'ts': [], 'json_dic': []} - for subsegment in subsegments: - start, dur = subsegment - _subsegment_dict[uniq_id]['ts'].append([round(start, deci), round(start + dur, deci)]) - _subsegment_dict[uniq_id]['json_dic'].append(dic) - return _subsegment_dict - - -def get_input_manifest_dict(input_manifest_path: str) -> Dict[str, dict]: - """ - Get dictionary from manifest file. - - Args: - input_manifest_path (str): Path to manifest file - Returns: - input_manifest_dict (dict): Dictionary from manifest file - """ - input_manifest_dict = {} - with open(input_manifest_path, 'r') as input_manifest_fp: - json_lines = input_manifest_fp.readlines() - for json_line in json_lines: - dic = json.loads(json_line) - dic["text"] = "-" - uniq_id = get_uniqname_from_filepath(dic["audio_filepath"]) - input_manifest_dict[uniq_id] = dic - return input_manifest_dict - - -def write_truncated_subsegments( - input_manifest_dict: Dict[str, dict], - _subsegment_dict: Dict[str, dict], - output_manifest_path: str, - step_count: int, - deci: int, -): - """ - Write subsegments to manifest filepath. - - Args: - input_manifest_dict (dict): Input manifest dictionary - _subsegment_dict (dict): Input subsegment dictionary - output_manifest_path (str): Path to output manifest file - step_count (int): Number of the unit segments you want to create per utterance - deci (int): Rounding number of decimal places - """ - with open(output_manifest_path, 'w') as output_manifest_fp: - for uniq_id, subseg_dict in _subsegment_dict.items(): - subseg_array = np.array(subseg_dict['ts']) - subseg_array_idx = np.argsort(subseg_array, axis=0) - chunked_set_count = subseg_array_idx.shape[0] // step_count - - for idx in range(chunked_set_count - 1): - chunk_index_stt = subseg_array_idx[:, 0][idx * step_count] - chunk_index_end = subseg_array_idx[:, 1][(idx + 1) * step_count] - offset_sec = subseg_array[chunk_index_stt, 0] - end_sec = subseg_array[chunk_index_end, 1] - dur = round(end_sec - offset_sec, deci) - meta = input_manifest_dict[uniq_id] - meta['offset'] = offset_sec - meta['duration'] = dur - json.dump(meta, output_manifest_fp) - output_manifest_fp.write("\n") - - -def write_file(name: str, lines: List[dict], idx: int): - """ - Write json lines to file. - - Args: - name (str): Output file path - lines (list): List of json lines - idx (int): Indices to dump to the file - """ - with open(name, 'w') as fout: - for i in idx: - dic = lines[i] - json.dump(dic, fout) - fout.write('\n') - - -def read_file(pathlist: str) -> List[str]: - """ - Read list of lines from target file. - - Args: - pathlist (str): Input file path - Returns: - sorted(pathlist) (list): List of lines - """ - with open(pathlist, 'r') as f: - pathlist = f.readlines() - return sorted(pathlist) - - -def get_dict_from_wavlist(pathlist: List[str]) -> Dict[str, str]: - """ - Read dictionaries from list of lines - - Args: - pathlist (list): List of file paths - Returns: - path_dict (dict): Dictionary containing dictionaries read from files - """ - path_dict = od() - pathlist = sorted(pathlist) - for line_path in pathlist: - uniq_id = os.path.basename(line_path).split('.')[0] - path_dict[uniq_id] = line_path - return path_dict - - -def get_dict_from_list(data_pathlist: List[str], uniqids: List[str]) -> Dict[str, str]: - """ - Create dictionaries from list of lines - - Args: - data_pathlist (list): List of file paths - uniqids (list): List of file IDs - Returns: - path_dict (dict): Dictionary containing file paths - """ - path_dict = {} - for line_path in data_pathlist: - uniq_id = os.path.basename(line_path).split('.')[0] - if uniq_id in uniqids: - path_dict[uniq_id] = line_path - else: - raise ValueError(f'uniq id {uniq_id} is not in wav filelist') - return path_dict - - -def get_path_dict(data_path: str, uniqids: List[str], len_wavs: int = None) -> Dict[str, str]: - """ - Create dictionary from list of lines (using the get_dict_from_list function) - - Args: - data_path (str): Path to file containing list of files - uniqids (list): List of file IDs - len_wavs (int): Length of file list - Returns: - data_pathdict (dict): Dictionary containing file paths - """ - if data_path is not None: - data_pathlist = read_file(data_path) - if len_wavs is not None: - assert len(data_pathlist) == len_wavs - data_pathdict = get_dict_from_list(data_pathlist, uniqids) - elif len_wavs is not None: - data_pathdict = {uniq_id: None for uniq_id in uniqids} - return data_pathdict - - -def create_segment_manifest( - input_manifest_path: str, output_manifest_path: str, window: float, shift: float, step_count: int, deci: int -): - """ - Create segmented manifest file from base manifest file - - Args: - input_manifest_path (str): Path to input manifest file - output_manifest_path (str): Path to output manifest file - window (float): Window length for segmentation - shift (float): Shift length for segmentation - step_count (int): Number of the unit segments you want to create per utterance - deci (int): Rounding number of decimal places - """ - if '.json' not in input_manifest_path: - raise ValueError("input_manifest_path file should be .json file format") - if output_manifest_path and '.json' not in output_manifest_path: - raise ValueError("output_manifest_path file should be .json file format") - elif not output_manifest_path: - output_manifest_path = rreplace(input_manifest_path, '.json', f'_{step_count}seg.json') - - input_manifest_dict = get_input_manifest_dict(input_manifest_path) - segment_manifest_path = rreplace(input_manifest_path, '.json', '_seg.json') - subsegment_manifest_path = rreplace(input_manifest_path, '.json', '_subseg.json') - min_subsegment_duration = 0.05 - step_count = int(step_count) - - AUDIO_RTTM_MAP = audio_rttm_map(input_manifest_path) - segments_manifest_file = write_rttm2manifest(AUDIO_RTTM_MAP, segment_manifest_path, deci) - subsegments_manifest_file = subsegment_manifest_path - segments_manifest_to_subsegments_manifest( - segments_manifest_file, subsegments_manifest_file, window, shift, min_subsegment_duration, - ) - subsegments_dict = get_subsegment_dict(subsegments_manifest_file, window, shift, deci) - write_truncated_subsegments(input_manifest_dict, subsegments_dict, output_manifest_path, step_count, deci) - os.remove(segment_manifest_path) - os.remove(subsegment_manifest_path) - - -def create_manifest( - wav_path: str, - manifest_filepath: str, - text_path: str = None, - rttm_path: str = None, - uem_path: str = None, - ctm_path: str = None, - add_duration: bool = False, -): - """ - Create base manifest file - - Args: - wav_path (str): Path to list of wav files - manifest_filepath (str): Path to output manifest file - text_path (str): Path to list of text files - rttm_path (str): Path to list of rttm files - uem_path (str): Path to list of uem files - ctm_path (str): Path to list of ctm files - add_duration (bool): Whether to add durations to the manifest file - """ - if os.path.exists(manifest_filepath): - os.remove(manifest_filepath) - wav_pathlist = read_file(wav_path) - wav_pathdict = get_dict_from_wavlist(wav_pathlist) - len_wavs = len(wav_pathlist) - uniqids = sorted(wav_pathdict.keys()) - - text_pathdict = get_path_dict(text_path, uniqids, len_wavs) - rttm_pathdict = get_path_dict(rttm_path, uniqids, len_wavs) - uem_pathdict = get_path_dict(uem_path, uniqids, len_wavs) - ctm_pathdict = get_path_dict(ctm_path, uniqids, len_wavs) - - lines = [] - for uid in uniqids: - wav, text, rttm, uem, ctm = ( - wav_pathdict[uid], - text_pathdict[uid], - rttm_pathdict[uid], - uem_pathdict[uid], - ctm_pathdict[uid], - ) - - audio_line = wav.strip() - if rttm is not None: - rttm = rttm.strip() - labels = rttm_to_labels(rttm) - num_speakers = Counter([l.split()[-1] for l in labels]).keys().__len__() - else: - num_speakers = None - - if uem is not None: - uem = uem.strip() - - if text is not None: - with open(text.strip()) as f: - text = f.readlines()[0].strip() - else: - text = "-" - - if ctm is not None: - ctm = ctm.strip() - - duration = None - if add_duration: - y, sr = librosa.load(audio_line, sr=None) - duration = librosa.get_duration(y=y, sr=sr) - meta = [ - { - "audio_filepath": audio_line, - "offset": 0, - "duration": duration, - "label": "infer", - "text": text, - "num_speakers": num_speakers, - "rttm_filepath": rttm, - "uem_filepath": uem, - "ctm_filepath": ctm, - } - ] - lines.extend(meta) - - write_file(manifest_filepath, lines, range(len(lines))) - - -def read_manifest(manifest: Union[Path, str]) -> List[dict]: - """ - Read manifest file - - Args: - manifest (str or Path): Path to manifest file - Returns: - data (list): List of JSON items - """ - manifest = DataStoreObject(str(manifest)) - - data = [] - try: - f = open(manifest.get(), 'r', encoding='utf-8') - except: - raise Exception(f"Manifest file could not be opened: {manifest}") - for line in f: - item = json.loads(line) - data.append(item) - f.close() - return data - - -def write_manifest(output_path: Union[Path, str], target_manifest: List[dict], ensure_ascii: bool = True): - """ - Write to manifest file - - Args: - output_path (str or Path): Path to output manifest file - target_manifest (list): List of manifest file entries - ensure_ascii (bool): default is True, meaning the output is guaranteed to have all incoming non-ASCII characters escaped. If ensure_ascii is false, these characters will be output as-is. - """ - with open(output_path, "w", encoding="utf-8") as outfile: - for tgt in target_manifest: - json.dump(tgt, outfile, ensure_ascii=ensure_ascii) - outfile.write('\n') - - -def write_ctm(output_path: str, target_ctm: Dict[str, dict]): - """ - Write ctm entries from diarization session to a .ctm file. - - Args: - output_path (str): target file path - target_ctm (dict): list of ctm entries - """ - target_ctm.sort(key=lambda y: y[0]) - with open(output_path, "w") as outfile: - for pair in target_ctm: - tgt = pair[1] - outfile.write(tgt) - - -def write_text(output_path: str, target_ctm: Dict[str, dict]): - """ - Write text from diarization session to a .txt file - - Args: - output_path (str): target file path - target_ctm (dict): list of ctm entries - """ - target_ctm.sort(key=lambda y: y[0]) - with open(output_path, "w") as outfile: - for pair in target_ctm: - tgt = pair[1] - word = tgt.split(' ')[4] - outfile.write(word + ' ') - outfile.write('\n') diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/numba_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/numba_utils.py deleted file mode 100644 index 8a5048aaf7486255f4d75c6e515756bd0b5c5b1d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/numba_utils.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import numpy as np -from numba import jit - - -def phase_vocoder(D: np.ndarray, rate: float, phi_advance: np.ndarray, scale_buffer: np.ndarray): - """ - Optimized implementation of phase vocoder from Librosa. - Reference implementation: - - https://librosa.github.io/librosa/generated/librosa.core.phase_vocoder.html - Args: - D: Complex spectograms of shape [d, t, complex=2]. - rate: Speed rate, must be float greater than 0. - phi_advance: Precomputed phase advance buffer array of length [n_fft + 1] - scale_buffer: Precomputed numpy buffer array of length [n_fft + 1] - Returns: - Complex64 ndarray of shape [d, t / rate, complex=2] - """ - time_steps = np.arange(0, D.shape[1], rate, dtype=np.float) - - # Create an empty output array - d_stretch = np.zeros((D.shape[0], len(time_steps)), D.dtype, order='F') - - # Phase accumulator; initialize to the first sample - phase_acc = np.angle(D[:, 0]) - - # Pad 0 columns to simplify boundary logic - D = np.pad(D, [(0, 0), (0, 2)], mode='constant') - - d_stretch = _phase_vocoder_kernel(D, time_steps, phi_advance, d_stretch, phase_acc, scale_buffer) - - return d_stretch - - -@jit(nopython=True, nogil=True) -def _phase_vocoder_kernel(D, time_steps, phi_advance, d_stretch, phase_acc, scale_buffer): - """ - Numba optimized kernel to compute the phase vocoder step. - Args: - D: Complex spectograms of shape [d, t, complex=2]. - rate: Speed rate, must be float greater than 0. - time_steps: Numpy ndarray of linearly spaced time steps, shape = [t] - phi_advance: Precomputed phase advance buffer array of length [n_fft + 1] - d_stretch: Output complex matrix of shape [d, t / rate, complex=2] - phase_acc: Phase accumulator initialized to first sample of shape [d, complex=2] - scale_buffer: Precomputed numpy buffer array of length [n_fft + 1] - Returns: - Complex64 ndarray of shape [d, t / rate, complex=2] - """ - two_pi = 2.0 * np.pi - - for (t, step) in enumerate(time_steps): - columns = D[:, int(step) : int(step + 2)] - columns_0 = columns[:, 0] - columns_1 = columns[:, 1] - - # Weighting for linear magnitude interpolation - alpha = np.mod(step, 1.0) - mag = (1.0 - alpha) * np.abs(columns_0) + alpha * np.abs(columns_1) - - # Store to output array - d_stretch[:, t] = mag * np.exp(1.0j * phase_acc) - - # Compute phase advance - dphase = np.angle(columns_1) - np.angle(columns_0) - phi_advance - - # Wrap to -pi:pi range - scale = dphase / two_pi - np.round(scale, 0, scale_buffer) - - dphase = dphase - two_pi * scale_buffer - - # Accumulate phase - phase_acc += phi_advance + dphase - - return d_stretch diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/offline_clustering.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/offline_clustering.py deleted file mode 100644 index d62ba23d8b6bbf3a06680d3c4d8c6612386ca8b3..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/offline_clustering.py +++ /dev/null @@ -1,1327 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2007-2020 The scikit-learn developers. - -# BSD 3-Clause License - -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# NME-SC clustering is based on the implementation from the paper -# https://arxiv.org/pdf/2003.02405.pdf and the implementation from -# https://github.com/tango4j/Auto-Tuning-Spectral-Clustering. - -from typing import Dict, List, Tuple - -import torch -from torch.linalg import eigh, eigvalsh - - -def cos_similarity(emb_a: torch.Tensor, emb_b: torch.Tensor, eps=torch.tensor(3.5e-4)) -> torch.Tensor: - """ - Calculate cosine similarities of the given two set of tensors. The output is an N by N - matrix where N is the number of feature vectors. - - Args: - a (Tensor): - Matrix containing speaker representation vectors. (N x embedding_dim) - b (Tensor): - Matrix containing speaker representation vectors. (N x embedding_dim) - - Returns: - res (Tensor): - N by N matrix containing the cosine similarities of the values. - """ - # If number of embedding count is 1, it creates nan values - if emb_a.shape[0] == 1 or emb_b.shape[0] == 1: - raise ValueError(f"Number of feature vectors should be greater than 1 but got {emb_a.shape} and {emb_b.shape}") - a_norm = emb_a / (torch.norm(emb_a, dim=1).unsqueeze(1) + eps) - b_norm = emb_b / (torch.norm(emb_b, dim=1).unsqueeze(1) + eps) - res = torch.mm(a_norm, b_norm.transpose(0, 1)) - res.fill_diagonal_(1) - return res - - -def ScalerMinMax(X: torch.Tensor) -> torch.Tensor: - """ - Min-max scale the input affinity matrix X, which will lead to a dynamic range of [0, 1]. - - Args: - X (Tensor): - Matrix containing cosine similarity values among embedding vectors (N x N) - - Returns: - v_norm (Tensor): - Min-max normalized value of X. - """ - v_min, v_max = X.min(), X.max() - v_norm = (X - v_min) / (v_max - v_min) - return v_norm - - -def getEuclideanDistance( - specEmbA: torch.Tensor, specEmbB: torch.Tensor, device: torch.device = torch.device('cpu') -) -> torch.Tensor: - """ - Calculate Euclidean distances from the given feature tensors. - - Args: - specEmbA (Tensor): - Matrix containing spectral embedding vectors from eigenvalue decomposition (N x embedding_dim). - specEmbB (Tensor): - Matrix containing spectral embedding vectors from eigenvalue decomposition (N x embedding_dim). - - Returns: - dis (Tensor): - Euclidean distance values of the two sets of spectral embedding vectors. - """ - specEmbA, specEmbB = specEmbA.to(device), specEmbB.to(device) - A, B = specEmbA.unsqueeze(dim=1), specEmbB.unsqueeze(dim=0) - dis = (A - B) ** 2.0 - dis = dis.sum(dim=-1).squeeze() - return dis - - -def kmeans_plusplus_torch( - X: torch.Tensor, - n_clusters: int, - random_state: int, - n_local_trials: int = 30, - device: torch.device = torch.device('cpu'), -): - """ - Choose initial centroids for initializing k-means algorithm. The performance of - k-means algorithm can vary significantly by the initial centroids. To alleviate - this problem, k-means++ algorithm chooses initial centroids based on the probability - proportional to the distance from the formally chosen centroids. The centroids - selected by k-means++ algorithm improve the chance of getting more accurate and - stable clustering results. The overall implementation of k-means++ algorithm is - inspired by the numpy based k-means++ implementation in: - https://github.com/scikit-learn/scikit-learn - - Originally, the implementation of the k-means++ algorithm in scikit-learn is based - on the following research article: - Arthur, David, and Sergei Vassilvitskii. k-means++: The advantages of careful - seeding. Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete - algorithms, Society for Industrial and Applied Mathematics (2007) - - Args: - X (Tensor): - Matrix containing cosine similarity values among embedding vectors (N x N) - n_clusters (int): - Maximum number of speakers for estimating number of speakers. - Shows stable performance under 20. - random_state (int): - Seed variable for setting up a random state. - n_local_trials (int): - Number of trials for creating initial values of the center points. - device (torch.device) - Torch device variable. - - Returns: - centers (Tensor): - The coordinates for center points that are used for initializing k-means algorithm. - indices (Tensor): - The indices of the best candidate center points. - """ - torch.manual_seed(random_state) - X = X.to(device) - n_samples, n_features = X.shape - - centers = torch.zeros(n_clusters, n_features, dtype=X.dtype) - center_id = torch.randint(0, n_samples, (1,)).long() - indices = torch.full([n_clusters,], -1, dtype=torch.int) - - centers[0] = X[center_id].squeeze(0) - indices[0] = center_id.squeeze(0) - - centers = centers.to(device) - closest_dist_diff = centers[0, None].repeat(1, X.shape[0]).view(X.shape[0], -1) - X - closest_dist_sq = closest_dist_diff.pow(2).sum(dim=1).unsqueeze(dim=0) - current_pot = closest_dist_sq.sum() - - for c in range(1, n_clusters): - rand_vals = torch.rand(n_local_trials) * current_pot.item() - - if len(closest_dist_sq.shape) > 1: - torch_cumsum = torch.cumsum(closest_dist_sq, dim=1)[0] - else: - torch_cumsum = torch.cumsum(closest_dist_sq, dim=0) - - candidate_ids = torch.searchsorted(torch_cumsum, rand_vals.to(device)) - - N_ci = candidate_ids.shape[0] - distance_diff = X[candidate_ids].repeat(1, X.shape[0]).view(X.shape[0] * N_ci, -1) - X.repeat(N_ci, 1) - distance = distance_diff.pow(2).sum(dim=1).view(N_ci, -1) - distance_to_candidates = torch.minimum(closest_dist_sq, distance) - candidates_pot = distance_to_candidates.sum(dim=1) - - best_candidate = torch.argmin(candidates_pot) - current_pot = candidates_pot[best_candidate] - closest_dist_sq = distance_to_candidates[best_candidate] - best_candidate = candidate_ids[best_candidate] - - centers[c] = X[best_candidate] - indices[c] = best_candidate - return centers, indices - - -def kmeans_torch( - X: torch.Tensor, - num_clusters: int, - threshold: float = 1e-4, - iter_limit: int = 15, - random_state: int = 0, - device: torch.device = torch.device('cpu'), -) -> torch.Tensor: - """ - Run k-means algorithm on the given set of spectral embeddings in X. The threshold - and iter_limit variables are set to show the best performance on speaker diarization - tasks. The overall implementation of k-means algorithm is inspired by the k-means - algorithm implemented in https://github.com/scikit-learn/scikit-learn. - - References: - Arthur, David, and Sergei Vassilvitskii. k-means++: The advantages of careful - seeding. Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete - algorithms, Society for Industrial and Applied Mathematics (2007). - - Args: - X (Tensor): - Cosine similarity matrix calculated from speaker embeddings - num_clusters (int): - The estimated number of speakers. - threshold (float): - This threshold limits the change of center values. If the square of - the center shift values are bigger than this threshold, the iteration stops. - iter_limit (int): - The maximum number of iterations that is allowed by the k-means algorithm. - device (torch.device): - Torch device variable - - Returns: - selected_cluster_indices (Tensor): - The assigned cluster labels from the k-means clustering. - """ - # Convert tensor type to float - X = X.float().to(device) - input_size = X.shape[0] - - # Initialize the cluster centers with kmeans_plusplus algorithm. - plusplus_init_states = kmeans_plusplus_torch(X, n_clusters=num_clusters, random_state=random_state, device=device) - centers = plusplus_init_states[0] - - selected_cluster_indices = torch.zeros(input_size).long() - - for iter_count in range(iter_limit): - euc_dist = getEuclideanDistance(X, centers, device=device) - - if len(euc_dist.shape) <= 1: - break - else: - selected_cluster_indices = torch.argmin(euc_dist, dim=1) - - center_inits = centers.clone() - - for index in range(num_clusters): - selected_cluster = torch.nonzero(selected_cluster_indices == index).squeeze().to(device) - chosen_indices = torch.index_select(X, 0, selected_cluster) - - if chosen_indices.shape[0] == 0: - chosen_indices = X[torch.randint(len(X), (1,))] - - centers[index] = chosen_indices.mean(dim=0) - - # Calculate the delta from center_inits to centers - center_delta_pow = torch.pow((centers - center_inits), 2) - center_shift_pow = torch.pow(torch.sum(torch.sqrt(torch.sum(center_delta_pow, dim=1))), 2) - - # If the cluster centers are not changing significantly, stop the loop. - if center_shift_pow < threshold: - break - - return selected_cluster_indices - - -def getTheLargestComponent(affinity_mat: torch.Tensor, seg_index: int, device: torch.device) -> torch.Tensor: - """ - Find the largest affinity_mat connected components for each given node. - This is for checking whether the affinity_mat is fully connected. - - Args: - affinity_mat (Tensor): - A square matrix (tensor) containing normalized cosine distance values - seg_index (int): - The segment index that is targeted to be explored. - - Returns: - connected_nodes (Tensor): - A tensor containing booleans that indicate whether the node is connected. - """ - num_of_segments = affinity_mat.shape[0] - - connected_nodes = torch.zeros(num_of_segments, dtype=torch.bool).to(device) - nodes_to_explore = torch.zeros(num_of_segments, dtype=torch.bool).to(device) - - nodes_to_explore[seg_index] = True - nodes_to_explore = nodes_to_explore.to(device) - for k in range(num_of_segments): - last_num_component = connected_nodes.sum() - torch.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes) - if last_num_component >= connected_nodes.sum(): - break - - indices = (nodes_to_explore == torch.tensor(True)).nonzero().t().squeeze() - if len(indices.size()) == 0: - indices = indices.unsqueeze(0) - for i in indices: - neighbors = affinity_mat[i].to(device) - torch.logical_or(nodes_to_explore, neighbors.squeeze(0), out=nodes_to_explore) - return connected_nodes - - -def isGraphFullyConnected(affinity_mat: torch.Tensor, device: torch.device) -> torch.Tensor: - """ - Check whether the given affinity matrix is a fully connected graph. - """ - return getTheLargestComponent(affinity_mat, 0, device).sum() == affinity_mat.shape[0] - - -def getKneighborsConnections(affinity_mat: torch.Tensor, p_value: int, mask_method: str = 'binary') -> torch.Tensor: - """ - Binarize top-p values for each row from the given affinity matrix. - - Args: - affinity_mat (Tensor): - A square matrix (tensor) containing normalized cosine similarity values - p_value (int): - The number of top values that are selected from each row. - mask_method (str): - The method that is used to manipulate the affinity matrix. The default method is 'binary'. - - Returns: - binarized_affinity_mat (Tensor): - A binarized affinity matrix based on the given mask method. - """ - dim = affinity_mat.shape - binarized_affinity_mat = torch.zeros_like(affinity_mat).half() - sorted_matrix = torch.argsort(affinity_mat, dim=1, descending=True)[:, :p_value] - binarized_affinity_mat[sorted_matrix.T, torch.arange(affinity_mat.shape[0])] = ( - torch.ones(1).to(affinity_mat.device).half() - ) - indices_row = sorted_matrix[:, :p_value].flatten() - indices_col = torch.arange(dim[1]).repeat(p_value, 1).T.flatten() - if mask_method == 'binary' or mask_method is None: - binarized_affinity_mat[indices_row, indices_col] = ( - torch.ones(indices_row.shape[0]).to(affinity_mat.device).half() - ) - elif mask_method == 'drop': - binarized_affinity_mat[indices_row, indices_col] = affinity_mat[indices_row, indices_col].half() - elif mask_method == 'sigmoid': - binarized_affinity_mat[indices_row, indices_col] = torch.sigmoid(affinity_mat[indices_row, indices_col]).half() - else: - raise ValueError(f'Unknown mask method: {mask_method}') - return binarized_affinity_mat - - -def getAffinityGraphMat(affinity_mat_raw: torch.Tensor, p_value: int) -> torch.Tensor: - """ - Calculate a binarized graph matrix and - symmetrize the binarized graph matrix. - """ - X = affinity_mat_raw if p_value <= 0 else getKneighborsConnections(affinity_mat_raw, p_value) - symm_affinity_mat = 0.5 * (X + X.T) - return symm_affinity_mat - - -def getMinimumConnection( - mat: torch.Tensor, max_N: torch.Tensor, n_list: torch.Tensor, device: torch.device -) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Generate connections until fully connect all the nodes in the graph. - If the graph is not fully connected, it might generate inaccurate results. - """ - p_value = torch.tensor(1) - affinity_mat = getAffinityGraphMat(mat, p_value) - for i, p_value in enumerate(n_list): - fully_connected = isGraphFullyConnected(affinity_mat, device) - affinity_mat = getAffinityGraphMat(mat, p_value) - if fully_connected or p_value > max_N: - break - - return affinity_mat, p_value - - -def getRepeatedList(mapping_argmat: torch.Tensor, score_mat_size: torch.Tensor) -> torch.Tensor: - """ - Count the numbers in the mapping dictionary and create lists that contain - repeated indices that will be used for creating a repeated affinity matrix. - This repeated matrix is then used for fusing multiple affinity values. - """ - repeat_list = torch.zeros(score_mat_size, dtype=torch.int32).to(mapping_argmat.device) - idxs, counts = torch.unique(mapping_argmat, return_counts=True) - repeat_list[idxs] = counts.int().to(mapping_argmat.device) - return repeat_list - - -def get_argmin_mat(timestamps_in_scales: List[torch.Tensor]) -> List[torch.Tensor]: - """ - Calculate the mapping between the base scale and other scales. A segment from a longer scale is - repeatedly mapped to a segment from a shorter scale or the base scale. - - Args: - timestamps_in_scales (list): - List containing timestamp tensors for each scale. - Each tensor has dimensions of (Number of base segments) x 2. - - Returns: - session_scale_mapping_list (list): - List containing argmin arrays indexed by scale index. - """ - scale_list = list(range(len(timestamps_in_scales))) - segment_anchor_list = [] - for scale_idx in scale_list: - time_stamps_float = timestamps_in_scales[scale_idx] - segment_anchor_list.append(torch.mean(time_stamps_float, dim=1)) - - base_scale_idx = max(scale_list) - base_scale_anchor = segment_anchor_list[base_scale_idx] - session_scale_mapping_list = [] - for scale_idx in scale_list: - curr_scale_anchor = segment_anchor_list[scale_idx] - curr_mat = torch.tile(curr_scale_anchor, (base_scale_anchor.shape[0], 1)) - base_mat = torch.tile(base_scale_anchor, (curr_scale_anchor.shape[0], 1)).t() - argmin_mat = torch.argmin(torch.abs(curr_mat - base_mat), dim=1) - session_scale_mapping_list.append(argmin_mat) - return session_scale_mapping_list - - -def getCosAffinityMatrix(emb: torch.Tensor) -> torch.Tensor: - """ - Calculate cosine similarity values among speaker embeddings then min-max normalize - the affinity matrix. - - Args: - emb (Tensor): - Matrix containing embedding vectors. emb variable should be float(FP32) type to make the data-type - compatible with torch.mm operation for both CPU and GPU(CUDA). - dimension: (Number of embedding vectors) x (embedding dimension) - - Returns: - sim_d (Tensor): - Matrix containing cosine similarity values among the given embedding vectors. - dimension: (Number of embedding vectors) x (Number of embedding vectors) - """ - if emb.shape[0] == 1: - sim_d = torch.tensor([[1]]).to(emb.device) - else: - emb = emb.float() - sim_d = cos_similarity(emb, emb) - sim_d = ScalerMinMax(sim_d) - return sim_d - - -def get_scale_interpolated_embs( - multiscale_weights: torch.Tensor, - embeddings_in_scales: List[torch.Tensor], - timestamps_in_scales: List[torch.Tensor], - device: torch.device = torch.device('cpu'), -) -> Tuple[torch.Tensor, List[torch.Tensor]]: - """ - Generate a scale-interpolated single embedding vector by calculating the weighted sum - of the multiple embedding vectors from different scales. The output is a set of embedding - vectors corresponding to the base-scale segments. - - Args: - multiscale_weights (Tensor): - Tensor containing Multiscale weights - Dimensions: (Number of scales) x 1 - embeddings_in_scales (list): - List containing split embedding tensors by each scale - timestamps_in_scales (list): - List containing split timestamps tensors by each scale - device (torch.device): - Torch device variable - - Returns: - context_emb (torch.tensor): - A set of scale-interpolated embedding vectors. - Dimensions: (Number of base-scale segments) x (Dimensions of embedding vector) - session_scale_mapping_list (list): - List containing argmin arrays indexed by scale index. - """ - rep_mat_list = [] - multiscale_weights = multiscale_weights.to(device) - session_scale_mapping_list = get_argmin_mat(timestamps_in_scales) - scale_list = list(range(len(timestamps_in_scales))) - for scale_idx in scale_list: - mapping_argmat = session_scale_mapping_list[scale_idx] - emb_t = embeddings_in_scales[scale_idx].to(device) - mapping_argmat = mapping_argmat.to(device) - repeat_list = getRepeatedList(mapping_argmat, torch.tensor(emb_t.shape[0])).to(device) - rep_emb_t = torch.repeat_interleave(emb_t, repeats=repeat_list, dim=0) - rep_mat_list.append(rep_emb_t) - stacked_scale_embs = torch.stack(rep_mat_list) - context_emb = torch.matmul(stacked_scale_embs.permute(2, 1, 0), multiscale_weights.t()).squeeze().t() - if len(context_emb.shape) < 2: - context_emb = context_emb.unsqueeze(0) - context_emb = context_emb.to(device) - return context_emb, session_scale_mapping_list - - -def getMultiScaleCosAffinityMatrix( - multiscale_weights: torch.Tensor, - embeddings_in_scales: List[torch.Tensor], - timestamps_in_scales: List[torch.Tensor], - device: torch.device = torch.device('cpu'), -) -> torch.Tensor: - """ - Calculate cosine similarity values among speaker embeddings for each scale then - apply multiscale weights to calculate the fused similarity matrix. - NOTE: Due to CUDA memory limit, the embedding vectors in embeddings_in_scales are stored in `cpu` device. - - Args: - multiscale_weights (Tensor): - Tensor containing multiscale weights - Dimensions: (Number of scales) x 1 - embeddings_in_scales (list): - List containing split embedding tensors by each scale - timestamps_in_scales (list): - List containing split timestamps tensors by each scale - device (torch.device): - Torch device variable - - Returns: - fused_sim_d (Tensor): - An affinity matrix that is obtained by calculating the weighted sum of - the multiple affinity matrices from the different scales. - """ - multiscale_weights = torch.squeeze(multiscale_weights, dim=0).to(device) - session_scale_mapping_list = get_argmin_mat(timestamps_in_scales) - scale_list = list(range(len(timestamps_in_scales))) - fused_sim_d = torch.zeros(len(timestamps_in_scales[-1]), len(timestamps_in_scales[-1])).to(device) - for scale_idx in scale_list: - mapping_argmat = session_scale_mapping_list[scale_idx] - emb_t = embeddings_in_scales[scale_idx].half().to(device) - score_mat_torch = getCosAffinityMatrix(emb_t) - repeat_list = getRepeatedList(mapping_argmat, torch.tensor(score_mat_torch.shape[0])).to(device) - repeated_tensor_0 = torch.repeat_interleave(score_mat_torch, repeats=repeat_list, dim=0).to(device) - repeated_tensor_1 = torch.repeat_interleave(repeated_tensor_0, repeats=repeat_list, dim=1).to(device) - fused_sim_d += multiscale_weights[scale_idx] * repeated_tensor_1 - return fused_sim_d - - -def getLaplacian(X: torch.Tensor) -> torch.Tensor: - """ - Calculate a laplacian matrix from an affinity matrix X. - """ - X.fill_diagonal_(0) - D = torch.sum(torch.abs(X), dim=1) - D = torch.diag_embed(D) - L = D - X - return L - - -def eigDecompose(laplacian: torch.Tensor, cuda: bool, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Calculate eigenvalues and eigenvectors from the Laplacian matrix. - """ - if cuda: - if device is None: - device = torch.cuda.current_device() - laplacian = laplacian.float().to(device) - else: - laplacian = laplacian.float().to(torch.device('cpu')) - lambdas, diffusion_map = eigh(laplacian) - return lambdas, diffusion_map - - -def eigValueSh(laplacian: torch.Tensor, cuda: bool, device: torch.device) -> torch.Tensor: - """ - Calculate only eigenvalues from the Laplacian matrix. - """ - if cuda: - if device is None: - device = torch.cuda.current_device() - laplacian = laplacian.float().to(device) - else: - laplacian = laplacian.float().to(torch.device('cpu')) - lambdas = eigvalsh(laplacian) - return lambdas - - -def getLamdaGaplist(lambdas: torch.Tensor) -> torch.Tensor: - """ - Calculate the gaps between lambda values. - """ - if torch.is_complex(lambdas): - lambdas = torch.real(lambdas) - return lambdas[1:] - lambdas[:-1] - - -def addAnchorEmb(emb: torch.Tensor, anchor_sample_n: int, anchor_spk_n: int, sigma: float) -> torch.Tensor: - """ - Add randomly generated synthetic embeddings to make eigenanalysis more stable. - We refer to these embeddings as anchor embeddings. - - emb (Tensor): - The input embedding from the embedding extractor. - anchor_sample_n (int): - Number of embedding samples per speaker. - anchor_sample_n = 10 is recommended. - anchor_spk_n (int): - Number of speakers for synthetic embedding. - anchor_spk_n = 3 is recommended. - sigma (int): - The amplitude of synthetic noise for each embedding vector. - If the sigma value is too small, under-counting could happen. - If the sigma value is too large, over-counting could happen. - sigma = 50 is recommended. - """ - emb_dim = emb.shape[1] - std_org = torch.std(emb, dim=0) - sigma = torch.tensor(sigma).to(emb.device) - new_emb_list = [] - for _ in range(anchor_spk_n): - emb_m = torch.tile(torch.randn(1, emb_dim), (anchor_sample_n, 1)).to(emb.device) - emb_noise = torch.randn(anchor_sample_n, emb_dim).T.to(emb.device) - emb_noise = torch.matmul( - torch.diag(std_org), emb_noise / torch.max(torch.abs(emb_noise), dim=0)[0].unsqueeze(0) - ).T - emb_gen = emb_m + sigma * emb_noise - new_emb_list.append(emb_gen) - - new_emb_list.append(emb) - new_emb_np = torch.vstack(new_emb_list) - return new_emb_np - - -def getEnhancedSpeakerCount( - emb: torch.Tensor, - random_test_count: int = 5, - anchor_spk_n: int = 3, - anchor_sample_n: int = 10, - sigma: float = 50, - cuda: bool = False, -) -> torch.Tensor: - """ - Calculate the number of speakers using NME analysis with anchor embeddings. Add dummy speaker - embedding vectors and run speaker counting multiple times to enhance the speaker counting accuracy - for the short audio samples. - - Args: - emb (Tensor): - The input embedding from the embedding extractor. - cuda (bool): - Use cuda for the operations if cuda==True. - random_test_count (int): - Number of trials of the enhanced counting with randomness. - The higher the count, the more accurate the enhanced counting is. - anchor_spk_n (int): - Number of speakers for synthetic embedding. - anchor_spk_n = 3 is recommended. - anchor_sample_n (int): - Number of embedding samples per speaker. - anchor_sample_n = 10 is recommended. - sigma (float): - The amplitude of synthetic noise for each embedding vector. - If the sigma value is too small, under-counting could happen. - If the sigma value is too large, over-counting could happen. - sigma = 50 is recommended. - - Returns: - comp_est_num_of_spk (Tensor): - The estimated number of speakers. `anchor_spk_n` is subtracted from the estimated - number of speakers to factor out the dummy speaker embedding vectors. - """ - est_num_of_spk_list: List[int] = [] - for seed in range(random_test_count): - torch.manual_seed(seed) - emb_aug = addAnchorEmb(emb, anchor_sample_n, anchor_spk_n, sigma) - mat = getCosAffinityMatrix(emb_aug) - nmesc = NMESC( - mat, - max_num_speakers=emb.shape[0], - max_rp_threshold=0.15, - sparse_search=True, - sparse_search_volume=10, - fixed_thres=-1.0, - nme_mat_size=300, - cuda=cuda, - ) - est_num_of_spk, _ = nmesc.forward() - est_num_of_spk_list.append(est_num_of_spk.item()) - comp_est_num_of_spk = torch.tensor(max(torch.mode(torch.tensor(est_num_of_spk_list))[0].item() - anchor_spk_n, 1)) - return comp_est_num_of_spk - - -def split_input_data( - embeddings_in_scales: torch.Tensor, - timestamps_in_scales: torch.Tensor, - multiscale_segment_counts: torch.LongTensor, -) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: - """ - Split multiscale embeddings and multiscale timestamps and put split scale-wise data into python lists. - This formatting function is needed to make the input type as `torch.Tensor`. - - Args: - embeddings_in_scales (Tensor): - Concatenated Torch tensor containing embeddings in multiple scales - timestamps_in_scales (Tensor): - Concatenated Torch tensor containing timestamps in multiple scales - multiscale_segment_counts (LongTensor): - Concatenated Torch LongTensor containing number of segments per each scale - - Returns: - embeddings_in_scales (list): - List containing split embedding tensors by each scale - timestamps_in_scales (list): - List containing split timestamps tensors by each scale - """ - split_index: List[int] = multiscale_segment_counts.tolist() - embeddings_in_scales = torch.split(embeddings_in_scales, split_index, dim=0) - timestamps_in_scales = torch.split(timestamps_in_scales, split_index, dim=0) - embeddings_in_scales, timestamps_in_scales = list(embeddings_in_scales), list(timestamps_in_scales) - return embeddings_in_scales, timestamps_in_scales - - -def estimateNumofSpeakers( - affinity_mat: torch.Tensor, max_num_speakers: int, cuda: bool = False -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Estimate the number of speakers using eigendecomposition on the Laplacian Matrix. - - Args: - affinity_mat (Tensor): - N by N affinity matrix - max_num_speakers (int): - Maximum number of clusters to consider for each session - cuda (bool): - If cuda available eigendecomposition is computed on GPUs. - - Returns: - num_of_spk (Tensor): - The estimated number of speakers - lambdas (Tensor): - The lambda values from eigendecomposition - lambda_gap (Tensor): - The gap between the lambda values from eigendecomposition - """ - laplacian = getLaplacian(affinity_mat) - lambdas = eigValueSh(laplacian, cuda=cuda, device=affinity_mat.device) - lambdas = torch.sort(lambdas)[0] - lambda_gap = getLamdaGaplist(lambdas) - num_of_spk = torch.argmax(lambda_gap[: min(max_num_speakers, lambda_gap.shape[0])]) + 1 - return num_of_spk, lambdas, lambda_gap - - -class SpectralClustering: - """ - Perform spectral clustering by calculating spectral embeddings then run k-means clustering - algorithm on the spectral embeddings. - """ - - def __init__( - self, - n_clusters: int = 8, - random_state: int = 0, - n_random_trials: int = 1, - cuda: bool = False, - device: torch.device = torch.device('cpu'), - ): - """ - Initialize the variables needed for spectral clustering and k-means++. - - Args: - n_clusters (int): - Number of the estimated (or oracle) number of speakers - random_state (int): - Random seed that determines a random state of k-means initialization. - n_random_trials (int): - Number of trials with different random seeds for k-means initialization. - k-means++ algorithm is executed for multiple times then the final result - is obtained by taking a majority vote. - cuda (bool): - if cuda=True, spectral clustering is done on GPU. - device (torch.device): - Torch device variable - """ - self.n_clusters = n_clusters - self.random_state = random_state - self.n_random_trials = max(n_random_trials, 1) - self.cuda = cuda - self.device = device - - def forward(self, X) -> torch.Tensor: - """ - Call self.clusterSpectralEmbeddings() function to predict cluster labels. - - Args: - X (Tensor): - Affinity matrix input - - Returns: - labels (Tensor): - clustering label output - """ - if X.shape[0] != X.shape[1]: - raise ValueError("The affinity matrix is not a square matrix.") - labels = self.clusterSpectralEmbeddings(X, cuda=self.cuda, device=self.device) - return labels - - def clusterSpectralEmbeddings( - self, affinity: torch.Tensor, cuda: bool = False, device: torch.device = torch.device('cpu') - ) -> torch.Tensor: - """ - Perform k-means clustering on spectral embeddings. To alleviate the effect of randomness, - k-means clustering is performed for (self.n_random_trials) times then the final labels are obtained - by taking a majority vote. If speed is the major concern, self.n_random_trials should be set to 1. - n_random_trials=30 is recommended to see an improved result. - - Args: - affinity (Tensor): - Affinity matrix input - cuda (torch.bool): - Use cuda for spectral clustering if cuda=True - device (torch.device): - Torch device variable - - Returns: - labels (Tensor): - clustering label output - - """ - spectral_emb = self.getSpectralEmbeddings(affinity, n_spks=self.n_clusters, cuda=cuda) - labels_set = [] - - for random_state_seed in range(self.random_state, self.random_state + self.n_random_trials): - _labels = kmeans_torch( - X=spectral_emb, num_clusters=self.n_clusters, random_state=random_state_seed, device=device - ) - labels_set.append(_labels) - stacked_labels = torch.stack(labels_set) - label_index = torch.mode(torch.mode(stacked_labels, 0)[1])[0] - labels = stacked_labels[label_index] - return labels - - def getSpectralEmbeddings(self, affinity_mat: torch.Tensor, n_spks: int = 8, cuda: bool = False) -> torch.Tensor: - """ - Calculate eigenvalues and eigenvectors to extract spectral embeddings. - - Args: - affinity (Tensor): - Affinity matrix input - cuda (torch.bool): - Use cuda for spectral clustering if cuda=True - device (torch.device): - Torch device variable - - Returns: - labels (Tensor): - clustering label output - """ - laplacian = getLaplacian(affinity_mat) - _, diffusion_map_ = eigDecompose(laplacian, cuda=cuda, device=affinity_mat.device) - diffusion_map = diffusion_map_[:, :n_spks] - inv_idx = torch.arange(diffusion_map.size(1) - 1, -1, -1).long() - embedding = diffusion_map.T[inv_idx, :] - return embedding[:n_spks].T - - -class NMESC: - """ - Normalized Maximum Eigengap based Spectral Clustering (NME-SC) - uses Eigengap analysis to get an estimated p-value for - affinity binarization and an estimated number of speakers. - - p_value (also referred to as p_neighbors) is for taking - top p number of affinity values and convert those to 1 while - convert the rest of values to 0. - - p_value can be also tuned on a development set without performing - NME-analysis. Fixing p_value brings about significantly faster clustering - speed, but the performance is limited to the development set. - - References: - Tae Jin Park et al., Auto-Tuning Spectral Clustering for Speaker Diarization - Using Normalized Maximum Eigengap, IEEE Signal Processing Letters 27 (2019), - https://arxiv.org/abs/2003.02405 - - Args: - Please refer to def __init__(). - - Methods: - NMEanalysis(): - Performs NME-analysis to estimate p_value and the number of speakers - subsampleAffinityMat(nme_mat_size): - Subsamples the number of speakers to reduce the computational load - getPvalueList(): - Generates a list containing p-values that need to be examined. - getEigRatio(p_neighbors): - Calculates g_p, which is a ratio between p_neighbors and the maximum eigengap - getLamdaGaplist(lambdas): - Calculates lambda gap values from an array contains lambda values - estimateNumofSpeakers(affinity_mat): - Estimates the number of speakers using lambda gap list - """ - - def __init__( - self, - mat: torch.Tensor, - max_num_speakers: int = 10, - max_rp_threshold: float = 0.15, - sparse_search: bool = True, - sparse_search_volume: int = 30, - nme_mat_size: int = 512, - use_subsampling_for_nme: bool = True, - fixed_thres: float = -1.0, - maj_vote_spk_count: bool = False, - parallelism: bool = True, - cuda: bool = False, - device: torch.device = torch.device('cpu'), - ): - """ - Args: - mat (Tensor): - Cosine similarity matrix calculated from the provided speaker embeddings. - max_num_speakers (int): - Maximum number of speakers for estimating number of speakers. - Shows stable performance under 20. - max_rp_threshold (float): - Limits the range of parameter search. - Clustering performance can vary depending on this range. - Default is 0.25. - sparse_search (bool): - To increase the speed of parameter estimation, sparse_search=True - limits the number of p_values we search. - sparse_search_volume (int): - Number of p_values we search during NME analysis. - Default is 30. The lower the value, the faster NME-analysis becomes. - However, a value lower than 20 might cause a poor parameter estimation. - nme_mat_size (int): - Targeted size of matrix for NME analysis. - use_subsampling_for_nme (bool): - Use subsampling to reduce the calculational complexity. - Default is True. - fixed_thres (float or None): - A fixed threshold which can be used instead of estimating the - threshold with NME analysis. If fixed_thres is float, - it skips the NME analysis part. - maj_vote_spk_count (bool): - If True, take a majority vote on all p-values in the given range to estimate the number of speakers. - The majority voting may contribute to surpress overcounting of the speakers and improve speaker - counting accuracy. - parallelism (bool): - If True, turn on parallelism based on torch.jit.script library. - cuda (bool): - Use cuda for Eigen decomposition if cuda=True. - device (torch.device): - Torch device variable - - """ - self.max_num_speakers: int = max_num_speakers - self.max_rp_threshold: float = max_rp_threshold - self.use_subsampling_for_nme: bool = use_subsampling_for_nme - self.nme_mat_size: int = nme_mat_size - self.sparse_search: bool = sparse_search - self.sparse_search_volume: int = sparse_search_volume - self.min_p_value = torch.tensor(2) - self.fixed_thres: float = fixed_thres - self.eps = 1e-10 - self.max_N = torch.tensor(0) - self.mat: torch.Tensor = mat - self.p_value_list: torch.Tensor = self.min_p_value.unsqueeze(0) - self.cuda: bool = cuda - self.device: torch.device = device - self.maj_vote_spk_count: bool = maj_vote_spk_count - self.parallelism: bool = parallelism - - def forward(self) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Subsample the input matrix to reduce the computational load. - - Returns: - est_num_of_spk (Tensor): - Estimated number of speakers from NMESC approach - p_hat_value (Tensor): - Estimated p-value (determines how many neighboring values to be selected) - """ - if self.use_subsampling_for_nme: - subsample_ratio = self.subsampleAffinityMat(self.nme_mat_size) - else: - subsample_ratio = torch.tensor(1) - - # Scans p_values and find a p_value that generates the smallest g_p value. - results: List[torch.Tensor] = [] - est_spk_n_dict: Dict[int, torch.Tensor] = {} - self.p_value_list = self.getPvalueList() - p_volume = self.p_value_list.shape[0] - eig_ratio_list = torch.zeros(p_volume,) - est_num_of_spk_list = torch.zeros(p_volume,) - - if self.parallelism: - futures: List[torch.jit.Future[torch.Tensor]] = [] - for p_idx, p_value in enumerate(self.p_value_list): - futures.append(torch.jit.fork(self.getEigRatio, p_value)) - for future in futures: - results.append(torch.jit.wait(future)) - - else: - for p_idx, p_value in enumerate(self.p_value_list): - results.append(self.getEigRatio(p_value)) - - # Retrieve the eigen analysis results - for p_idx, p_value in enumerate(self.p_value_list): - output = results[p_idx] - g_p, est_num_of_spk = output[0], output[1].int() - eig_ratio_list[p_idx] = g_p - est_spk_n_dict[p_value.item()] = est_num_of_spk - est_num_of_spk_list[p_idx] = est_num_of_spk - - index_nn = torch.argmin(eig_ratio_list) - rp_p_value = self.p_value_list[index_nn] - affinity_mat = getAffinityGraphMat(self.mat, rp_p_value) - - # Checks whether the affinity graph is fully connected. - # If not, it adds a minimum number of connections to make it fully connected. - if not isGraphFullyConnected(affinity_mat, device=self.device): - affinity_mat, rp_p_value = getMinimumConnection( - self.mat, self.max_N, self.p_value_list, device=self.device - ) - - p_hat_value = (subsample_ratio * rp_p_value).type(torch.int) - if self.maj_vote_spk_count: - est_num_of_spk = torch.mode(torch.tensor(est_num_of_spk_list))[0] - else: - est_num_of_spk = est_spk_n_dict[rp_p_value.item()] - return est_num_of_spk, p_hat_value - - def subsampleAffinityMat(self, nme_mat_size: int) -> torch.Tensor: - """ - Perform subsampling of affinity matrix. - This subsampling is for calculational complexity, not for performance. - The smaller nme_mat_size is, - - the bigger the chance of missing a speaker. - - the faster p-value estimation speed (based on eigen decomposition). - - The recommended nme_mat_size is 250~750. - However, if there are speakers who speak for very short period of time in the recording, - this subsampling might make the system miss underrepresented speakers. - Use this variable with caution. - - Args: - nme_mat_size (int): - The targeted matrix size - - Returns: - subsample_ratio (float): - The ratio between nme_mat_size and the original matrix size - """ - subsample_ratio = torch.max(torch.tensor(1), torch.tensor(self.mat.shape[0] / nme_mat_size)).type(torch.int) - self.mat = self.mat[:: subsample_ratio.item(), :: subsample_ratio.item()] - return subsample_ratio - - def getEigRatio(self, p_neighbors: int) -> torch.Tensor: - """ - For a given p_neighbors value, calculate g_p, which is a ratio between p_neighbors and the - maximum eigengap values. - References: - Tae Jin Park et al., Auto-Tuning Spectral Clustering for Speaker Diarization Using - Normalized Maximum Eigengap, IEEE Signal Processing Letters 27 (2019), - https://arxiv.org/abs/2003.02405 - - Args: - p_neighbors (int): - Determines how many binary graph connections we want to keep for each row. - - Returns: - est_num_of_spk (int): - Estimated number of speakers - g_p (float): - The ratio between p_neighbors value and the maximum eigen gap value. - """ - affinity_mat = getAffinityGraphMat(self.mat, p_neighbors) - est_num_of_spk, lambdas, lambda_gap_list = estimateNumofSpeakers( - affinity_mat, self.max_num_speakers, self.cuda - ) - arg_sorted_idx = torch.argsort(lambda_gap_list[: self.max_num_speakers], descending=True) - max_key = arg_sorted_idx[0] - max_eig_gap = lambda_gap_list[max_key] / (torch.max(lambdas).item() + self.eps) - g_p = (p_neighbors / self.mat.shape[0]) / (max_eig_gap + self.eps) - return torch.stack([g_p, est_num_of_spk]) - - def getPvalueList(self) -> torch.Tensor: - """ - Generates a p-value (p_neighbour) list for searching. p_value_list must include 2 (min_p_value) - since at least one neighboring segment should be selected other than itself. - - If fixed_thres value is specified, then only one p-value is specified. - If fixed_thres is not provided, multiple p-values are searched. - If sparse_search is True: - - Limit the number of p-values to be searched to sparse_search_volume. - - N should be at least 2 to include a number greater than 1. - If sparse_search is False: - - Scan all the p_values from 1 to max_N - - If sparse_search is False, NMESC analysis could take more time compared to sparse_search = True. - - Returns: - p_value_list (Tensor): - Tensor containing the p_values to be searched. - """ - if self.fixed_thres is not None and self.fixed_thres > 0.0: - self.max_N = torch.max( - torch.floor(torch.tensor(self.mat.shape[0] * self.fixed_thres)).type(torch.int), self.min_p_value - ) - p_value_list = self.max_N.unsqueeze(0).int() - else: - self.max_N = torch.max( - torch.floor(torch.tensor(self.mat.shape[0] * self.max_rp_threshold)).type(torch.int), self.min_p_value - ) - if self.sparse_search: - search_volume = torch.min(self.max_N, torch.tensor(self.sparse_search_volume).type(torch.int)) - # search at least two values - N = torch.max(search_volume, torch.tensor(2)) - # avoid repeating values by limiting the step size - steps = min(self.max_N, N) - p_value_list = torch.linspace(start=1, end=self.max_N, steps=steps).type(torch.int) - else: - p_value_list = torch.arange(1, self.max_N + 1) - if p_value_list.shape[0] == 0: - raise ValueError("p_value_list should not be empty.") - return p_value_list - - -class SpeakerClustering(torch.nn.Module): - def __init__( - self, - min_samples_for_nmesc: int = 6, - nme_mat_size: int = 512, - sparse_search: bool = True, - maj_vote_spk_count: bool = False, - parallelism: bool = False, - cuda: bool = False, - ): - """ - Clustering method for speaker diarization based on cosine similarity. - NME-SC part is converted to torch.tensor based operations in NeMo 1.9. - - Args: - min_samples_for_nmesc (int): - The minimum number of samples required for NME clustering. This avoids - zero p_neighbour_lists. If the input has fewer segments than min_samples, - it is directed to the enhanced speaker counting mode. - sparse_search (bool): - Toggle sparse search mode. If True, limit the size of p_value_list to sparse_search_volume. - maj_vote_spk_count (bool): - If True, take a majority vote on all p-values in the given range to estimate the number of speakers. - The majority voting may contribute to surpress overcounting of the speakers and improve speaker - counting accuracy. - parallelism (bool): - Use dynamic parallelism feature in torch.jit compiler to accelerate the p-value search. - cuda (bool): - Boolean variable for toggling cuda availability. - """ - super().__init__() - self.min_samples_for_nmesc: int = min_samples_for_nmesc - self.nme_mat_size: int = nme_mat_size - self.sparse_search: bool = sparse_search - self.parallelism: bool = parallelism - self.cuda: bool = cuda - self.maj_vote_spk_count: bool = maj_vote_spk_count - self.embeddings_in_scales: List[torch.Tensor] = [torch.Tensor(0)] - self.timestamps_in_scales: List[torch.Tensor] = [torch.Tensor(0)] - self.device = torch.device("cuda") if self.cuda else torch.device("cpu") - - def forward(self, param_dict: Dict[str, torch.Tensor]) -> torch.LongTensor: - """ - A function wrapper designed for inference in exported script format. - - Note: - Dict is used to allow easy inference of the exported jit model in Triton server using easy to understand - naming convention. - See https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#special-conventions-for-pytorch-backend - - Args: - param_dict (dict): - Dictionary containing the arguments for speaker clustering. - See `forward_infer` function for the argument information. - - Returns: - Y (LongTensor): - Speaker labels for the segments in the given input embeddings. - """ - embeddings_in_scales = param_dict['embeddings'] - timestamps_in_scales = param_dict['timestamps'] - multiscale_segment_counts = param_dict['multiscale_segment_counts'] - multiscale_weights = param_dict['multiscale_weights'] - - oracle_num_speakers = int(param_dict['oracle_num_speakers'].item()) - max_num_speakers = int(param_dict['max_num_speakers'].item()) - enhanced_count_thres = int(param_dict['enhanced_count_thres'].item()) - sparse_search_volume = int(param_dict['sparse_search_volume'].item()) - max_rp_threshold = float(param_dict['max_rp_threshold'].item()) - fixed_thres = float(param_dict['fixed_thres'].item()) - - return self.forward_infer( - embeddings_in_scales=embeddings_in_scales, - timestamps_in_scales=timestamps_in_scales, - multiscale_segment_counts=multiscale_segment_counts, - multiscale_weights=multiscale_weights, - oracle_num_speakers=oracle_num_speakers, - max_rp_threshold=max_rp_threshold, - max_num_speakers=max_num_speakers, - enhanced_count_thres=enhanced_count_thres, - sparse_search_volume=sparse_search_volume, - fixed_thres=fixed_thres, - ) - - def forward_infer( - self, - embeddings_in_scales: torch.Tensor, - timestamps_in_scales: torch.Tensor, - multiscale_segment_counts: torch.LongTensor, - multiscale_weights: torch.Tensor, - oracle_num_speakers: int = -1, - max_rp_threshold: float = 0.15, - max_num_speakers: int = 8, - enhanced_count_thres: int = 40, - sparse_search_volume: int = 30, - fixed_thres: float = -1.0, - kmeans_random_trials: int = 1, - ) -> torch.LongTensor: - """ - Calculate affinity matrix using timestamps and speaker embeddings, run NME analysis to estimate the best - p-value and perform spectral clustering based on the estimated p-value and the calculated affinity matrix. - - Caution: - For the sake of compatibility with libtorch, python boolean `False` is replaced with `torch.LongTensor(-1)`. - - Args: - Dict containing following keys associated with tensors. - embeddings (Tensor): - Concatenated Torch tensor containing embeddings in multiple scales - This tensor has dimensions of (Number of base segments) x (Embedding Dimension) - timestamps (Tensor): - Concatenated Torch tensor containing timestamps in multiple scales. - This tensor has dimensions of (Total number of segments all scales) x 2 - Example: - >>> timestamps_in_scales = \ - >>> torch.tensor([0.4, 1.4], [0.9, 1.9], [1.4, 2.4], ... [121.2, 122.2]]) - - multiscale_segment_counts (LongTensor): - Concatenated Torch tensor containing number of segments per each scale - This tensor has dimensions of (Number of scales) - Example: - >>> multiscale_segment_counts = torch.LongTensor([31, 52, 84, 105, 120]) - - multiscale_weights (Tensor): - Multi-scale weights that are used when affinity scores are merged. - Example: - >>> multiscale_weights = torch.tensor([1.4, 1.3, 1.2, 1.1, 1.0]) - - oracle_num_speakers (int): - The number of speakers in a session from the reference transcript - max_num_speakers (int): - The upper bound for the number of speakers in each session - max_rp_threshold (float): - Limits the range of parameter search. - Clustering performance can vary depending on this range. - Default is 0.15. - enhanced_count_thres (int): - For the short audio recordings, clustering algorithm cannot - accumulate enough amount of speaker profile for each cluster. - Thus, function `getEnhancedSpeakerCount` employs anchor embeddings - (dummy representations) to mitigate the effect of cluster sparsity. - enhanced_count_thres = 80 is recommended. - sparse_search_volume (int): - Number of p_values we search during NME analysis. - Default is 30. The lower the value, the faster NME-analysis becomes. - Lower than 20 might cause a poor parameter estimation. - fixed_thres (float): - If fixed_thres value is provided, NME-analysis process will be skipped. - This value should be optimized on a development set to obtain a quality result. - Default is None and performs NME-analysis to estimate the threshold. - kmeans_random_trials (int): - Number of random trials for initializing k-means clustering. More trials - will result in a more stable clustering result. Default is 1. - - Returns: - Y (LongTensor): - Speaker labels for the segments in the given input embeddings. - """ - self.embeddings_in_scales, self.timestamps_in_scales = split_input_data( - embeddings_in_scales, timestamps_in_scales, multiscale_segment_counts - ) - # Last slot is the base scale embeddings - emb = self.embeddings_in_scales[-1] - - # Cases for extreamly short sessions - if emb.shape[0] == 1: - return torch.zeros((1,), dtype=torch.int64) - elif emb.shape[0] <= max(enhanced_count_thres, self.min_samples_for_nmesc) and oracle_num_speakers < 0: - est_num_of_spk_enhanced = getEnhancedSpeakerCount(emb=emb, cuda=self.cuda) - else: - est_num_of_spk_enhanced = torch.tensor(-1) - - if oracle_num_speakers > 0: - max_num_speakers = oracle_num_speakers - - mat = getMultiScaleCosAffinityMatrix( - multiscale_weights, self.embeddings_in_scales, self.timestamps_in_scales, self.device - ) - - nmesc = NMESC( - mat, - max_num_speakers=max_num_speakers, - max_rp_threshold=max_rp_threshold, - sparse_search=self.sparse_search, - sparse_search_volume=sparse_search_volume, - fixed_thres=fixed_thres, - nme_mat_size=self.nme_mat_size, - maj_vote_spk_count=self.maj_vote_spk_count, - parallelism=self.parallelism, - cuda=self.cuda, - device=self.device, - ) - - # If there are less than `min_samples_for_nmesc` segments, est_num_of_spk is 1. - if mat.shape[0] > self.min_samples_for_nmesc: - est_num_of_spk, p_hat_value = nmesc.forward() - affinity_mat = getAffinityGraphMat(mat, p_hat_value) - else: - nmesc.fixed_thres = max_rp_threshold - est_num_of_spk, p_hat_value = nmesc.forward() - affinity_mat = mat - - # n_clusters is number of speakers estimated from spectral clustering. - if oracle_num_speakers > 0: - n_clusters = int(oracle_num_speakers) - elif est_num_of_spk_enhanced > 0: - n_clusters = int(est_num_of_spk_enhanced.item()) - else: - n_clusters = int(est_num_of_spk.item()) - - spectral_model = SpectralClustering( - n_clusters=n_clusters, n_random_trials=kmeans_random_trials, cuda=self.cuda, device=self.device - ) - Y = spectral_model.forward(affinity_mat) - return Y diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/online_clustering.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/online_clustering.py deleted file mode 100644 index 9620a87144b935d4d828eee927504eaab8bda54c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/online_clustering.py +++ /dev/null @@ -1,1191 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2007-2020 The scikit-learn developers. - -# BSD 3-Clause License - -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# NME-SC clustering is based on the implementation from the paper -# https://arxiv.org/pdf/2003.02405.pdf and the implementation from -# https://github.com/tango4j/Auto-Tuning-Spectral-Clustering. - -from typing import List, Set, Tuple - -import numpy as np -import torch - -from nemo.collections.asr.parts.utils.offline_clustering import ( - NMESC, - SpectralClustering, - getAffinityGraphMat, - getCosAffinityMatrix, -) -from nemo.collections.asr.parts.utils.optimization_utils import linear_sum_assignment - - -def get_lsa_speaker_mapping( - U_set: torch.Tensor, cmm_P: torch.Tensor, cmm_Q: torch.Tensor, PandQ: torch.Tensor -) -> torch.Tensor: - """ - Find a mapping that minimizes the matching cost between the label P and Q. - One-hot encodding is employed to represent sequence and calculate the cost. - - Args: - U_set (list): - Whole set of the estimated speakers - cmm_P (Tensor): - Length-matched old sequence - cmm_Q (Tensor): - Length-matched new sequence - PandQ (Tensor): - Tensor containing the indices of the speakers that are in both old and new sequences - - Returns: - mapping_array (np.array): - Mapped labels that minimizes the cost - """ - all_spks_labels = [[x] for x in range(len(U_set))] - common_inds: List[int] = [int(x.item()) for x in PandQ] - - # Create tensors for one-hot encoding - enc_P = torch.zeros((len(cmm_P), len(all_spks_labels))).to(cmm_P.device) - enc_Q = torch.zeros((len(cmm_Q), len(all_spks_labels))).to(cmm_Q.device) - - # Create one-hot encoding - enc_P[torch.arange(len(cmm_P)), cmm_P] = 1 - enc_Q[torch.arange(len(cmm_Q)), cmm_Q] = 1 - - # Cost matrix from one-hot encoding vectors - cost = -1 * torch.matmul(enc_P.T, enc_Q).T.to(PandQ.device) - _, col_ind = linear_sum_assignment(cost) - - # If number of are speakers in each vector is not the same - mapping_array = torch.arange(0, len(U_set)).to(PandQ.device) - for x in range(col_ind.shape[0]): - if x not in common_inds: - mapping_array[x] = x - else: - mapping_array[x] = col_ind[x] - return mapping_array - - -def get_minimal_indices(Y_new: torch.Tensor) -> torch.Tensor: - """ - Force the unique indices of the labels to use the lowest numbers. - - Example: - >>> Y_new = [3, 3, 3, 4, 4, 5] - >>> get_minimal_indices(Y_new) - Return: - [0, 0, 0, 1, 1, 2] - - Args: - Y_new (Tensor): - Tensor containing cluster labels - - Returns: - (Tensor): Newly mapped cluster labels that has minimized indicies - """ - device = Y_new.device - Y_new_enlisted = torch.unique(Y_new).sort()[0].to(torch.long).to(device) - sequence = torch.arange(torch.max(Y_new_enlisted) + 1).to(device) - sequence[Y_new_enlisted] = torch.arange(len(Y_new_enlisted)).to(device) - return sequence[Y_new] - - -@torch.jit.script -def stitch_cluster_labels(Y_old: torch.Tensor, Y_new: torch.Tensor) -> torch.Tensor: - """ - Run Hungarian (linear sum assignment) algorithm to find the best permutation mapping between - the cumulated labels in history and the new clustering output labels. - - Args: - Y_old (Tensor): - Cumulated diarization labels. This will be concatenated with history embedding speaker label - then compared with the predicted label Y_new. - Y_new (Tensor): - Contains predicted labels for reduced history embeddings concatenated with the predicted label. - Permutation is not matched yet. - - Returns: - mapping_array[Y] (Tensor): - An output numpy array where the input Y_new is mapped with mapping_array. - """ - Y_new = get_minimal_indices(Y_new) - if len(Y_old) == 0: - matched_output = Y_new - else: - P_raw, Q_raw = Y_old.to(Y_new.device), Y_new - U_set = torch.unique(torch.cat([P_raw, Q_raw])) - PQ = torch.cat([P_raw, Q_raw]) - a_cat_b, counts = torch.unique(PQ, return_counts=True) - # Get a union set of old P and new Q labels - PandQ = a_cat_b[torch.where(counts.gt(1))[0]] - min_len = min(P_raw.shape[0], Q_raw.shape[0]) - P, Q = P_raw[:min_len], Q_raw[:min_len] - - if len(U_set) == 1: - # When two speaker vectors are exactly the same: No need to encode. - mapping_array = torch.tensor([0, 0]).to(Y_new.device) - else: - # Run Hungarian algorithm if there are more than one speaker in universal set U. - mapping_array = get_lsa_speaker_mapping(U_set=U_set, cmm_P=P, cmm_Q=Q, PandQ=PandQ) - matched_output = mapping_array[Y_new] - matched_output = get_minimal_indices(matched_output) - return matched_output - - -def calculate_removable_counts(removable_counts_mat: torch.Tensor, remain_count: int, num_clus: int) -> torch.Tensor: - """ - Calculate removable counts based on the arguments and calculate how many counts should be - removed from the each cluster. This function has `O(N)` (N = num_clus) time complexity to - return the desired `removable_counts_mat`. - - Example: - - The original input to `get_merge_quantity` function: - >>> pre_clus_labels = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2] - >>> num_to_be_removed = 3 - >>> min_count_per_cluster = 2 - - Histogram: (`min_count_per_cluster`=2 is removed) - 0 |***** - 1 |*** - 2 |* - - Inputs: - >>> removable_counts_mat = [5, 3, 1] - >>> remain_count = 6 - >>> num_clus = 3 - - Interim results: - >>> diff_counts - [1, 2, 2] - >>> gradual_counts - [3, 4, 2] - >>> cumsum_counts - [3, 7, 9] - - Return: - >>> removable_counts_mat - [2, 1, 0] - - Args: - removable_counts_mat (Tensor): - Tensor containing how many vectors could be removed from each cluster - remain_count (int): - Integer value that indicates the number of vectors removed from the total set - num_clus (int): - Number of clusters in the given label sequence (cardinality of a label set) - - Returns: - removable_counts_mat (Tensor): - Tensor containing the number of vectors should be removed from each cluster - """ - device = removable_counts_mat.device - zero_padded_counts = torch.cat( - [torch.tensor([0]).to(device), removable_counts_mat.sort()[0], torch.tensor([0]).to(device)], dim=0 - ) - removable_count_args = removable_counts_mat.sort(descending=True)[1] - - # Calculate the size difference between clusters - diff_counts = (zero_padded_counts[1:] - zero_padded_counts[:-1])[:num_clus] - gradual_counts = torch.arange(num_clus, 0, -1).to(device) * diff_counts - cumsum_counts = torch.cumsum(gradual_counts, dim=0) - remain_count_rem = remain_count - - # Find how many remaining counts we can use - ind: int = 0 - for ind, num in enumerate(cumsum_counts): - if remain_count < num: - break - - # Subtract the common values step by step - if ind > 0: - for knd in range(ind): - removable_counts_mat[removable_count_args[: num_clus - knd]] -= diff_counts[knd] - remain_count_rem -= int(diff_counts[knd].item()) * (num_clus - knd) - assert remain_count >= 0, "remain_count should never be negative." - - # Add remaining values - num_labels = remain_count_rem // (num_clus - ind) - rem_labels = remain_count_rem % (num_clus - ind) - removable_counts_mat[removable_count_args[: (num_clus - ind)]] -= num_labels - removable_counts_mat[removable_count_args[:rem_labels]] -= 1 - return removable_counts_mat - - -def get_merge_quantity( - num_to_be_removed: int, pre_clus_labels: torch.Tensor, min_count_per_cluster: int, -) -> torch.Tensor: - """ - Determine which embeddings we need to reduce or merge in history buffer. - We want to merge or remove the embedding in the bigger cluster first. - At the same time, we keep the minimum number of embedding per cluster - with the variable named min_count_per_cluster. - - Constraint: - - Each cluster should keep the number of vectors over `min_count_per_cluster`. - - In total, `num_to_be_removed` of vectors should be removed from the total buffer. - - While merging embeddings, minimize the gap between quantities between clusters. - - Example: - >>> num_to_be_removed = 3 - >>> pre_clus_labels = [0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2] - >>> min_count_per_cluster = 2 - >>> get_merge_quantity(num_to_be_removed, pre_clus_labels, min_count_per_cluster) - Return: - torch.tensor([2, 1, 0]) - >>> # Sum should be equal to `num_to_be_removed` which is 3 - - Args: - num_to_be_removed: (int) - the quantity of the newly obtained embedding from the new stream of input. - pre_clus_labels: (Tensor) - the speaker labels of (the history_embedding_buffer_emb) + (the new embeddings to be added) - min_count_per_cluster: (int) - Minimum vector quantity for each cluster - - Returns: - removable_counts_mat: (Tensor) - Tensor containing the number of vectors should be removed from each cluster - """ - if num_to_be_removed > pre_clus_labels.shape[0] - 1: - raise ValueError(f"num_to_be_removed: {num_to_be_removed} should be less than pre_clus_labels length - 1") - remain_count = pre_clus_labels.shape[0] - num_to_be_removed - spk_freq_count = torch.bincount(pre_clus_labels) - num_clus = len(torch.unique(pre_clus_labels)) - if remain_count < min_count_per_cluster * num_clus: - raise ValueError(f"The remaining embedding vectors should be more than { min_count_per_cluster * num_clus }") - - # Minimum vector counts should be excluded from the removable amount - min_seg_count = torch.tensor([min_count_per_cluster] * len(spk_freq_count)).to(pre_clus_labels.device) - min_seg_count_mat = torch.stack((min_seg_count, spk_freq_count)).min(0)[0] - - # Exclude minimum quantities from the removable count matrix - remain_count -= int(torch.sum(min_seg_count_mat)) - removable_counts_mat = spk_freq_count - min_seg_count_mat - - # Calculate removable counts from `remain_count` variable - removable_counts_mat = calculate_removable_counts(removable_counts_mat, remain_count, num_clus) - if int(removable_counts_mat.sum()) != num_to_be_removed: - raise ValueError("Sum of `removable_counts_mat` is not equal to `num_to_be_removed` variable.") - if not torch.all(removable_counts_mat >= 0) or not torch.all(spk_freq_count - min_seg_count_mat >= 0): - raise ValueError( - f"Every value in `removable_counts_mat` should be always non-negative value but got {removable_counts_mat}" - ) - return removable_counts_mat - - -def merge_vectors( - selected_inds: torch.Tensor, emb_ndx: torch.Tensor, pre_cluster_labels: torch.Tensor -) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Merge feature (embedding) vectors estimated to be the same cluster label. - - Args: - selected_inds (Tensor): - Selected indices for merging - emb_ndx (Tensor): - Feature (embedding) vectors - Dimension: (original vector counts) x (feature dimension) - pre_cluster_labels (Tensor): - Original cluster labels before merging - - Returns: - merged_vecs (Tensor): - Merged feature vectors that are concatenated - Dimension: (merged vector counts) x (feature dimension) - merged_clus_labels (Tensor): - Cluster labels for the merged feature vectors - Dimension: (merged vector counts) - """ - if emb_ndx.shape[0] != pre_cluster_labels.shape[0]: - raise ValueError("pre_cluster_labels and emb_ndx have mismatch in dimension") - avg_emb = torch.mean(emb_ndx[selected_inds, :], dim=0) - merged_clus_labels = pre_cluster_labels[selected_inds] - selected_inds_list: List[int] = selected_inds.tolist() - bypass_inds_list: List[int] = [] - for k in range(emb_ndx.shape[0]): - if k not in selected_inds_list: - bypass_inds_list.append(k) - bypass_inds = torch.tensor(bypass_inds_list) - selected_inds = torch.tensor(selected_inds_list) - merged_vecs = torch.vstack((emb_ndx[bypass_inds], avg_emb)) - merged_clus_labels = torch.hstack((pre_cluster_labels[bypass_inds], merged_clus_labels[0])) - return merged_vecs, merged_clus_labels - - -def get_closest_embeddings(affinity_mat: torch.Tensor, n_closest: int) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Get the indices of the embedding vectors we want to merge. - - Example: - >>> n_closest = 2 - >>> affinity_mat = [[1.0, 0.2, 0.8], - [0.2, 1.0, 0.4], - [0.8, 0.4, 1.0]] - >>> affinity_mat.sum(0) - [2.0, 1.6, 2.2] - - # The closest two embedding vectors are at index 0 and 2. - - Args: - affinity_mat: (Tensor) - Symmetric affinity matrix of the given embedding vector set. - n_closest (int): - The amount of vector counts that are expected to be removed from the set - Example: - Input: 10 vectors in a set - n_closest = 5 - (5+1) vectors are merged into 1 vector - Output: 5 vectors in a set - - Returns: - idx_aff_sum (torch.Tensor): - Indices of the closest `n_closest` embedding vectors - rest_inds (torch.Tensor): - Indices of the complementary set of the indices in `idx_aff_sum` - """ - comb_limit = int(affinity_mat.shape[0] - 1) - if n_closest > comb_limit: - raise ValueError(f"Got n_closest of {n_closest}: {n_closest} is bigger than comb_limit {comb_limit}") - - # Take summed values over one axis - sum_cmat = affinity_mat.sum(0) - - # `n_closest + 1` will become 1 embedding vector after merging - idx_aff_sum = torch.argsort(sum_cmat, descending=True)[: (n_closest + 1)] - rest_inds = torch.argsort(sum_cmat, descending=True)[(n_closest + 1) :] - return idx_aff_sum, rest_inds - - -def run_reducer( - pre_embs: torch.Tensor, target_spk_idx: int, merge_quantity: int, pre_clus_labels: torch.Tensor, -): - """ - Reduce the number of embedding vectors by merging the closest embedding vectors. - - This merging algorithm is based on the assumption that the closest embeddings - are the most redundant embedding vectors. - - The closest embedding vectors are chosen by selecting the highest top-N sum of - each column in a given affinity matrix. - - If merge_quantity is N, we choose (N+1) vectors into 1 embedding vector. - Thus, we reduce N embeddings in the original embedding vector set. - - Example: - >>> merge_quantity = 1 # We merge 1+1 = 2 embedding vectors - >>> affinity_mat = [[1.0, 0.2, 0.8], - [0.2, 1.0, 0.4], - [0.8, 0.4, 1.0]] - >>> affinity_mat.sum(0) - [2.0, 1.6, 2.2] - - The first and the third embedding vectors are merged into one embedding vector. - >>> index_mapping # (bypassed indices, merged indices) - ([1], [0, 2]) - - Args: - pre_embs (Tensor): - Potential Embedding vectors to be merged - affinity_mat (Tensor): - The affinity matrix of the `pre_embs` - target_spk_idx (int): - The targeted speaker index for merging - merge_quantity (int): - The count of embeddings to be reduced - pre_clus_labels (list) - The original cluster (speaker) index - - Returns: - merged_embs (torch.Tensor): - The merged embedding vectors. - merged_clus_labels (torch.Tensor): - The cluster (speaker) indices for the merged embedding vectors. - index_mapping (Tuple[torch.Tensor, torch.Tensor]): - A tuple containing the indices of the original embeddings that were not merged (`bypassed indices`) - and the indices of the new merged embeddings (`merged indices`). - """ - if pre_embs.shape[0] != pre_clus_labels.shape[0]: - raise ValueError("Dimension mismatch between `pre_embs` and `pre_clus_labels`.") - - target_emb_index = torch.where(pre_clus_labels == target_spk_idx)[0] - org_size = target_emb_index.shape[0] - if merge_quantity > 0: - if merge_quantity > (target_emb_index.shape[0] - 1): - raise ValueError( - f"merge_quantity {merge_quantity} should not be larger than target_emb_index length: {target_emb_index.shape[0]-1}" - ) - total_affinity_mat = getCosAffinityMatrix(pre_embs) - - # Get the lower triangle of the affinity_mat array - affinity_mat = total_affinity_mat[:, target_emb_index][target_emb_index, :] - if affinity_mat.shape[0] != target_emb_index.shape[0]: - raise ValueError( - "Dimension mismatch between targeted speaker affinity `affinity_mat` and targeted speaker index `target_emb_index`." - ) - # Get the indices of the closest embedding vectors - selected_inds, rest_inds = get_closest_embeddings(affinity_mat, merge_quantity) - spk_cluster_labels, selected_embs = pre_clus_labels[target_emb_index], pre_embs[target_emb_index] - - # Note that we need to return the indices of speaker-specific indices from `target_emb_index`. - index_mapping = (target_emb_index[rest_inds.sort()[0]], target_emb_index[selected_inds]) - - # Merge the embeddings targeted by the 2-dim indices `index_2d` - merged_embs, merged_clus_labels = merge_vectors(selected_inds, selected_embs, spk_cluster_labels) - - if (org_size - merge_quantity) != merged_embs.shape[0]: - raise ValueError( - f"Reducer output {merged_embs.shape[0]} is not matched to the target quantity {org_size - merge_quantity}." - ) - - else: - merged_embs = pre_embs[target_emb_index] - merged_clus_labels = pre_clus_labels[target_emb_index] - index_mapping = (target_emb_index, torch.arange(0)) - return merged_embs, merged_clus_labels, index_mapping - - -def get_first_arg_index(mat: torch.Tensor, label: int) -> int: - """ - Get the index of the first element are specified by `index` variable. - - Args: - mat (Tensor): - Source matrix filled with indices - label (int): - Label which we want to find the first occuring index - - Returns: - (int) The first index of the given label - """ - return int(torch.where(mat == label)[0][0]) - - -class OnlineSpeakerClustering(torch.nn.Module): - """ - Online clustering method for speaker diarization based on cosine similarity. - - Regular Clustering Attributes: - - max_num_speakers (int): - The upper bound for the number of speakers in each session - max_rp_threshold (float): - Limits the range of parameter search. - Clustering performance can vary depending on this range. - Default is 0.15. - enhanced_count_thres (int): - For the short audio recordings, clustering algorithm cannot - accumulate enough amount of speaker profile for each cluster. - Thus, function `getEnhancedSpeakerCount` employs anchor embeddings - (dummy representations) to mitigate the effect of cluster sparsity. - enhanced_count_thres = 40 is recommended. - sparse_search_volume (int): - Number of p_values we search during NME analysis. - Default is 30. The lower the value, the faster NME-analysis becomes. - Lower than 20 might cause a poor parameter estimation. - fixed_thres (float): - A fixed threshold for finding p-closest neighbors in affinity matrix for clustering. - If fixed_thres value is provided, NME-analysis process will be skipped. - This value should be optimized on a development set to obtain a quality result. - Default is None and performs NME-analysis to estimate the threshold. - min_samples_for_nmesc (int): - The minimum number of samples required for NME clustering. This avoids - zero p_neighbour_lists. If the input has fewer segments than min_samples, - it is directed to the enhanced speaker counting mode. - sparse_search (bool): - Toggle sparse search mode. If True, limit the size of p_value_list to sparse_search_volume. - cuda (bool): - Use cuda for Eigen decomposition if cuda=True. - - Additional Online Processing Attributes: - - history_buffer_size (int): - - This is a buffer where diarization history is saved in the form of averaged speaker embedding vector. - - The values in [50, 200] range is recommended while the system requires bigger buffer size for - sessions with larger number of speakers. - current_buffer_size (int): - - This is a buffer which process the most recent speaker embedding vector inputs. - current-buffer is first-in-first-out (FIFO) queue where the embeddings accepted earlier - get to merged and saved to history buffer. - - In general, [50, 200] range is recommended and the performance can be sensitive on this buffer size. - min_spk_counting_buffer_size (int): - Integer number for speaker counting buffer. Number of speakers are estimated through a small buffer - and the number is obtained by taking majority vote. - min_frame_per_spk (int): - Below this number, the system considers the whole input segments as a single speaker. - p_update_freq (int): - Frequency (interval) of updating p_value for NMESC algorithm. - p_value_skip_frame_thres (int): - After `frame_index` passes this number, `p_value` estimation is skipped for inference speed - p_value_queue_size (int): - `p_value` buffer for major voting - use_temporal_label_major_vote (bool): - Boolean that determines whether to use temporal majorvoting for the final speaker labels - temporal_label_major_vote_buffer_size (int): - Buffer size for major-voting the - num_spk_stat (list): - List of number of speakers for major voting. Number of speakers are estimated through - majority voting of `self.num_spk_stat` list. - p_value_hist (list): - List of p_values for major voting. - To save the computation time, p_value is estimated every `p_update_freq` frames and - saved to `self.p_value_hist`. - - Attributes for counters and buffers in streaming system: - - is_online (bool): - - If self.is_online is False: - FIFO queue does not push out any speaker embedding vector - - If self.is_online is True: - FIFO queue starts push out speaker embedding vectors and saving them into - history buffer. - max_embed_count (int): - The maximum number of segments the streaming system has ever seen. - This value keeps increasing as the system processes more and more segments. - memory_margin (int): - The margin that is added to keep the segmentation data in the streaming system - minimum_segments_per_buffer (int): - Maximum number of embedding vectors kept in history buffer per speaker. - Example: - history_buffer_size (history_n) = 100 - max_num_speakers = 4 - minimum_segments_per_buffer = 25 - history_buffer_seg_end (int): - Index that indicates the boundary between history embedding sets and current processing buffer - when history embedding vectors and current input embedding vectors are concatenated into a - single matrix. - - Attributes for history buffer: - - history_embedding_buffer_emb (Tensor) - Tensor containing speaker embedding vectors for saving the history of the previous - speaker profile in the given audio session - history_embedding_buffer_label (Tensor) - Speaker label (cluster label) for embedding vectors saved in the history buffer - Y_fullhist (Tensor) - Tensor containing the speaker label hypothesis from start to current frame - """ - - def __init__( - self, - max_num_speakers: int = 8, - max_rp_threshold: float = 0.15, - enhanced_count_thres: float = 40, - fixed_thres: float = -1.0, - sparse_search_volume: int = 10, - history_buffer_size: int = 150, - current_buffer_size: int = 150, - min_spk_counting_buffer_size: int = 3, - min_frame_per_spk: int = 15, - p_update_freq: int = 5, - p_value_skip_frame_thres: int = 50, - p_value_queue_size: int = 3, - use_temporal_label_major_vote: bool = False, - temporal_label_major_vote_buffer_size: int = 11, - cuda: bool = False, - ): - super().__init__() - self.max_num_speakers = max_num_speakers - self.max_rp_threshold = max_rp_threshold - self.enhanced_count_thres = enhanced_count_thres - self.sparse_search_volume = sparse_search_volume - self.fixed_thres = fixed_thres - self.history_n = history_buffer_size - self.current_n = current_buffer_size - self.min_spk_counting_buffer_size = min_spk_counting_buffer_size - self.min_frame_per_spk = min_frame_per_spk - self.p_update_freq = p_update_freq - self.p_value_skip_frame_thres = p_value_skip_frame_thres - self.p_value_queue_size = p_value_queue_size - self.use_temporal_label_major_vote = use_temporal_label_major_vote - self.temporal_label_major_vote_buffer_size = temporal_label_major_vote_buffer_size - self.cuda = cuda - self.num_spk_stat: List[torch.Tensor] = [torch.tensor(1)] - self.p_value_hist: List[torch.Tensor] = [torch.tensor(2)] - - # Initialize the counters and buffers in streaming system - self.is_online = False - self.max_embed_count = 0 - self.memory_margin = 0 - self.minimum_segments_per_buffer = int(self.history_n / self.max_num_speakers) - self.history_buffer_seg_end = 0 - - # Initialize the streaming buffer tensors - self.history_embedding_buffer_emb = torch.tensor([]) - self.history_embedding_buffer_label = torch.tensor([]) - self.Y_fullhist = torch.tensor([]) - - def onlineNMEanalysis(self, mat_in: torch.Tensor, frame_index: int) -> Tuple[int, int]: - """ - To save the running time, the p-value is only estimated in the beginning of the session. - After switching to online mode, the system uses the most common estimated p-value. - Estimating p-value requires a plenty of computational resource. The less frequent estimation of - p-value can speed up the clustering algorithm by a huge margin. - - Args: - mat_in (Tensor): - Tensor containing the affinity matrix for the current segments - frame_index (int): - Unique index for each segment and embedding vector - - Returns: - est_num_of_spk: (int) - The estimated number of speakers. - p_hat_value: (int) - The estimated p-value from NMESC method. - """ - nmesc = NMESC( - mat_in, - max_num_speakers=self.max_num_speakers, - max_rp_threshold=self.max_rp_threshold, - sparse_search=True, - maj_vote_spk_count=False, - sparse_search_volume=self.sparse_search_volume, - fixed_thres=self.fixed_thres, - nme_mat_size=256, - parallelism=False, - device=mat_in.device, - cuda=self.cuda, - ) - if len(self.p_value_hist) == 0 or ( - frame_index < self.p_value_skip_frame_thres and frame_index % self.p_update_freq == 0 - ): - est_num_of_spk, p_hat_value = nmesc.forward() - self.p_value_hist.append(p_hat_value) - if len(self.p_value_hist) > self.p_value_queue_size: - self.p_value_hist.pop(0) - p_hat_int_list: List[int] = [int(p) for p in self.p_value_hist] - p_hat_value = torch.mode(torch.tensor(p_hat_int_list))[0].item() - output = nmesc.getEigRatio(p_hat_value) - g_p, est_num_of_spk = output[0], output[1].int() - return est_num_of_spk, p_hat_value - - def speaker_counter_buffer(self, est_num_of_spk: int) -> torch.Tensor: - """ - Use a queue to avoid unstable speaker counting results. - - Args: - est_num_of_spk (int): - Estimated number of speakers - - Returns: - est_num_of_spk (torch.Tensor): - Estimated number of speakers from the speaker counting buffer. - """ - est_num_of_spk = torch.tensor(est_num_of_spk) - self.num_spk_stat.append(est_num_of_spk) - if len(self.num_spk_stat) > self.min_spk_counting_buffer_size: - self.num_spk_stat.pop(0) - num_spk_stat_tensor = torch.tensor([int(s) for s in self.num_spk_stat]) - num_spks_bincount = torch.bincount(num_spk_stat_tensor) - est_num_of_spk = torch.argmax(num_spks_bincount) - return est_num_of_spk - - def limit_frames_per_speaker(self, frame_index: int, est_num_of_spk: int) -> int: - """ - Limit the estimated number of speakers in proportion to the number of speakers. - - Args: - frame_index (int): - Unique index for each segment and embedding vector - est_num_of_spk (int): - Estimated number of speakers - - Returns: - (int) Estimated number of speakers capped by `self.min_frame_per_spk` - """ - return min(est_num_of_spk, int(1 + frame_index // self.min_frame_per_spk)) - - def online_spk_num_estimation(self, mat_in: torch.Tensor, frame_index: int) -> Tuple[int, torch.Tensor]: - """ - Online version of speaker estimation involves speaker counting buffer and application of per-speaker - frame count limit. - - Args: - mat_in (Tensor): - Raw affinity matrix containing similarity values of each pair of segments - frame_index (int) - Unique frame index of online processing pipeline - - Returns: - est_num_of_spk (int): - Estimated number of speakers - affinity_mat (Tensor): - Affinity matrix after applying the affinity threshold with `p_hat_value` - """ - est_num_of_spk, p_hat_value = self.onlineNMEanalysis(mat_in, frame_index) - affinity_mat = getAffinityGraphMat(mat_in, p_hat_value) - raw_est_num_of_spk = self.speaker_counter_buffer(est_num_of_spk) - est_num_of_spk = self.limit_frames_per_speaker(frame_index, raw_est_num_of_spk.item()) - return est_num_of_spk, affinity_mat - - def prepare_embedding_update( - self, emb_in: torch.Tensor, segment_indexes_matrix: torch.Tensor - ) -> Tuple[bool, int, torch.Tensor, torch.Tensor]: - """ - This function performs the following tasks: - 1. Decide whether to extract more embeddings or not (by setting `is_update`) - (Only if we need update): - 2. Calculate how many embeddings should be updated (set `new_emb_n` variable) - 3. Update history embedding vectors and save it to `pre_embs`. - - We only save the index and clustering label of each embedding. - - - Case-1: The very first step - This else statement is for the very first diarization loop. - This is the very first reduction frame. - - - Case-2: Number of embedding vectors is increased, therefore we need to update. - Since there are new embeddings, we push the same amount (new_emb_n) - of old embeddings to the history buffer. - We should also update self.history_buffer_seg_end which is a pointer. - update to history emb: emb_in[emb_idx_stt:emb_idx_end] - update to history label: self.Y_fullhist[label_stt:_end] - - - Case-3: Number of embedding vectors is decreased - If the number of embeddings is decreased compared to the last trial, - then skip embedding merging. - - Variables: - hist_curr_boundary (int): - The current boundary of between history buffer and current buffer. - This is the new history-current buffer boundary while self.history_buffer_seg_end is the old one. - Thus, the new set of embedding vectors are collected from - `label_stt=self.hist_buffer_seg_end` to `label_end=hist_curr_boundary`. - total_segments_processed_count (int): - The number of segments that are processed so far in integer format. - - Args: - emb_in (Tensor): - Tensor containing embedding vectors - Dimensions: (number of embedding vectors) x (embedding dimension) - segment_indexes_matrix (Tensor): - Tensor containing unique segment (embedding vector) index - - Returns: - is_update (bool): - Boolean indicates whether to update speaker embedding vectors. - new_emb_n (int): - The amount of embedding vectors that are exceeding FIFO queue size. - new_emb_n is also an amount of embedding vectors that needs to be merged in history buffer. - pre_embs (Tensor): - Embedding vector matrix before merging. - The subset of `pre_embs` embedding vectors will be merged. - Dimensions: (number of embedding vectors) x (embedding dimension) - pre_clus_labels (Tensor): - A set of clustering labels for each embedding vector in `pre_embs`. - """ - total_segments_processed_count = int(segment_indexes_matrix[-1] + 1) - hist_curr_boundary = int(total_segments_processed_count - self.current_n) - new_emb_n: int = 0 - pre_embs: torch.Tensor = torch.empty(0) - pre_clus_labels: torch.Tensor = torch.empty(0) - is_update = True - - if total_segments_processed_count > self.max_embed_count: - # Case-1: The very first step - if len(self.history_embedding_buffer_emb) == 0: - new_emb_n = total_segments_processed_count - (self.current_n + self.history_n) - hist_curr_boundary_emb_idx = get_first_arg_index(segment_indexes_matrix, hist_curr_boundary) - pre_embs = emb_in[:hist_curr_boundary_emb_idx] - pre_clus_labels = self.Y_fullhist[:hist_curr_boundary] - - # Case-2: Number of embedding vectors is increased, need to update history and its label - else: - # Calculate the number of new embedding vectors: `new_emb_n` - label_stt, label_end = self.history_buffer_seg_end, hist_curr_boundary - new_emb_n = label_end - label_stt - # Add embedding vectors to `pre_embs` so that we can merge it with reducer function. - emb_idx_stt = int(get_first_arg_index(segment_indexes_matrix, label_stt)) - emb_idx_end = int(get_first_arg_index(segment_indexes_matrix, label_end)) - pre_embs = torch.vstack((self.history_embedding_buffer_emb, emb_in[emb_idx_stt:emb_idx_end])) - # Update labels for `pre_embs` - pre_clus_labels = torch.hstack( - (self.history_embedding_buffer_label, self.Y_fullhist[label_stt:label_end]) - ) - - if new_emb_n > self.current_n: - raise ValueError( - "new_emb_n should be less than or equal to current buffer size (self.current_n)." - f" Getting too many segments: {new_emb_n} for the given current buffer size {self.current_n}." - " Please either (1) increase buffer size or (2) use longer segment lengths to get less number of segments." - ) - elif new_emb_n <= 0: - raise ValueError("Segment counting error. `new_emb_n` should be a positve integer number.") - if pre_embs.shape[0] != pre_clus_labels.shape[0]: - raise ValueError( - "`pre_embs` and `pre_clus_labels` should have the same length, " - f"but got {pre_embs.shape[0]} and {pre_clus_labels.shape[0]} respectively." - ) - - # Case-3: Number of embedding vectors is not increased. - else: - # There will be no embedding update, so new_emb_n is 0, pre_embs and pre_clus_labels are empty. - is_update = False - - # Update the history buffer index for the next step - self.history_buffer_seg_end = hist_curr_boundary - self.max_embed_count = max(total_segments_processed_count, self.max_embed_count) - return is_update, new_emb_n, pre_embs, pre_clus_labels - - def make_constant_length_emb(self, emb_in: torch.Tensor, base_segment_indexes: torch.Tensor) -> torch.Tensor: - """ - This function deals with edge cases when the number of segments decreases and the number of embedding falls - short for the labels. - - - ASR decoder occasionally returns less number of words compared to the previous frame. - - In this case, we obtain fewer embedding vectors for the short period of time. To match the pre-defined - length, the last embedding vector is repeated to fill the voidness. - - The repeated embedding will be soon replaced by the actual embeddings once the system takes new frames. - - Args: - emb_in (Tensor): - If self.is_online is False: - `pre_embs` contains only current speaker embedding inputs, which is FIFO queue - If self.is_online is True: - `pre_embs` contains history buffer and FIFO queue - base_segment_indexes (Tensor): - Tensor containing unique segment (embedding vector) index - - Returns: - emb_curr (Tensor): - Length preserved speaker embedding vectors - """ - curr_clustered_segments = torch.where(base_segment_indexes >= self.history_buffer_seg_end)[0] - - # Check if the current buffer result is falling short compared to `self.current_n`. - if emb_in[curr_clustered_segments].shape[0] < self.current_n: - delta_count = self.current_n - emb_in[curr_clustered_segments].shape[0] - fill_in_emb = torch.tile(emb_in[curr_clustered_segments][-1], (delta_count, 1)) - emb_curr = torch.vstack((emb_in[curr_clustered_segments], fill_in_emb)) - else: - emb_curr = emb_in[curr_clustered_segments] - return emb_curr - - def update_speaker_history_buffer( - self, emb_in: torch.Tensor, base_segment_indexes: torch.Tensor - ) -> Tuple[torch.Tensor, bool]: - """ - Merge the given embedding vectors based on the calculate affinity matrix. - if `is_update` is True, update the history buffer . - - Args: - emb_in (Tensor): - If self.is_online is False: - `emb` contains only current speaker embedding inputs, which is FIFO queue - If self.is_online is True: - `emb` contains history buffer and FIFO queue - base_segment_indexes (Tensor): - Tensor containing unique segment (embedding vector) index - - Returns: - history_embedding_buffer_emb (Tensor): - Matrix containing merged embedding vectors of the previous frames. - This matrix is referred to as "history buffer" in this class. - is_update (bool): - Boolean indicates whether to update speaker - - Example: - - at the frame index where `is_online` turns to True: - - |------hist-buffer------|-----FIFO-queue-----| - - self.history_n = 20 - self.current_n = 10 - - Step (1) - |-----------------------|ABCDEF--------------| - - If we get two more segments, "NN" as in the description: - history buffer = 20 - current buffer = 12 - - Step (2) - |-----------------------|ABCDEF--------------XY| - |---------emb_in-------| - - The newly accepted embeddings go through a FIFO queue (first come, first merge) - history buffer = 22 - current buffer = 10 - - Step (3) - |-----------------------AB|CDEF--------------XY| - |---------pre_embs--------| - - After merging (reducing) the embedding set gets back to the original size: - history buffer = 20 - current buffer = 10 - - Step (4) - |======================|CDEF--------------XY| - |-----hist_emb_buff----| - - After clustering, `self.Y_fullhist` is updated as: - - |0000000000011111111111|11110000110010010011| - - The dimension of `self.Y_fullhist` is (`history_n + current_n`) x 1 - - self.history_buffer_seg_end (int): - The total number of segments that have been merged from the beginning of the session. - (=`hist_curr_boundary`) - """ - is_update, new_emb_n, pre_embs, pre_clus_labels = self.prepare_embedding_update(emb_in, base_segment_indexes) - - # Update the history/current_buffer boundary cursor - total_emb, total_cluster_labels = [], [] - - if is_update: - # Calculate how many embedding vectors should be reduced per speaker - class_target_vol = get_merge_quantity( - num_to_be_removed=new_emb_n, - pre_clus_labels=pre_clus_labels, - min_count_per_cluster=self.minimum_segments_per_buffer, - ) - - # Merge the segments in the history buffer - for spk_idx, target_num in enumerate(list(class_target_vol)): - merged_embs, merged_clus_labels, _ = run_reducer( - pre_embs=pre_embs, - target_spk_idx=spk_idx, - merge_quantity=target_num, - pre_clus_labels=pre_clus_labels, - ) - total_emb.append(merged_embs) - total_cluster_labels.append(merged_clus_labels) - - # Update the speaker history buffer - self.history_embedding_buffer_emb = torch.vstack(total_emb) - self.history_embedding_buffer_label = torch.hstack(total_cluster_labels) - if self.history_embedding_buffer_emb.shape[0] != self.history_n: - raise ValueError("History embedding size is not maintained correctly.") - if len(self.history_embedding_buffer_label) != self.history_n: - raise ValueError("History label size is not maintained correctly.") - - else: - total_emb.append(self.history_embedding_buffer_emb) - total_cluster_labels.append(self.history_embedding_buffer_label) - - # `emb_curr` is the incumbent set of embeddings which is the the latest. - emb_curr = self.make_constant_length_emb(emb_in, base_segment_indexes) - total_emb.append(emb_curr) - - # Before perform clustering, we attach the current_n number of estimated speaker labels - # from the previous clustering result. - total_cluster_labels.append(self.Y_fullhist[-self.current_n :]) - - history_and_current_emb = torch.vstack(total_emb) - history_and_current_labels = torch.hstack(total_cluster_labels) - if history_and_current_emb.shape[0] != len(history_and_current_labels): - raise ValueError("`history_and_current_emb` has a mismatch in length with `history_and_current_labels`.") - return history_and_current_emb, is_update - - def get_reduced_mat(self, emb_in: torch.Tensor, base_segment_indexes: torch.Tensor) -> Tuple[torch.Tensor, bool]: - """ - Choose whether we want to add embeddings to the memory or not. - The processing buffer has size of (self.current_n + self.history_n). - - Case-1: If margin_seg_n > 0, this means we have more embedding vectors than we can hold in the processing buffer. - - `is_online` should be `True` - - reduce the number of embedding vectors by merging the closest ones. - call `update_speaker_history_buffer` function - - Case-2: If margin_seg_n <= 0, this means that we can accept more embedding vectors and yet to fill the processing buffer. - - `is_online` should be `False` - - Replace `merged_emb` variable with the raw input `emb_in`. - - `add_new` is `True`, since we are adding more embedding vectors to `merged_emb` variable. - - Args: - emb_in (Tensor): - If self.is_online is False: - `emb` contains only current speaker embedding inputs - base_segment_indexes (Tensor): - Tensor containing unique segment (embedding vector) index - - Returns: - merged_emb (Tensor): - Matrix containing merged embedding vectors of the previous frames. - This matrix is referred to as "history buffer" in this class. - If self.is_online is False: - `merged_emb` contains only current speaker embedding inputs - If self.is_online is True: - `merged_emb` is a concatenated matrix with history embedding and current embedding inputs - add_new (bool): - Boolean that indicates whether there is a new set of segments. Depending on the VAD timestamps, - the number of subsegments can be ocassionally decreased. If `add_new=True`, then it adds the newly - acquired cluster labels. - - """ - margin_seg_n = emb_in.shape[0] - (self.current_n + self.history_n) - if len(self.Y_fullhist) == 0 and margin_seg_n > 0: - raise ValueError( - "The number of incoming embedding vectors is larger than the total processing buffer size." - "Please either (1) increase the history and current buffer size (2) or use longer segment lengths to reduce number of segments." - ) - if margin_seg_n > 0: - self.is_online = True - merged_emb, add_new = self.update_speaker_history_buffer( - emb_in=emb_in, base_segment_indexes=base_segment_indexes - ) - else: - self.is_online = False - merged_emb = emb_in - add_new = True - return merged_emb, add_new - - def match_labels(self, Y_merged: torch.Tensor, add_new: bool) -> torch.Tensor: - """ - This function matches the newly generated clustering label sequence with the existing speaker labels in the history buffer. - `self.history_buffer_seg_end` is an integer index that tells to which point is history embedding contains from `self.Y_fullhist`. - - If embedding reducing is done correctly, we should discard (0, self.history_n) amount and take - (self.history_n, len(Y_merged)) from the new clustering output `Y_merged`. - - Args: - Y_merged (Tensor): - The newly generated clustering label sequence that may have different permutations with the existing - speaker labels in the history buffer. - add_new (bool): - This variable indicates whether there is a new set of segments. Depending on the VAD timestamps, - the number of subsegments can be occasionally decreased. If `add_new=True`, then it adds the newly - acquired cluster labels. - - Returns: - Y_out (Tensor): - Permutation-matched speaker labels based on history buffer - """ - if self.is_online: - # Online clustering mode with history buffer - Y_old = torch.hstack((self.history_embedding_buffer_label, self.Y_fullhist[self.history_buffer_seg_end :])) - - # Stitch the old history and new cluster labels - Y_matched = stitch_cluster_labels(Y_old=Y_old, Y_new=Y_merged).to(Y_merged.device) - if add_new: - if Y_matched[self.history_n :].shape[0] != self.current_n: - raise ValueError("Update point sync is not correct.") - # Concatenate the newly generated speaker labels - Y_out = torch.hstack((self.Y_fullhist[: self.history_buffer_seg_end], Y_matched[self.history_n :])) - self.Y_fullhist = Y_out - else: - # Do not update cumulative labels since there are no new segments. - Y_out = self.Y_fullhist - else: - # If no memory is used, offline clustering is applied. - Y_out = stitch_cluster_labels(Y_old=self.Y_fullhist, Y_new=Y_merged).to(Y_merged.device) - self.Y_fullhist = Y_out - return Y_out - - def forward( - self, - curr_emb, - base_segment_indexes, - max_num_speakers: int, - max_rp_threshold: float, - enhanced_count_thres: int, - sparse_search_volume: int, - frame_index: int, - cuda: bool = False, - ) -> torch.Tensor: - """ - Wrapper function for torch.jit.script compatibility. - NOTE: jit scripted classes only contain the methods which are included in the computation graph in the forward pass. - """ - Y = self.forward_infer( - curr_emb=curr_emb, - base_segment_indexes=base_segment_indexes, - max_num_speakers=max_num_speakers, - max_rp_threshold=max_rp_threshold, - enhanced_count_thres=enhanced_count_thres, - sparse_search_volume=sparse_search_volume, - frame_index=frame_index, - cuda=cuda, - ) - return Y - - def forward_infer( - self, - curr_emb: torch.Tensor, - base_segment_indexes: torch.Tensor, - max_num_speakers: int = 4, - max_rp_threshold: float = 0.15, - enhanced_count_thres: int = 40, - sparse_search_volume: int = 10, - fixed_thres: float = -1.0, - frame_index: int = 0, - cuda: bool = False, - ) -> torch.Tensor: - """ - Perform speaker clustering in online mode. Embedding vector set `emb` is expected to be containing - history embeddings to count the number of speakers. - - Args: - curr_emb (Tensor): - Current embedding vector input. - base_segment_indexes (Tensor): - Tensor containing unique segment (embedding vector) index - max_num_speakers (int): - Maximum number of speakers to be detected during online diarization session - max_rp_threshold (float): - Limits the range of parameter search. - Clustering performance can vary depending on this range. - Default is 0.25. - max_rp_threshold (float): - Limits the range of parameter search. - Clustering performance can vary depending on this range. - Default is 0.15. - frame_index (int): - Unique index for each segment (also each embedding vector) - cuda (bool): - Boolean that determines whether cuda is used or not - device (torch.device): - `torch.device` variable - - Returns: - Y (Tensor): - Speaker labels for history embeddings and current embedding inputs - """ - self.max_num_speakers = max_num_speakers - self.max_rp_threshold = max_rp_threshold - self.enhanced_count_thres = enhanced_count_thres - self.sparse_search_volume = sparse_search_volume - self.fixed_thres = fixed_thres - # Merge the closest embeddings and reduce the size of the embedding count. - - if cuda and (curr_emb.device == torch.device("cpu") or base_segment_indexes.device == torch.device("cpu")): - raise ValueError(f"CUDA is enabled but the input {curr_emb} or {base_segment_indexes} is not on the GPU.") - - merged_embs, add_new = self.get_reduced_mat(emb_in=curr_emb, base_segment_indexes=base_segment_indexes,) - # Perform clustering on the embedding matrix containing history and current FIFO buffer merged_embeddings - if merged_embs.shape[0] == 1: - Y = torch.zeros((1,), dtype=torch.int32) - else: - mat = getCosAffinityMatrix(merged_embs) - est_num_of_spk, affinity_mat = self.online_spk_num_estimation(mat, frame_index) - spectral_model = SpectralClustering(n_clusters=est_num_of_spk, cuda=cuda, device=merged_embs.device) - Y = spectral_model.forward(affinity_mat).to(merged_embs.device) - # Match the permutation of the newly obtained speaker labels and the previous labels - merged_clus_labels = self.match_labels(Y_merged=Y, add_new=add_new) - return merged_clus_labels diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/optimization_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/optimization_utils.py deleted file mode 100644 index f947007e59b41b4723b0adfd131c679dbc3c179e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/optimization_utils.py +++ /dev/null @@ -1,343 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The original code of Linear Sum Assignment solver is -# from: https://github.com/scipy/scipy/blob/v0.18.1/scipy/optimize/_hungarian.py -# The following is the full text of the license: - -# Hungarian algorithm (Kuhn-Munkres) for solving the linear sum assignment -# problem. Taken from scikit-learn. Based on original code by Brian Clapper, -# adapted to NumPy by Gael Varoquaux. -# Further improvements by Ben Root, Vlad Niculae and Lars Buitinck. -# Copyright (c) 2008 Brian M. Clapper , Gael Varoquaux -# Author: Brian M. Clapper, Gael Varoquaux -# License: 3-clause BSD - -import torch - - -@torch.jit.script -def unravel_index(index: int, shape: torch.Tensor): - """ - Unravel the index input to fit the given shape. - This function is needed for torch.jit.script compatibility. - - Args: - index (int): The index to unravel. - shape (Tesnor): The shape to unravel the index to. - - Returns: - Tensor: The unraveled index. - """ - out = [] - shape = torch.flip(shape, dims=(0,)) - for dim in shape: - out.append(index % dim) - index = index // dim - out = torch.tensor([int(x.item()) for x in out]) - return torch.flip(out, dims=(0,)) - - -@torch.jit.script -class LinearSumAssignmentSolver(object): - """ - A Solver class for the linear sum assignment (LSA) problem. - Designed for torch.jit.script compatibility in NeMo. - - The LSA problem is also referred to as bipartite matching problem. An LSA problem is described - by a matrix `cost_mat`, where each cost_mat[i,j] is the cost of matching vertex i of the first partite - set (e.g. a "worker") and vertex j of the second set (e.g. a "job"). - - Thus, the goal of LSA-solver is to find a complete assignment of column element to row element with - the minimal cost. Note that the solution may not be unique and there could be multiple solutions that - yield the same minimal cost. - - LSA problem solver is needed for the following tasks in NeMo: - - Permutation Invariant Loss (PIL) for diarization model training - - Label permutation matching for online speaker diarzation - - Concatenated minimum-permutation Word Error Rate (cp-WER) calculation - - This implementation is based on the LAP solver from scipy: - https://github.com/scipy/scipy/blob/v0.18.1/scipy/optimize/_hungarian.py - The scipy implementation comes with the following license: - - Copyright (c) 2008 Brian M. Clapper , Gael Varoquaux - Author: Brian M. Clapper, Gael Varoquaux - License: 3-clause BSD - - References - 1. http://csclab.murraystate.edu/bob.pilgrim/445/munkres.html - 2. https://en.wikipedia.org/wiki/Hungarian_algorithm - 3. https://github.com/scipy/scipy/blob/v0.18.1/scipy/optimize/_hungarian.py - - - Attributes: - cost_mat (Tensor): 2D matrix containing cost matrix. Number of columns must be larger than number of rows. - row_uncovered (Tensor): 1D matrix containing boolean values indicating whether a row is covered. - col_uncovered (Tensor): 1D matrix containing boolean values indicating whether a column is covered. - zero_row (Tensor): 1D matrix containing the row index of the last zero found. - zero_col (Tensor): 1D matrix containing the column index of the last zero found. - path (Tensor): 2D matrix containing the path taken through the matrix. - marked (Tensor): 2D matrix containing the marked zeros. - """ - - def __init__(self, cost_matrix: torch.Tensor): - # The main cost matrix - self.cost_mat = cost_matrix - row_len, col_len = self.cost_mat.shape - - # Initialize the solver state - self.zero_row = torch.tensor(0, dtype=torch.long).to(cost_matrix.device) - self.zero_col = torch.tensor(0, dtype=torch.long).to(cost_matrix.device) - - # Initialize the covered matrices - self.row_uncovered = torch.ones(row_len, dtype=torch.bool).to(cost_matrix.device) - self.col_uncovered = torch.ones(col_len, dtype=torch.bool).to(cost_matrix.device) - - # Initialize the path matrix and the mark matrix - self.path = torch.zeros((row_len + col_len, 2), dtype=torch.long).to(cost_matrix.device) - self.marked = torch.zeros((row_len, col_len), dtype=torch.long).to(cost_matrix.device) - - def _reset_uncovered_mat(self): - """ - Clear all covered matrix cells and assign `True` to all uncovered elements. - """ - self.row_uncovered[:] = True - self.col_uncovered[:] = True - - def _step1(self): - """ - Step 1 - - Goal: Subtract the smallest element of each row from its elements. - - All elements of the matrix are now non-negative. - - Therefore, an assignment of total cost 0 is the minimum cost assignment. - - This operation leads to at least one zero in each row. - - Procedure: - - For each row of the matrix, find the smallest element and subtract it from every element in its row. - - Go to Step 2. - """ - self.cost_mat -= torch.min(self.cost_mat, dim=1)[0].unsqueeze(1) - return 2 - - def _step2(self): - """ - Step 2 - - Goal: Make sure assignment with cost sum 0 is feasible. - - Procedure: - - Find a zero in the resulting cost matrix. - - If there are no marked zeros in its row or column, mark the zero. - - Repeat for each element in the matrix. - - Go to step 3. - """ - ind_out = torch.where(self.cost_mat == 0) - ind, val = list(ind_out[0]), list(ind_out[1]) - for i, j in zip(ind, val): - if self.col_uncovered[j] and self.row_uncovered[i]: - self.marked[i, j] = 1 - self.col_uncovered[j] = False - self.row_uncovered[i] = False - - self._reset_uncovered_mat() - return 3 - - def _step3(self) -> int: - """ - Step 3 - - Goal: All zeros in the matrix must be covered by marking with the least numbers of rows and columns. - - Procedure: - - Cover each column containing a marked zero. - - If n columns are covered, the marked zeros describe a complete set of unique assignments. - In this case, Go to Step 0 (Done state) - - Otherwise, Go to Step 4. - """ - marked = self.marked == 1 - self.col_uncovered[torch.any(marked, dim=0)] = False - if marked.sum() < self.cost_mat.shape[0]: - return 4 # Go to step 4 - else: - return 0 # Go to step 0 (Done state) - - def _step4(self, bypass: bool = False) -> int: - """ - Step 4 - - Goal: Cover all columns containing a marked zero. - - Procedure: - - Find a non-covered zero and put a prime mark on it. - - If there is no marked zero in the row containing this primed zero, Go to Step 5. - - Otherwise, cover this row and uncover the column containing the marked zero. - - Continue in this manner until there are no uncovered zeros left. - - Save the smallest uncovered value. - - Go to Step 6. - """ - # We convert to int as numpy operations are faster on int - cost_mat = (self.cost_mat == 0).int() - covered_cost_mat = cost_mat * self.row_uncovered.unsqueeze(1) - covered_cost_mat *= self.col_uncovered.long() - row_len, col_len = self.cost_mat.shape - if not bypass: - while True: - urv = unravel_index(torch.argmax(covered_cost_mat).item(), torch.tensor([col_len, row_len])) - row, col = int(urv[0].item()), int(urv[1].item()) - if covered_cost_mat[row, col] == 0: - return 6 - else: - self.marked[row, col] = 2 # Find the first marked element in the row - mark_col = torch.argmax((self.marked[row] == 1).int()) - if self.marked[row, mark_col] != 1: # No marked element in the row - self.zero_row = torch.tensor(row) - self.zero_col = torch.tensor(col) - return 5 - else: - col = mark_col - self.row_uncovered[row] = False - self.col_uncovered[col] = True - covered_cost_mat[:, col] = cost_mat[:, col] * self.row_uncovered - covered_cost_mat[row] = 0 - return 0 - - def _step5(self) -> int: - """ - Step 5 - - Goal: Construct a series of alternating primed and marked zeros as follows. - - Procedure: - - Let Z0 represent the uncovered primed zero found in Step 4. - - Let Z1 denote the marked zero in the column of Z0 (if any). - - Let Z2 denote the primed zero in the row of Z1 (there will always be one). - - Continue until the series terminates at a primed zero that has no marked zero in its column. - - Unmark each marked zero of the series. - - Mark each primed zero of the series. - - Erase all primes and uncover every line in the matrix. - - Return to Step 3 - """ - count = torch.tensor(0) - path = self.path - path[count, 0] = self.zero_row.long() - path[count, 1] = self.zero_col.long() - - while True: # Unmark each marked zero of the series - # Find the first marked element in the col defined by the path (= `val`) - row = torch.argmax((self.marked[:, path[count, 1]] == 1).int()) - - if self.marked[row, path[count, 1]] != 1: - # Could not find one - break - else: - count += 1 - path[count, 0] = row - path[count, 1] = path[count - 1, 1] - - # Find the first prime element in the row defined by the first path step - col = int(torch.argmax((self.marked[path[count, 0]] == 2).int())) - if self.marked[row, col] != 2: - col = -1 - count += 1 - path[count, 0] = path[count - 1, 0] - path[count, 1] = col - - # Convert paths - for i in range(int(count.item()) + 1): - if self.marked[path[i, 0], path[i, 1]] == 1: - self.marked[path[i, 0], path[i, 1]] = 0 - else: - self.marked[path[i, 0], path[i, 1]] = 1 - - self._reset_uncovered_mat() - - # Remove all prime markings in marked matrix - self.marked[self.marked == 2] = 0 - return 3 - - def _step6(self) -> int: - """ - Step 6 - - Goal: Prepare for another iteration by modifying the cost matrix. - - Procedure: - - Add the value found in Step 4 to every element of each covered row. - - Subtract it from every element of each uncovered column. - - Return to Step 4 without altering any marks, primes, or covered lines. - """ - if torch.any(self.row_uncovered) and torch.any(self.col_uncovered): - row_minval = torch.min(self.cost_mat[self.row_uncovered], dim=0)[0] - minval = torch.min(row_minval[self.col_uncovered]) - self.cost_mat[~self.row_uncovered] += minval - self.cost_mat[:, self.col_uncovered] -= minval - return 4 - - -@torch.jit.script -def linear_sum_assignment(cost_matrix: torch.Tensor, max_size: int = 100): - """ - Launch the linear sum assignment algorithm on a cost matrix. - - Args: - cost_matrix (Tensor): The cost matrix of shape (N, M) where M should be larger than N. - - Returns: - row_index (Tensor): The row indices of the optimal assignments. - col_index (Tensor): The column indices of the optimal assignments. - """ - cost_matrix = cost_matrix.clone().detach() - - if len(cost_matrix.shape) != 2: - raise ValueError(f"2-d tensor is expected but got a {cost_matrix.shape} tensor") - if max(cost_matrix.shape) > max_size: - raise ValueError( - f"Cost matrix size {cost_matrix.shape} is too large. The maximum supported size is {max_size}x{max_size}." - ) - - # The algorithm expects more columns than rows in the cost matrix. - if cost_matrix.shape[1] < cost_matrix.shape[0]: - cost_matrix = cost_matrix.T - transposed = True - else: - transposed = False - - lap_solver = LinearSumAssignmentSolver(cost_matrix) - f_int: int = 0 if 0 in cost_matrix.shape else 1 - - # while step is not Done (step 0): - # NOTE: torch.jit.scipt does not support getattr with string argument. - # Do not use getattr(lap_solver, f"_step{f_int}")() - while f_int != 0: - if f_int == 1: - f_int = lap_solver._step1() - elif f_int == 2: - f_int = lap_solver._step2() - elif f_int == 3: - f_int = lap_solver._step3() - elif f_int == 4: - f_int = lap_solver._step4() - elif f_int == 5: - f_int = lap_solver._step5() - elif f_int == 6: - f_int = lap_solver._step6() - - if transposed: - marked = lap_solver.marked.T - else: - marked = lap_solver.marked - row_index, col_index = torch.where(marked == 1) - return row_index, col_index diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/regularization_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/regularization_utils.py deleted file mode 100644 index 871b4889ad68c6338514add9313fd8778910d55d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/regularization_utils.py +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List - - -def compute_stochastic_depth_drop_probs( - num_layers: int, - stochastic_depth_drop_prob: float = 0.0, - stochastic_depth_mode: str = "linear", - stochastic_depth_start_layer: int = 1, -) -> List[float]: - """Computes drop probabilities for stochastic depth regularization technique. - The first layer is never dropped and the starting layer needs to be greater - or equal to 1. - - Args: - num_layers (int): number of layers in the network. - stochastic_depth_drop_prob (float): if non-zero, will randomly drop - layers during training. The higher this value, the more often layers - are dropped. Defaults to 0.0. - stochastic_depth_mode (str): can be either "linear" or "uniform". If - set to "uniform", all layers have the same probability of drop. If - set to "linear", the drop probability grows linearly from 0 for the - first layer to the desired value for the final layer. Defaults to - "linear". - stochastic_depth_start_layer (int): starting layer for stochastic depth. - All layers before this will never be dropped. Note that drop - probability will be adjusted accordingly if mode is "linear" when - start layer is > 1. Defaults to 1. - Returns: - List[float]: list of drop probabilities for all layers - """ - if not (0 <= stochastic_depth_drop_prob < 1.0): - raise ValueError("stochastic_depth_drop_prob has to be in [0, 1).") - if not (1 <= stochastic_depth_start_layer <= num_layers): - raise ValueError("stochastic_depth_start_layer has to be in [1, num layers].") - - # Layers before `stochastic_depth_start_layer` are never dropped - layer_drop_probs = [0.0] * stochastic_depth_start_layer - - # Layers starting with `stochastic_depth_start_layer` may be dropped - if (L := num_layers - stochastic_depth_start_layer) > 0: - if stochastic_depth_mode == "linear": - # we start with 1/L * drop_prob and and end with the desired drop probability. - layer_drop_probs += [l / L * stochastic_depth_drop_prob for l in range(1, L + 1)] - elif stochastic_depth_mode == "uniform": - layer_drop_probs += [stochastic_depth_drop_prob] * L - else: - raise ValueError( - f'stochastic_depth_mode has to be one of ["linear", "uniform"]. Current value: {stochastic_depth_mode}' - ) - return layer_drop_probs diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/rnnt_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/rnnt_utils.py deleted file mode 100644 index 18fc8305cac1948fab5dc43ee288fc776487a555..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/rnnt_utils.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2017 Johns Hopkins University (Shinji Watanabe) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, Tuple, Union - -import torch - - -@dataclass -class Hypothesis: - """Hypothesis class for beam search algorithms. - - score: A float score obtained from an AbstractRNNTDecoder module's score_hypothesis method. - - y_sequence: Either a sequence of integer ids pointing to some vocabulary, or a packed torch.Tensor - behaving in the same manner. dtype must be torch.Long in the latter case. - - dec_state: A list (or list of list) of LSTM-RNN decoder states. Can be None. - - text: (Optional) A decoded string after processing via CTC / RNN-T decoding (removing the CTC/RNNT - `blank` tokens, and optionally merging word-pieces). Should be used as decoded string for - Word Error Rate calculation. - - timestep: (Optional) A list of integer indices representing at which index in the decoding - process did the token appear. Should be of same length as the number of non-blank tokens. - - alignments: (Optional) Represents the CTC / RNNT token alignments as integer tokens along an axis of - time T (for CTC) or Time x Target (TxU). - For CTC, represented as a single list of integer indices. - For RNNT, represented as a dangling list of list of integer indices. - Outer list represents Time dimension (T), inner list represents Target dimension (U). - The set of valid indices **includes** the CTC / RNNT blank token in order to represent alignments. - - frame_confidence: (Optional) Represents the CTC / RNNT per-frame confidence scores as token probabilities - along an axis of time T (for CTC) or Time x Target (TxU). - For CTC, represented as a single list of float indices. - For RNNT, represented as a dangling list of list of float indices. - Outer list represents Time dimension (T), inner list represents Target dimension (U). - - token_confidence: (Optional) Represents the CTC / RNNT per-token confidence scores as token probabilities - along an axis of Target U. - Represented as a single list of float indices. - - word_confidence: (Optional) Represents the CTC / RNNT per-word confidence scores as token probabilities - along an axis of Target U. - Represented as a single list of float indices. - - length: Represents the length of the sequence (the original length without padding), otherwise - defaults to 0. - - y: (Unused) A list of torch.Tensors representing the list of hypotheses. - - lm_state: (Unused) A dictionary state cache used by an external Language Model. - - lm_scores: (Unused) Score of the external Language Model. - - ngram_lm_state: (Optional) State of the external n-gram Language Model. - - tokens: (Optional) A list of decoded tokens (can be characters or word-pieces. - - last_token (Optional): A token or batch of tokens which was predicted in the last step. - """ - - score: float - y_sequence: Union[List[int], torch.Tensor] - text: Optional[str] = None - dec_out: Optional[List[torch.Tensor]] = None - dec_state: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor]]] = None - timestep: Union[List[int], torch.Tensor] = field(default_factory=list) - alignments: Optional[Union[List[int], List[List[int]]]] = None - frame_confidence: Optional[Union[List[float], List[List[float]]]] = None - token_confidence: Optional[List[float]] = None - word_confidence: Optional[List[float]] = None - length: Union[int, torch.Tensor] = 0 - y: List[torch.tensor] = None - lm_state: Optional[Union[Dict[str, Any], List[Any]]] = None - lm_scores: Optional[torch.Tensor] = None - ngram_lm_state: Optional[Union[Dict[str, Any], List[Any]]] = None - tokens: Optional[Union[List[int], torch.Tensor]] = None - last_token: Optional[torch.Tensor] = None - - @property - def non_blank_frame_confidence(self) -> List[float]: - """Get per-frame confidence for non-blank tokens according to self.timestep - - Returns: - List with confidence scores. The length of the list is the same as `timestep`. - """ - non_blank_frame_confidence = [] - # self.timestep can be a dict for RNNT - timestep = self.timestep['timestep'] if isinstance(self.timestep, dict) else self.timestep - if len(self.timestep) != 0 and self.frame_confidence is not None: - if any(isinstance(i, list) for i in self.frame_confidence): # rnnt - t_prev = -1 - offset = 0 - for t in timestep: - if t != t_prev: - t_prev = t - offset = 0 - else: - offset += 1 - non_blank_frame_confidence.append(self.frame_confidence[t][offset]) - else: # ctc - non_blank_frame_confidence = [self.frame_confidence[t] for t in timestep] - return non_blank_frame_confidence - - @property - def words(self) -> List[str]: - """Get words from self.text - - Returns: - List with words (str). - """ - return [] if self.text is None else self.text.split() - - -@dataclass -class NBestHypotheses: - """List of N best hypotheses""" - - n_best_hypotheses: Optional[List[Hypothesis]] - - -@dataclass -class HATJointOutput: - """HATJoint outputs for beam search decoding - - hat_logprobs: standard HATJoint outputs as for RNNTJoint - - ilm_logprobs: internal language model probabilities (for ILM subtraction) - """ - - hat_logprobs: Optional[torch.Tensor] = None - ilm_logprobs: Optional[torch.Tensor] = None - - -def is_prefix(x: List[int], pref: List[int]) -> bool: - """ - Obtained from https://github.com/espnet/espnet. - - Check if pref is a prefix of x. - - Args: - x: Label ID sequence. - pref: Prefix label ID sequence. - - Returns: - : Whether pref is a prefix of x. - """ - if len(pref) >= len(x): - return False - - for i in range(len(pref)): - if pref[i] != x[i]: - return False - - return True - - -def select_k_expansions( - hyps: List[Hypothesis], topk_idxs: torch.Tensor, topk_logps: torch.Tensor, gamma: float, beta: int, -) -> List[Tuple[int, Hypothesis]]: - """ - Obtained from https://github.com/espnet/espnet - - Return K hypotheses candidates for expansion from a list of hypothesis. - K candidates are selected according to the extended hypotheses probabilities - and a prune-by-value method. Where K is equal to beam_size + beta. - - Args: - hyps: Hypotheses. - topk_idxs: Indices of candidates hypothesis. Shape = [B, num_candidates] - topk_logps: Log-probabilities for hypotheses expansions. Shape = [B, V + 1] - gamma: Allowed logp difference for prune-by-value method. - beta: Number of additional candidates to store. - - Return: - k_expansions: Best K expansion hypotheses candidates. - """ - k_expansions = [] - - for i, hyp in enumerate(hyps): - hyp_i = [(int(k), hyp.score + float(v)) for k, v in zip(topk_idxs[i], topk_logps[i])] - k_best_exp_val = max(hyp_i, key=lambda x: x[1]) - - k_best_exp_idx = k_best_exp_val[0] - k_best_exp = k_best_exp_val[1] - - expansions = sorted(filter(lambda x: (k_best_exp - gamma) <= x[1], hyp_i), key=lambda x: x[1],) - - if len(expansions) > 0: - k_expansions.append(expansions) - else: - k_expansions.append([(k_best_exp_idx, k_best_exp)]) - - return k_expansions diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/slu_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/slu_utils.py deleted file mode 100644 index 47b2881143297cfa48510b0823c8dcaad8bfaddd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/slu_utils.py +++ /dev/null @@ -1,205 +0,0 @@ -# ! /usr/bin/python -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from dataclasses import dataclass -from typing import List, Optional - -import torch -from omegaconf import DictConfig - -from nemo.collections.asr.modules.transformer import ( - BeamSearchSequenceGenerator, - GreedySequenceGenerator, - TopKSequenceGenerator, -) -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.core.classes.module import NeuralModule - - -@dataclass -class SequenceGeneratorConfig: - type: str = "greedy" # choices=[greedy, topk, beam] - max_sequence_length: int = 512 - max_delta_length: int = -1 - temperature: float = 1.0 # for top-k sampling - beam_size: int = 1 # K for top-k sampling, N for beam search - len_pen: float = 0.0 # for beam-search - - -class SequenceGenerator: - """ - Wrapper class for sequence generators for NeMo transformers. - """ - - TYPE_GREEDY = "greedy" - TYPE_TOPK = "topk" - TYPE_BEAM = "beam" - SEARCHER_TYPES = [TYPE_GREEDY, TYPE_TOPK, TYPE_BEAM] - - def __init__( - self, - cfg: DictConfig, - embedding: NeuralModule, - decoder: NeuralModule, - log_softmax: NeuralModule, - tokenizer: TokenizerSpec, - ) -> None: - super().__init__() - - self._type = cfg.get("type", "greedy") - self.tokenizer = tokenizer - self.pad_id = getattr(tokenizer, "pad_id", 0) - self.eos_id = getattr(tokenizer, "eos_id", -1) - self.bos_id = getattr(tokenizer, "bos_id", -1) - common_args = { - "pad": self.pad_id, - "bos": self.bos_id, - "eos": self.eos_id, - "max_sequence_length": cfg.get("max_sequence_length", 512), - "max_delta_length": cfg.get("max_delta_length", -1), - "batch_size": cfg.get("batch_size", 1), - } - if self._type == self.TYPE_GREEDY: - self.generator = GreedySequenceGenerator(embedding, decoder, log_softmax, **common_args) - elif self._type == self.TYPE_TOPK: - beam_size = cfg.get("beam_size", 1) - temperature = cfg.get("temperature", 1.0) - self.generator = TopKSequenceGenerator( - embedding, decoder, log_softmax, beam_size, temperature, **common_args - ) - elif self._type == self.TYPE_BEAM: - beam_size = cfg.get("beam_size", 1) - len_pen = cfg.get("len_pen", 0.0) - self.generator = BeamSearchSequenceGenerator( - embedding, decoder, log_softmax, beam_size, len_pen, **common_args - ) - else: - raise ValueError( - f"Sequence Generator only supports one of {self.SEARCH_TYPES}, but got {self._type} instead." - ) - - def __call__( - self, - encoder_states: torch.Tensor, - encoder_input_mask: torch.Tensor = None, - return_beam_scores: bool = False, - pad_max_len: Optional[int] = None, - return_length: bool = False, - ): - """ - Generate sequence tokens given the input encoder states and masks. - Params: - - encoder_states: a torch Tensor of shape BxTxD - - encoder_input_mask: a binary tensor of shape BxTxD - - return_beam_scores: whether to return beam scores - - pad_max_len: optional int, set it to pad all sequence to the same length - - return_length: whether to return the lengths for generated sequences (shape B) - Returns: - - generated tokens tensor of shape BxT - """ - predictions = self.generator( - encoder_hidden_states=encoder_states, - encoder_input_mask=encoder_input_mask, - return_beam_scores=return_beam_scores, - ) - - if pad_max_len: - predictions = pad_sequence(predictions, pad_max_len, self.pad_id) - - if return_length: - return predictions, self.get_seq_length(predictions) - - return predictions - - def get_seq_length(self, seq: torch.Tensor) -> torch.Tensor: - """ - Get sequence length. - Params: - - seq: batched sequence tensor of shape BxTxD - Returns: - - tensor of shape B, where each element is the length of the sequence - """ - lengths = seq.size(1) * torch.ones(seq.size(0), device=seq.device).long() - pos = (seq == self.eos_id).long().nonzero() - seq_lengths = torch.scatter(lengths, dim=0, index=pos[:, 0], src=pos[:, 1]) - return seq_lengths - - def decode_semantics_from_tokens(self, seq_tokens: torch.Tensor) -> List[str]: - """ - Decode tokens into strings - Rarams: - - seq_tokens: integer tensor of shape BxT - Returns: - - list of strings - """ - semantics_list = [] - # Drop sequence tokens to CPU - seq_tokens = seq_tokens.detach().long().cpu() - seq_lengths = self.get_seq_length(seq_tokens) - # iterate over batch - for ind in range(seq_tokens.shape[0]): - tokens = seq_tokens[ind].numpy().tolist() - length = seq_lengths[ind].long().cpu().item() - tokens = tokens[:length] - text = "".join(self.tokenizer.tokenizer.decode_ids(tokens)) - semantics_list.append(text) - return semantics_list - - -def get_seq_length(seq: torch.Tensor, eos_id: int) -> torch.Tensor: - """ - Get sequence length. - Params: - - seq: batched sequence tensor of shape BxTxD - - eos_id: integer representing the end of sentence - Returns: - - tensor of shape B, where each element is the length of the sequence - """ - lengths = seq.size(1) * torch.ones(seq.size(0), device=seq.device).long() - pos = (seq == eos_id).long().nonzero() - seq_lengths = torch.scatter(lengths, dim=0, index=pos[:, 0], src=pos[:, 1]) - return seq_lengths - - -def pad_sequence(seq: torch.Tensor, max_len: int, pad_token: int = 0) -> torch.Tensor: - """ - Params: - - seq: integer token sequences of shape BxT - - max_len: integer for max sequence length - - pad_token: integer token for padding - Returns: - - padded sequence of shape B x max_len - """ - batch = seq.size(0) - curr_len = seq.size(1) - if curr_len >= max_len: - return seq - - padding = torch.zeros(batch, max_len - curr_len, dtype=seq.dtype, device=seq.device).fill_(pad_token) - return torch.cat([seq, padding], dim=1) - - -def get_seq_mask(seq: torch.Tensor, seq_lens: torch.Tensor) -> torch.Tensor: - """ - Get the sequence mask based on the actual length of each sequence - Params: - - seq: tensor of shape [BxLxD] - - seq_len: tensor of shape [B] - Returns: - - binary mask of shape [BxL] - """ - mask = torch.arange(seq.size(1))[None, :].to(seq.device) < seq_lens[:, None] - return mask.to(seq.device, dtype=bool) diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/speaker_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/speaker_utils.py deleted file mode 100644 index 413b24f97e81985a9996db65580a4856f0a82e15..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/speaker_utils.py +++ /dev/null @@ -1,1720 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import json -import math -import os -import shutil -from copy import deepcopy -from typing import Dict, List, Tuple, Union - -import numpy as np -import omegaconf -import soundfile as sf -import torch -from pyannote.core import Annotation, Segment -from tqdm import tqdm - -from nemo.collections.asr.data.audio_to_label import repeat_signal -from nemo.collections.asr.parts.utils.offline_clustering import SpeakerClustering, get_argmin_mat, split_input_data -from nemo.utils import logging - - -""" -This file contains all the utility functions required for speaker embeddings part in diarization scripts -""" - - -def get_uniqname_from_filepath(filepath): - """ - Return base name from provided filepath - """ - if type(filepath) is str: - uniq_id = os.path.splitext(os.path.basename(filepath))[0] - return uniq_id - else: - raise TypeError("input must be filepath string") - - -def get_uniq_id_from_manifest_line(line: str) -> str: - """ - Retrieve `uniq_id` from the `audio_filepath` in a manifest line. - """ - dic = json.loads(line.strip()) - uniq_id = get_uniqname_from_filepath(dic['audio_filepath']) - return uniq_id - - -def get_uniq_id_with_dur(meta, decimals=3): - """ - Return basename with offset and end time labels - """ - # bare_uniq_id = get_uniqname_from_filepath(meta['audio_filepath']) - bare_uniq_id = get_uniqname_from_filepath(meta['rttm_filepath']) - if meta['offset'] is None and meta['duration'] is None: - return bare_uniq_id - if meta['offset']: - offset = str(int(round(meta['offset'], decimals) * pow(10, decimals))) - else: - offset = 0 - if meta['duration']: - endtime = str(int(round(meta['offset'] + meta['duration'], decimals) * pow(10, decimals))) - else: - endtime = 'NULL' - uniq_id = f"{bare_uniq_id}_{offset}_{endtime}" - return uniq_id - - -def audio_rttm_map(manifest, attach_dur=False): - """ - This function creates AUDIO_RTTM_MAP which is used by all diarization components to extract embeddings, - cluster and unify time stamps - Args: manifest file that contains keys audio_filepath, rttm_filepath if exists, text, num_speakers if known and uem_filepath if exists - - returns: - AUDIO_RTTM_MAP (dict) : A dictionary with keys of uniq id, which is being used to map audio files and corresponding rttm files - """ - - AUDIO_RTTM_MAP = {} - with open(manifest, 'r') as inp_file: - lines = inp_file.readlines() - logging.info("Number of files to diarize: {}".format(len(lines))) - for line in lines: - line = line.strip() - dic = json.loads(line) - - meta = { - 'audio_filepath': dic['audio_filepath'], - 'rttm_filepath': dic.get('rttm_filepath', None), - 'offset': dic.get('offset', None), - 'duration': dic.get('duration', None), - 'text': dic.get('text', None), - 'num_speakers': dic.get('num_speakers', None), - 'uem_filepath': dic.get('uem_filepath', None), - 'ctm_filepath': dic.get('ctm_filepath', None), - } - if attach_dur: - uniqname = get_uniq_id_with_dur(meta) - else: - uniqname = get_uniqname_from_filepath(filepath=meta['audio_filepath']) - - if uniqname not in AUDIO_RTTM_MAP: - AUDIO_RTTM_MAP[uniqname] = meta - else: - raise KeyError( - "file {} is already part of AUDIO_RTTM_MAP, it might be duplicated, Note: file basename must be unique".format( - meta['audio_filepath'] - ) - ) - - return AUDIO_RTTM_MAP - - -def parse_scale_configs(window_lengths_in_sec, shift_lengths_in_sec, multiscale_weights): - """ - Check whether multiscale parameters are provided correctly. window_lengths_in_sec, shift_lengfhs_in_sec and - multiscale_weights should be all provided in omegaconf.listconfig.ListConfig type. In addition, the scales - should be provided in descending order, from the longest scale to the base scale (the shortest). - - Example: - Single-scale setting: - parameters.window_length_in_sec=1.5 - parameters.shift_length_in_sec=0.75 - parameters.multiscale_weights=null - - Multiscale setting (base scale - window_length 0.5 s and shift_length 0.25): - parameters.window_length_in_sec=[1.5,1.0,0.5] - parameters.shift_length_in_sec=[0.75,0.5,0.25] - parameters.multiscale_weights=[1,1,1] - - In addition, you can also specify session-by-session multiscale weight. In this case, each dictionary key - points to different weights. - """ - check_float_config = [isinstance(var, float) for var in (window_lengths_in_sec, shift_lengths_in_sec)] - check_list_config = [ - isinstance(var, (omegaconf.listconfig.ListConfig, list, tuple)) - for var in (window_lengths_in_sec, shift_lengths_in_sec, multiscale_weights) - ] - if all(check_list_config) or all(check_float_config): - - # If bare floating numbers are provided, convert them to list format. - if all(check_float_config): - window_lengths, shift_lengths, multiscale_weights = ( - [window_lengths_in_sec], - [shift_lengths_in_sec], - [1.0], - ) - else: - window_lengths, shift_lengths, multiscale_weights = ( - window_lengths_in_sec, - shift_lengths_in_sec, - multiscale_weights, - ) - - length_check = ( - len(set([len(window_lengths), len(shift_lengths), len(multiscale_weights)])) == 1 - and len(multiscale_weights) > 0 - ) - scale_order_check = ( - list(window_lengths) == sorted(window_lengths)[::-1] and list(shift_lengths) == sorted(shift_lengths)[::-1] - ) - - # Check whether window lengths are longer than shift lengths - if len(window_lengths) > 1: - shift_length_check = all([w > s for w, s in zip(window_lengths, shift_lengths)]) - else: - shift_length_check = window_lengths[0] > shift_lengths[0] - - multiscale_args_dict = {'use_single_scale_clustering': False} - if all([length_check, scale_order_check, shift_length_check]): - if len(window_lengths) > 1: - multiscale_args_dict['scale_dict'] = { - k: (w, s) for k, (w, s) in enumerate(zip(window_lengths, shift_lengths)) - } - else: - multiscale_args_dict['scale_dict'] = {0: (window_lengths[0], shift_lengths[0])} - multiscale_args_dict['multiscale_weights'] = multiscale_weights - return multiscale_args_dict - else: - raise ValueError('Multiscale parameters are not properly setup.') - - elif any(check_list_config): - raise ValueError( - 'You must provide a list config for all three parameters: window, shift and multiscale weights.' - ) - else: - return None - - -def get_embs_and_timestamps(multiscale_embeddings_and_timestamps, multiscale_args_dict): - """ - The embeddings and timestamps in multiscale_embeddings_and_timestamps dictionary are - indexed by scale index. This function rearranges the extracted speaker embedding and - timestamps by unique ID to make the further processing more convenient. - - Args: - multiscale_embeddings_and_timestamps (dict): - Dictionary of embeddings and timestamps for each scale. - multiscale_args_dict (dict): - Dictionary of scale information: window, shift and multiscale weights. - - Returns: - embs_and_timestamps (dict) - A dictionary containing embeddings and timestamps of each scale, indexed by unique ID. - """ - embs_and_timestamps = {uniq_id: {} for uniq_id in multiscale_embeddings_and_timestamps[0][0].keys()} - if multiscale_args_dict['use_single_scale_clustering']: - _multiscale_args_dict = deepcopy(multiscale_args_dict) - _multiscale_args_dict['scale_dict'] = {0: multiscale_args_dict['scale_dict'][0]} - _multiscale_args_dict['multiscale_weights'] = multiscale_args_dict['multiscale_weights'][:1] - else: - _multiscale_args_dict = multiscale_args_dict - - embeddings, timestamps = multiscale_embeddings_and_timestamps[0] - for uniq_id in embeddings.keys(): - embeddings_list, time_stamps_list, segment_index_list = [], [], [] - for scale_idx in sorted(_multiscale_args_dict['scale_dict'].keys()): - embeddings, timestamps = multiscale_embeddings_and_timestamps[scale_idx] - if len(embeddings[uniq_id]) != len(timestamps[uniq_id]): - raise ValueError("Mismatch of counts between embedding vectors and timestamps") - time_stamps_tensor = torch.tensor(timestamps[uniq_id]) - embeddings_list.append(embeddings[uniq_id]) - segment_index_list.append(embeddings[uniq_id].shape[0]) - time_stamps_list.append(time_stamps_tensor) - - embs_and_timestamps[uniq_id]['multiscale_weights'] = ( - torch.tensor(_multiscale_args_dict['multiscale_weights']).unsqueeze(0).float() - ) - embs_and_timestamps[uniq_id]['embeddings'] = torch.cat(embeddings_list, dim=0) - embs_and_timestamps[uniq_id]['timestamps'] = torch.cat(time_stamps_list, dim=0) - embs_and_timestamps[uniq_id]['multiscale_segment_counts'] = torch.tensor(segment_index_list) - - return embs_and_timestamps - - -def get_timestamps(multiscale_timestamps, multiscale_args_dict): - """ - The timestamps in `multiscale_timestamps` dictionary are indexed by scale index. - This function rearranges the extracted speaker embedding and timestamps by unique ID to make the further processing more convenient. - - Args: - multiscale_timestamps (dict): - Dictionary of timestamps for each scale. - multiscale_args_dict (dict): - Dictionary of scale information: window, shift and multiscale weights. - - Returns: - timestamps_dict (dict) - A dictionary containing embeddings and timestamps of each scale, indexed by unique ID. - """ - timestamps_dict = {uniq_id: {'scale_dict': {}} for uniq_id in multiscale_timestamps[0].keys()} - for scale_idx in sorted(multiscale_args_dict['scale_dict'].keys()): - time_stamps = multiscale_timestamps[scale_idx] - for uniq_id in time_stamps.keys(): - timestamps_dict[uniq_id]['scale_dict'][scale_idx] = { - 'time_stamps': time_stamps[uniq_id], - } - - return timestamps_dict - - -def get_contiguous_stamps(stamps): - """ - Return contiguous time stamps - """ - lines = deepcopy(stamps) - contiguous_stamps = [] - for i in range(len(lines) - 1): - start, end, speaker = lines[i].split() - next_start, next_end, next_speaker = lines[i + 1].split() - if float(end) > float(next_start): - avg = str((float(next_start) + float(end)) / 2.0) - lines[i + 1] = ' '.join([avg, next_end, next_speaker]) - contiguous_stamps.append(start + " " + avg + " " + speaker) - else: - contiguous_stamps.append(start + " " + end + " " + speaker) - start, end, speaker = lines[-1].split() - contiguous_stamps.append(start + " " + end + " " + speaker) - return contiguous_stamps - - -def merge_stamps(lines): - """ - Merge time stamps of the same speaker. - """ - stamps = deepcopy(lines) - overlap_stamps = [] - for i in range(len(stamps) - 1): - start, end, speaker = stamps[i].split() - next_start, next_end, next_speaker = stamps[i + 1].split() - if float(end) == float(next_start) and speaker == next_speaker: - stamps[i + 1] = ' '.join([start, next_end, next_speaker]) - else: - overlap_stamps.append(start + " " + end + " " + speaker) - - start, end, speaker = stamps[-1].split() - overlap_stamps.append(start + " " + end + " " + speaker) - - return overlap_stamps - - -def labels_to_pyannote_object(labels, uniq_name=''): - """ - Convert the given labels to pyannote object to calculate DER and for visualization - """ - annotation = Annotation(uri=uniq_name) - for label in labels: - start, end, speaker = label.strip().split() - start, end = float(start), float(end) - annotation[Segment(start, end)] = speaker - - return annotation - - -def labels_to_rttmfile(labels, uniq_id, out_rttm_dir): - """ - Write rttm file with uniq_id name in out_rttm_dir with timestamps in labels - """ - filename = os.path.join(out_rttm_dir, uniq_id + '.rttm') - with open(filename, 'w') as f: - for line in labels: - line = line.strip() - start, end, speaker = line.split() - duration = float(end) - float(start) - start = float(start) - log = 'SPEAKER {} 1 {:.3f} {:.3f} {} \n'.format(uniq_id, start, duration, speaker) - f.write(log) - - return filename - - -def string_to_float(x, round_digits): - """ - Convert string to float then round the number. - """ - return round(float(x), round_digits) - - -def convert_rttm_line(rttm_line, round_digits=3): - """ - Convert a line in RTTM file to speaker label, start and end timestamps. - - Args: - rttm_line (str): - A line in RTTM formatted file containing offset and duration of each segment. - round_digits (int): - Number of digits to be rounded. - - Returns: - start (float) - Start timestamp in floating point number. - end (float): - End timestamp in floating point number. - speaker (str): - speaker string in RTTM lines. - """ - rttm = rttm_line.strip().split() - start = string_to_float(rttm[3], round_digits) - end = string_to_float(rttm[4], round_digits) + string_to_float(rttm[3], round_digits) - speaker = rttm[7] - return start, end, speaker - - -def rttm_to_labels(rttm_filename): - """ - Prepare time stamps label list from rttm file - """ - labels = [] - with open(rttm_filename, 'r') as f: - for line in f.readlines(): - start, end, speaker = convert_rttm_line(line, round_digits=3) - labels.append('{} {} {}'.format(start, end, speaker)) - return labels - - -def write_cluster_labels(base_scale_idx, lines_cluster_labels, out_rttm_dir): - """ - Write cluster labels that are generated from clustering into a file. - Args: - base_scale_idx (int): The base scale index which is the highest scale index. - lines_cluster_labels (list): The start and end time-stamps of each segment with the predicted cluster label. - out_rttm_dir (str): The path where output rttm files are saved. - """ - out_label_name = os.path.join( - out_rttm_dir, '../speaker_outputs', f'subsegments_scale{base_scale_idx}_cluster.label' - ) - with open(out_label_name, 'w') as f: - for clus_label_line in lines_cluster_labels: - f.write(clus_label_line) - - -def generate_cluster_labels(segment_ranges: List[str], cluster_labels: List[int]): - """ - Generate cluster (speaker labels) from the segment_range list and cluster label list. - - Args: - segment_ranges (list): - List containing intervals (start and end timestapms, ranges) of each segment - cluster_labels (list): - List containing a cluster label sequence - - Returns: - diar_hyp (list): - List containing merged speaker-turn-level timestamps and labels in string format - Example: - >>> diar_hyp = ['0.0 4.375 speaker_1', '4.375 5.125 speaker_0', ...] - - lines (list) - List containing raw segment-level timestamps and labels in raw digits - >>> diar_hyp = ['0.0 0.25 speaker_1', '0.25 0.5 speaker_1', ..., '4.125 4.375 speaker_1'] - """ - lines = [] - for idx, label in enumerate(cluster_labels): - tag = 'speaker_' + str(label) - stt, end = segment_ranges[idx] - lines.append(f"{stt} {end} {tag}") - cont_lines = get_contiguous_stamps(lines) - diar_hyp = merge_stamps(cont_lines) - return diar_hyp, lines - - -def perform_clustering( - embs_and_timestamps, AUDIO_RTTM_MAP, out_rttm_dir, clustering_params, device, verbose: bool = True -): - """ - Performs spectral clustering on embeddings with time stamps generated from VAD output - - Args: - embs_and_timestamps (dict): This dictionary contains the following items indexed by unique IDs. - 'embeddings' : Tensor containing embeddings. Dimensions:(# of embs) x (emb. dimension) - 'timestamps' : Tensor containing ime stamps list for each audio recording - 'multiscale_segment_counts' : Tensor containing the number of segments for each scale - AUDIO_RTTM_MAP (dict): AUDIO_RTTM_MAP for mapping unique id with audio file path and rttm path - out_rttm_dir (str): Path to write predicted rttms - clustering_params (dict): clustering parameters provided through config that contains max_num_speakers (int), - oracle_num_speakers (bool), max_rp_threshold(float), sparse_search_volume(int) and enhance_count_threshold (int) - use_torch_script (bool): Boolean that determines whether to use torch.jit.script for speaker clustering - device (torch.device): Device we are running on ('cpu', 'cuda'). - verbose (bool): Enable TQDM progress bar. - - Returns: - all_reference (list[uniq_name,Annotation]): reference annotations for score calculation - all_hypothesis (list[uniq_name,Annotation]): hypothesis annotations for score calculation - - """ - all_hypothesis = [] - all_reference = [] - no_references = False - lines_cluster_labels = [] - - cuda = True - if device.type != 'cuda': - logging.warning("cuda=False, using CPU for eigen decomposition. This might slow down the clustering process.") - cuda = False - - speaker_clustering = SpeakerClustering(cuda=cuda) - - # If True, export torch script module and save it to the base folder. - if clustering_params.get('export_script_module', False): - speaker_clustering = torch.jit.script(speaker_clustering) - torch.jit.save(speaker_clustering, 'speaker_clustering_script.pt') - - for uniq_id, audio_rttm_values in tqdm(AUDIO_RTTM_MAP.items(), desc='clustering', leave=True, disable=not verbose): - uniq_embs_and_timestamps = embs_and_timestamps[uniq_id] - - if clustering_params.oracle_num_speakers: - num_speakers = audio_rttm_values.get('num_speakers', None) - if num_speakers is None: - raise ValueError("Provided option as oracle num of speakers but num_speakers in manifest is null") - else: - num_speakers = -1 - - base_scale_idx = uniq_embs_and_timestamps['multiscale_segment_counts'].shape[0] - 1 - - cluster_labels = speaker_clustering.forward_infer( - embeddings_in_scales=uniq_embs_and_timestamps['embeddings'], - timestamps_in_scales=uniq_embs_and_timestamps['timestamps'], - multiscale_segment_counts=uniq_embs_and_timestamps['multiscale_segment_counts'], - multiscale_weights=uniq_embs_and_timestamps['multiscale_weights'], - oracle_num_speakers=int(num_speakers), - max_num_speakers=int(clustering_params.max_num_speakers), - max_rp_threshold=float(clustering_params.max_rp_threshold), - sparse_search_volume=int(clustering_params.sparse_search_volume), - ) - - del uniq_embs_and_timestamps - if cuda: - torch.cuda.empty_cache() - else: - gc.collect() - - timestamps = speaker_clustering.timestamps_in_scales[base_scale_idx] - cluster_labels = cluster_labels.cpu().numpy() - if len(cluster_labels) != timestamps.shape[0]: - raise ValueError("Mismatch of length between cluster_labels and timestamps.") - - labels, lines = generate_cluster_labels(timestamps, cluster_labels) - - if out_rttm_dir: - labels_to_rttmfile(labels, uniq_id, out_rttm_dir) - lines_cluster_labels.extend([f'{uniq_id} {seg_line}\n' for seg_line in lines]) - hypothesis = labels_to_pyannote_object(labels, uniq_name=uniq_id) - all_hypothesis.append([uniq_id, hypothesis]) - - rttm_file = audio_rttm_values.get('rttm_filepath', None) - if rttm_file is not None and os.path.exists(rttm_file) and not no_references: - ref_labels = rttm_to_labels(rttm_file) - reference = labels_to_pyannote_object(ref_labels, uniq_name=uniq_id) - all_reference.append([uniq_id, reference]) - else: - no_references = True - all_reference = [] - - if out_rttm_dir: - write_cluster_labels(base_scale_idx, lines_cluster_labels, out_rttm_dir) - - return all_reference, all_hypothesis - - -def get_vad_out_from_rttm_line(rttm_line): - """ - Extract VAD timestamp from the given RTTM lines. - """ - vad_out = rttm_line.strip().split() - if len(vad_out) > 3: - start, dur, _ = float(vad_out[3]), float(vad_out[4]), vad_out[7] - else: - start, dur, _ = float(vad_out[0]), float(vad_out[1]), vad_out[2] - return start, dur - - -def get_offset_and_duration(AUDIO_RTTM_MAP, uniq_id, decimals=5): - """ - Extract offset and duration information from AUDIO_RTTM_MAP dictionary. - If duration information is not specified, a duration value is extracted from the audio file directly. - - Args: - AUDIO_RTTM_MAP (dict): - Dictionary containing RTTM file information, which is indexed by unique file id. - uniq_id (str): - Unique file id - Returns: - offset (float): - The offset value that determines the beginning of the audio stream. - duration (float): - The length of audio stream that is expected to be used. - """ - audio_path = AUDIO_RTTM_MAP[uniq_id]['audio_filepath'] - if AUDIO_RTTM_MAP[uniq_id].get('duration', None): - duration = round(AUDIO_RTTM_MAP[uniq_id]['duration'], decimals) - offset = round(AUDIO_RTTM_MAP[uniq_id]['offset'], decimals) - else: - sound = sf.SoundFile(audio_path) - duration = sound.frames / sound.samplerate - offset = 0.0 - return offset, duration - - -def write_overlap_segments(outfile, AUDIO_RTTM_MAP, uniq_id, overlap_range_list, decimals=5): - """ - Write the json dictionary into the specified manifest file. - - Args: - outfile: - File pointer that indicates output file path. - AUDIO_RTTM_MAP (dict): - Dictionary containing the input manifest information - uniq_id (str): - Unique file id - overlap_range_list (list): - List containing overlapping ranges between target and source. - decimals (int): - Number of decimals to round the offset and duration values. - """ - audio_path = AUDIO_RTTM_MAP[uniq_id]['audio_filepath'] - for (stt, end) in overlap_range_list: - meta = { - "audio_filepath": audio_path, - "offset": round(stt, decimals), - "duration": round(end - stt, decimals), - "label": 'UNK', - "uniq_id": uniq_id, - } - json.dump(meta, outfile) - outfile.write("\n") - - -def read_rttm_lines(rttm_file_path): - """ - Read rttm files and return the rttm information lines. - - Args: - rttm_file_path (str): - An absolute path to an RTTM file - - Returns: - lines (list): - List containing the strings from the RTTM file. - """ - if rttm_file_path and os.path.exists(rttm_file_path): - with open(rttm_file_path, 'r') as f: - lines = f.readlines() - else: - raise FileNotFoundError( - "Requested to construct manifest from rttm with oracle VAD option or from NeMo VAD but received filename as {}".format( - rttm_file_path - ) - ) - return lines - - -def validate_vad_manifest(AUDIO_RTTM_MAP, vad_manifest): - """ - This function will check the valid speech segments in the manifest file which is either - generated from NeMo voice activity detection(VAD) or oracle VAD. - If an audio file does not contain any valid speech segments, we ignore the audio file - (indexed by uniq_id) for the rest of the processing steps. - """ - vad_uniq_ids = set() - with open(vad_manifest, 'r') as vad_file: - for line in vad_file: - line = line.strip() - dic = json.loads(line) - if dic['duration'] > 0: - vad_uniq_ids.add(dic['uniq_id']) - - provided_uniq_ids = set(AUDIO_RTTM_MAP.keys()) - silence_ids = provided_uniq_ids - vad_uniq_ids - for uniq_id in silence_ids: - del AUDIO_RTTM_MAP[uniq_id] - logging.warning(f"{uniq_id} is ignored since the file does not contain any speech signal to be processed.") - - if len(AUDIO_RTTM_MAP) == 0: - raise ValueError("All files present in manifest contains silence, aborting next steps") - - -def is_overlap(rangeA: List[float], rangeB: List[float]) -> bool: - """ - Check whether two ranges have overlap. - - Args: - rangeA (list, tuple): - List or tuple containing start and end value in float. - rangeB (list, tuple): - List or tuple containing start and end value in float. - Returns: - (bool): - Boolean that indicates whether the input ranges have overlap. - """ - start1, end1 = rangeA[0], rangeA[1] - start2, end2 = rangeB[0], rangeB[1] - return end1 > start2 and end2 > start1 - - -def get_overlap_range(rangeA: List[float], rangeB: List[float]): - """ - Calculate the overlapping range between rangeA and rangeB. - - Args: - rangeA (list, tuple): - List or tuple containing start and end value in float. - rangeB (list, tuple): - List or tuple containing start and end value in float. - - Returns: - (list): - List containing the overlapping range between rangeA and rangeB. - """ - assert is_overlap(rangeA, rangeB), f"There is no overlap between rangeA:{rangeA} and rangeB:{rangeB}" - return [max(rangeA[0], rangeB[0]), min(rangeA[1], rangeB[1])] - - -def merge_int_intervals(intervals_in: List[List[int]]) -> List[List[int]]: - """ - Interval merging algorithm which has `O(N*logN)` time complexity. (N is number of intervals) - Merge the range pairs if there is overlap exists between the given ranges. - This algorithm needs a sorted range list in terms of the start time. - Note that neighboring numbers lead to a merged range. - - Example: - input: [(1, 10), (11, 20)] - output: [(1, 20)] - - Refer to the original code at https://stackoverflow.com/a/59378428 - - Args: - intervals_in (list): - List containing ranges. - Example: - >>> intervals_in - [(102, 103), (104, 109), (107, 120)] - - Returns: - merged_list (list): - List containing the combined ranges. - Example: - >>> merged_list - [(102, 120)] - """ - num_intervals = len(intervals_in) - if num_intervals == 0: - return [] - elif num_intervals == 1: - return intervals_in - else: - merged_list: List[List[int]] = [] - stt2: int = 0 - end2: int = 0 - - intervals_in = [[int(x[0]), int(x[1])] for x in intervals_in] - interval_tensor: torch.Tensor = torch.tensor(intervals_in) - _sorted, _ = torch.sort(interval_tensor, dim=0) - _sorted_int: List[List[int]] = [[int(x[0]), int(x[1])] for x in _sorted.cpu()] - intervals: List[List[int]] = _sorted_int - - start, end = intervals[0][0], intervals[0][1] - for i in range(1, num_intervals): - stt2, end2 = intervals[i][0], intervals[i][1] - if end >= stt2: - end = max(end2, end) - else: - start, end = int(start), int(end) - merged_list.append([start, end]) - start = stt2 - end = max(end2, end) - - start, end = int(start), int(end) - merged_list.append([start, end]) - return merged_list - - -def fl2int(x: float, decimals: int = 3) -> int: - """ - Convert floating point number to integer. - """ - return torch.round(torch.tensor([x * (10 ** decimals)]), decimals=0).int().item() - - -def int2fl(x: int, decimals: int = 3) -> float: - """ - Convert integer to floating point number. - """ - return torch.round(torch.tensor([x / (10 ** decimals)]), decimals=decimals).item() - - -def merge_float_intervals(ranges: List[List[float]], decimals: int = 5, margin: int = 2) -> List[List[float]]: - """ - Combine overlaps with floating point numbers. Since neighboring integers are considered as continuous range, - we need to add margin to the starting range before merging then subtract margin from the result range. - - Args: - ranges (list): - List containing ranges. - Example: [(10.2, 10.83), (10.42, 10.91), (10.45, 12.09)] - decimals (int): - Number of rounding decimals - margin (int): - margin for determining overlap of the two ranges when ranges are converted to integer ranges. - Default is margin=2 which follows the python index convention. - - Examples: - If margin is 0: - [(1, 10), (10, 20)] -> [(1, 20)] - [(1, 10), (11, 20)] -> [(1, 20)] - If margin is 1: - [(1, 10), (10, 20)] -> [(1, 20)] - [(1, 10), (11, 20)] -> [(1, 10), (11, 20)] - If margin is 2: - [(1, 10), (10, 20)] -> [(1, 10), (10, 20)] - [(1, 10), (11, 20)] -> [(1, 10), (11, 20)] - - Returns: - merged_list (list): - List containing the combined ranges. - Example: [(10.2, 12.09)] - """ - ranges_int: List[List[int]] = [] - merged_ranges_int: List[List[int]] = [] - for x in ranges: - stt, end = int(fl2int(x[0], decimals) + margin), int(fl2int(x[1], decimals)) - if stt < end: - ranges_int.append([stt, end]) - merged_ranges_int = merge_int_intervals(ranges_int) - merged_ranges_float: List[List[float]] = [] - merged_ranges_float = [[int2fl(x[0] - margin, decimals), int2fl(x[1], decimals)] for x in merged_ranges_int] - return merged_ranges_float - - -def get_sub_range_list(target_range: List[float], source_range_list: List[List[float]]) -> List[List[float]]: - """ - Get the ranges that has overlaps with the target range from the source_range_list. - - Example: - source range: - |===--======---=====---====--| - target range: - |--------================----| - out_range: - |--------===---=====---==----| - - Args: - target_range (list): - A range (a start and end value pair) that defines the target range we want to select. - target_range = [(start, end)] - source_range_list (list): - List containing the subranges that need to be selected. - source_range = [(start0, end0), (start1, end1), ...] - Returns: - out_range (list): - List containing the overlap between target_range and - source_range_list. - """ - if len(target_range) == 0: - return [] - else: - out_range: List[List[float]] = [] - for s_range in source_range_list: - if is_overlap(s_range, target_range): - ovl_range = get_overlap_range(s_range, target_range) - out_range.append(ovl_range) - return out_range - - -def write_rttm2manifest( - AUDIO_RTTM_MAP: str, manifest_file: str, include_uniq_id: bool = False, decimals: int = 5 -) -> str: - """ - Write manifest file based on rttm files (or vad table out files). This manifest file would be used by - speaker diarizer to compute embeddings and cluster them. This function takes care of overlapping VAD timestamps - and trimmed with the given offset and duration value. - - Args: - AUDIO_RTTM_MAP (dict): - Dictionary containing keys to unique names, that contains audio filepath and rttm_filepath as its contents, - these are used to extract oracle vad timestamps. - manifest (str): - The path to the output manifest file. - - Returns: - manifest (str): - The path to the output manifest file. - """ - with open(manifest_file, 'w') as outfile: - for uniq_id in AUDIO_RTTM_MAP: - rttm_file_path = AUDIO_RTTM_MAP[uniq_id]['rttm_filepath'] - rttm_lines = read_rttm_lines(rttm_file_path) - offset, duration = get_offset_and_duration(AUDIO_RTTM_MAP, uniq_id, decimals) - vad_start_end_list_raw = [] - for line in rttm_lines: - start, dur = get_vad_out_from_rttm_line(line) - vad_start_end_list_raw.append([start, start + dur]) - vad_start_end_list = merge_float_intervals(vad_start_end_list_raw, decimals) - if len(vad_start_end_list) == 0: - logging.warning(f"File ID: {uniq_id}: The VAD label is not containing any speech segments.") - elif duration <= 0: - logging.warning(f"File ID: {uniq_id}: The audio file has negative or zero duration.") - else: - overlap_range_list = get_sub_range_list( - source_range_list=vad_start_end_list, target_range=[offset, offset + duration] - ) - write_overlap_segments(outfile, AUDIO_RTTM_MAP, uniq_id, overlap_range_list, decimals) - return manifest_file - - -def segments_manifest_to_subsegments_manifest( - segments_manifest_file: str, - subsegments_manifest_file: str = None, - window: float = 1.5, - shift: float = 0.75, - min_subsegment_duration: float = 0.05, - include_uniq_id: bool = False, -): - """ - Generate subsegments manifest from segments manifest file - Args: - segments_manifest file (str): path to segments manifest file, typically from VAD output - subsegments_manifest_file (str): path to output subsegments manifest file (default (None) : writes to current working directory) - window (float): window length for segments to subsegments length - shift (float): hop length for subsegments shift - min_subsegments_duration (float): exclude subsegments smaller than this duration value - - Returns: - returns path to subsegment manifest file - """ - if subsegments_manifest_file is None: - pwd = os.getcwd() - subsegments_manifest_file = os.path.join(pwd, 'subsegments.json') - - with open(segments_manifest_file, 'r') as segments_manifest, open( - subsegments_manifest_file, 'w' - ) as subsegments_manifest: - segments = segments_manifest.readlines() - for segment in segments: - segment = segment.strip() - dic = json.loads(segment) - audio, offset, duration, label = dic['audio_filepath'], dic['offset'], dic['duration'], dic['label'] - subsegments = get_subsegments(offset=offset, window=window, shift=shift, duration=duration) - if include_uniq_id and 'uniq_id' in dic: - uniq_id = dic['uniq_id'] - else: - uniq_id = None - for subsegment in subsegments: - start, dur = subsegment - if dur > min_subsegment_duration: - meta = { - "audio_filepath": audio, - "offset": start, - "duration": dur, - "label": label, - "uniq_id": uniq_id, - } - - json.dump(meta, subsegments_manifest) - subsegments_manifest.write("\n") - - return subsegments_manifest_file - - -def get_subsegments(offset: float, window: float, shift: float, duration: float) -> List[List[float]]: - """ - Return subsegments from a segment of audio file - Args: - offset (float): start time of audio segment - window (float): window length for segments to subsegments length - shift (float): hop length for subsegments shift - duration (float): duration of segment - Returns: - subsegments (List[tuple[float, float]]): subsegments generated for the segments as list of tuple of start and duration of each subsegment - """ - subsegments: List[List[float]] = [] - start = offset - slice_end = start + duration - base = math.ceil((duration - window) / shift) - slices = 1 if base < 0 else base + 1 - for slice_id in range(slices): - end = start + window - if end > slice_end: - end = slice_end - subsegments.append([start, end - start]) - start = offset + (slice_id + 1) * shift - return subsegments - - -def get_target_sig(sig, start_sec: float, end_sec: float, slice_length: int, sample_rate: int,) -> torch.Tensor: - """ - Extract time-series signal from the given audio buffer based on the start and end - timestamps. - - Args: - start_sec (float): - Start of the targeted segments in second - end_sec (float): - Start of the targeted segments in second - slice_length (int): - Length of the entire audio segment that the samples are extracted from - sample_rate (int): - Sampling rate of the time-series audio signal - - Returns: - (Tensor) Trimmed ime-series audio signal samples - """ - start_idx = int(start_sec * sample_rate) - end_idx = min(int(end_sec * sample_rate), int(slice_length + start_idx)) - return sig[start_idx:end_idx] - - -def check_ranges(range_tensor): - """ - Check whether the range list has any faulty timestamp order. - - Args: - range_tensor (list): - List containing the start and end time of the segments. - Example: - >>> range_tensor = [[0.5, 3.12], [3.51, 7.26], ... ] - """ - for k in range(range_tensor.shape[0]): - range_tup = range_tensor[k] - if range_tup[1] < range_tup[0]: - raise ValueError("Range start time should be preceding the end time but we got: {range_tup}") - return True - - -def tensor_to_list(range_tensor: torch.Tensor) -> List[List[float]]: - """ - For online segmentation. Force the list elements to be float type. - """ - return [[float(range_tensor[k][0]), float(range_tensor[k][1])] for k in range(range_tensor.shape[0])] - - -def get_speech_labels_for_update( - frame_start: float, - buffer_end: float, - vad_timestamps: torch.Tensor, - cumulative_speech_labels: torch.Tensor, - cursor_for_old_segments: float, -) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Bring the new speech labels from the current buffer. Followingly: - - 1. Concatenate the old speech labels from self.cumulative_speech_labels for the overlapped region. - - This goes to new_speech_labels. - 2. Update the new 1 sec of speech label (speech_label_for_new_segments) to self.cumulative_speech_labels. - 3. Return the speech label from cursor_for_old_segments to buffer end. - - Args: - frame_start (float): - Start of the middle audio chunk in the audio buffer - buffer_end (float): - End of the audio buffer - vad_timestamps (Tensor): - Tensor containing VAD intervals (start and end timestamps) - cumulative_speech_labels (torch.Tensor): - Cumulative speech/non-speech timestamps (equivalent to VAD timestamps) - cursor_for_old_segments (float): - Floating point number that indicates the point where new segments should replace - the old segments - - Returns: - speech_label_for_new_segments (Tensor): - The intervals (start and end) timestamps where the new incoming speech segments should - be collected from - cumulative_speech_labels (Tensor): - Cumulative speech/non-speech timestamps (equivalent to VAD timestamps) with newly added - speech/non-speech timestamps from the `vad_timestamps` input - """ - update_overlap_range: List[float] = [] - if cursor_for_old_segments < frame_start: - update_overlap_range = [float(cursor_for_old_segments), float(frame_start)] - - # Get VAD timestamps that are in (frame_start, buffer_end) range - vad_timestamps = tensor_to_list(vad_timestamps) - cumulative_speech_labels = tensor_to_list(cumulative_speech_labels) - new_incoming_speech_labels = get_sub_range_list( - target_range=[float(frame_start), float(buffer_end)], source_range_list=vad_timestamps - ) - - # Update the speech label by including overlapping region with the previous output - update_overlap_speech_labels = get_sub_range_list( - target_range=update_overlap_range, source_range_list=cumulative_speech_labels - ) - - # Speech segments for embedding extractions - speech_label_for_new_segments = merge_float_intervals( - update_overlap_speech_labels + new_incoming_speech_labels, margin=0 - ) - - # Keep cumulative VAD labels for the future use - cumulative_speech_labels = merge_float_intervals(cumulative_speech_labels + new_incoming_speech_labels, margin=0) - - # Convert the lists back to type torch.Tensor - speech_label_for_new_segments = torch.tensor(speech_label_for_new_segments) - cumulative_speech_labels = torch.tensor(cumulative_speech_labels) - - return speech_label_for_new_segments, cumulative_speech_labels - - -def get_new_cursor_for_update(frame_start: float, segment_range_ts: List[List[float]],) -> Tuple[float, int]: - """ - Function for updating a cursor online speaker diarization. - Remove the old segments that overlap with the new frame (self.frame_start) - cursor_for_old_segments is set to the onset of the t_range popped lastly. - - - Args: - frame_start (float): - Start of streaming pipeline frame - segment_range_ts (float): - Interval (start and end timestamps) of the targeted segments - - Returns: - cursor_for_old_segments (float): - Floating point number that indicates the point where new segments should replace - the old segments - cursor_index (int): - The index of the first newly accepted segments - """ - cursor_for_old_segments = frame_start - cursor_index: int = len(segment_range_ts) - count = 0 - while True and len(segment_range_ts) > 0: - t_range = segment_range_ts[-1 * (count + 1)] - if frame_start <= t_range[1]: - count += 1 - cursor_for_old_segments = t_range[0] - else: - break - cursor_index = len(segment_range_ts) - count - return cursor_for_old_segments, cursor_index - - -def get_online_segments_from_slices( - sig: torch.Tensor, - buffer_start: float, - buffer_end: float, - subsegments: List[List[float]], - ind_offset: int, - window: float, - sample_rate: int, -) -> Tuple[int, List[torch.Tensor], List[List[float]], List[int]]: - """ - Create short speech segments from slices for online processing purpose. - - Args: - sig (Tensor): - Tensor containing the raw time-series signal - buffer_start (float): - Start point of the time-series signal buffer - buffer_end (float): - End point of the time-series signal buffer - subsegments (list): - List containing the interval information (start and duration) of each segment - ind_offset (int): - Offset for index that compensates the point of the current position in the streaming session - window (float): - Window length in second - shift (float): - Shift length in second - - Returns: - sigs_list (list): - list of sliced input signal - audio_lengths (list): - list of audio sample lengths - """ - sig_rangel_list: List[List[float]] = [] - sig_indexes: List[int] = [] - sigs_list: List[torch.Tensor] = [] - slice_length: int = int(window * sample_rate) - end_sec: float = 0.0 - for subseg in subsegments: - start_sec, dur = subseg[0], subseg[1] - - if start_sec > buffer_end: - continue - ind_offset += 1 - - buffer_len = buffer_end - buffer_start - end_sec = float(start_sec + dur) - - if end_sec > buffer_len: - end_sec = float(min(end_sec, buffer_len)) - - signal = get_target_sig(sig, start_sec, end_sec, slice_length, sample_rate) - - if len(signal) == 0: - raise ValueError("len(signal) is zero. Signal length should not be zero.") - if len(signal) < slice_length: - signal = repeat_signal(signal, len(signal), slice_length) - - start_abs_sec = buffer_start + start_sec - end_abs_sec = buffer_start + end_sec - - sigs_list.append(signal) - sig_rangel_list.append([start_abs_sec, end_abs_sec]) - sig_indexes.append(ind_offset) - - if not len(sigs_list) == len(sig_rangel_list) == len(sig_indexes): - raise ValueError("Signal information lists have a mismatch.") - - return ind_offset, sigs_list, sig_rangel_list, sig_indexes - - -def get_online_subsegments_from_buffer( - buffer_start: float, - buffer_end: float, - sample_rate: int, - speech_labels_for_update: torch.Tensor, - audio_buffer: torch.Tensor, - segment_indexes: List[int], - window: float, - shift: float, -) -> Tuple[List[torch.Tensor], List[List[float]], List[int]]: - """ - Generate subsegments for online processing from the given segment information. - This function extracts subsegments (embedding vector level) time-series from the - raw time-series buffer based on the segment interval (start and end timestamps) information. - - Args: - buffer_start (float): - Start point of the time-series signal buffer - buffer_end (float): - End point of the time-series signal buffer - sample_rate (int): - Sampling rate of the audio input - speech_labels_for_update (Tensor): - Tensor containing intervals (start and end timestamps) of the speech segments - audio_buffer (Tensor): - Tensor containing the raw time-series signal - segment_indexes (list): - List containing the unique indices of segments - window (float): - Window length in second - shift (float): - Shift length in second - - Returns: - sigs_list (list): - List containing the tensors of the old and the newly added time-series signals - sig_rangel_list (list): - List containing the old and the newly added intervals (timestamps) of the speech segments - sig_indexes (list): - List containing the old and the newly added unique indices of segments - """ - sigs_list: List[torch.Tensor] = [] - sig_rangel_list: List[List[float]] = [] - sig_indexes: List[int] = [] - if len(segment_indexes) > 0: - ind_offset = segment_indexes[-1] - else: - ind_offset = -1 - - for idx, range_spl in enumerate(speech_labels_for_update): - range_offs = [float(range_spl[0].item() - buffer_start), float(range_spl[1].item() - buffer_start)] - range_t = [max(0, range_offs[0]), range_offs[1]] - - subsegments = get_subsegments( - offset=range_t[0], window=window, shift=shift, duration=(range_t[1] - range_t[0]), - ) - ind_offset, sigs, ranges, inds = get_online_segments_from_slices( - sig=audio_buffer, - buffer_start=buffer_start, - buffer_end=buffer_end, - subsegments=subsegments, - window=window, - ind_offset=ind_offset, - sample_rate=sample_rate, - ) - - sigs_list.extend(sigs) - sig_rangel_list.extend(ranges) - sig_indexes.extend(inds) - - assert len(sigs_list) == len(sig_rangel_list) == len(sig_indexes) - return sigs_list, sig_rangel_list, sig_indexes - - -def get_scale_mapping_argmat(uniq_embs_and_timestamps: Dict[str, dict]) -> Dict[int, torch.Tensor]: - """ - Calculate cosine similarity values among speaker embeddings for each scale then - apply multiscale weights to calculate the fused similarity matrix. - - Args: - uniq_embs_and_timestamps: (dict) - The dictionary containing embeddings, timestamps and multiscale weights. - If uniq_embs_and_timestamps contains only one scale, single scale diarization - is performed. - - Returns: - scale_mapping_argmat (dict) - Dictionary containing scale mapping information matrix for each scale. - """ - scale_mapping_argmat = {} - embeddings_in_scales, timestamps_in_scales = split_input_data( - embeddings_in_scales=uniq_embs_and_timestamps['embeddings'], - timestamps_in_scales=uniq_embs_and_timestamps['timestamps'], - multiscale_segment_counts=uniq_embs_and_timestamps['multiscale_segment_counts'], - ) - session_scale_mapping_list = get_argmin_mat(timestamps_in_scales) - for scale_idx in range(len(session_scale_mapping_list)): - mapping_argmat = session_scale_mapping_list[scale_idx] - scale_mapping_argmat[scale_idx] = mapping_argmat - return scale_mapping_argmat - - -def get_overlap_stamps(cont_stamps: List[str], ovl_spk_idx: List[str]): - """ - Generate timestamps that include overlap speech. Overlap-including timestamps are created based on the segments that are - created for clustering diarizer. Overlap speech is assigned to the existing speech segments in `cont_stamps`. - - Args: - cont_stamps (list): - Non-overlapping (single speaker per segment) diarization output in string format. - Each line contains the start and end time of segments and corresponding speaker labels. - ovl_spk_idx (list): - List containing segment index of the estimated overlapped speech. The start and end of segments are based on the - single-speaker (i.e., non-overlap-aware) RTTM generation. - Returns: - total_ovl_cont_list (list): - Rendered diarization output in string format. Each line contains the start and end time of segments and - corresponding speaker labels. This format is identical to `cont_stamps`. - """ - ovl_spk_cont_list = [[] for _ in range(len(ovl_spk_idx))] - for spk_idx in range(len(ovl_spk_idx)): - for idx, cont_a_line in enumerate(cont_stamps): - start, end, speaker = cont_a_line.split() - if idx in ovl_spk_idx[spk_idx]: - ovl_spk_cont_list[spk_idx].append(f"{start} {end} speaker_{spk_idx}") - total_ovl_cont_list = [] - for ovl_cont_list in ovl_spk_cont_list: - if len(ovl_cont_list) > 0: - total_ovl_cont_list.extend(merge_stamps(ovl_cont_list)) - return total_ovl_cont_list - - -def get_adaptive_threshold(estimated_num_of_spks: int, min_threshold: float, overlap_infer_spk_limit: int): - """ - This function controls the magnitude of the sigmoid threshold based on the estimated number of speakers. As the number of - speakers becomes larger, diarization error rate is very sensitive on overlap speech detection. This function linearly increases - the threshold in proportion to the estimated number of speakers so more confident overlap speech results are reflected when - the number of estimated speakers are relatively high. - - Args: - estimated_num_of_spks (int): - Estimated number of speakers from the clustering result. - min_threshold (float): - Sigmoid threshold value from the config file. This threshold value is minimum threshold value when `estimated_num_of_spks=2` - overlap_infer_spk_limit (int): - If the `estimated_num_of_spks` is less then `overlap_infer_spk_limit`, overlap speech estimation is skipped. - - Returns: - adaptive_threshold (float): - Threshold value that is scaled based on the `estimated_num_of_spks`. - """ - adaptive_threshold = min_threshold - (estimated_num_of_spks - 2) * (min_threshold - 1) / ( - overlap_infer_spk_limit - 2 - ) - return adaptive_threshold - - -def generate_speaker_timestamps( - clus_labels: List[Union[float, int]], msdd_preds: List[torch.Tensor], **params -) -> Tuple[List[str], List[str]]: - ''' - Generate speaker timestamps from the segmentation information. If `use_clus_as_main=True`, use clustering result for main speaker - labels and use timestamps from the predicted sigmoid values. In this function, the main speaker labels in `maj_labels` exist for - every subsegment steps while overlap speaker labels in `ovl_labels` only exist for segments where overlap-speech is occuring. - - Args: - clus_labels (list): - List containing integer-valued speaker clustering results. - msdd_preds (list): - List containing tensors of the predicted sigmoid values. - Each tensor has shape of: (Session length, estimated number of speakers). - params: - Parameters for generating RTTM output and evaluation. Parameters include: - infer_overlap (bool): If False, overlap-speech will not be detected. - use_clus_as_main (bool): Add overlap-speech detection from MSDD to clustering results. If False, only MSDD output - is used for constructing output RTTM files. - overlap_infer_spk_limit (int): Above this limit, overlap-speech detection is bypassed. - use_adaptive_thres (bool): Boolean that determines whehther to use adaptive_threshold depending on the estimated - number of speakers. - max_overlap_spks (int): Maximum number of overlap speakers detected. Default is 2. - threshold (float): Sigmoid threshold for MSDD output. - - Returns: - maj_labels (list): - List containing string-formated single-speaker speech segment timestamps and corresponding speaker labels. - Example: [..., '551.685 552.77 speaker_1', '552.99 554.43 speaker_0', '554.97 558.19 speaker_0', ...] - ovl_labels (list): - List containing string-formated additional overlapping speech segment timestamps and corresponding speaker labels. - Note that `ovl_labels` includes only overlapping speech that is not included in `maj_labels`. - Example: [..., '152.495 152.745 speaker_1', '372.71 373.085 speaker_0', '554.97 555.885 speaker_1', ...] - ''' - msdd_preds.squeeze(0) - estimated_num_of_spks = msdd_preds.shape[-1] - overlap_speaker_list = [[] for _ in range(estimated_num_of_spks)] - infer_overlap = estimated_num_of_spks < int(params['overlap_infer_spk_limit']) - main_speaker_lines = [] - if params['use_adaptive_thres']: - threshold = get_adaptive_threshold( - estimated_num_of_spks, params['threshold'], params['overlap_infer_spk_limit'] - ) - else: - threshold = params['threshold'] - for seg_idx, cluster_label in enumerate(clus_labels): - msdd_preds.squeeze(0) - spk_for_seg = (msdd_preds[0, seg_idx] > threshold).int().cpu().numpy().tolist() - sm_for_seg = msdd_preds[0, seg_idx].cpu().numpy() - - if params['use_clus_as_main']: - main_spk_idx = int(cluster_label[2]) - else: - main_spk_idx = np.argsort(msdd_preds[0, seg_idx].cpu().numpy())[::-1][0] - - if sum(spk_for_seg) > 1 and infer_overlap: - idx_arr = np.argsort(sm_for_seg)[::-1] - for ovl_spk_idx in idx_arr[: params['max_overlap_spks']].tolist(): - if ovl_spk_idx != int(main_spk_idx): - overlap_speaker_list[ovl_spk_idx].append(seg_idx) - main_speaker_lines.append(f"{cluster_label[0]} {cluster_label[1]} speaker_{main_spk_idx}") - cont_stamps = get_contiguous_stamps(main_speaker_lines) - maj_labels = merge_stamps(cont_stamps) - ovl_labels = get_overlap_stamps(cont_stamps, overlap_speaker_list) - return maj_labels, ovl_labels - - -def get_uniq_id_list_from_manifest(manifest_file: str): - """Retrieve `uniq_id` values from the given manifest_file and save the IDs to a list. - """ - uniq_id_list = [] - with open(manifest_file, 'r', encoding='utf-8') as manifest: - for i, line in enumerate(manifest.readlines()): - line = line.strip() - dic = json.loads(line) - uniq_id = get_uniqname_from_filepath(dic['audio_filepath']) - uniq_id_list.append(uniq_id) - return uniq_id_list - - -def get_id_tup_dict(uniq_id_list: List[str], test_data_collection, preds_list: List[torch.Tensor]): - """ - Create session-level dictionary containing data needed to construct RTTM diarization output. - - Args: - uniq_id_list (list): - List containing the `uniq_id` values. - test_data_collection (collections.DiarizationLabelEntity): - Class instance that is containing session information such as targeted speaker indices, audio filepath and RTTM filepath. - preds_list (list): - List containing tensors of predicted sigmoid values. - - Returns: - session_dict (dict): - Dictionary containing session-level target speakers data and predicted simoid values in tensor format. - """ - session_dict = {x: [] for x in uniq_id_list} - for idx, line in enumerate(test_data_collection): - uniq_id = get_uniqname_from_filepath(line.audio_file) - session_dict[uniq_id].append([line.target_spks, preds_list[idx]]) - return session_dict - - -def prepare_split_data(manifest_filepath, _out_dir, multiscale_args_dict, global_rank): - """ - This function is needed for preparing diarization training data for multiscale diarization decoder (MSDD). - Prepare multiscale timestamp data for training. Oracle VAD timestamps from RTTM files are used as VAD timestamps. - In this function, timestamps for embedding extraction are extracted without extracting the embedding vectors. - - Args: - manifest_filepath (str): - Input manifest file for creating audio-to-RTTM mapping. - _out_dir (str): - Output directory where timestamp json files are saved. - - Returns: - multiscale_args_dict (dict): - - Dictionary containing two types of arguments: multi-scale weights and subsegment timestamps for each data sample. - - Each data sample has two keys: `multiscale_weights` and `scale_dict`. - - `multiscale_weights` key contains a list containing multiscale weights. - - `scale_dict` is indexed by integer keys which are scale index. - - Each data sample is indexed by using the following naming convention: `__` - Example: `fe_03_00106_mixed_626310_642300` - """ - speaker_dir = os.path.join(_out_dir, 'speaker_outputs') - - # Only if this is for the first run of modelPT instance, remove temp folders. - if global_rank == 0: - if os.path.exists(speaker_dir): - shutil.rmtree(speaker_dir) - os.makedirs(speaker_dir) - split_audio_rttm_map = audio_rttm_map(manifest_filepath, attach_dur=True) - - # Speech Activity Detection part - _speaker_manifest_path = os.path.join(speaker_dir, f'oracle_vad_manifest.json') - logging.info(f"Extracting oracle VAD timestamps and saving at {speaker_dir}") - if not os.path.exists(_speaker_manifest_path): - write_rttm2manifest(split_audio_rttm_map, _speaker_manifest_path, include_uniq_id=True) - - multiscale_timestamps_by_scale = {} - - # Segmentation - for scale_idx, (window, shift) in multiscale_args_dict['scale_dict'].items(): - subsegments_manifest_path = os.path.join(speaker_dir, f'subsegments_scale{scale_idx}.json') - if not os.path.exists(subsegments_manifest_path): - # Sub-segmentation for the current scale (scale_idx) - segments_manifest_to_subsegments_manifest( - segments_manifest_file=_speaker_manifest_path, - subsegments_manifest_file=subsegments_manifest_path, - window=window, - shift=shift, - include_uniq_id=True, - ) - logging.info( - f"Subsegmentation for timestamp extracted for: scale-{scale_idx} at {subsegments_manifest_path}" - ) - multiscale_timestamps = extract_timestamps(subsegments_manifest_path) - multiscale_timestamps_by_scale[scale_idx] = multiscale_timestamps - - multiscale_timestamps_dict = get_timestamps(multiscale_timestamps_by_scale, multiscale_args_dict) - return multiscale_timestamps_dict - - -def extract_timestamps(manifest_file: str): - """ - This method extracts timestamps from segments passed through manifest_file. - - Args: - manifest_file (str): - Manifest file containing segmentation information. - Returns: - time_stamps (dict): - Dictionary containing lists of timestamps. - """ - logging.info(f"Extracting timestamps from {manifest_file} for multiscale subsegmentation.") - time_stamps = {} - with open(manifest_file, 'r', encoding='utf-8') as manifest: - for i, line in enumerate(manifest.readlines()): - line = line.strip() - dic = json.loads(line) - uniq_name = dic['uniq_id'] - if uniq_name not in time_stamps: - time_stamps[uniq_name] = [] - start = dic['offset'] - end = start + dic['duration'] - time_stamps[uniq_name].append([start, end]) - return time_stamps - - -def make_rttm_with_overlap( - manifest_file_path: str, - clus_label_dict: Dict[str, List[Union[float, int]]], - msdd_preds: List[torch.Tensor], - **params, -): - """ - Create RTTM files that include detected overlap speech. Note that the effect of overlap detection is only - notable when RTTM files are evaluated with `ignore_overlap=False` option. - - Args: - manifest_file_path (str): - Path to the input manifest file. - clus_label_dict (dict): - Dictionary containing subsegment timestamps in float type and cluster labels in integer type. - Indexed by `uniq_id` string. - msdd_preds (list): - List containing tensors of the predicted sigmoid values. - Each tensor has shape of: (Session length, estimated number of speakers). - params: - Parameters for generating RTTM output and evaluation. Parameters include: - infer_overlap (bool): If False, overlap-speech will not be detected. - See docstrings of `generate_speaker_timestamps` function for other variables in `params`. - - Returns: - all_hypothesis (list): - List containing Pyannote's `Annotation` objects that are created from hypothesis RTTM outputs. - all_reference - List containing Pyannote's `Annotation` objects that are created from ground-truth RTTM outputs - """ - AUDIO_RTTM_MAP = audio_rttm_map(manifest_file_path) - manifest_file_lengths_list = [] - all_hypothesis, all_reference = [], [] - no_references = False - with open(manifest_file_path, 'r', encoding='utf-8') as manifest: - for i, line in enumerate(manifest.readlines()): - uniq_id = get_uniq_id_from_manifest_line(line) - manifest_dic = AUDIO_RTTM_MAP[uniq_id] - clus_labels = clus_label_dict[uniq_id] - manifest_file_lengths_list.append(len(clus_labels)) - maj_labels, ovl_labels = generate_speaker_timestamps(clus_labels, msdd_preds[i], **params) - if params['infer_overlap']: - hyp_labels = maj_labels + ovl_labels - else: - hyp_labels = maj_labels - hypothesis = labels_to_pyannote_object(hyp_labels, uniq_name=uniq_id) - if params['out_rttm_dir']: - hyp_labels = sorted(hyp_labels, key=lambda x: float(x.split()[0])) - labels_to_rttmfile(hyp_labels, uniq_id, params['out_rttm_dir']) - all_hypothesis.append([uniq_id, hypothesis]) - rttm_file = manifest_dic.get('rttm_filepath', None) - if rttm_file is not None and os.path.exists(rttm_file) and not no_references: - ref_labels = rttm_to_labels(rttm_file) - reference = labels_to_pyannote_object(ref_labels, uniq_name=uniq_id) - all_reference.append([uniq_id, reference]) - else: - no_references = True - all_reference = [] - return all_reference, all_hypothesis - - -def embedding_normalize(embs, use_std=False, eps=1e-10): - """ - Mean and l2 length normalize the input speaker embeddings - - Args: - embs: embeddings of shape (Batch,emb_size) - Returns: - embs: normalized embeddings of shape (Batch,emb_size) - """ - embs = embs - embs.mean(axis=0) - if use_std: - embs = embs / (embs.std(axis=0) + eps) - embs_l2_norm = np.expand_dims(np.linalg.norm(embs, ord=2, axis=-1), axis=1) - embs = embs / embs_l2_norm - - return embs - - -class OnlineSegmentor: - """ - Online Segmentor for online (streaming) diarizer. - - The class instances created by this class takes time-series signal from the audio buffer and - creates subsegments for embedding extraction. - - Since online segmentation is based on a short audio buffer, the methods in this class extracts - a few subsegments from the given intervals for the raw time-series signal. - - Attributes: - frame_start (float): - Start of the middle chunk - buffer_start (float): - Start of the entire buffer - buffer_end (float): - End of the entire buffer - sample_rate (int): - Sampling rate of the input time-series signal - cumulative_speech_labels (Tensor): - Torch tensor matrix containing culmulative VAD (speech activity) timestamps - """ - - def __init__(self, sample_rate: int): - self.frame_start: float = 0.0 - self.buffer_start: float = 0.0 - self.buffer_end: float = 0.0 - self.sample_rate: int = sample_rate - self.cumulative_speech_labels: torch.Tensor = torch.tensor([]) - - def run_online_segmentation( - self, - audio_buffer: torch.Tensor, - vad_timestamps: torch.Tensor, - segment_raw_audio: List[torch.Tensor], - segment_range_ts: List[List[float]], - segment_indexes: List[int], - window: float, - shift: float, - ): - """ - Remove the old segments that overlap with the new frame (self.frame_start) - cursor_for_old_segments is pointing at the onset of the t_range popped most recently. - - Frame is in the middle of the buffer. - - |___Buffer___[___________]____________| - |____________[ Frame ]____________| - - | <- buffer start - |____________| <- frame start - - - Args: - audio_buffer (Tensor): - Tensor containing raw time-series signal - vad_timestamps (Tensor): - Tensor containing VAD intervals (start and end timestamps) - segment_raw_audio (list): - List containing the previously added tensors of the raw time-series signal segments - segment_range_ts (list): - List containing the previously added intervals (start and end timestamps) of each segment - segment_indexes (list): - List containing the previously added global integer indicies of the segments from - start to current cursor - window (float): - Window length in second - shift (float): - Shift length in second - - Returns: - segment_raw_audio (list): - List containing the newly added tensors of the raw time-series signal - segment_range_ts (list): - List containing the newly added interval (start and end timestamps) of each segment - segment_indexes (list): - List containing the newly added global integer indicies of the segments from - start to current cursor - """ - if self.buffer_start >= 0: - # Check if this is the very first step - if len(segment_raw_audio) == 0 and vad_timestamps.shape[0] > 0: - vad_timestamps[0][0] = max(vad_timestamps[0][0], 0.0) - speech_labels_for_update = vad_timestamps - self.cumulative_speech_labels = speech_labels_for_update - else: - # Calculate a cursor for the update point - cursor_for_old_segments, cursor_index = get_new_cursor_for_update(self.frame_start, segment_range_ts) - - segment_range_ts = segment_range_ts[:cursor_index] - segment_raw_audio = segment_raw_audio[:cursor_index] - segment_indexes = segment_indexes[:cursor_index] - - if not len(segment_raw_audio) == len(segment_range_ts) == len(segment_indexes): - raise ValueError("Scale-wise segment information has a mismatch in length.") - - speech_labels_for_update, self.cumulative_speech_labels = get_speech_labels_for_update( - self.frame_start, - self.buffer_end, - self.cumulative_speech_labels, - vad_timestamps, - cursor_for_old_segments, - ) - - # Collect the timeseries signal from the buffer - sigs_list, sig_rangel_list, sig_indexes = get_online_subsegments_from_buffer( - buffer_start=self.buffer_start, - buffer_end=self.buffer_end, - sample_rate=self.sample_rate, - speech_labels_for_update=speech_labels_for_update, - audio_buffer=audio_buffer, - segment_indexes=segment_indexes, - window=window, - shift=shift, - ) - - segment_raw_audio.extend(sigs_list) - segment_range_ts.extend(sig_rangel_list) - segment_indexes.extend(sig_indexes) - - if not len(segment_raw_audio) == len(segment_range_ts) == len(segment_indexes): - raise ValueError("Segment information has a mismatch in length.") - return segment_raw_audio, segment_range_ts, segment_indexes diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/streaming_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/streaming_utils.py deleted file mode 100644 index 9efb675b61752af5be4ab79cb4fe87aa9bfbd61d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/streaming_utils.py +++ /dev/null @@ -1,1556 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os - -import numpy as np -import torch -from omegaconf import OmegaConf -from torch.utils.data import DataLoader - -from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE -from nemo.collections.asr.parts.mixins.streaming import StreamingEncoder -from nemo.collections.asr.parts.preprocessing.features import normalize_batch -from nemo.collections.asr.parts.utils.audio_utils import get_samples -from nemo.core.classes import IterableDataset -from nemo.core.neural_types import LengthsType, NeuralType - -# Minimum number of tokens required to assign a LCS merge step, otherwise ignore and -# select all i-1 and ith buffer tokens to merge. -MIN_MERGE_SUBSEQUENCE_LEN = 1 - - -def print_alignment(alignment): - """ - Print an alignment matrix of the shape (m + 1, n + 1) - - Args: - alignment: An integer alignment matrix of shape (m + 1, n + 1) - """ - m = len(alignment) - if m > 0: - n = len(alignment[0]) - for i in range(m): - for j in range(n): - if j == 0: - print(f"{i:4d} |", end=" ") - print(f"{alignment[i][j]}", end=" ") - print() - - -def write_lcs_alignment_to_pickle(alignment, filepath, extras=None): - """ - Writes out the LCS alignment to a file, along with any extras provided. - - Args: - alignment: An alignment matrix of shape [m + 1, n + 1] - filepath: str filepath - extras: Optional dictionary of items to preserve. - """ - if extras is None: - extras = {} - - extras['alignment'] = alignment - torch.save(extras, filepath) - - -def longest_common_subsequence_merge(X, Y, filepath=None): - """ - Longest Common Subsequence merge algorithm for aligning two consecutive buffers. - - Base alignment construction algorithm is Longest Common Subsequence (reffered to as LCS hear after) - - LCS Merge algorithm looks at two chunks i-1 and i, determins the aligned overlap at the - end of i-1 and beginning of ith chunk, and then clips the subsegment of the ith chunk. - - Assumption is that the two chunks are consecutive chunks, and there exists at least small overlap acoustically. - - It is a sub-word token merge algorithm, operating on the abstract notion of integer ids representing the subword ids. - It is independent of text or character encoding. - - Since the algorithm is merge based, and depends on consecutive buffers, the very first buffer is processes using - the "middle tokens" algorithm. - - It requires a delay of some number of tokens such that: - lcs_delay = math.floor(((total_buffer_in_secs - chunk_len_in_sec)) / model_stride_in_secs) - - Total cost of the model is O(m_{i-1} * n_{i}) where (m, n) represents the number of subword ids of the buffer. - - Args: - X: The subset of the previous chunk i-1, sliced such X = X[-(lcs_delay * max_steps_per_timestep):] - Therefore there can be at most lcs_delay * max_steps_per_timestep symbols for X, preserving computation. - Y: The entire current chunk i. - filepath: Optional filepath to save the LCS alignment matrix for later introspection. - - Returns: - A tuple containing - - - i: Start index of alignment along the i-1 chunk. - - j: Start index of alignment along the ith chunk. - - slice_len: number of tokens to slice off from the ith chunk. - The LCS alignment matrix itself (shape m + 1, n + 1) - """ - # LCSuff is the table with zero - # value initially in each cell - m = len(X) - n = len(Y) - LCSuff = [[0 for k in range(n + 1)] for l in range(m + 1)] - - # To store the length of - # longest common substring - result = 0 - result_idx = [0, 0, 0] # Contains (i, j, slice_len) - - # Following steps to build - # LCSuff[m+1][n+1] in bottom up fashion - for i in range(m + 1): - for j in range(n + 1): - if i == 0 or j == 0: - LCSuff[i][j] = 0 - elif X[i - 1] == Y[j - 1]: - LCSuff[i][j] = LCSuff[i - 1][j - 1] + 1 - - if result <= LCSuff[i][j]: - result = LCSuff[i][j] # max(result, LCSuff[i][j]) - result_idx = [i, j, result] - - else: - LCSuff[i][j] = 0 - - # Check if perfect alignment was found or not - # Perfect alignment is found if : - # Longest common subsequence extends to the final row of of the old buffer - # This means that there exists a diagonal LCS backtracking to the beginning of the new buffer - i, j = result_idx[0:2] - is_complete_merge = i == m - - # Perfect alignment was found, slice eagerly - if is_complete_merge: - length = result_idx[-1] - - # In case the LCS was incomplete - missing a few tokens at the beginning - # Perform backtrack to find the origin point of the slice (j) and how many tokens should be sliced - while length >= 0 and i > 0 and j > 0: - # Alignment exists at the required diagonal - if LCSuff[i - 1][j - 1] > 0: - length -= 1 - i, j = i - 1, j - 1 - - else: - # End of longest alignment - i, j, length = i - 1, j - 1, length - 1 - break - - else: - # Expand hypothesis to catch partial mismatch - - # There are 3 steps for partial mismatch in alignment - # 1) Backward search for leftmost LCS - # 2) Greedy expansion of leftmost LCS to the right - # 3) Backtrack final leftmost expanded LCS to find origin point of slice - - # (1) Backward search for Leftmost LCS - # This is required for cases where multiple common subsequences exist - # We only need to select the leftmost one - since that corresponds - # to the last potential subsequence that matched with the new buffer. - # If we just chose the LCS (and not the leftmost LCS), then we can potentially - # slice off major sections of text which are repeated between two overlapping buffers. - - # backward linear search for leftmost j with longest subsequence - max_j = 0 - max_j_idx = n - - i_partial = m # Starting index of i for partial merge - j_partial = -1 # Index holder of j for partial merge - j_skip = 0 # Number of tokens that were skipped along the diagonal - slice_count = 0 # Number of tokens that should be sliced - - # Select leftmost LCS - for i_idx in range(m, -1, -1): # start from last timestep of old buffer - for j_idx in range(0, n + 1): # start from first token from new buffer - # Select the longest LCSuff, while minimizing the index of j (token index for new buffer) - if LCSuff[i_idx][j_idx] > max_j and j_idx <= max_j_idx: - max_j = LCSuff[i_idx][j_idx] - max_j_idx = j_idx - - # Update the starting indices of the partial merge - i_partial = i_idx - j_partial = j_idx - - # EARLY EXIT (if max subsequence length <= MIN merge length) - # Important case where there is long silence - # The end of one buffer will have many blank tokens, the beginning of new buffer may have many blank tokens - # As such, LCS will potentially be from the region of actual tokens. - # This can be detected as the max length of the suffix in LCS - # If this max length of the leftmost suffix is less than some margin, avoid slicing all together. - if max_j <= MIN_MERGE_SUBSEQUENCE_LEN: - # If the number of partiial tokens to be deleted are less than the minimum, - # dont delete any tokens at all. - - i = i_partial - j = 0 - result_idx[-1] = 0 - - else: - # Some valid long partial alignment was found - # (2) Expand this alignment along the diagonal *downwards* towards the end of the old buffer - # such that i_partial = m + 1. - # This is a common case where due to LSTM state or reduced buffer size, the alignment breaks - # in the middle but there are common subsequences between old and new buffers towards the end - # We can expand the current leftmost LCS in a diagonal manner downwards to include such potential - # merge regions. - - # Expand current partial subsequence with co-located tokens - i_temp = i_partial + 1 # diagonal next i - j_temp = j_partial + 1 # diagonal next j - - j_exp = 0 # number of tokens to expand along the diagonal - j_skip = 0 # how many diagonals didnt have the token. Incremented by 1 for every row i - - for i_idx in range(i_temp, m + 1): # walk from i_partial + 1 => m + 1 - j_any_skip = 0 # If the diagonal element at this location is not found, set to 1 - # j_any_skip expands the search space one place to the right - # This allows 1 diagonal misalignment per timestep i (and expands the search for the next timestep) - - # walk along the diagonal corresponding to i_idx, plus allowing diagonal skips to occur - # diagonal elements may not be aligned due to ASR model predicting - # incorrect token in between correct tokens - for j_idx in range(j_temp, j_temp + j_skip + 1): - if j_idx < n + 1: - if LCSuff[i_idx][j_idx] == 0: - j_any_skip = 1 - else: - j_exp = 1 + j_skip + j_any_skip - - # If the diagonal element existed, dont expand the search space, - # otherwise expand the search space 1 token to the right - j_skip += j_any_skip - - # Move one step to the right for the next diagonal j corresponding to i - j_temp += 1 - - # reset j_skip, augment j_partial with expansions - j_skip = 0 - j_partial += j_exp - - # (3) Given new leftmost j_partial with expansions, backtrack the partial alignments - # counting how many diagonal skips occured to compute slice length - # as well as starting point of slice. - - # Partial backward trace to find start of slice - while i_partial > 0 and j_partial > 0: - if LCSuff[i_partial][j_partial] == 0: - # diagonal skip occured, move j to left 1 extra time - j_partial -= 1 - j_skip += 1 - - if j_partial > 0: - # If there are more steps to be taken to the left, slice off the current j - # Then loop for next (i, j) diagonal to the upper left - slice_count += 1 - i_partial -= 1 - j_partial -= 1 - - # Recompute total slice length as slice count along diagonal - # plus the number of diagonal skips - i = max(0, i_partial) - j = max(0, j_partial) - result_idx[-1] = slice_count + j_skip - - # Set the value of i and j - result_idx[0] = i - result_idx[1] = j - - if filepath is not None: - extras = { - "is_complete_merge": is_complete_merge, - "X": X, - "Y": Y, - "slice_idx": result_idx, - } - write_lcs_alignment_to_pickle(LCSuff, filepath=filepath, extras=extras) - print("Wrote alignemnt to :", filepath) - - return result_idx, LCSuff - - -def lcs_alignment_merge_buffer(buffer, data, delay, model, max_steps_per_timestep: int = 5, filepath: str = None): - """ - Merges the new text from the current frame with the previous text contained in the buffer. - - The alignment is based on a Longest Common Subsequence algorithm, with some additional heuristics leveraging - the notion that the chunk size is >= the context window. In case this assumptio is violated, the results of the merge - will be incorrect (or at least obtain worse WER overall). - """ - # If delay timesteps is 0, that means no future context was used. Simply concatenate the buffer with new data. - if delay < 1: - buffer += data - return buffer - - # If buffer is empty, simply concatenate the buffer and data. - if len(buffer) == 0: - buffer += data - return buffer - - # Prepare a subset of the buffer that will be LCS Merged with new data - search_size = int(delay * max_steps_per_timestep) - buffer_slice = buffer[-search_size:] - - # Perform LCS Merge - lcs_idx, lcs_alignment = longest_common_subsequence_merge(buffer_slice, data, filepath=filepath) - - # Slice off new data - # i, j, slice_len = lcs_idx - slice_idx = lcs_idx[1] + lcs_idx[-1] # slice = j + slice_len - data = data[slice_idx:] - - # Concat data to buffer - buffer += data - return buffer - - -def inplace_buffer_merge(buffer, data, timesteps, model): - """ - Merges the new text from the current frame with the previous text contained in the buffer. - - The alignment is based on a Longest Common Subsequence algorithm, with some additional heuristics leveraging - the notion that the chunk size is >= the context window. In case this assumptio is violated, the results of the merge - will be incorrect (or at least obtain worse WER overall). - """ - # If delay timesteps is 0, that means no future context was used. Simply concatenate the buffer with new data. - if timesteps < 1: - buffer += data - return buffer - - # If buffer is empty, simply concatenate the buffer and data. - if len(buffer) == 0: - buffer += data - return buffer - - # Concat data to buffer - buffer += data - return buffer - - -class StreamingFeatureBufferer: - """ - Class to append each feature frame to a buffer and return an array of buffers. - This class is designed to perform a real-life streaming decoding where only a single chunk - is provided at each step of a streaming pipeline. - """ - - def __init__(self, asr_model, chunk_size, buffer_size): - ''' - Args: - asr_model: - Reference to the asr model instance for which the feature needs to be created - chunk_size (float): - Duration of the new chunk of audio - buffer_size (float): - Size of the total audio in seconds maintained in the buffer - ''' - - self.NORM_CONSTANT = 1e-5 - if hasattr(asr_model.preprocessor, 'log') and asr_model.preprocessor.log: - self.ZERO_LEVEL_SPEC_DB_VAL = -16.635 # Log-Melspectrogram value for zero signal - else: - self.ZERO_LEVEL_SPEC_DB_VAL = 0.0 - self.asr_model = asr_model - self.sr = asr_model.cfg.sample_rate - self.model_normalize_type = asr_model.cfg.preprocessor.normalize - self.chunk_size = chunk_size - timestep_duration = asr_model.cfg.preprocessor.window_stride - - self.n_chunk_look_back = int(timestep_duration * self.sr) - self.n_chunk_samples = int(chunk_size * self.sr) - self.buffer_size = buffer_size - total_buffer_len = int(buffer_size / timestep_duration) - self.n_feat = asr_model.cfg.preprocessor.features - self.sample_buffer = torch.zeros(int(self.buffer_size * self.sr)) - self.buffer = torch.ones([self.n_feat, total_buffer_len], dtype=torch.float32) * self.ZERO_LEVEL_SPEC_DB_VAL - self.feature_chunk_len = int(chunk_size / timestep_duration) - self.feature_buffer_len = total_buffer_len - - self.reset() - cfg = copy.deepcopy(asr_model.cfg) - OmegaConf.set_struct(cfg.preprocessor, False) - - cfg.preprocessor.dither = 0.0 - cfg.preprocessor.pad_to = 0 - cfg.preprocessor.normalize = "None" - self.raw_preprocessor = EncDecCTCModelBPE.from_config_dict(cfg.preprocessor) - self.raw_preprocessor.to(asr_model.device) - - def reset(self): - ''' - Reset frame_history and decoder's state - ''' - self.buffer = torch.ones(self.buffer.shape, dtype=torch.float32) * self.ZERO_LEVEL_SPEC_DB_VAL - self.frame_buffers = [] - self.sample_buffer = torch.zeros(int(self.buffer_size * self.sr)) - self.feature_buffer = ( - torch.ones([self.n_feat, self.feature_buffer_len], dtype=torch.float32) * self.ZERO_LEVEL_SPEC_DB_VAL - ) - - def _add_chunk_to_buffer(self, chunk): - """ - Add time-series audio signal to `sample_buffer` - - Args: - chunk (Tensor): - Tensor filled with time-series audio signal - """ - self.sample_buffer[: -self.n_chunk_samples] = self.sample_buffer[self.n_chunk_samples :].clone() - self.sample_buffer[-self.n_chunk_samples :] = chunk.clone() - - def _update_feature_buffer(self, feat_chunk): - """ - Add an extracted feature to `feature_buffer` - """ - self.feature_buffer[:, : -self.feature_chunk_len] = self.feature_buffer[:, self.feature_chunk_len :].clone() - self.feature_buffer[:, -self.feature_chunk_len :] = feat_chunk.clone() - - def get_raw_feature_buffer(self): - return self.feature_buffer - - def get_normalized_feature_buffer(self): - normalized_buffer, _, _ = normalize_batch( - x=self.feature_buffer.unsqueeze(0), - seq_len=torch.tensor([len(self.feature_buffer)]), - normalize_type=self.model_normalize_type, - ) - return normalized_buffer.squeeze(0) - - def _convert_buffer_to_features(self): - """ - Extract features from the time-series audio buffer `sample_buffer`. - """ - # samples for conversion to features. - # Add look_back to have context for the first feature - samples = self.sample_buffer[: -(self.n_chunk_samples + self.n_chunk_look_back)] - device = self.asr_model.device - audio_signal = samples.unsqueeze_(0).to(device) - audio_signal_len = torch.Tensor([samples.shape[1]]).to(device) - features, features_len = self.raw_preprocessor(input_signal=audio_signal, length=audio_signal_len,) - features = features.squeeze() - self._update_feature_buffer(features[:, -self.feature_chunk_len :]) - - def update_feature_buffer(self, chunk): - """ - Update time-series signal `chunk` to the buffer then generate features out of the - signal in the audio buffer. - - Args: - chunk (Tensor): - Tensor filled with time-series audio signal - """ - if len(chunk) > self.n_chunk_samples: - raise ValueError(f"chunk should be of length {self.n_chunk_samples} or less") - if len(chunk) < self.n_chunk_samples: - temp_chunk = torch.zeros(self.n_chunk_samples, dtype=torch.float32) - temp_chunk[: chunk.shape[0]] = chunk - chunk = temp_chunk - self._add_chunk_to_buffer(chunk) - self._convert_buffer_to_features() - - -class AudioFeatureIterator(IterableDataset): - def __init__(self, samples, frame_len, preprocessor, device): - self._samples = samples - self._frame_len = frame_len - self._start = 0 - self.output = True - self.count = 0 - timestep_duration = preprocessor._cfg['window_stride'] - self._feature_frame_len = frame_len / timestep_duration - audio_signal = torch.from_numpy(self._samples).unsqueeze_(0).to(device) - audio_signal_len = torch.Tensor([self._samples.shape[0]]).to(device) - self._features, self._features_len = preprocessor(input_signal=audio_signal, length=audio_signal_len,) - self._features = self._features.squeeze() - - def __iter__(self): - return self - - def __next__(self): - if not self.output: - raise StopIteration - last = int(self._start + self._feature_frame_len) - if last <= self._features_len[0]: - frame = self._features[:, self._start : last].cpu() - self._start = last - else: - frame = np.zeros([self._features.shape[0], int(self._feature_frame_len)], dtype='float32') - samp_len = self._features_len[0] - self._start - frame[:, 0:samp_len] = self._features[:, self._start : self._features_len[0]].cpu() - self.output = False - self.count += 1 - return frame - - -def speech_collate_fn(batch): - """collate batch of audio sig, audio len, tokens, tokens len - Args: - batch (Optional[FloatTensor], Optional[LongTensor], LongTensor, - LongTensor): A tuple of tuples of signal, signal lengths, - encoded tokens, and encoded tokens length. This collate func - assumes the signals are 1d torch tensors (i.e. mono audio). - """ - _, audio_lengths = zip(*batch) - max_audio_len = 0 - has_audio = audio_lengths[0] is not None - if has_audio: - max_audio_len = max(audio_lengths).item() - - audio_signal = [] - for sig, sig_len in batch: - if has_audio: - sig_len = sig_len.item() - if sig_len < max_audio_len: - pad = (0, max_audio_len - sig_len) - sig = torch.nn.functional.pad(sig, pad) - audio_signal.append(sig) - - if has_audio: - audio_signal = torch.stack(audio_signal) - audio_lengths = torch.stack(audio_lengths) - else: - audio_signal, audio_lengths = None, None - - return audio_signal, audio_lengths - - -# simple data layer to pass buffered frames of audio samples -class AudioBuffersDataLayer(IterableDataset): - @property - def output_types(self): - return { - "processed_signal": NeuralType(('B', 'D', 'T'), MelSpectrogramType()), - "processed_length": NeuralType(tuple('B'), LengthsType()), - } - - def __init__(self): - super().__init__() - - def __iter__(self): - return self - - def __next__(self): - if self._buf_count == len(self.signal): - raise StopIteration - self._buf_count += 1 - return ( - torch.as_tensor(self.signal[self._buf_count - 1], dtype=torch.float32), - torch.as_tensor(self.signal_shape[1], dtype=torch.int64), - ) - - def set_signal(self, signals): - self.signal = signals - self.signal_shape = self.signal[0].shape - self._buf_count = 0 - - def __len__(self): - return 1 - - -class FeatureFrameBufferer: - """ - Class to append each feature frame to a buffer and return - an array of buffers. - """ - - def __init__(self, asr_model, frame_len=1.6, batch_size=4, total_buffer=4.0): - ''' - Args: - frame_len: frame's duration, seconds - frame_overlap: duration of overlaps before and after current frame, seconds - offset: number of symbols to drop for smooth streaming - ''' - if hasattr(asr_model.preprocessor, 'log') and asr_model.preprocessor.log: - self.ZERO_LEVEL_SPEC_DB_VAL = -16.635 # Log-Melspectrogram value for zero signal - else: - self.ZERO_LEVEL_SPEC_DB_VAL = 0.0 - self.asr_model = asr_model - self.sr = asr_model._cfg.sample_rate - self.frame_len = frame_len - timestep_duration = asr_model._cfg.preprocessor.window_stride - self.n_frame_len = int(frame_len / timestep_duration) - - total_buffer_len = int(total_buffer / timestep_duration) - self.n_feat = asr_model._cfg.preprocessor.features - self.buffer = np.ones([self.n_feat, total_buffer_len], dtype=np.float32) * self.ZERO_LEVEL_SPEC_DB_VAL - - self.batch_size = batch_size - - self.signal_end = False - self.frame_reader = None - self.feature_buffer_len = total_buffer_len - - self.feature_buffer = ( - np.ones([self.n_feat, self.feature_buffer_len], dtype=np.float32) * self.ZERO_LEVEL_SPEC_DB_VAL - ) - self.frame_buffers = [] - self.buffered_features_size = 0 - self.reset() - self.buffered_len = 0 - - def reset(self): - ''' - Reset frame_history and decoder's state - ''' - self.buffer = np.ones(shape=self.buffer.shape, dtype=np.float32) * self.ZERO_LEVEL_SPEC_DB_VAL - self.prev_char = '' - self.unmerged = [] - self.frame_buffers = [] - self.buffered_len = 0 - self.feature_buffer = ( - np.ones([self.n_feat, self.feature_buffer_len], dtype=np.float32) * self.ZERO_LEVEL_SPEC_DB_VAL - ) - - def get_batch_frames(self): - if self.signal_end: - return [] - batch_frames = [] - for frame in self.frame_reader: - batch_frames.append(np.copy(frame)) - if len(batch_frames) == self.batch_size: - return batch_frames - self.signal_end = True - - return batch_frames - - def get_frame_buffers(self, frames): - # Build buffers for each frame - self.frame_buffers = [] - for frame in frames: - self.buffer[:, : -self.n_frame_len] = self.buffer[:, self.n_frame_len :] - self.buffer[:, -self.n_frame_len :] = frame - self.buffered_len += frame.shape[1] - self.frame_buffers.append(np.copy(self.buffer)) - return self.frame_buffers - - def set_frame_reader(self, frame_reader): - self.frame_reader = frame_reader - self.signal_end = False - - def _update_feature_buffer(self, feat_frame): - self.feature_buffer[:, : -feat_frame.shape[1]] = self.feature_buffer[:, feat_frame.shape[1] :] - self.feature_buffer[:, -feat_frame.shape[1] :] = feat_frame - self.buffered_features_size += feat_frame.shape[1] - - def get_norm_consts_per_frame(self, batch_frames): - norm_consts = [] - for i, frame in enumerate(batch_frames): - self._update_feature_buffer(frame) - mean_from_buffer = np.mean(self.feature_buffer, axis=1) - stdev_from_buffer = np.std(self.feature_buffer, axis=1) - norm_consts.append((mean_from_buffer.reshape(self.n_feat, 1), stdev_from_buffer.reshape(self.n_feat, 1))) - return norm_consts - - def normalize_frame_buffers(self, frame_buffers, norm_consts): - CONSTANT = 1e-5 - for i, frame_buffer in enumerate(frame_buffers): - frame_buffers[i] = (frame_buffer - norm_consts[i][0]) / (norm_consts[i][1] + CONSTANT) - - def get_buffers_batch(self): - batch_frames = self.get_batch_frames() - - while len(batch_frames) > 0: - - frame_buffers = self.get_frame_buffers(batch_frames) - norm_consts = self.get_norm_consts_per_frame(batch_frames) - if len(frame_buffers) == 0: - continue - self.normalize_frame_buffers(frame_buffers, norm_consts) - return frame_buffers - return [] - - -# class for streaming frame-based ASR -# 1) use reset() method to reset FrameASR's state -# 2) call transcribe(frame) to do ASR on -# contiguous signal's frames -class FrameBatchASR: - """ - class for streaming frame-based ASR use reset() method to reset FrameASR's - state call transcribe(frame) to do ASR on contiguous signal's frames - """ - - def __init__( - self, asr_model, frame_len=1.6, total_buffer=4.0, batch_size=4, - ): - ''' - Args: - frame_len: frame's duration, seconds - frame_overlap: duration of overlaps before and after current frame, seconds - offset: number of symbols to drop for smooth streaming - ''' - self.frame_bufferer = FeatureFrameBufferer( - asr_model=asr_model, frame_len=frame_len, batch_size=batch_size, total_buffer=total_buffer - ) - - self.asr_model = asr_model - self.decoder = asr_model.decoder - - self.batch_size = batch_size - self.all_logits = [] - self.all_preds = [] - - self.unmerged = [] - - if hasattr(asr_model.decoder, "vocabulary"): - self.blank_id = len(asr_model.decoder.vocabulary) - else: - self.blank_id = len(asr_model.joint.vocabulary) - self.tokenizer = asr_model.tokenizer - self.toks_unmerged = [] - self.frame_buffers = [] - self.reset() - cfg = copy.deepcopy(asr_model._cfg) - self.cfg = cfg - self.frame_len = frame_len - OmegaConf.set_struct(cfg.preprocessor, False) - - # some changes for streaming scenario - cfg.preprocessor.dither = 0.0 - cfg.preprocessor.pad_to = 0 - cfg.preprocessor.normalize = "None" - self.raw_preprocessor = EncDecCTCModelBPE.from_config_dict(cfg.preprocessor) - self.raw_preprocessor.to(asr_model.device) - self.preprocessor = self.raw_preprocessor - - def reset(self): - """ - Reset frame_history and decoder's state - """ - self.prev_char = '' - self.unmerged = [] - self.data_layer = AudioBuffersDataLayer() - self.data_loader = DataLoader(self.data_layer, batch_size=self.batch_size, collate_fn=speech_collate_fn) - self.all_logits = [] - self.all_preds = [] - self.toks_unmerged = [] - self.frame_buffers = [] - self.frame_bufferer.reset() - - def read_audio_file(self, audio_filepath: str, delay, model_stride_in_secs): - samples = get_samples(audio_filepath) - samples = np.pad(samples, (0, int(delay * model_stride_in_secs * self.asr_model._cfg.sample_rate))) - frame_reader = AudioFeatureIterator(samples, self.frame_len, self.raw_preprocessor, self.asr_model.device) - self.set_frame_reader(frame_reader) - - def set_frame_reader(self, frame_reader): - self.frame_bufferer.set_frame_reader(frame_reader) - - @torch.no_grad() - def infer_logits(self, keep_logits=False): - frame_buffers = self.frame_bufferer.get_buffers_batch() - - while len(frame_buffers) > 0: - self.frame_buffers += frame_buffers[:] - self.data_layer.set_signal(frame_buffers[:]) - self._get_batch_preds(keep_logits) - frame_buffers = self.frame_bufferer.get_buffers_batch() - - @torch.no_grad() - def _get_batch_preds(self, keep_logits=False): - device = self.asr_model.device - for batch in iter(self.data_loader): - - feat_signal, feat_signal_len = batch - feat_signal, feat_signal_len = feat_signal.to(device), feat_signal_len.to(device) - forward_outs = self.asr_model(processed_signal=feat_signal, processed_signal_length=feat_signal_len) - - if len(forward_outs) == 2: # hybrid ctc rnnt model - encoded, encoded_len = forward_outs - log_probs = self.asr_model.ctc_decoder(encoder_output=encoded) - predictions = log_probs.argmax(dim=-1, keepdim=False) - else: - log_probs, encoded_len, predictions = forward_outs - - preds = torch.unbind(predictions) - for pred in preds: - self.all_preds.append(pred.cpu().numpy()) - if keep_logits: - log_probs = torch.unbind(log_probs) - for log_prob in log_probs: - self.all_logits.append(log_prob.cpu()) - else: - del log_probs - del encoded_len - del predictions - - def transcribe(self, tokens_per_chunk: int, delay: int, keep_logits=False): - self.infer_logits(keep_logits) - self.unmerged = [] - for pred in self.all_preds: - decoded = pred.tolist() - self.unmerged += decoded[len(decoded) - 1 - delay : len(decoded) - 1 - delay + tokens_per_chunk] - hypothesis = self.greedy_merge(self.unmerged) - if not keep_logits: - return hypothesis - - all_logits = [] - for log_prob in self.all_logits: - T = log_prob.shape[0] - log_prob = log_prob[T - 1 - delay : T - 1 - delay + tokens_per_chunk, :] - all_logits.append(log_prob) - all_logits = torch.concat(all_logits, 0) - return hypothesis, all_logits - - def greedy_merge(self, preds): - decoded_prediction = [] - previous = self.blank_id - for p in preds: - if (p != previous or previous == self.blank_id) and p != self.blank_id: - decoded_prediction.append(p) - previous = p - hypothesis = self.tokenizer.ids_to_text(decoded_prediction) - return hypothesis - - -class BatchedFeatureFrameBufferer(FeatureFrameBufferer): - """ - Batched variant of FeatureFrameBufferer where batch dimension is the independent audio samples. - """ - - def __init__(self, asr_model, frame_len=1.6, batch_size=4, total_buffer=4.0): - ''' - Args: - frame_len: frame's duration, seconds - frame_overlap: duration of overlaps before and after current frame, seconds - offset: number of symbols to drop for smooth streaming - ''' - super().__init__(asr_model, frame_len=frame_len, batch_size=batch_size, total_buffer=total_buffer) - - # OVERRIDES OF BASE CLASS - timestep_duration = asr_model._cfg.preprocessor.window_stride - total_buffer_len = int(total_buffer / timestep_duration) - self.buffer = ( - np.ones([batch_size, self.n_feat, total_buffer_len], dtype=np.float32) * self.ZERO_LEVEL_SPEC_DB_VAL - ) - - # Preserve list of buffers and indices, one for every sample - self.all_frame_reader = [None for _ in range(self.batch_size)] - self.signal_end = [False for _ in range(self.batch_size)] - self.signal_end_index = [None for _ in range(self.batch_size)] - self.buffer_number = 0 # preserve number of buffers returned since reset. - - self.reset() - del self.buffered_len - del self.buffered_features_size - - def reset(self): - ''' - Reset frame_history and decoder's state - ''' - super().reset() - self.feature_buffer = ( - np.ones([self.batch_size, self.n_feat, self.feature_buffer_len], dtype=np.float32) - * self.ZERO_LEVEL_SPEC_DB_VAL - ) - self.all_frame_reader = [None for _ in range(self.batch_size)] - self.signal_end = [False for _ in range(self.batch_size)] - self.signal_end_index = [None for _ in range(self.batch_size)] - self.buffer_number = 0 - - def get_batch_frames(self): - # Exit if all buffers of all samples have been processed - if all(self.signal_end): - return [] - - # Otherwise sequentially process frames of each sample one by one. - batch_frames = [] - for idx, frame_reader in enumerate(self.all_frame_reader): - try: - frame = next(frame_reader) - frame = np.copy(frame) - - batch_frames.append(frame) - except StopIteration: - # If this sample has finished all of its buffers - # Set its signal_end flag, and assign it the id of which buffer index - # did it finish the sample (if not previously set) - # This will let the alignment module know which sample in the batch finished - # at which index. - batch_frames.append(None) - self.signal_end[idx] = True - - if self.signal_end_index[idx] is None: - self.signal_end_index[idx] = self.buffer_number - - self.buffer_number += 1 - return batch_frames - - def get_frame_buffers(self, frames): - # Build buffers for each frame - self.frame_buffers = [] - # Loop over all buffers of all samples - for idx in range(self.batch_size): - frame = frames[idx] - # If the sample has a buffer, then process it as usual - if frame is not None: - self.buffer[idx, :, : -self.n_frame_len] = self.buffer[idx, :, self.n_frame_len :] - self.buffer[idx, :, -self.n_frame_len :] = frame - # self.buffered_len += frame.shape[1] - # WRAP the buffer at index idx into a outer list - self.frame_buffers.append([np.copy(self.buffer[idx])]) - else: - # If the buffer does not exist, the sample has finished processing - # set the entire buffer for that sample to 0 - self.buffer[idx, :, :] *= 0.0 - self.frame_buffers.append([np.copy(self.buffer[idx])]) - - return self.frame_buffers - - def set_frame_reader(self, frame_reader, idx): - self.all_frame_reader[idx] = frame_reader - self.signal_end[idx] = False - self.signal_end_index[idx] = None - - def _update_feature_buffer(self, feat_frame, idx): - # Update the feature buffer for given sample, or reset if the sample has finished processing - if feat_frame is not None: - self.feature_buffer[idx, :, : -feat_frame.shape[1]] = self.feature_buffer[idx, :, feat_frame.shape[1] :] - self.feature_buffer[idx, :, -feat_frame.shape[1] :] = feat_frame - # self.buffered_features_size += feat_frame.shape[1] - else: - self.feature_buffer[idx, :, :] *= 0.0 - - def get_norm_consts_per_frame(self, batch_frames): - for idx, frame in enumerate(batch_frames): - self._update_feature_buffer(frame, idx) - - mean_from_buffer = np.mean(self.feature_buffer, axis=2, keepdims=True) # [B, self.n_feat, 1] - stdev_from_buffer = np.std(self.feature_buffer, axis=2, keepdims=True) # [B, self.n_feat, 1] - - return (mean_from_buffer, stdev_from_buffer) - - def normalize_frame_buffers(self, frame_buffers, norm_consts): - CONSTANT = 1e-8 - for i in range(len(frame_buffers)): - frame_buffers[i] = (frame_buffers[i] - norm_consts[0][i]) / (norm_consts[1][i] + CONSTANT) - - def get_buffers_batch(self): - batch_frames = self.get_batch_frames() - - while len(batch_frames) > 0: - # while there exists at least one sample that has not been processed yet - frame_buffers = self.get_frame_buffers(batch_frames) - norm_consts = self.get_norm_consts_per_frame(batch_frames) - - self.normalize_frame_buffers(frame_buffers, norm_consts) - return frame_buffers - return [] - - -class BatchedFrameASRRNNT(FrameBatchASR): - """ - Batched implementation of FrameBatchASR for RNNT models, where the batch dimension is independent audio samples. - """ - - def __init__( - self, - asr_model, - frame_len=1.6, - total_buffer=4.0, - batch_size=32, - max_steps_per_timestep: int = 5, - stateful_decoding: bool = False, - ): - ''' - Args: - asr_model: An RNNT model. - frame_len: frame's duration, seconds. - total_buffer: duration of total audio chunk size, in seconds. - batch_size: Number of independent audio samples to process at each step. - max_steps_per_timestep: Maximum number of tokens (u) to process per acoustic timestep (t). - stateful_decoding: Boolean whether to enable stateful decoding for preservation of state across buffers. - ''' - super().__init__(asr_model, frame_len=frame_len, total_buffer=total_buffer, batch_size=batch_size) - - # OVERRIDES OF THE BASE CLASS - self.max_steps_per_timestep = max_steps_per_timestep - self.stateful_decoding = stateful_decoding - - self.all_alignments = [[] for _ in range(self.batch_size)] - self.all_preds = [[] for _ in range(self.batch_size)] - self.all_timestamps = [[] for _ in range(self.batch_size)] - self.previous_hypotheses = None - self.batch_index_map = { - idx: idx for idx in range(self.batch_size) - } # pointer from global batch id : local sub-batch id - - try: - self.eos_id = self.asr_model.tokenizer.eos_id - except Exception: - self.eos_id = -1 - - print("Performing Stateful decoding :", self.stateful_decoding) - - # OVERRIDES - self.frame_bufferer = BatchedFeatureFrameBufferer( - asr_model=asr_model, frame_len=frame_len, batch_size=batch_size, total_buffer=total_buffer - ) - - self.reset() - - def reset(self): - """ - Reset frame_history and decoder's state - """ - super().reset() - - self.all_alignments = [[] for _ in range(self.batch_size)] - self.all_preds = [[] for _ in range(self.batch_size)] - self.all_timestamps = [[] for _ in range(self.batch_size)] - self.previous_hypotheses = None - self.batch_index_map = {idx: idx for idx in range(self.batch_size)} - - self.data_layer = [AudioBuffersDataLayer() for _ in range(self.batch_size)] - self.data_loader = [ - DataLoader(self.data_layer[idx], batch_size=1, collate_fn=speech_collate_fn) - for idx in range(self.batch_size) - ] - - def read_audio_file(self, audio_filepath: list, delay, model_stride_in_secs): - assert len(audio_filepath) == self.batch_size - - # Read in a batch of audio files, one by one - for idx in range(self.batch_size): - samples = get_samples(audio_filepath[idx]) - samples = np.pad(samples, (0, int(delay * model_stride_in_secs * self.asr_model._cfg.sample_rate))) - frame_reader = AudioFeatureIterator(samples, self.frame_len, self.raw_preprocessor, self.asr_model.device) - self.set_frame_reader(frame_reader, idx) - - def set_frame_reader(self, frame_reader, idx): - self.frame_bufferer.set_frame_reader(frame_reader, idx) - - @torch.no_grad() - def infer_logits(self): - frame_buffers = self.frame_bufferer.get_buffers_batch() - - while len(frame_buffers) > 0: - # While at least 1 sample has a buffer left to process - self.frame_buffers += frame_buffers[:] - - for idx, buffer in enumerate(frame_buffers): - self.data_layer[idx].set_signal(buffer[:]) - - self._get_batch_preds() - frame_buffers = self.frame_bufferer.get_buffers_batch() - - @torch.no_grad() - def _get_batch_preds(self): - """ - Perform dynamic batch size decoding of frame buffers of all samples. - - Steps: - - Load all data loaders of every sample - - For all samples, determine if signal has finished. - - If so, skip calculation of mel-specs. - - If not, compute mel spec and length - - Perform Encoder forward over this sub-batch of samples. Maintain the indices of samples that were processed. - - If performing stateful decoding, prior to decoder forward, remove the states of samples that were not processed. - - Perform Decoder + Joint forward for samples that were processed. - - For all output RNNT alignment matrix of the joint do: - - If signal has ended previously (this was last buffer of padding), skip alignment - - Otherwise, recalculate global index of this sample from the sub-batch index, and preserve alignment. - - Same for preds - - Update indices of sub-batch with global index map. - - Redo steps until all samples were processed (sub-batch size == 0). - """ - device = self.asr_model.device - - data_iters = [iter(data_loader) for data_loader in self.data_loader] - - feat_signals = [] - feat_signal_lens = [] - - new_batch_keys = [] - # while not all(self.frame_bufferer.signal_end): - for idx in range(self.batch_size): - if self.frame_bufferer.signal_end[idx]: - continue - - batch = next(data_iters[idx]) - feat_signal, feat_signal_len = batch - feat_signal, feat_signal_len = feat_signal.to(device), feat_signal_len.to(device) - - feat_signals.append(feat_signal) - feat_signal_lens.append(feat_signal_len) - - # preserve batch indeices - new_batch_keys.append(idx) - - if len(feat_signals) == 0: - return - - feat_signal = torch.cat(feat_signals, 0) - feat_signal_len = torch.cat(feat_signal_lens, 0) - - del feat_signals, feat_signal_lens - - encoded, encoded_len = self.asr_model(processed_signal=feat_signal, processed_signal_length=feat_signal_len) - - # filter out partial hypotheses from older batch subset - if self.stateful_decoding and self.previous_hypotheses is not None: - new_prev_hypothesis = [] - for new_batch_idx, global_index_key in enumerate(new_batch_keys): - old_pos = self.batch_index_map[global_index_key] - new_prev_hypothesis.append(self.previous_hypotheses[old_pos]) - self.previous_hypotheses = new_prev_hypothesis - - best_hyp, _ = self.asr_model.decoding.rnnt_decoder_predictions_tensor( - encoded, encoded_len, return_hypotheses=True, partial_hypotheses=self.previous_hypotheses - ) - - if self.stateful_decoding: - # preserve last state from hypothesis of new batch indices - self.previous_hypotheses = best_hyp - - for idx, hyp in enumerate(best_hyp): - global_index_key = new_batch_keys[idx] # get index of this sample in the global batch - - has_signal_ended = self.frame_bufferer.signal_end[global_index_key] - if not has_signal_ended: - self.all_alignments[global_index_key].append(hyp.alignments) - - preds = [hyp.y_sequence for hyp in best_hyp] - for idx, pred in enumerate(preds): - global_index_key = new_batch_keys[idx] # get index of this sample in the global batch - - has_signal_ended = self.frame_bufferer.signal_end[global_index_key] - if not has_signal_ended: - self.all_preds[global_index_key].append(pred.cpu().numpy()) - - timestamps = [hyp.timestep for hyp in best_hyp] - for idx, timestep in enumerate(timestamps): - global_index_key = new_batch_keys[idx] # get index of this sample in the global batch - - has_signal_ended = self.frame_bufferer.signal_end[global_index_key] - if not has_signal_ended: - self.all_timestamps[global_index_key].append(timestep) - - if self.stateful_decoding: - # State resetting is being done on sub-batch only, global index information is not being updated - reset_states = self.asr_model.decoder.initialize_state(encoded) - - for idx, pred in enumerate(preds): - if len(pred) > 0 and pred[-1] == self.eos_id: - # reset states : - self.previous_hypotheses[idx].y_sequence = self.previous_hypotheses[idx].y_sequence[:-1] - self.previous_hypotheses[idx].dec_state = self.asr_model.decoder.batch_select_state( - reset_states, idx - ) - - # Position map update - if len(new_batch_keys) != len(self.batch_index_map): - for new_batch_idx, global_index_key in enumerate(new_batch_keys): - self.batch_index_map[global_index_key] = new_batch_idx # let index point from global pos -> local pos - - del encoded, encoded_len - del best_hyp, pred - - def transcribe( - self, tokens_per_chunk: int, delay: int, - ): - """ - Performs "middle token" alignment prediction using the buffered audio chunk. - """ - self.infer_logits() - - self.unmerged = [[] for _ in range(self.batch_size)] - for idx, alignments in enumerate(self.all_alignments): - - signal_end_idx = self.frame_bufferer.signal_end_index[idx] - if signal_end_idx is None: - raise ValueError("Signal did not end") - - for a_idx, alignment in enumerate(alignments): - if delay == len(alignment): # chunk size = buffer size - offset = 0 - else: # all other cases - offset = 1 - - alignment = alignment[ - len(alignment) - offset - delay : len(alignment) - offset - delay + tokens_per_chunk - ] - - ids, toks = self._alignment_decoder(alignment, self.asr_model.tokenizer, self.blank_id) - - if len(ids) > 0 and a_idx < signal_end_idx: - self.unmerged[idx] = inplace_buffer_merge(self.unmerged[idx], ids, delay, model=self.asr_model,) - - output = [] - for idx in range(self.batch_size): - output.append(self.greedy_merge(self.unmerged[idx])) - return output - - def _alignment_decoder(self, alignments, tokenizer, blank_id): - s = [] - ids = [] - - for t in range(len(alignments)): - for u in range(len(alignments[t])): - _, token_id = alignments[t][u] # (logprob, token_id) - token_id = int(token_id) - if token_id != blank_id: - token = tokenizer.ids_to_tokens([token_id])[0] - s.append(token) - ids.append(token_id) - - else: - # blank token - pass - - return ids, s - - def greedy_merge(self, preds): - decoded_prediction = [p for p in preds] - hypothesis = self.asr_model.tokenizer.ids_to_text(decoded_prediction) - return hypothesis - - -class LongestCommonSubsequenceBatchedFrameASRRNNT(BatchedFrameASRRNNT): - """ - Implements a token alignment algorithm for text alignment instead of middle token alignment. - - For more detail, read the docstring of longest_common_subsequence_merge(). - """ - - def __init__( - self, - asr_model, - frame_len=1.6, - total_buffer=4.0, - batch_size=4, - max_steps_per_timestep: int = 5, - stateful_decoding: bool = False, - alignment_basepath: str = None, - ): - ''' - Args: - asr_model: An RNNT model. - frame_len: frame's duration, seconds. - total_buffer: duration of total audio chunk size, in seconds. - batch_size: Number of independent audio samples to process at each step. - max_steps_per_timestep: Maximum number of tokens (u) to process per acoustic timestep (t). - stateful_decoding: Boolean whether to enable stateful decoding for preservation of state across buffers. - alignment_basepath: Str path to a directory where alignments from LCS will be preserved for later analysis. - ''' - super().__init__(asr_model, frame_len, total_buffer, batch_size, max_steps_per_timestep, stateful_decoding) - self.sample_offset = 0 - self.lcs_delay = -1 - - self.alignment_basepath = alignment_basepath - - def transcribe( - self, tokens_per_chunk: int, delay: int, - ): - if self.lcs_delay < 0: - raise ValueError( - "Please set LCS Delay valus as `(buffer_duration - chunk_duration) / model_stride_in_secs`" - ) - - self.infer_logits() - - self.unmerged = [[] for _ in range(self.batch_size)] - for idx, alignments in enumerate(self.all_alignments): - - signal_end_idx = self.frame_bufferer.signal_end_index[idx] - if signal_end_idx is None: - raise ValueError("Signal did not end") - - for a_idx, alignment in enumerate(alignments): - - # Middle token first chunk - if a_idx == 0: - # len(alignment) - 1 - delay + tokens_per_chunk - alignment = alignment[len(alignment) - 1 - delay :] - ids, toks = self._alignment_decoder(alignment, self.asr_model.tokenizer, self.blank_id) - - if len(ids) > 0: - self.unmerged[idx] = inplace_buffer_merge( - self.unmerged[idx], ids, delay, model=self.asr_model, - ) - - else: - ids, toks = self._alignment_decoder(alignment, self.asr_model.tokenizer, self.blank_id) - if len(ids) > 0 and a_idx < signal_end_idx: - - if self.alignment_basepath is not None: - basepath = self.alignment_basepath - sample_offset = self.sample_offset + idx - alignment_offset = a_idx - path = os.path.join(basepath, str(sample_offset)) - - os.makedirs(path, exist_ok=True) - path = os.path.join(path, "alignment_" + str(alignment_offset) + '.pt') - - filepath = path - else: - filepath = None - - self.unmerged[idx] = lcs_alignment_merge_buffer( - self.unmerged[idx], - ids, - self.lcs_delay, - model=self.asr_model, - max_steps_per_timestep=self.max_steps_per_timestep, - filepath=filepath, - ) - - output = [] - for idx in range(self.batch_size): - output.append(self.greedy_merge(self.unmerged[idx])) - return output - - -class CacheAwareStreamingAudioBuffer: - """ - A buffer to be used for cache-aware streaming. It can load a single or multiple audio files/processed signals, split them in chunks and return one on one. - It can be used to simulate streaming audio or audios. - """ - - def __init__(self, model, online_normalization=None, pad_and_drop_preencoded=False): - ''' - Args: - model: An ASR model. - online_normalization (bool): whether to perform online normalization per chunk or normalize the whole audio before chunking - pad_and_drop_preencoded (bool): if true pad first audio chunk and always drop preencoded - ''' - self.model = model - self.buffer = None - self.buffer_idx = 0 - self.streams_length = None - self.step = 0 - self.pad_and_drop_preencoded = pad_and_drop_preencoded - - self.online_normalization = online_normalization - if not isinstance(model.encoder, StreamingEncoder): - raise ValueError( - "The model's encoder is not inherited from StreamingEncoder, and likely not to support streaming!" - ) - if model.encoder.streaming_cfg is None: - model.encoder.setup_streaming_params() - self.streaming_cfg = model.encoder.streaming_cfg - - self.input_features = model.encoder._feat_in - - self.preprocessor = self.extract_preprocessor() - - if hasattr(model.encoder, "pre_encode") and hasattr(model.encoder.pre_encode, "get_sampling_frames"): - self.sampling_frames = model.encoder.pre_encode.get_sampling_frames() - else: - self.sampling_frames = None - - def __iter__(self): - while True: - if self.buffer_idx >= self.buffer.size(-1): - return - - if self.buffer_idx == 0 and isinstance(self.streaming_cfg.chunk_size, list): - if self.pad_and_drop_preencoded: - chunk_size = self.streaming_cfg.chunk_size[1] - else: - chunk_size = self.streaming_cfg.chunk_size[0] - else: - chunk_size = ( - self.streaming_cfg.chunk_size[1] - if isinstance(self.streaming_cfg.chunk_size, list) - else self.streaming_cfg.chunk_size - ) - - if self.buffer_idx == 0 and isinstance(self.streaming_cfg.shift_size, list): - if self.pad_and_drop_preencoded: - shift_size = self.streaming_cfg.shift_size[1] - else: - shift_size = self.streaming_cfg.shift_size[0] - else: - shift_size = ( - self.streaming_cfg.shift_size[1] - if isinstance(self.streaming_cfg.shift_size, list) - else self.streaming_cfg.shift_size - ) - - audio_chunk = self.buffer[:, :, self.buffer_idx : self.buffer_idx + chunk_size] - - if self.sampling_frames is not None: - # checking to make sure the audio chunk has enough frames to produce at least one output after downsampling - if self.buffer_idx == 0 and isinstance(self.sampling_frames, list): - cur_sampling_frames = self.sampling_frames[0] - else: - cur_sampling_frames = ( - self.sampling_frames[1] if isinstance(self.sampling_frames, list) else self.sampling_frames - ) - if audio_chunk.size(-1) < cur_sampling_frames: - return - - # Adding the cache needed for the pre-encoder part of the model to the chunk - # if there is not enough frames to be used as the pre-encoding cache, zeros would be added - zeros_pads = None - if self.buffer_idx == 0 and isinstance(self.streaming_cfg.pre_encode_cache_size, list): - if self.pad_and_drop_preencoded: - cache_pre_encode_num_frames = self.streaming_cfg.pre_encode_cache_size[1] - else: - cache_pre_encode_num_frames = self.streaming_cfg.pre_encode_cache_size[0] - cache_pre_encode = torch.zeros( - (audio_chunk.size(0), self.input_features, cache_pre_encode_num_frames), - device=audio_chunk.device, - dtype=audio_chunk.dtype, - ) - else: - if isinstance(self.streaming_cfg.pre_encode_cache_size, list): - pre_encode_cache_size = self.streaming_cfg.pre_encode_cache_size[1] - else: - pre_encode_cache_size = self.streaming_cfg.pre_encode_cache_size - - start_pre_encode_cache = self.buffer_idx - pre_encode_cache_size - if start_pre_encode_cache < 0: - start_pre_encode_cache = 0 - cache_pre_encode = self.buffer[:, :, start_pre_encode_cache : self.buffer_idx] - if cache_pre_encode.size(-1) < pre_encode_cache_size: - zeros_pads = torch.zeros( - ( - audio_chunk.size(0), - audio_chunk.size(-2), - pre_encode_cache_size - cache_pre_encode.size(-1), - ), - device=audio_chunk.device, - dtype=audio_chunk.dtype, - ) - - added_len = cache_pre_encode.size(-1) - audio_chunk = torch.cat((cache_pre_encode, audio_chunk), dim=-1) - - if self.online_normalization: - audio_chunk, x_mean, x_std = normalize_batch( - x=audio_chunk, - seq_len=torch.tensor([audio_chunk.size(-1)] * audio_chunk.size(0)), - normalize_type=self.model_normalize_type, - ) - - if zeros_pads is not None: - # TODO: check here when zero_pads is not None and added_len is already non-zero - audio_chunk = torch.cat((zeros_pads, audio_chunk), dim=-1) - added_len += zeros_pads.size(-1) - - max_chunk_lengths = self.streams_length - self.buffer_idx - max_chunk_lengths = max_chunk_lengths + added_len - chunk_lengths = torch.clamp(max_chunk_lengths, min=0, max=audio_chunk.size(-1)) - - self.buffer_idx += shift_size - self.step += 1 - yield audio_chunk, chunk_lengths - - def is_buffer_empty(self): - if self.buffer_idx >= self.buffer.size(-1): - return True - else: - return False - - def __len__(self): - return len(self.buffer) - - def reset_buffer(self): - self.buffer = None - self.buffer_idx = 0 - self.streams_length = None - self.step = 0 - - def reset_buffer_pointer(self): - self.buffer_idx = 0 - self.step = 0 - - def extract_preprocessor(self): - cfg = copy.deepcopy(self.model._cfg) - self.model_normalize_type = cfg.preprocessor.normalize - OmegaConf.set_struct(cfg.preprocessor, False) - cfg.preprocessor.dither = 0.0 - cfg.preprocessor.pad_to = 0 - if self.online_normalization: - cfg.preprocessor.normalize = "None" - - preprocessor = self.model.from_config_dict(cfg.preprocessor) - return preprocessor.to(self.get_model_device()) - - def append_audio_file(self, audio_filepath, stream_id=-1): - audio = get_samples(audio_filepath) - processed_signal, processed_signal_length, stream_id = self.append_audio(audio, stream_id) - return processed_signal, processed_signal_length, stream_id - - def append_audio(self, audio, stream_id=-1): - processed_signal, processed_signal_length = self.preprocess_audio(audio) - processed_signal, processed_signal_length, stream_id = self.append_processed_signal( - processed_signal, stream_id - ) - return processed_signal, processed_signal_length, stream_id - - def append_processed_signal(self, processed_signal, stream_id=-1): - processed_signal_length = torch.tensor(processed_signal.size(-1), device=processed_signal.device) - if stream_id >= 0 and (self.streams_length is not None and stream_id >= len(self.streams_length)): - raise ValueError("Not valid stream_id!") - if self.buffer is None: - if stream_id >= 0: - raise ValueError("stream_id can not be specified when there is no stream.") - self.buffer = processed_signal - self.streams_length = torch.tensor([processed_signal_length], device=processed_signal.device) - else: - if self.buffer.size(1) != processed_signal.size(1): - raise ValueError("Buffer and the processed signal have different dimensions!") - if stream_id < 0: - self.buffer = torch.nn.functional.pad(self.buffer, pad=(0, 0, 0, 0, 0, 1)) - self.streams_length = torch.cat( - (self.streams_length, torch.tensor([0], device=self.streams_length.device)), dim=-1 - ) - stream_id = len(self.streams_length) - 1 - needed_len = self.streams_length[stream_id] + processed_signal_length - if needed_len > self.buffer.size(-1): - self.buffer = torch.nn.functional.pad(self.buffer, pad=(0, needed_len - self.buffer.size(-1))) - - self.buffer[ - stream_id, :, self.streams_length[stream_id] : self.streams_length[stream_id] + processed_signal_length - ] = processed_signal - self.streams_length[stream_id] = self.streams_length[stream_id] + processed_signal.size(-1) - - if self.online_normalization: - processed_signal, x_mean, x_std = normalize_batch( - x=processed_signal, - seq_len=torch.tensor([processed_signal_length]), - normalize_type=self.model_normalize_type, - ) - return processed_signal, processed_signal_length, stream_id - - def get_model_device(self): - return self.model.device - - def preprocess_audio(self, audio, device=None): - if device is None: - device = self.get_model_device() - audio_signal = torch.from_numpy(audio).unsqueeze_(0).to(device) - audio_signal_len = torch.Tensor([audio.shape[0]]).to(device) - processed_signal, processed_signal_length = self.preprocessor( - input_signal=audio_signal, length=audio_signal_len - ) - return processed_signal, processed_signal_length - - def get_all_audios(self): - processed_signal = self.buffer - if self.online_normalization: - processed_signal, x_mean, x_std = normalize_batch( - x=processed_signal, - seq_len=torch.tensor(self.streams_length), - normalize_type=self.model_normalize_type, - ) - return processed_signal, self.streams_length diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/transcribe_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/transcribe_utils.py deleted file mode 100644 index 8d80396dd82ebd46b2d77545a750d1f06f542531..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/transcribe_utils.py +++ /dev/null @@ -1,612 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import glob -import json -import os -import re -from dataclasses import dataclass -from typing import List, Optional, Tuple, Union - -import torch -from omegaconf import DictConfig -from tqdm.auto import tqdm - -import nemo.collections.asr as nemo_asr -from nemo.collections.asr.metrics.wer import word_error_rate -from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCModel -from nemo.collections.asr.parts.utils import rnnt_utils -from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchASR -from nemo.collections.common.metrics.punct_er import OccurancePunctuationErrorRate -from nemo.collections.common.parts.preprocessing.manifest import get_full_path -from nemo.utils import logging, model_utils - - -def get_buffered_pred_feat_rnnt( - asr: FrameBatchASR, - tokens_per_chunk: int, - delay: int, - model_stride_in_secs: int, - batch_size: int, - manifest: str = None, - filepaths: List[list] = None, -) -> List[rnnt_utils.Hypothesis]: - """ - Moved from examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py - Write all information presented in input manifest to output manifest and removed WER calculation. - """ - hyps = [] - refs = [] - - if filepaths and manifest: - raise ValueError("Please select either filepaths or manifest") - if filepaths is None and manifest is None: - raise ValueError("Either filepaths or manifest shoud not be None") - - if manifest: - filepaths = [] - with open(manifest, "r", encoding='utf_8') as mfst_f: - print("Parsing manifest files...") - for l in mfst_f: - row = json.loads(l.strip()) - audio_file = get_full_path(audio_file=row['audio_filepath'], manifest_file=manifest) - filepaths.append(audio_file) - if 'text' in row: - refs.append(row['text']) - - with torch.inference_mode(): - with torch.cuda.amp.autocast(): - batch = [] - asr.sample_offset = 0 - for idx in tqdm(range(len(filepaths)), desc='Sample:', total=len(filepaths)): - batch.append((filepaths[idx])) - - if len(batch) == batch_size: - audio_files = [sample for sample in batch] - - asr.reset() - asr.read_audio_file(audio_files, delay, model_stride_in_secs) - hyp_list = asr.transcribe(tokens_per_chunk, delay) - hyps.extend(hyp_list) - - batch.clear() - asr.sample_offset += batch_size - - if len(batch) > 0: - asr.batch_size = len(batch) - asr.frame_bufferer.batch_size = len(batch) - asr.reset() - - audio_files = [sample for sample in batch] - asr.read_audio_file(audio_files, delay, model_stride_in_secs) - hyp_list = asr.transcribe(tokens_per_chunk, delay) - hyps.extend(hyp_list) - - batch.clear() - asr.sample_offset += len(batch) - - if os.environ.get('DEBUG', '0') in ('1', 'y', 't'): - if len(refs) == 0: - print("ground-truth text does not present!") - for hyp in hyps: - print("hyp:", hyp) - else: - for hyp, ref in zip(hyps, refs): - print("hyp:", hyp) - print("ref:", ref) - - wrapped_hyps = wrap_transcription(hyps) - return wrapped_hyps - - -def get_buffered_pred_feat( - asr: FrameBatchASR, - frame_len: float, - tokens_per_chunk: int, - delay: int, - preprocessor_cfg: DictConfig, - model_stride_in_secs: int, - device: Union[List[int], int], - manifest: str = None, - filepaths: List[list] = None, -) -> List[rnnt_utils.Hypothesis]: - """ - Moved from examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py - Write all information presented in input manifest to output manifest and removed WER calculation. - """ - # Create a preprocessor to convert audio samples into raw features, - # Normalization will be done per buffer in frame_bufferer - # Do not normalize whatever the model's preprocessor setting is - preprocessor_cfg.normalize = "None" - preprocessor = nemo_asr.models.EncDecCTCModelBPE.from_config_dict(preprocessor_cfg) - preprocessor.to(device) - hyps = [] - refs = [] - - if filepaths and manifest: - raise ValueError("Please select either filepaths or manifest") - if filepaths is None and manifest is None: - raise ValueError("Either filepaths or manifest shoud not be None") - - if filepaths: - for l in tqdm(filepaths, desc="Sample:"): - asr.reset() - asr.read_audio_file(l, delay, model_stride_in_secs) - hyp = asr.transcribe(tokens_per_chunk, delay) - hyps.append(hyp) - else: - with open(manifest, "r", encoding='utf_8') as mfst_f: - for l in tqdm(mfst_f, desc="Sample:"): - asr.reset() - row = json.loads(l.strip()) - if 'text' in row: - refs.append(row['text']) - audio_file = get_full_path(audio_file=row['audio_filepath'], manifest_file=manifest) - # do not support partial audio - asr.read_audio_file(audio_file, delay, model_stride_in_secs) - hyp = asr.transcribe(tokens_per_chunk, delay) - hyps.append(hyp) - - if os.environ.get('DEBUG', '0') in ('1', 'y', 't'): - if len(refs) == 0: - print("ground-truth text does not present!") - for hyp in hyps: - print("hyp:", hyp) - else: - for hyp, ref in zip(hyps, refs): - print("hyp:", hyp) - print("ref:", ref) - - wrapped_hyps = wrap_transcription(hyps) - return wrapped_hyps - - -def wrap_transcription(hyps: List[str]) -> List[rnnt_utils.Hypothesis]: - """ Wrap transcription to the expected format in func write_transcription """ - wrapped_hyps = [] - for hyp in hyps: - hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], text=hyp) - wrapped_hyps.append(hypothesis) - return wrapped_hyps - - -def setup_model(cfg: DictConfig, map_location: torch.device) -> Tuple[ASRModel, str]: - """ Setup model from cfg and return model and model name for next step """ - if cfg.model_path is not None and cfg.model_path != "None": - # restore model from .nemo file path - model_cfg = ASRModel.restore_from(restore_path=cfg.model_path, return_config=True) - classpath = model_cfg.target # original class path - imported_class = model_utils.import_class_by_path(classpath) # type: ASRModel - logging.info(f"Restoring model : {imported_class.__name__}") - asr_model = imported_class.restore_from( - restore_path=cfg.model_path, map_location=map_location, - ) # type: ASRModel - model_name = os.path.splitext(os.path.basename(cfg.model_path))[0] - else: - # restore model by name - asr_model = ASRModel.from_pretrained( - model_name=cfg.pretrained_name, map_location=map_location, - ) # type: ASRModel - model_name = cfg.pretrained_name - - if hasattr(cfg, "model_change"): - asr_model.change_attention_model( - self_attention_model=cfg.model_change.conformer.get("self_attention_model", None), - att_context_size=cfg.model_change.conformer.get("att_context_size", None), - ) - - return asr_model, model_name - - -def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]: - """ Prepare audio data and decide whether it's partial_audio condition. """ - # this part may need refactor alongsides with refactor of transcribe - partial_audio = False - - if cfg.audio_dir is not None and not cfg.append_pred: - filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True)) - else: - # get filenames from manifest - filepaths = [] - if os.stat(cfg.dataset_manifest).st_size == 0: - logging.error(f"The input dataset_manifest {cfg.dataset_manifest} is empty. Exiting!") - return None - - with open(cfg.dataset_manifest, 'r', encoding='utf_8') as f: - has_two_fields = [] - for line in f: - item = json.loads(line) - if "offset" in item and "duration" in item: - has_two_fields.append(True) - else: - has_two_fields.append(False) - audio_key = cfg.get('audio_key', 'audio_filepath') - audio_file = get_full_path(audio_file=item[audio_key], manifest_file=cfg.dataset_manifest) - filepaths.append(audio_file) - partial_audio = all(has_two_fields) - logging.info(f"\nTranscribing {len(filepaths)} files...\n") - - return filepaths, partial_audio - - -def compute_output_filename(cfg: DictConfig, model_name: str) -> DictConfig: - """ Compute filename of output manifest and update cfg""" - if cfg.output_filename is None: - # create default output filename - if cfg.audio_dir is not None: - cfg.output_filename = os.path.dirname(os.path.join(cfg.audio_dir, '.')) + '.json' - elif cfg.pred_name_postfix is not None: - cfg.output_filename = cfg.dataset_manifest.replace('.json', f'_{cfg.pred_name_postfix}.json') - else: - cfg.output_filename = cfg.dataset_manifest.replace('.json', f'_{model_name}.json') - return cfg - - -def normalize_timestamp_output(timestamps: dict): - """ - Normalize the dictionary of timestamp values to JSON serializable values. - Expects the following keys to exist - - "start_offset": int-like object that represents the starting index of the token - in the full audio after downsampling. - "end_offset": int-like object that represents the ending index of the token - in the full audio after downsampling. - - Args: - timestamps: Nested dict. - - Returns: - Normalized `timestamps` dictionary (in-place normalized) - """ - for val_idx in range(len(timestamps)): - timestamps[val_idx]['start_offset'] = int(timestamps[val_idx]['start_offset']) - timestamps[val_idx]['end_offset'] = int(timestamps[val_idx]['end_offset']) - return timestamps - - -def write_transcription( - transcriptions: Union[List[rnnt_utils.Hypothesis], List[List[rnnt_utils.Hypothesis]], List[str]], - cfg: DictConfig, - model_name: str, - filepaths: List[str] = None, - compute_langs: bool = False, - compute_timestamps: bool = False, -) -> Tuple[str, str]: - """ Write generated transcription to output file. """ - if cfg.append_pred: - logging.info(f'Transcripts will be written in "{cfg.output_filename}" file') - if cfg.pred_name_postfix is not None: - pred_by_model_name = cfg.pred_name_postfix - else: - pred_by_model_name = model_name - pred_text_attr_name = 'pred_text_' + pred_by_model_name - else: - pred_text_attr_name = 'pred_text' - - return_hypotheses = True - if isinstance(transcriptions[0], str): # List[str]: - best_hyps = transcriptions - return_hypotheses = False - elif isinstance(transcriptions[0], rnnt_utils.Hypothesis): # List[rnnt_utils.Hypothesis] - best_hyps = transcriptions - assert cfg.decoding.beam.return_best_hypothesis, "Works only with return_best_hypothesis=true" - elif isinstance(transcriptions[0], list) and isinstance( - transcriptions[0][0], rnnt_utils.Hypothesis - ): # List[List[rnnt_utils.Hypothesis]] NBestHypothesis - best_hyps, beams = [], [] - for hyps in transcriptions: - best_hyps.append(hyps[0]) - if not cfg.decoding.beam.return_best_hypothesis: - beam = [] - for hyp in hyps: - beam.append((hyp.text, hyp.score)) - beams.append(beam) - else: - raise TypeError - - with open(cfg.output_filename, 'w', encoding='utf-8', newline='\n') as f: - if cfg.audio_dir is not None: - for idx, transcription in enumerate(best_hyps): # type: rnnt_utils.Hypothesis or str - if not return_hypotheses: # transcription is str - item = {'audio_filepath': filepaths[idx], pred_text_attr_name: transcription} - else: # transcription is Hypothesis - item = {'audio_filepath': filepaths[idx], pred_text_attr_name: transcription.text} - - if compute_timestamps: - timestamps = transcription.timestep - if timestamps is not None and isinstance(timestamps, dict): - timestamps.pop( - 'timestep', None - ) # Pytorch tensor calculating index of each token, not needed. - for key in timestamps.keys(): - values = normalize_timestamp_output(timestamps[key]) - item[f'timestamps_{key}'] = values - - if compute_langs: - item['pred_lang'] = transcription.langs - item['pred_lang_chars'] = transcription.langs_chars - if not cfg.decoding.beam.return_best_hypothesis: - item['beams'] = beams[idx] - f.write(json.dumps(item) + "\n") - else: - with open(cfg.dataset_manifest, 'r', encoding='utf-8') as fr: - for idx, line in enumerate(fr): - item = json.loads(line) - if not return_hypotheses: # transcription is str - item[pred_text_attr_name] = best_hyps[idx] - else: # transcription is Hypothesis - item[pred_text_attr_name] = best_hyps[idx].text - - if compute_timestamps: - timestamps = best_hyps[idx].timestep - if timestamps is not None and isinstance(timestamps, dict): - timestamps.pop( - 'timestep', None - ) # Pytorch tensor calculating index of each token, not needed. - for key in timestamps.keys(): - values = normalize_timestamp_output(timestamps[key]) - item[f'timestamps_{key}'] = values - - if compute_langs: - item['pred_lang'] = best_hyps[idx].langs - item['pred_lang_chars'] = best_hyps[idx].langs_chars - - if not cfg.decoding.beam.return_best_hypothesis: - item['beams'] = beams[idx] - f.write(json.dumps(item) + "\n") - - return cfg.output_filename, pred_text_attr_name - - -def transcribe_partial_audio( - asr_model, - path2manifest: str = None, - batch_size: int = 4, - logprobs: bool = False, - return_hypotheses: bool = False, - num_workers: int = 0, - channel_selector: Optional[int] = None, - augmentor: DictConfig = None, - decoder_type: Optional[str] = None, -) -> List[str]: - """ - See description of this function in trancribe() in nemo/collections/asr/models/ctc_models.py and nemo/collections/asr/models/rnnt_models.py - """ - - if return_hypotheses and logprobs: - raise ValueError( - "Either `return_hypotheses` or `logprobs` can be True at any given time." - "Returned hypotheses will contain the logprobs." - ) - if num_workers is None: - num_workers = min(batch_size, os.cpu_count() - 1) - - # We will store transcriptions here - hypotheses = [] - # Model's mode and device - mode = asr_model.training - device = next(asr_model.parameters()).device - dither_value = asr_model.preprocessor.featurizer.dither - pad_to_value = asr_model.preprocessor.featurizer.pad_to - - if decoder_type is not None: # Hybrid model - decode_function = ( - asr_model.decoding.rnnt_decoder_predictions_tensor - if decoder_type == 'rnnt' - else asr_model.ctc_decoding.ctc_decoder_predictions_tensor - ) - elif hasattr(asr_model, 'joint'): # RNNT model - decode_function = asr_model.decoding.rnnt_decoder_predictions_tensor - else: # CTC model - decode_function = asr_model.decoding.ctc_decoder_predictions_tensor - - try: - asr_model.preprocessor.featurizer.dither = 0.0 - asr_model.preprocessor.featurizer.pad_to = 0 - # Switch model to evaluation mode - asr_model.eval() - # Freeze the encoder and decoder modules - asr_model.encoder.freeze() - asr_model.decoder.freeze() - logging_level = logging.get_verbosity() - logging.set_verbosity(logging.WARNING) - - config = { - 'manifest_filepath': path2manifest, - 'batch_size': batch_size, - 'num_workers': num_workers, - 'channel_selector': channel_selector, - } - if augmentor: - config['augmentor'] = augmentor - - temporary_datalayer = asr_model._setup_transcribe_dataloader(config) - for test_batch in tqdm(temporary_datalayer, desc="Transcribing"): - outputs = asr_model.forward( - input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) - ) - logits, logits_len = outputs[0], outputs[1] - - if isinstance(asr_model, EncDecHybridRNNTCTCModel) and decoder_type == "ctc": - logits = asr_model.ctc_decoder(encoder_output=logits) - - logits = logits.cpu() - - if logprobs: - logits = logits.numpy() - # dump log probs per file - for idx in range(logits.shape[0]): - lg = logits[idx][: logits_len[idx]] - hypotheses.append(lg) - else: - current_hypotheses, _ = decode_function(logits, logits_len, return_hypotheses=return_hypotheses,) - - if return_hypotheses: - # dump log probs per file - for idx in range(logits.shape[0]): - current_hypotheses[idx].y_sequence = logits[idx][: logits_len[idx]] - if current_hypotheses[idx].alignments is None: - current_hypotheses[idx].alignments = current_hypotheses[idx].y_sequence - - hypotheses += current_hypotheses - - del logits - del test_batch - - finally: - # set mode back to its original value - asr_model.train(mode=mode) - asr_model.preprocessor.featurizer.dither = dither_value - asr_model.preprocessor.featurizer.pad_to = pad_to_value - if mode is True: - asr_model.encoder.unfreeze() - asr_model.decoder.unfreeze() - logging.set_verbosity(logging_level) - return hypotheses - - -def compute_metrics_per_sample( - manifest_path: str, - reference_field: str = "text", - hypothesis_field: str = "pred_text", - metrics: list[str] = ["wer"], - punctuation_marks: list[str] = [".", ",", "?"], - output_manifest_path: str = None, -) -> dict: - - ''' - Computes metrics per sample for given manifest - - Args: - manifest_path: str, Required - path to dataset JSON manifest file (in NeMo format) - reference_field: str, Optional - name of field in .json manifest with the reference text ("text" by default). - hypothesis_field: str, Optional - name of field in .json manifest with the hypothesis text ("pred_text" by default). - metrics: list[str], Optional - list of metrics to be computed (currently supported "wer", "cer", "punct_er") - punctuation_marks: list[str], Optional - list of punctuation marks for computing punctuation error rate ([".", ",", "?"] by default). - output_manifest_path: str, Optional - path where .json manifest with calculated metrics will be saved. - - Returns: - samples: dict - Dict of samples with calculated metrics - ''' - - supported_metrics = ["wer", "cer", "punct_er"] - - if len(metrics) == 0: - raise AssertionError( - f"'metrics' list is empty. \ - Select the metrics from the supported: {supported_metrics}." - ) - - for metric in metrics: - if metric not in supported_metrics: - raise AssertionError( - f"'{metric}' metric is not supported. \ - Currently supported metrics are {supported_metrics}." - ) - - if "punct_er" in metrics: - if len(punctuation_marks) == 0: - raise AssertionError("punctuation_marks list can't be empty when 'punct_er' metric is enabled.") - else: - oper_obj = OccurancePunctuationErrorRate(punctuation_marks=punctuation_marks) - - use_wer = "wer" in metrics - use_cer = "cer" in metrics - use_punct_er = "punct_er" in metrics - - with open(manifest_path, 'r') as manifest: - lines = manifest.readlines() - samples = [json.loads(line) for line in lines] - samples_with_metrics = [] - - logging.info(f"Computing {', '.join(metrics)} per sample") - - for sample in tqdm(samples): - reference = sample[reference_field] - hypothesis = sample[hypothesis_field] - - if use_wer: - sample_wer = word_error_rate(hypotheses=[hypothesis], references=[reference], use_cer=False) - sample["wer"] = round(100 * sample_wer, 2) - - if use_cer: - sample_cer = word_error_rate(hypotheses=[hypothesis], references=[reference], use_cer=True) - sample["cer"] = round(100 * sample_cer, 2) - - if use_punct_er: - operation_amounts, substitution_amounts, punctuation_rates = oper_obj.compute( - reference=reference, hypothesis=hypothesis - ) - sample["punct_correct_rate"] = round(100 * punctuation_rates.correct_rate, 2) - sample["punct_deletions_rate"] = round(100 * punctuation_rates.deletions_rate, 2) - sample["punct_insertions_rate"] = round(100 * punctuation_rates.insertions_rate, 2) - sample["punct_substitutions_rate"] = round(100 * punctuation_rates.substitutions_rate, 2) - sample["punct_error_rate"] = round(100 * punctuation_rates.punct_er, 2) - - samples_with_metrics.append(sample) - - if output_manifest_path is not None: - with open(output_manifest_path, 'w') as output: - for sample in samples_with_metrics: - line = json.dumps(sample) - output.writelines(f'{line}\n') - logging.info(f'Output manifest saved: {output_manifest_path}') - - return samples_with_metrics - - -class PunctuationCapitalization: - def __init__(self, punctuation_marks: str): - """ - Class for text processing with punctuation and capitalization. Can be used with class TextProcessingConfig. - - Args: - punctuation_marks (str): String with punctuation marks to process. - Example: punctuation_marks = '.,?' - """ - if punctuation_marks: - self.regex_punctuation = re.compile(fr"([{''.join(punctuation_marks)}])") - self.regex_extra_space = re.compile('\s{2,}') - else: - self.regex_punctuation = None - - def separate_punctuation(self, lines: List[str]) -> List[str]: - if self.regex_punctuation is not None: - return [ - self.regex_extra_space.sub(' ', self.regex_punctuation.sub(r' \1 ', line)).strip() for line in lines - ] - else: - return lines - - def do_lowercase(self, lines: List[str]) -> List[str]: - return [line.lower() for line in lines] - - def rm_punctuation(self, lines: List[str]) -> List[str]: - if self.regex_punctuation is not None: - return [self.regex_extra_space.sub(' ', self.regex_punctuation.sub(' ', line)).strip() for line in lines] - else: - return lines - - -@dataclass -class TextProcessingConfig: - # Punctuation marks to process. Example: ".,?" - punctuation_marks: str = "" - - # Whether to apply lower case conversion on the training text. - do_lowercase: bool = False - - # Whether to remove punctuation marks from text. - rm_punctuation: bool = False - - # Whether to separate punctuation with the previouse word by space. - separate_punctuation: bool = True diff --git a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/vad_utils.py b/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/vad_utils.py deleted file mode 100644 index 68dfaf3d6c7674b60f18645f32ebdf2d08840406..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/asr/parts/utils/vad_utils.py +++ /dev/null @@ -1,1718 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import glob -import json -import math -import multiprocessing -import os -import shutil -from itertools import repeat -from math import ceil, floor -from pathlib import Path -from typing import Dict, List, Optional, Tuple, Union - -import IPython.display as ipd -import librosa -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import torch -from omegaconf import DictConfig -from pyannote.core import Annotation, Segment -from pyannote.metrics import detection -from sklearn.metrics import roc_auc_score -from sklearn.model_selection import ParameterGrid -from tqdm import tqdm - -from nemo.collections.asr.models import EncDecClassificationModel, EncDecFrameClassificationModel -from nemo.collections.common.parts.preprocessing.manifest import get_full_path -from nemo.utils import logging - -try: - from torch.cuda.amp import autocast -except ImportError: - from contextlib import contextmanager - - @contextmanager - def autocast(enabled=None): - yield - - -""" -This file contains all the utility functions required for voice activity detection. -""" - - -def prepare_manifest(config: dict) -> str: - """ - Perform VAD on long audio snippet might cause CUDA out of memory issue. - Automatically split manifest entry by split_duration to avoid the potential memory issue. - """ - if 'prepared_manifest_vad_input' in config and config['prepared_manifest_vad_input']: - manifest_vad_input = config['prepared_manifest_vad_input'] - else: - default_path = "manifest_vad_input.json" - manifest_vad_input = os.path.join(config["out_dir"], default_path) if "out_dir" in config else default_path - - # input_list is a list of variable ['audio_filepath': i, "offset": xxx, "duration": xxx]) - if type(config['input']) == str: - input_list = [] - with open(config['input'], 'r', encoding='utf-8') as manifest: - for line in manifest.readlines(): - input_list.append(json.loads(line.strip())) - elif type(config['input']) == list: - input_list = config['input'] - else: - raise ValueError( - "The input for manifest preparation would either be a string of the filepath to manifest or a list of {'audio_filepath': i, 'offset': 0, 'duration': null} " - ) - - args_func = { - 'label': 'infer', - 'split_duration': config['split_duration'], - 'window_length_in_sec': config['window_length_in_sec'], - 'manifest_dir': Path(config['input']).parent if type(config['input']) == str else '', - } - - if config.get('num_workers') is not None and config['num_workers'] > 1: - with multiprocessing.Pool(processes=config['num_workers']) as p: - inputs = zip(input_list, repeat(args_func)) - results = list( - tqdm( - p.imap(write_vad_infer_manifest_star, inputs), - total=len(input_list), - desc='splitting manifest', - leave=True, - ) - ) - else: - results = [ - write_vad_infer_manifest(input_el, args_func) - for input_el in tqdm(input_list, desc='splitting manifest', leave=True) - ] - - if os.path.exists(manifest_vad_input): - logging.info("The prepared manifest file exists. Overwriting!") - os.remove(manifest_vad_input) - - with open(manifest_vad_input, 'a', encoding='utf-8') as fout: - for res in results: - for r in res: - json.dump(r, fout) - fout.write('\n') - fout.flush() - return manifest_vad_input - - -def write_vad_infer_manifest_star(args): - """ - A workaround for tqdm with starmap of multiprocessing - """ - return write_vad_infer_manifest(*args) - - -def write_vad_infer_manifest(file: dict, args_func: dict) -> list: - """ - Used by prepare_manifest. - Given a list of files, split them with maximum split_duration and write them to the manifest. - Args: - files (dict) : file to be processed - args_func: - label (str): label for audio snippet.y - split_duration (float): max duration of each audio clip (each line in json) - window_length_in_sec (float) : length of window for generating the frame. Used for taking care of joint. - Returns: - res (list) : list of generated metadata line of json for file - """ - res = [] - label = args_func['label'] - split_duration = args_func['split_duration'] - window_length_in_sec = args_func['window_length_in_sec'] - filepath = file['audio_filepath'] - in_duration = file.get('duration', None) - in_offset = file.get('offset', 0) - - # if filepath is not found, try to find it in the dir of manifest - if not Path(filepath).is_file(): - new_filepath = Path(args_func['manifest_dir']) / filepath - if new_filepath.is_file(): - filepath = new_filepath.absolute().as_posix() - - try: - sr = 16000 - x, _sr = librosa.load(filepath, sr=sr, offset=in_offset, duration=in_duration) - duration = librosa.get_duration(y=x, sr=sr) - left = duration - current_offset = in_offset - - status = 'single' - while left > 0: - if left <= split_duration: - if status == 'single': - write_duration = left - current_offset = 0 - else: - status = 'end' - write_duration = left + window_length_in_sec - current_offset -= window_length_in_sec - offset_inc = left - left = 0 - else: - if status == 'start' or status == 'next': - status = 'next' - else: - status = 'start' - - if status == 'start': - write_duration = split_duration - offset_inc = split_duration - else: - write_duration = split_duration + window_length_in_sec - current_offset -= window_length_in_sec - offset_inc = split_duration + window_length_in_sec - - left -= split_duration - - metadata = { - 'audio_filepath': filepath, - 'duration': write_duration, - 'label': label, - 'text': '_', - 'offset': current_offset, - } - res.append(metadata) - - current_offset += offset_inc - - except Exception as e: - err_file = "error.log" - with open(err_file, 'w', encoding='utf-8') as fout: - fout.write(filepath + ":" + str(e)) - return res - - -def get_vad_stream_status(data: list) -> list: - """ - Generate a list of status for each snippet in manifest. A snippet should be in single, start, next or end status. - Used for concatenating to full audio file. - Args: - data (list): list of filepath of audio snippet - Returns: - status (list): list of status of each snippet. - """ - if len(data) == 1: - return ['single'] - - status = [None] * len(data) - for i in range(len(data)): - if i == 0: - status[i] = 'start' if data[i] == data[i + 1] else 'single' - elif i == len(data) - 1: - status[i] = 'end' if data[i] == data[i - 1] else 'single' - else: - if data[i] != data[i - 1] and data[i] == data[i + 1]: - status[i] = 'start' - elif data[i] == data[i - 1] and data[i] == data[i + 1]: - status[i] = 'next' - elif data[i] == data[i - 1] and data[i] != data[i + 1]: - status[i] = 'end' - else: - status[i] = 'single' - return status - - -def load_tensor_from_file(filepath: str) -> Tuple[torch.Tensor, str]: - """ - Load torch.Tensor and the name from file - """ - frame = [] - with open(filepath, "r", encoding='utf-8') as f: - for line in f.readlines(): - frame.append(float(line)) - - name = Path(filepath).stem - return torch.tensor(frame), name - - -def generate_overlap_vad_seq( - frame_pred_dir: str, - smoothing_method: str, - overlap: float, - window_length_in_sec: float, - shift_length_in_sec: float, - num_workers: int, - out_dir: str = None, -) -> str: - """ - Generate predictions with overlapping input windows/segments. Then a smoothing filter is applied to decide the label for a frame spanned by multiple windows. - Two common smoothing filters are supported: majority vote (median) and average (mean). - This function uses multiprocessing to speed up. - Args: - frame_pred_dir (str): Directory of frame prediction file to be processed. - smoothing_method (str): median or mean smoothing filter. - overlap (float): amounts of overlap of adjacent windows. - window_length_in_sec (float): length of window for generating the frame. - shift_length_in_sec (float): amount of shift of window for generating the frame. - out_dir (str): directory of generated predictions. - num_workers(float): number of process for multiprocessing - Returns: - overlap_out_dir(str): directory of the generated predictions. - """ - - frame_filepathlist = glob.glob(frame_pred_dir + "/*.frame") - if out_dir: - overlap_out_dir = out_dir - else: - overlap_out_dir = os.path.join( - frame_pred_dir, "overlap_smoothing_output" + "_" + smoothing_method + "_" + str(overlap) - ) - - if not os.path.exists(overlap_out_dir): - os.mkdir(overlap_out_dir) - - per_args = { - "overlap": overlap, - "window_length_in_sec": window_length_in_sec, - "shift_length_in_sec": shift_length_in_sec, - "out_dir": overlap_out_dir, - "smoothing_method": smoothing_method, - } - if num_workers is not None and num_workers > 1: - with multiprocessing.Pool(processes=num_workers) as p: - inputs = zip(frame_filepathlist, repeat(per_args)) - results = list( - tqdm( - p.imap(generate_overlap_vad_seq_per_file_star, inputs), - total=len(frame_filepathlist), - desc='generating preds', - leave=True, - ) - ) - - else: - for frame_filepath in tqdm(frame_filepathlist, desc='generating preds', leave=False): - generate_overlap_vad_seq_per_file(frame_filepath, per_args) - - return overlap_out_dir - - -def generate_overlap_vad_seq_per_file_star(args): - """ - A workaround for tqdm with starmap of multiprocessing - """ - return generate_overlap_vad_seq_per_file(*args) - - -@torch.jit.script -def generate_overlap_vad_seq_per_tensor( - frame: torch.Tensor, per_args: Dict[str, float], smoothing_method: str -) -> torch.Tensor: - """ - Use generated frame prediction (generated by shifting window of shift_length_in_sec (10ms)) to generate prediction with overlapping input window/segments - See description in generate_overlap_vad_seq. - Use this for single instance pipeline. - """ - # This function will be refactor for vectorization but this is okay for now - - overlap = per_args['overlap'] - window_length_in_sec = per_args['window_length_in_sec'] - shift_length_in_sec = per_args['shift_length_in_sec'] - frame_len = per_args.get('frame_len', 0.01) - - shift = int(shift_length_in_sec / frame_len) # number of units of shift - seg = int((window_length_in_sec / frame_len + 1)) # number of units of each window/segment - - jump_on_target = int(seg * (1 - overlap)) # jump on target generated sequence - jump_on_frame = int(jump_on_target / shift) # jump on input frame sequence - - if jump_on_frame < 1: - raise ValueError( - f"Note we jump over frame sequence to generate overlapping input segments. \n \ - Your input makes jump_on_frame={jump_on_frame} < 1 which is invalid because it cannot jump and will stuck.\n \ - Please try different window_length_in_sec, shift_length_in_sec and overlap choices. \n \ - jump_on_target = int(seg * (1 - overlap)) \n \ - jump_on_frame = int(jump_on_frame/shift) " - ) - - target_len = int(len(frame) * shift) - - if smoothing_method == 'mean': - preds = torch.zeros(target_len) - pred_count = torch.zeros(target_len) - - for i, og_pred in enumerate(frame): - if i % jump_on_frame != 0: - continue - start = i * shift - end = start + seg - preds[start:end] = preds[start:end] + og_pred - pred_count[start:end] = pred_count[start:end] + 1 - - preds = preds / pred_count - last_non_zero_pred = preds[pred_count != 0][-1] - preds[pred_count == 0] = last_non_zero_pred - - elif smoothing_method == 'median': - preds = [torch.empty(0) for _ in range(target_len)] - for i, og_pred in enumerate(frame): - if i % jump_on_frame != 0: - continue - - start = i * shift - end = start + seg - for j in range(start, end): - if j <= target_len - 1: - preds[j] = torch.cat((preds[j], og_pred.unsqueeze(0)), 0) - - preds = torch.stack([torch.nanquantile(l, q=0.5) for l in preds]) - nan_idx = torch.isnan(preds) - last_non_nan_pred = preds[~nan_idx][-1] - preds[nan_idx] = last_non_nan_pred - - else: - raise ValueError("smoothing_method should be either mean or median") - - return preds - - -def generate_overlap_vad_seq_per_file(frame_filepath: str, per_args: dict) -> str: - """ - A wrapper for generate_overlap_vad_seq_per_tensor. - """ - - out_dir = per_args['out_dir'] - smoothing_method = per_args['smoothing_method'] - frame, name = load_tensor_from_file(frame_filepath) - - per_args_float: Dict[str, float] = {} - for i in per_args: - if type(per_args[i]) == float or type(per_args[i]) == int: - per_args_float[i] = per_args[i] - - preds = generate_overlap_vad_seq_per_tensor(frame, per_args_float, smoothing_method) - - overlap_filepath = os.path.join(out_dir, name + "." + smoothing_method) - with open(overlap_filepath, "w", encoding='utf-8') as f: - for pred in preds: - f.write(f"{pred:.4f}\n") - - return overlap_filepath - - -@torch.jit.script -def merge_overlap_segment(segments: torch.Tensor) -> torch.Tensor: - """ - Merged the given overlapped segments. - For example: - torch.Tensor([[0, 1.5], [1, 3.5]]) -> torch.Tensor([0, 3.5]) - """ - if ( - segments.shape == torch.Size([0]) - or segments.shape == torch.Size([0, 2]) - or segments.shape == torch.Size([1, 2]) - ): - return segments - - segments = segments[segments[:, 0].sort()[1]] - merge_boundary = segments[:-1, 1] >= segments[1:, 0] - head_padded = torch.nn.functional.pad(merge_boundary, [1, 0], mode='constant', value=0.0) - head = segments[~head_padded, 0] - tail_padded = torch.nn.functional.pad(merge_boundary, [0, 1], mode='constant', value=0.0) - tail = segments[~tail_padded, 1] - merged = torch.stack((head, tail), dim=1) - return merged - - -@torch.jit.script -def filter_short_segments(segments: torch.Tensor, threshold: float) -> torch.Tensor: - """ - Remove segments which duration is smaller than a threshold. - For example, - torch.Tensor([[0, 1.5], [1, 3.5], [4, 7]]) and threshold = 2.0 - -> - torch.Tensor([[1, 3.5], [4, 7]]) - """ - return segments[segments[:, 1] - segments[:, 0] >= threshold] - - -def percentile(data: torch.Tensor, perc: int) -> float: - """ - Calculate percentile given data - """ - size = len(data) - return float(sorted(data)[int(math.ceil((size * perc) / 100)) - 1]) - - -def cal_vad_onset_offset( - scale: str, onset: float, offset: float, sequence: torch.Tensor = None -) -> Tuple[float, float]: - """ - Calculate onset and offset threshold given different scale. - """ - if scale == "absolute": - mini = 0 - maxi = 1 - elif scale == "relative": - mini = min(sequence) - maxi = max(sequence) - elif scale == "percentile": - mini = percentile(sequence, 1) - maxi = percentile(sequence, 99) - - onset = mini + onset * (maxi - mini) - offset = mini + offset * (maxi - mini) - return float(onset), float(offset) - - -@torch.jit.script -def binarization(sequence: torch.Tensor, per_args: Dict[str, float]) -> torch.Tensor: - """ - Binarize predictions to speech and non-speech - - Reference - Paper: Gregory Gelly and Jean-Luc Gauvain. "Minimum Word Error Training of RNN-based Voice Activity Detection", InterSpeech 2015. - Implementation: https://github.com/pyannote/pyannote-audio/blob/master/pyannote/audio/utils/signal.py - - Args: - sequence (torch.Tensor) : A tensor of frame level predictions. - per_args: - onset (float): onset threshold for detecting the beginning and end of a speech - offset (float): offset threshold for detecting the end of a speech. - pad_onset (float): adding durations before each speech segment - pad_offset (float): adding durations after each speech segment; - frame_length_in_sec (float): length of frame. - - Returns: - speech_segments(torch.Tensor): A tensor of speech segment in torch.Tensor([[start1, end1], [start2, end2]]) format. - """ - frame_length_in_sec = per_args.get('frame_length_in_sec', 0.01) - - onset = per_args.get('onset', 0.5) - offset = per_args.get('offset', 0.5) - pad_onset = per_args.get('pad_onset', 0.0) - pad_offset = per_args.get('pad_offset', 0.0) - - speech = False - start = 0.0 - i = 0 - - speech_segments = torch.empty(0) - - for i in range(0, len(sequence)): - # Current frame is speech - if speech: - # Switch from speech to non-speech - if sequence[i] < offset: - if i * frame_length_in_sec + pad_offset > max(0, start - pad_onset): - new_seg = torch.tensor( - [max(0, start - pad_onset), i * frame_length_in_sec + pad_offset] - ).unsqueeze(0) - speech_segments = torch.cat((speech_segments, new_seg), 0) - - start = i * frame_length_in_sec - speech = False - - # Current frame is non-speech - else: - # Switch from non-speech to speech - if sequence[i] > onset: - start = i * frame_length_in_sec - speech = True - - # if it's speech at the end, add final segment - if speech: - new_seg = torch.tensor([max(0, start - pad_onset), i * frame_length_in_sec + pad_offset]).unsqueeze(0) - speech_segments = torch.cat((speech_segments, new_seg), 0) - - # Merge the overlapped speech segments due to padding - speech_segments = merge_overlap_segment(speech_segments) # not sorted - return speech_segments - - -@torch.jit.script -def remove_segments(original_segments: torch.Tensor, to_be_removed_segments: torch.Tensor) -> torch.Tensor: - """ - Remove speech segments list in to_be_removed_segments from original_segments. - For example, - remove torch.Tensor([[start2, end2],[start4, end4]]) from torch.Tensor([[start1, end1],[start2, end2],[start3, end3], [start4, end4]]), - -> - torch.Tensor([[start1, end1],[start3, end3]]) - """ - for y in to_be_removed_segments: - original_segments = original_segments[original_segments.eq(y).all(dim=1).logical_not()] - return original_segments - - -@torch.jit.script -def get_gap_segments(segments: torch.Tensor) -> torch.Tensor: - """ - Get the gap segments. - For example, - torch.Tensor([[start1, end1], [start2, end2], [start3, end3]]) -> torch.Tensor([[end1, start2], [end2, start3]]) - """ - segments = segments[segments[:, 0].sort()[1]] - return torch.column_stack((segments[:-1, 1], segments[1:, 0])) - - -@torch.jit.script -def filtering(speech_segments: torch.Tensor, per_args: Dict[str, float]) -> torch.Tensor: - - """ - Filter out short non_speech and speech segments. - - Reference - Paper: Gregory Gelly and Jean-Luc Gauvain. "Minimum Word Error Training of RNN-based Voice Activity Detection", InterSpeech 2015. - Implementation: https://github.com/pyannote/pyannote-audio/blob/master/pyannote/audio/utils/signal.py - Args: - speech_segments (torch.Tensor): A tensor of speech segment in torch.Tensor([[start1, end1], [start2, end2]]) format. - per_args: - min_duration_on (float): threshold for small non_speech deletion - min_duration_off (float): threshold for short speech segment deletion - filter_speech_first (float): Whether to perform short speech segment deletion first. Use 1.0 to represent True. - - Returns: - speech_segments(torch.Tensor): A tensor of filtered speech segment in torch.Tensor([[start1, end1], [start2, end2]]) format. - """ - if speech_segments.shape == torch.Size([0]): - return speech_segments - - min_duration_on = per_args.get('min_duration_on', 0.0) - min_duration_off = per_args.get('min_duration_off', 0.0) - filter_speech_first = per_args.get('filter_speech_first', 1.0) - - if filter_speech_first == 1.0: - # Filter out the shorter speech segments - if min_duration_on > 0.0: - speech_segments = filter_short_segments(speech_segments, min_duration_on) - # Filter out the shorter non-speech segments and return to be as speech segments - if min_duration_off > 0.0: - # Find non-speech segments - non_speech_segments = get_gap_segments(speech_segments) - # Find shorter non-speech segments - short_non_speech_segments = remove_segments( - non_speech_segments, filter_short_segments(non_speech_segments, min_duration_off) - ) - # Return shorter non-speech segments to be as speech segments - speech_segments = torch.cat((speech_segments, short_non_speech_segments), 0) - - # Merge the overlapped speech segments - speech_segments = merge_overlap_segment(speech_segments) - else: - if min_duration_off > 0.0: - # Find non-speech segments - non_speech_segments = get_gap_segments(speech_segments) - # Find shorter non-speech segments - short_non_speech_segments = remove_segments( - non_speech_segments, filter_short_segments(non_speech_segments, min_duration_off) - ) - - speech_segments = torch.cat((speech_segments, short_non_speech_segments), 0) - - # Merge the overlapped speech segments - speech_segments = merge_overlap_segment(speech_segments) - if min_duration_on > 0.0: - speech_segments = filter_short_segments(speech_segments, min_duration_on) - - return speech_segments - - -def prepare_gen_segment_table(sequence: torch.Tensor, per_args: dict) -> Tuple[str, dict]: - """ - Preparing for generating segment table. - """ - out_dir = per_args.get('out_dir', None) - - # calculate onset offset based on scale selection - per_args['onset'], per_args['offset'] = cal_vad_onset_offset( - per_args.get('scale', 'absolute'), per_args['onset'], per_args['offset'], sequence - ) - - # cast 'filter_speech_first' for torch.jit.script - if 'filter_speech_first' in per_args: - if per_args['filter_speech_first']: - per_args['filter_speech_first'] = 1.0 - else: - per_args['filter_speech_first'] = 0.0 - - per_args_float: Dict[str, float] = {} - for i in per_args: - if type(per_args[i]) == float or type(per_args[i]) == int: - per_args_float[i] = per_args[i] - - return out_dir, per_args_float - - -@torch.jit.script -def generate_vad_segment_table_per_tensor(sequence: torch.Tensor, per_args: Dict[str, float]) -> torch.Tensor: - """ - See description in generate_overlap_vad_seq. - Use this for single instance pipeline. - """ - UNIT_FRAME_LEN = 0.01 - - speech_segments = binarization(sequence, per_args) - speech_segments = filtering(speech_segments, per_args) - - if speech_segments.shape == torch.Size([0]): - return speech_segments - - speech_segments, _ = torch.sort(speech_segments, 0) - - dur = speech_segments[:, 1:2] - speech_segments[:, 0:1] + UNIT_FRAME_LEN - speech_segments = torch.column_stack((speech_segments, dur)) - - return speech_segments - - -def generate_vad_segment_table_per_file(pred_filepath: str, per_args: dict) -> str: - """ - A wrapper for generate_vad_segment_table_per_tensor - """ - sequence, name = load_tensor_from_file(pred_filepath) - out_dir, per_args_float = prepare_gen_segment_table(sequence, per_args) - - preds = generate_vad_segment_table_per_tensor(sequence, per_args_float) - ext = ".rttm" if per_args.get("use_rttm", False) else ".txt" - save_name = name + ext - save_path = os.path.join(out_dir, save_name) - - if preds.shape[0] == 0: - with open(save_path, "w", encoding='utf-8') as fp: - if per_args.get("use_rttm", False): - fp.write(f"SPEAKER 1 0 0 speech \n") - else: - fp.write(f"0 0 speech\n") - else: - with open(save_path, "w", encoding='utf-8') as fp: - for i in preds: - if per_args.get("use_rttm", False): - fp.write(f"SPEAKER {name} 1 {i[0]:.4f} {i[2]:.4f} speech \n") - else: - fp.write(f"{i[0]:.4f} {i[2]:.4f} speech\n") - - return save_path - - -def generate_vad_segment_table( - vad_pred_dir: str, - postprocessing_params: dict, - frame_length_in_sec: float, - num_workers: int, - out_dir: str = None, - use_rttm: bool = False, -) -> str: - """ - Convert frame level prediction to speech segment in start and end times format. - And save to csv file in rttm-like format - 0, 10, speech - 17,18, speech - Args: - vad_pred_dir (str): directory of prediction files to be processed. - postprocessing_params (dict): dictionary of thresholds for prediction score. See details in binarization and filtering. - frame_length_in_sec (float): frame length. - out_dir (str): output dir of generated table/csv file. - num_workers(float): number of process for multiprocessing - Returns: - out_dir(str): directory of the generated table. - """ - - suffixes = ("frame", "mean", "median") - vad_pred_filepath_list = [os.path.join(vad_pred_dir, x) for x in os.listdir(vad_pred_dir) if x.endswith(suffixes)] - - if not out_dir: - out_dir_name = "seg_output" - for key in postprocessing_params: - out_dir_name = out_dir_name + "-" + str(key) + str(postprocessing_params[key]) - - out_dir = os.path.join(vad_pred_dir, out_dir_name) - - if not os.path.exists(out_dir): - os.mkdir(out_dir) - - per_args = { - "frame_length_in_sec": frame_length_in_sec, - "out_dir": out_dir, - "use_rttm": use_rttm, - } - per_args = {**per_args, **postprocessing_params} - num_workers = None - if num_workers is not None and num_workers > 1: - with multiprocessing.Pool(num_workers) as p: - inputs = zip(vad_pred_filepath_list, repeat(per_args)) - list( - tqdm( - p.imap(generate_vad_segment_table_per_file_star, inputs), - total=len(vad_pred_filepath_list), - desc='creating speech segments', - leave=True, - ) - ) - else: - for vad_pred_filepath in tqdm(vad_pred_filepath_list, desc='creating speech segments', leave=True): - generate_vad_segment_table_per_file(vad_pred_filepath, per_args) - - return out_dir - - -def generate_vad_segment_table_per_file_star(args): - """ - A workaround for tqdm with starmap of multiprocessing - """ - return generate_vad_segment_table_per_file(*args) - - -def vad_construct_pyannote_object_per_file( - vad_table_filepath: str, groundtruth_RTTM_file: str -) -> Tuple[Annotation, Annotation]: - """ - Construct a Pyannote object for evaluation. - Args: - vad_table_filepath(str) : path of vad rttm-like table. - groundtruth_RTTM_file(str): path of groundtruth rttm file. - Returns: - reference(pyannote.Annotation): groundtruth - hypothesis(pyannote.Annotation): prediction - """ - - pred = pd.read_csv(vad_table_filepath, sep=" ", header=None) - label = pd.read_csv(groundtruth_RTTM_file, sep=" ", delimiter=None, header=None) - label = label.rename(columns={3: "start", 4: "dur", 7: "speaker"}) - - # construct reference - reference = Annotation() - for index, row in label.iterrows(): - reference[Segment(row['start'], row['start'] + row['dur'])] = row['speaker'] - - # construct hypothsis - hypothesis = Annotation() - for index, row in pred.iterrows(): - hypothesis[Segment(float(row[0]), float(row[0]) + float(row[1]))] = 'Speech' - return reference, hypothesis - - -def get_parameter_grid(params: dict) -> list: - """ - Get the parameter grid given a dictionary of parameters. - """ - has_filter_speech_first = False - if 'filter_speech_first' in params: - filter_speech_first = params['filter_speech_first'] - has_filter_speech_first = True - params.pop("filter_speech_first") - - params_grid = list(ParameterGrid(params)) - - if has_filter_speech_first: - for i in params_grid: - i['filter_speech_first'] = filter_speech_first - return params_grid - - -def vad_tune_threshold_on_dev( - params: dict, - vad_pred: str, - groundtruth_RTTM: str, - result_file: str = "res", - vad_pred_method: str = "frame", - focus_metric: str = "DetER", - frame_length_in_sec: float = 0.01, - num_workers: int = 20, -) -> Tuple[dict, dict]: - """ - Tune thresholds on dev set. Return best thresholds which gives the lowest detection error rate (DetER) in thresholds. - Args: - params (dict): dictionary of parameters to be tuned on. - vad_pred_method (str): suffix of prediction file. Use to locate file. Should be either in "frame", "mean" or "median". - groundtruth_RTTM_dir (str): directory of ground-truth rttm files or a file contains the paths of them. - focus_metric (str): metrics we care most when tuning threshold. Should be either in "DetER", "FA", "MISS" - frame_length_in_sec (float): frame length. - num_workers (int): number of workers. - Returns: - best_threshold (float): threshold that gives lowest DetER. - """ - min_score = 100 - all_perf = {} - try: - check_if_param_valid(params) - except: - raise ValueError("Please check if the parameters are valid") - - paired_filenames, groundtruth_RTTM_dict, vad_pred_dict = pred_rttm_map(vad_pred, groundtruth_RTTM, vad_pred_method) - metric = detection.DetectionErrorRate() - params_grid = get_parameter_grid(params) - - for param in params_grid: - for i in param: - if type(param[i]) == np.float64 or type(param[i]) == np.int64: - param[i] = float(param[i]) - try: - # Generate speech segments by performing binarization on the VAD prediction according to param. - # Filter speech segments according to param and write the result to rttm-like table. - vad_table_dir = generate_vad_segment_table( - vad_pred, param, frame_length_in_sec=frame_length_in_sec, num_workers=num_workers - ) - # add reference and hypothesis to metrics - for filename in paired_filenames: - groundtruth_RTTM_file = groundtruth_RTTM_dict[filename] - vad_table_filepath = os.path.join(vad_table_dir, filename + ".txt") - reference, hypothesis = vad_construct_pyannote_object_per_file( - vad_table_filepath, groundtruth_RTTM_file - ) - metric(reference, hypothesis) # accumulation - - # delete tmp table files - shutil.rmtree(vad_table_dir, ignore_errors=True) - - report = metric.report(display=False) - DetER = report.iloc[[-1]][('detection error rate', '%')].item() - FA = report.iloc[[-1]][('false alarm', '%')].item() - MISS = report.iloc[[-1]][('miss', '%')].item() - - assert ( - focus_metric == "DetER" or focus_metric == "FA" or focus_metric == "MISS" - ), "Metric we care most should be only in 'DetER', 'FA' or 'MISS'!" - all_perf[str(param)] = {'DetER (%)': DetER, 'FA (%)': FA, 'MISS (%)': MISS} - logging.info(f"parameter {param}, {all_perf[str(param)] }") - - score = all_perf[str(param)][focus_metric + ' (%)'] - - del report - metric.reset() # reset internal accumulator - - # save results for analysis - with open(result_file + ".txt", "a", encoding='utf-8') as fp: - fp.write(f"{param}, {all_perf[str(param)] }\n") - - if score < min_score: - best_threshold = param - optimal_scores = all_perf[str(param)] - min_score = score - print("Current best", best_threshold, optimal_scores) - - except RuntimeError as e: - print(f"Pass {param}, with error {e}") - except pd.errors.EmptyDataError as e1: - print(f"Pass {param}, with error {e1}") - - return best_threshold, optimal_scores - - -def check_if_param_valid(params: dict) -> bool: - """ - Check if the parameters are valid. - """ - for i in params: - if i == "filter_speech_first": - if not type(params["filter_speech_first"]) == bool: - raise ValueError("Invalid inputs! filter_speech_first should be either True or False!") - elif i == "pad_onset": - continue - elif i == "pad_offset": - continue - else: - for j in params[i]: - if not j >= 0: - raise ValueError( - "Invalid inputs! All float parameters except pad_onset and pad_offset should be larger than 0!" - ) - - if not (all(i <= 1 for i in params['onset']) and all(i <= 1 for i in params['offset'])): - raise ValueError("Invalid inputs! The onset and offset thresholds should be in range [0, 1]!") - - return True - - -def pred_rttm_map(vad_pred: str, groundtruth_RTTM: str, vad_pred_method: str = "frame") -> Tuple[set, dict, dict]: - """ - Find paired files in vad_pred and groundtruth_RTTM - """ - groundtruth_RTTM_dict = {} - if os.path.isfile(groundtruth_RTTM): - with open(groundtruth_RTTM, "r", encoding='utf-8') as fp: - groundtruth_RTTM_files = fp.read().splitlines() - elif os.path.isdir(groundtruth_RTTM): - groundtruth_RTTM_files = glob.glob(os.path.join(groundtruth_RTTM, "*.rttm")) - else: - raise ValueError( - "groundtruth_RTTM should either be a directory contains rttm files or a file contains paths to them!" - ) - for f in groundtruth_RTTM_files: - filename = os.path.basename(f).rsplit(".", 1)[0] - groundtruth_RTTM_dict[filename] = f - - vad_pred_dict = {} - if os.path.isfile(vad_pred): - with open(vad_pred, "r", encoding='utf-8') as fp: - vad_pred_files = fp.read().splitlines() - elif os.path.isdir(vad_pred): - vad_pred_files = glob.glob(os.path.join(vad_pred, "*." + vad_pred_method)) - else: - raise ValueError( - "vad_pred should either be a directory containing vad pred files or a file contains paths to them!" - ) - for f in vad_pred_files: - filename = os.path.basename(f).rsplit(".", 1)[0] - vad_pred_dict[filename] = f - - paired_filenames = groundtruth_RTTM_dict.keys() & vad_pred_dict.keys() - return paired_filenames, groundtruth_RTTM_dict, vad_pred_dict - - -def plot( - path2audio_file: str, - path2_vad_pred: Optional[str] = None, - path2groundtruth_rttm: Optional[str] = None, - groundtruth_labels: Optional[str] = None, - sample_rate: int = 16000, - offset: float = 0, - duration: float = None, - threshold: float = None, - per_args: dict = None, - unit_frame_len: float = 0.01, - label_repeat: int = 1, - xticks_step: int = 5, -) -> ipd.Audio: - """ - Plot Audio and/or VAD output and/or groundtruth labels for visualization - Args: - path2audio_file (str): path to audio file. - path2_vad_pred (str): path to vad prediction file, - path2groundtruth_rttm(str): path to groundtruth RTTM file. - ground_truth_labels(str): a list of groundtruth label. - sample_rate (int): sample rate of audio file. - offset (float): offset in seconds. - duration (float): duration in seconds. - threshold (float): threshold for prediction score (from 0 to 1). - per_args(dict): a dict that stores the thresholds for postprocessing. - unit_frame_len (float): unit frame length in seconds for VAD predictions. - label_repeat (int): repeat the label for this number of times to match different frame lengths in preds and labels. - xticks_step (int): step size for xticks. - """ - plt.figure(figsize=[20, 2]) - - audio, sample_rate = librosa.load( - path=path2audio_file, sr=sample_rate, mono=True, offset=offset, duration=duration - ) - dur = librosa.get_duration(y=audio, sr=sample_rate) - - time = np.arange(offset, offset + dur, unit_frame_len) - len_pred = int(dur / unit_frame_len) + 1 - - frame_snippet = None - if path2_vad_pred: - frame, _ = load_tensor_from_file(path2_vad_pred) - frame_snippet = frame[int(offset / unit_frame_len) : int((offset + dur) / unit_frame_len)] - len_pred = len(frame_snippet) - - ax1 = plt.subplot() - ax1.plot(np.arange(audio.size) / sample_rate, audio, 'gray') - ax1.set_xlim([0, int(dur) + 1]) - ax1.tick_params(axis='y', labelcolor='b') - ax1.set_ylabel('Signal') - ax1.set_ylim([-1, 1]) - ax2 = ax1.twinx() - - if threshold and per_args: - raise ValueError("threshold and per_args cannot be used at same time!") - if not threshold and not per_args: - raise ValueError("One and only one of threshold and per_args must have been used!") - - if threshold and frame_snippet is not None: - pred_snippet = np.where(frame_snippet >= threshold, 1, 0) - elif per_args and frame_snippet is not None: - _, per_args_float = prepare_gen_segment_table( - frame, per_args - ) # take whole frame here for calculating onset and offset - speech_segments = generate_vad_segment_table_per_tensor(frame, per_args_float) - pred = gen_pred_from_speech_segments(speech_segments, frame) - pred_snippet = pred[int(offset / unit_frame_len) : int((offset + dur) / unit_frame_len)] - else: - pred_snippet = None - - if path2groundtruth_rttm and path2groundtruth_rttm.endswith('.rttm'): - label = extract_labels(path2groundtruth_rttm, time) - elif groundtruth_labels: - label = [float(x) for x in groundtruth_labels] - if label_repeat > 1: - label = np.repeat(label, label_repeat) - label = label[int(offset / unit_frame_len) : int((offset + dur) / unit_frame_len)] - else: - label = None - - if label is not None: - ax2.plot(np.arange(len_pred) * unit_frame_len, label, 'r', label='label') - if pred_snippet is not None: - ax2.plot(np.arange(len_pred) * unit_frame_len, pred_snippet, 'b', label='pred') - if frame_snippet is not None: - ax2.plot(np.arange(len_pred) * unit_frame_len, frame_snippet, 'g--', label='speech prob') - - ax2.tick_params(axis='y', labelcolor='r') - ax2.legend(loc='lower right', shadow=True) - ax2.set_ylabel('Preds and Probas') - ax2.set_ylim([-0.1, 1.1]) - ax2.set_xticks(np.arange(0, int(dur) + 1, xticks_step)) - return ipd.Audio(audio, rate=sample_rate) - - -def gen_pred_from_speech_segments( - speech_segments: torch.Tensor, prob: float, shift_length_in_sec: float = 0.01 -) -> np.array: - """ - Generate prediction arrays like 000111000... from speech segments {[0,1][2,4]} - """ - pred = np.zeros(prob.shape) - speech_segments = [list(i) for i in speech_segments] - speech_segments.sort(key=lambda x: x[0]) - - for seg in speech_segments: - start = int(seg[0] / shift_length_in_sec) - end = int(seg[1] / shift_length_in_sec) - pred[start:end] = 1 - return pred - - -def extract_labels(path2ground_truth_label: str, time: list) -> list: - """ - Extract ground-truth label for given time period. - path2ground_truth_label (str): path of groundtruth RTTM file - time (list) : a list of array representing time period. - """ - - data = pd.read_csv(path2ground_truth_label, sep="\s+", delimiter=None, header=None) - data = data.rename(columns={3: "start", 4: "dur", 7: "speaker"}) - labels = [] - for pos in time: - line = data[(data["start"] <= pos) & (data["start"] + data["dur"] > pos)] - if len(line) >= 1: - labels.append(1) - else: - labels.append(0) - return labels - - -def generate_vad_frame_pred( - vad_model, - window_length_in_sec: float, - shift_length_in_sec: float, - manifest_vad_input: str, - out_dir: str, - use_feat: bool = False, -) -> str: - """ - Generate VAD frame level prediction and write to out_dir - """ - time_unit = int(window_length_in_sec / shift_length_in_sec) - trunc = int(time_unit / 2) - trunc_l = time_unit - trunc - all_len = 0 - - data = [] - with open(manifest_vad_input, 'r', encoding='utf-8') as f: - for line in f: - file = json.loads(line)['audio_filepath'].split("/")[-1] - data.append(file.split(".wav")[0]) - logging.info(f"Inference on {len(data)} audio files/json lines!") - - status = get_vad_stream_status(data) - for i, test_batch in enumerate(tqdm(vad_model.test_dataloader(), total=len(vad_model.test_dataloader()))): - test_batch = [x.to(vad_model.device) for x in test_batch] - with autocast(): - if use_feat: - log_probs = vad_model(processed_signal=test_batch[0], processed_signal_length=test_batch[1]) - else: - log_probs = vad_model(input_signal=test_batch[0], input_signal_length=test_batch[1]) - probs = torch.softmax(log_probs, dim=-1) - if len(probs.shape) == 3 and probs.shape[0] == 1: - # squeeze the batch dimension, since batch size is 1 for frame-VAD - probs = probs.squeeze(0) # [1,T,C] -> [T,C] - pred = probs[:, 1] - - if window_length_in_sec == 0: - to_save = pred - elif status[i] == 'start': - to_save = pred[:-trunc] - elif status[i] == 'next': - to_save = pred[trunc:-trunc_l] - elif status[i] == 'end': - to_save = pred[trunc_l:] - else: - to_save = pred - - to_save = to_save.cpu().tolist() - all_len += len(to_save) - outpath = os.path.join(out_dir, data[i] + ".frame") - with open(outpath, "a", encoding='utf-8') as fout: - for f in range(len(to_save)): - fout.write('{0:0.4f}\n'.format(to_save[f])) - - del test_batch - if status[i] == 'end' or status[i] == 'single': - logging.debug(f"Overall length of prediction of {data[i]} is {all_len}!") - all_len = 0 - return out_dir - - -def init_vad_model(model_path: str): - """ - Initiate VAD model with model path - """ - if model_path.endswith('.nemo'): - logging.info(f"Using local VAD model from {model_path}") - vad_model = EncDecClassificationModel.restore_from(restore_path=model_path) - elif model_path.endswith('.ckpt'): - vad_model = EncDecClassificationModel.load_from_checkpoint(checkpoint_path=model_path) - else: - logging.info(f"Using NGC cloud VAD model {model_path}") - vad_model = EncDecClassificationModel.from_pretrained(model_name=model_path) - return vad_model - - -def init_frame_vad_model(model_path: str): - """ - Initiate VAD model with model path - """ - if model_path.endswith('.nemo'): - logging.info(f"Using local VAD model from {model_path}") - vad_model = EncDecFrameClassificationModel.restore_from(restore_path=model_path) - elif model_path.endswith('.ckpt'): - vad_model = EncDecFrameClassificationModel.load_from_checkpoint(checkpoint_path=model_path) - else: - logging.info(f"Using NGC cloud VAD model {model_path}") - vad_model = EncDecFrameClassificationModel.from_pretrained(model_name=model_path) - return vad_model - - -def stitch_segmented_asr_output( - segmented_output_manifest: str, - speech_segments_tensor_dir: str = "speech_segments", - stitched_output_manifest: str = "asr_stitched_output_manifest.json", -) -> str: - """ - Stitch the prediction of speech segments. - """ - if not os.path.exists(speech_segments_tensor_dir): - os.mkdir(speech_segments_tensor_dir) - - segmented_output = [] - with open(segmented_output_manifest, 'r', encoding='utf-8') as f: - for line in f: - file = json.loads(line) - segmented_output.append(file) - - with open(stitched_output_manifest, 'w', encoding='utf-8') as fout: - speech_segments = torch.Tensor() - all_pred_text = "" - if len(segmented_output) > 1: - for i in range(1, len(segmented_output)): - start, end = ( - segmented_output[i - 1]['offset'], - segmented_output[i - 1]['offset'] + segmented_output[i - 1]['duration'], - ) - new_seg = torch.tensor([start, end]).unsqueeze(0) - speech_segments = torch.cat((speech_segments, new_seg), 0) - pred_text = segmented_output[i - 1]['pred_text'] - all_pred_text += pred_text - name = segmented_output[i - 1]['audio_filepath'].split("/")[-1].rsplit(".", 1)[0] - - if segmented_output[i - 1]['audio_filepath'] != segmented_output[i]['audio_filepath']: - - speech_segments_tensor_path = os.path.join(speech_segments_tensor_dir, name + '.pt') - torch.save(speech_segments, speech_segments_tensor_path) - meta = { - 'audio_filepath': segmented_output[i - 1]['audio_filepath'], - 'speech_segments_filepath': speech_segments_tensor_path, - 'pred_text': all_pred_text, - } - - json.dump(meta, fout) - fout.write('\n') - fout.flush() - speech_segments = torch.Tensor() - all_pred_text = "" - else: - all_pred_text += " " - else: - i = -1 - - start, end = segmented_output[i]['offset'], segmented_output[i]['offset'] + segmented_output[i]['duration'] - new_seg = torch.tensor([start, end]).unsqueeze(0) - speech_segments = torch.cat((speech_segments, new_seg), 0) - pred_text = segmented_output[i]['pred_text'] - all_pred_text += pred_text - name = segmented_output[i]['audio_filepath'].split("/")[-1].rsplit(".", 1)[0] - speech_segments_tensor_path = os.path.join(speech_segments_tensor_dir, name + '.pt') - torch.save(speech_segments, speech_segments_tensor_path) - - meta = { - 'audio_filepath': segmented_output[i]['audio_filepath'], - 'speech_segments_filepath': speech_segments_tensor_path, - 'pred_text': all_pred_text, - } - json.dump(meta, fout) - fout.write('\n') - fout.flush() - - logging.info( - f"Finish stitch segmented ASR output to {stitched_output_manifest}, the speech segments info has been stored in directory {speech_segments_tensor_dir}" - ) - return stitched_output_manifest - - -def construct_manifest_eval( - input_manifest: str, stitched_output_manifest: str, aligned_vad_asr_output_manifest: str = "vad_asr_out.json" -) -> str: - - """ - Generate aligned manifest for evaluation. - Because some pure noise samples might not appear in stitched_output_manifest. - """ - stitched_output = dict() - with open(stitched_output_manifest, 'r', encoding='utf-8') as f: - for line in f: - file = json.loads(line) - stitched_output[file["audio_filepath"]] = file - - out = [] - with open(input_manifest, 'r', encoding='utf-8') as f: - for line in f: - file = json.loads(line) - sample = file["audio_filepath"] - if sample in stitched_output: - file["pred_text"] = stitched_output[sample]["pred_text"] - file["speech_segments_filepath"] = stitched_output[sample]["speech_segments_filepath"] - else: - file["pred_text"] = "" - file["speech_segments_filepath"] = "" - - out.append(file) - - with open(aligned_vad_asr_output_manifest, 'w', encoding='utf-8') as fout: - for i in out: - json.dump(i, fout) - fout.write('\n') - fout.flush() - - return aligned_vad_asr_output_manifest - - -def load_rttm_file(filepath: str) -> pd.DataFrame: - """ - Load rttm file and extract speech segments - """ - if not Path(filepath).exists(): - raise ValueError(f"File not found: {filepath}") - data = pd.read_csv(filepath, sep="\s+", delimiter=None, header=None) - data = data.rename(columns={3: "start", 4: "dur", 7: "speaker"}) - - data['start'] = data['start'].astype(float) - data['dur'] = data['dur'].astype(float) - data['end'] = data['start'] + data['dur'] - - data = data.sort_values(by=['start']) - data['segment'] = list(zip(data['start'], data['end'])) - - return data - - -def merge_intervals(intervals: List[List[float]]) -> List[List[float]]: - """ - Merge speech segments into non-overlapping segments - """ - intervals.sort(key=lambda x: x[0]) - merged = [] - for interval in intervals: - # if the list of merged intervals is empty or if the current - # interval does not overlap with the previous, simply append it. - if not merged or merged[-1][1] < interval[0]: - merged.append(interval) - else: - # otherwise, there is overlap, so we merge the current and previous - # intervals. - merged[-1][1] = max(merged[-1][1], interval[1]) - return merged - - -def load_speech_segments_from_rttm(rttm_file: str) -> List[List[float]]: - """ - load speech segments from rttm file, where each segment is represented - as [start, end] interval - """ - speech_segments = list(load_rttm_file(rttm_file)['segment']) - speech_segments = [list(x) for x in speech_segments] - speech_segments = merge_intervals(speech_segments) - return speech_segments - - -def load_speech_overlap_segments_from_rttm(rttm_file: str) -> Tuple[List[List[float]], List[List[float]]]: - """ - Load speech segments from RTTM file, merge and extract possible overlaps - - Args: - rttm_file (str): Path to RTTM file - - Returns: - merged (List[List[float]]): merged speech intervals without overlaps - overlaps (List[List[float]]): intervals with overlap speech - """ - speech_segments = list(load_rttm_file(rttm_file)['segment']) - speech_segments = [list(x) for x in speech_segments] - speech_segments.sort(key=lambda x: x[0]) # sort by start time - merged = [] - overlaps = [] - for interval in speech_segments: - # if the list of merged intervals is empty or if the current - # interval does not overlap with the previous, simply append it. - if not merged or merged[-1][1] < interval[0]: - merged.append(interval) - else: - # otherwise, there is overlap, so we merge the current and previous - # intervals. - overlaps.append([interval[0], min(merged[-1][1], interval[1])]) - merged[-1][1] = max(merged[-1][1], interval[1]) - return merged, overlaps - - -def get_nonspeech_segments( - speech_segments: List[List[float]], max_duration: Optional[float] = None -) -> List[List[float]]: - """ - Get non-speech segments from given speech segments and maximum duration - - Args: - speech_segments (List[List[float]]): speech segment intervals loaded by load_speech_segments() - max_duration (Optional[float]): maximum duration of the audio, used to calculate the last silence segment - - Returns: - nonspeech_segments (List[List[float]]): intervals of non-speech segments - """ - nonspeech_segments = [] - start = 0.0 - for sp_seg in speech_segments: - end = sp_seg[0] - nonspeech_segments.append([start, end]) - start = sp_seg[1] - - if max_duration is not None and start < max_duration: - nonspeech_segments.append([start, max_duration]) - - return nonspeech_segments - - -def get_frame_labels( - segments: List[List[float]], frame_length: float, offset: float, duration: float, as_str: bool = True -) -> str: - """ - Generate frame-level binary labels for audio, '0' for non-speech and '1' for speech - - Args: - segments (List[List[float]]): speech segments loaded by load_speech_segments_from_rttm - frame_length (float): frame length in seconds, e.g. 0.01 for 10ms frames - offset (float): Offset of the audio clip - duration (float): duration of the audio clip - """ - labels = [] - n_frames = int(np.ceil(duration / frame_length)) - sid = 0 - for i in range(n_frames): - t = offset + i * frame_length - while sid < len(segments) - 1 and segments[sid][1] < t: - sid += 1 - if segments[sid][1] != 0 and segments[sid][0] <= t <= segments[sid][1]: - labels.append(1) - else: - labels.append(0) - if as_str: - return ' '.join([str(x) for x in labels]) - return [float(x) for x in labels] - - -def plot_sample_from_rttm( - audio_file: str, - rttm_file: str, - max_duration: Optional[float] = None, - save_path: str = "", - show: bool = True, - offset: float = 0.0, - unit_frame_len: float = 0.01, -): - """ - Plot audio signal and frame-level labels from RTTM file - """ - plt.figure(figsize=[20, 2]) - - audio, sample_rate = librosa.load(path=audio_file, sr=16000, mono=True, offset=offset, duration=max_duration) - dur = librosa.get_duration(y=audio, sr=sample_rate) - - segments = load_speech_segments_from_rttm(rttm_file) - labels = get_frame_labels(segments, unit_frame_len, offset, dur) - labels = [float(x) for x in labels.split()] - - length = len(labels) - ax1 = plt.subplot() - ax1.set_title(audio_file) - ax1.plot(np.arange(audio.size) / sample_rate, audio, 'gray') - ax1.set_xlim([0, int(dur) + 1]) - ax1.tick_params(axis='y', labelcolor='b') - ax1.set_ylabel('Signal') - ax1.set_ylim([-1, 1]) - ax2 = ax1.twinx() - - ax2.plot(np.arange(length) * unit_frame_len, labels, 'r', label='label') - ax2.tick_params(axis='y', labelcolor='r') - ax2.legend(loc='lower right', shadow=True) - ax2.set_ylabel('Labels') - ax2.set_ylim([-0.1, 1.1]) - if show: - plt.show() - if save_path: - plt.savefig(save_path) - return ipd.Audio(audio, rate=16000) - - -def align_labels_to_frames(probs, labels, threshold=0.2): - """ - Aligns labels to frames when the frame length (e.g., 10ms) is different from the label length (e.g., 20ms). - The threshold 0.2 is not important, since the actual ratio will always be close to an integer unless using frame/label - lengths that are not multiples of each other (e.g., 15ms frame length and 20ms label length), which is not valid. - The value 0.2 here is just for easier unit testing. - Args: - probs (List[float]): list of probabilities - labels (List[int]): list of labels - threshold (float): threshold for rounding ratio to integer - Returns: - labels (List[int]): list of labels aligned to frames - """ - frames_len = len(probs) - labels_len = len(labels) - probs = torch.tensor(probs).float() - labels = torch.tensor(labels).long() - - if frames_len < labels_len: - # pad labels with zeros until labels_len is a multiple of frames_len - ratio = labels_len / frames_len - res = labels_len % frames_len - if ( - ceil(ratio) - ratio < threshold - ): # e.g., ratio = 2.9, ceil(ratio) = 3, then we pad labels to make it a multiple of 3 - # pad labels with zeros until labels_max_len is a multiple of logits_max_len - labels = labels.tolist() - if len(labels) % ceil(ratio) != 0: - labels += [0] * (ceil(ratio) - len(labels) % ceil(ratio)) - labels = torch.tensor(labels).long() - labels = labels.view(-1, ceil(ratio)).amax(1) - return align_labels_to_frames(probs.tolist(), labels.long().tolist()) - # otherwise, truncate additional labels until labels_max_len is a multiple of logits_max_len - if res > 0: - labels = labels[:-res] - labels = labels.view(-1, floor(ratio)).amax(1) - return labels.long().tolist() - elif frames_len > labels_len: - # repeat labels until labels_len is a multiple of frames_len - ratio = frames_len / labels_len - res = frames_len % labels_len - if ceil(ratio) - ratio < threshold: - # e.g., ratio is 1.83, ceil(ratio) = 2, then we repeat labels to make it a multiple of 2, and discard the redundant labels - labels = labels.repeat_interleave(ceil(ratio), dim=0).long().tolist() - labels = labels[:frames_len] - else: - # e.g., ratio is 2.02, floor(ratio) = 2, then we repeat labels to make it a multiple of 2 and add additional labels - labels = labels.repeat_interleave(floor(ratio), dim=0).long().tolist() - if res > 0: - labels += labels[-res:] - return labels - else: - return labels.long().tolist() - - -def read_rttm_as_pyannote_object(rttm_file: str, speaker_override: Optional[str] = None) -> Annotation: - """ - Read rttm file and construct a Pyannote object. - Args: - rttm_file(str) : path of rttm file. - speaker_override(str) : if not None, all speakers will be replaced by this value. - Returns: - annotation(pyannote.Annotation): annotation object - """ - annotation = Annotation() - data = pd.read_csv(rttm_file, sep="\s+", delimiter=None, header=None) - data = data.rename(columns={3: "start", 4: "dur", 7: "speaker"}) - for index, row in data.iterrows(): - if speaker_override is not None: - annotation[Segment(row['start'], row['start'] + row['dur'])] = speaker_override - else: - annotation[Segment(row['start'], row['start'] + row['dur'])] = row['speaker'] - return annotation - - -def convert_labels_to_speech_segments(labels: List[float], frame_length_in_sec: float = 0.01): - """ - Convert a list of labels to a list of speech segments. - Args: - labels (List[float]): list of labels - frame_length_in_sec (float): frame length in seconds - Returns: - segments (List[Tuple[float, float]]): list of speech segments - """ - segments = [] - start = -1 - for i, label in enumerate(labels): - if label == 1: - if start == -1: - start = i * frame_length_in_sec - else: - if start > -1: - segments.append([start, (i - 1) * frame_length_in_sec]) - start = -1 - if start != -1: - segments.append([start, (len(labels) - 1) * frame_length_in_sec]) - return segments - - -def frame_vad_construct_pyannote_object_per_file( - prediction: Union[str, List[float]], groundtruth: Union[str, List[float]], frame_length_in_sec: float = 0.01 -) -> Tuple[Annotation, Annotation]: - """ - Construct a Pyannote object for evaluation. - Args: - prediction (str) : path of VAD predictions stored as RTTM or CSV-like txt. - groundtruth (str): path of groundtruth rttm file. - frame_length_in_sec(float): frame length in seconds - Returns: - reference(pyannote.Annotation): groundtruth - hypothesis(pyannote.Annotation): prediction - """ - - hypothesis = Annotation() - if isinstance(groundtruth, str) and prediction.endswith('.rttm'): - hypothesis = read_rttm_as_pyannote_object(prediction, speaker_override='speech') - elif isinstance(groundtruth, str) and prediction.endswith('.txt'): - pred = pd.read_csv(prediction, sep=" ", header=None) - for index, row in pred.iterrows(): - hypothesis[Segment(float(row[0]), float(row[0]) + float(row[1]))] = 'speech' - elif isinstance(groundtruth, list): - segments = convert_labels_to_speech_segments(prediction, frame_length_in_sec) - for segment in segments: - hypothesis[Segment(segment[0], segment[1])] = 'speech' - else: - raise ValueError('prediction must be a path to rttm file or a list of frame labels.') - - reference = Annotation() - if isinstance(groundtruth, str) and groundtruth.endswith('.rttm'): - reference = read_rttm_as_pyannote_object(groundtruth, speaker_override='speech') - elif isinstance(groundtruth, list): - segments = convert_labels_to_speech_segments(groundtruth, frame_length_in_sec) - for segment in segments: - reference[Segment(segment[0], segment[1])] = 'speech' - else: - raise ValueError('groundtruth must be a path to rttm file or a list of frame labels.') - return reference, hypothesis - - -def frame_vad_infer_load_manifest(cfg: DictConfig): - """ - Load manifest file and prepare label/rttm mapping - Args: - cfg: config file - Returns: - manifest_orig (List[Dict]): original manifest data - key_labels_map (Dict): mapping from unique_audio_name to its labels - key_rttm_map (Dict): mapping from unique_audio_name to its rttm file - """ - unique_audio_names = set() - key_labels_map = {} - key_rttm_map = {} - manifest_orig = [] - manifest_file = Path(cfg.dataset).absolute().as_posix() - with open(manifest_file, 'r') as fin: - for line in fin.readlines(): - entry = json.loads(line.strip()) - audio_filepath = get_full_path(audio_file=entry['audio_filepath'], manifest_file=manifest_file) - entry['audio_filepath'] = str(audio_filepath) - uniq_audio_name = Path(audio_filepath).stem - - if uniq_audio_name in unique_audio_names: - raise ValueError("Please make sure each line is with different audio_filepath! ") - else: - unique_audio_names.add(uniq_audio_name) - - manifest_orig.append(entry) - - # always prefer RTTM labels if exist - if "label" not in entry and ("rttm_filepath" in entry or "rttm_file" in entry): - rttm_key = "rttm_filepath" if "rttm_filepath" in entry else "rttm_file" - segments = load_speech_segments_from_rttm(entry[rttm_key]) - label_str = get_frame_labels( - segments=segments, - frame_length=cfg.vad.parameters.shift_length_in_sec, - duration=entry['duration'], - offset=entry['offset'], - ) - key_rttm_map[uniq_audio_name] = entry[rttm_key] - key_labels_map[uniq_audio_name] = [float(x) for x in label_str.split()] - elif entry.get("label", None) is not None: - key_labels_map[uniq_audio_name] = [float(x) for x in entry["label"].split()] - elif cfg.evaluate: - raise ValueError("Must have either `label` or `rttm_filepath` in manifest when evaluate=True") - - return manifest_orig, key_labels_map, key_rttm_map - - -def frame_vad_eval_detection_error( - pred_dir: str, key_labels_map: dict, key_rttm_map: dict, key_pred_rttm_map: dict, frame_length_in_sec: float -): - """ - Perform evaluation on frame-VAD results - Args: - pred_dir: directory of frame-VAD prediction files with in `.frame` format - key_labels_map: dictionary of mapping each to its labels - key_rttm_map: dictionary of mapping each to its GROUNDTRUTH rttm file - key_pred_rttm_map: dictionary of mapping each to its PREDICTED rttm file - frame_length_in_sec: frame length in seconds, e.g. 0.02s - Returns: - auroc: AUROC score in 0~100% - report: Pyannote detection.DetectionErrorRate() report - """ - all_probs = [] - all_labels = [] - metric = detection.DetectionErrorRate() - key_probs_map = {} - predictions_list = list(Path(pred_dir).glob("*.frame")) - for frame_pred in tqdm(predictions_list, desc="Evaluating VAD results", total=len(predictions_list)): - pred_probs = [] - with frame_pred.open("r") as fin: - for line in fin.readlines(): - line = line.strip() - if not line: - continue - pred_probs.append(float(line)) - key = frame_pred.stem - key_probs_map[key] = pred_probs - key_labels_map[key] = align_labels_to_frames(probs=pred_probs, labels=key_labels_map[key]) - all_probs.extend(key_probs_map[key]) - all_labels.extend(key_labels_map[key]) - - if key in key_rttm_map: - groundtruth = key_rttm_map[key] - else: - groundtruth = key_labels_map[key] - - reference, hypothesis = frame_vad_construct_pyannote_object_per_file( - prediction=key_pred_rttm_map[key], groundtruth=groundtruth, frame_length_in_sec=frame_length_in_sec, - ) - metric(reference, hypothesis) - - auroc = roc_auc_score(y_true=all_labels, y_score=all_probs) - report = metric.report(display=False) - return auroc, report diff --git a/SoundScribe/SpeakerID/nemo/collections/common/__init__.py b/SoundScribe/SpeakerID/nemo/collections/common/__init__.py deleted file mode 100644 index 9c203622104b054550ec59232bdfad328f147e13..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import nemo.collections.common.callbacks -from nemo.collections.common import data, losses, parts, tokenizers -from nemo.package_info import __version__ - -# Set collection version equal to NeMo version. -__version = __version__ - -# Authorship. -__author__ = "NVIDIA Corporation" - -# Set collection name. -__description__ = "Common collection" diff --git a/SoundScribe/SpeakerID/nemo/collections/common/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index c17a39b309844b3005873ca30c2d189d6ffdb17f..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/callbacks/__init__.py b/SoundScribe/SpeakerID/nemo/collections/common/callbacks/__init__.py deleted file mode 100644 index 0cf495d946960af72276cde51d1f546385356a1d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/callbacks/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.common.callbacks.callbacks import LogEpochTimeCallback -from nemo.collections.common.callbacks.ema import EMA diff --git a/SoundScribe/SpeakerID/nemo/collections/common/callbacks/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/callbacks/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 8250298ee1895e112fb8cabce17a140e8428fa1f..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/callbacks/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/callbacks/__pycache__/callbacks.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/callbacks/__pycache__/callbacks.cpython-310.pyc deleted file mode 100644 index a936a1c1a81bb0a1dfadc9a282a68b03dd1ce4e8..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/callbacks/__pycache__/callbacks.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/callbacks/__pycache__/ema.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/callbacks/__pycache__/ema.cpython-310.pyc deleted file mode 100644 index 87b5c73bfc64e6acef5b5256ff480b547287ccae..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/callbacks/__pycache__/ema.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/callbacks/callbacks.py b/SoundScribe/SpeakerID/nemo/collections/common/callbacks/callbacks.py deleted file mode 100644 index 1a6c011c38dfaf292108d9ca903b2f133b991f92..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/callbacks/callbacks.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import time - -from pytorch_lightning.callbacks import Callback -from pytorch_lightning.utilities import rank_zero_only - -# from sacrebleu import corpus_bleu - - -class LogEpochTimeCallback(Callback): - """Simple callback that logs how long each epoch takes, in seconds, to a pytorch lightning log - """ - - @rank_zero_only - def on_train_epoch_start(self, trainer, pl_module): - self.epoch_start = time.time() - - @rank_zero_only - def on_train_epoch_end(self, trainer, pl_module): - curr_time = time.time() - duration = curr_time - self.epoch_start - trainer.logger.log_metrics({"epoch_time": duration}, step=trainer.global_step) - - -# class MachineTranslationLogEvalCallback(Callback): -# def _on_eval_end(self, trainer, pl_module, mode): -# counts = np.array(self._non_pad_tokens) -# eval_loss = np.sum(np.array(self._losses) * counts) / np.sum(counts) -# sacre_bleu = corpus_bleu(self._translations, [self._ground_truths], tokenize="13a") -# print(f"{mode} results for process with global rank {pl_module.global_rank}".upper()) -# for i in range(pl_module.num_examples[mode]): -# print('\u0332'.join(f"EXAMPLE {i}:")) # Underline output -# sent_id = np.random.randint(len(self._translations)) -# print(f"Ground truth: {self._ground_truths[sent_id]}\n") -# print(f"Translation: {self._translations[sent_id]}\n") -# print() -# print("-" * 50) -# print(f"loss: {eval_loss:.3f}") -# print(f"SacreBLEU: {sacre_bleu}") -# print("-" * 50) - -# @rank_zero_only -# def on_test_end(self, trainer, pl_module): -# self._on_eval_end(trainer, pl_module, "test") - -# @rank_zero_only -# def on_validation_end(self, trainer, pl_module): -# self._on_eval_end(trainer, pl_module, "val") - -# @rank_zero_only -# def on_sanity_check_end(self, trainer, pl_module): -# self._on_eval_end(trainer, pl_module, "val") - -# def _on_eval_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx, mode): -# self._translations.extend(outputs['translations']) -# self._ground_truths.extend(outputs['ground_truths']) -# self._non_pad_tokens.append(outputs['num_non_pad_tokens']) -# self._losses.append(outputs[f'{mode}_loss']) - -# @rank_zero_only -# def on_test_batch_end(self, trainer, pl_module, batch, outputs, batch_idx, dataloader_idx): -# self._on_eval_batch_end(trainer, pl_module, batch, outputs, batch_idx, dataloader_idx, 'test') - -# @rank_zero_only -# def on_validation_batch_end(self, trainer, pl_module, batch, outputs, batch_idx, dataloader_idx): -# self._on_eval_batch_end(trainer, pl_module, batch, outputs, batch_idx, dataloader_idx, 'val') - -# def _on_eval_start(self, trainer, pl_module): -# self._translations = [] -# self._ground_truths = [] -# self._losses = [] -# self._non_pad_tokens = [] - -# @rank_zero_only -# def on_test_start(self, trainer, pl_module): -# self._on_eval_start(trainer, pl_module) - -# @rank_zero_only -# def on_validation_start(self, trainer, pl_module): -# self._on_eval_start(trainer, pl_module) - -# @rank_zero_only -# def on_sanity_check_start(self, trainer, pl_module): -# self._on_eval_start(trainer, pl_module) diff --git a/SoundScribe/SpeakerID/nemo/collections/common/callbacks/ema.py b/SoundScribe/SpeakerID/nemo/collections/common/callbacks/ema.py deleted file mode 100644 index ec53b61e17c0d8de4d90e6afe14dd2ee6535a8b9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/callbacks/ema.py +++ /dev/null @@ -1,347 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import contextlib -import copy -import os -import threading -from typing import Any, Dict, Iterable - -import pytorch_lightning as pl -import torch -from pytorch_lightning import Callback -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.rank_zero import rank_zero_info - - -class EMA(Callback): - """ - Implements Exponential Moving Averaging (EMA). - - When training a model, this callback will maintain moving averages of the trained parameters. - When evaluating, we use the moving averages copy of the trained parameters. - When saving, we save an additional set of parameters with the prefix `ema`. - - Args: - decay: The exponential decay used when calculating the moving average. Has to be between 0-1. - validate_original_weights: Validate the original weights, as apposed to the EMA weights. - every_n_steps: Apply EMA every N steps. - cpu_offload: Offload weights to CPU. - """ - - def __init__( - self, decay: float, validate_original_weights: bool = False, every_n_steps: int = 1, cpu_offload: bool = False, - ): - if not (0 <= decay <= 1): - raise MisconfigurationException("EMA decay value must be between 0 and 1") - self.decay = decay - self.validate_original_weights = validate_original_weights - self.every_n_steps = every_n_steps - self.cpu_offload = cpu_offload - - def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - device = pl_module.device if not self.cpu_offload else torch.device('cpu') - trainer.optimizers = [ - EMAOptimizer( - optim, - device=device, - decay=self.decay, - every_n_steps=self.every_n_steps, - current_step=trainer.global_step, - ) - for optim in trainer.optimizers - if not isinstance(optim, EMAOptimizer) - ] - - def on_validation_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - if self._should_validate_ema_weights(trainer): - self.swap_model_weights(trainer) - - def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - if self._should_validate_ema_weights(trainer): - self.swap_model_weights(trainer) - - def on_test_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - if self._should_validate_ema_weights(trainer): - self.swap_model_weights(trainer) - - def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - if self._should_validate_ema_weights(trainer): - self.swap_model_weights(trainer) - - def _should_validate_ema_weights(self, trainer: "pl.Trainer") -> bool: - return not self.validate_original_weights and self._ema_initialized(trainer) - - def _ema_initialized(self, trainer: "pl.Trainer") -> bool: - return any(isinstance(optimizer, EMAOptimizer) for optimizer in trainer.optimizers) - - def swap_model_weights(self, trainer: "pl.Trainer", saving_ema_model: bool = False): - for optimizer in trainer.optimizers: - assert isinstance(optimizer, EMAOptimizer) - optimizer.switch_main_parameter_weights(saving_ema_model) - - @contextlib.contextmanager - def save_ema_model(self, trainer: "pl.Trainer"): - """ - Saves an EMA copy of the model + EMA optimizer states for resume. - """ - self.swap_model_weights(trainer, saving_ema_model=True) - try: - yield - finally: - self.swap_model_weights(trainer, saving_ema_model=False) - - @contextlib.contextmanager - def save_original_optimizer_state(self, trainer: "pl.Trainer"): - for optimizer in trainer.optimizers: - assert isinstance(optimizer, EMAOptimizer) - optimizer.save_original_optimizer_state = True - try: - yield - finally: - for optimizer in trainer.optimizers: - optimizer.save_original_optimizer_state = False - - def on_load_checkpoint( - self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", checkpoint: Dict[str, Any] - ) -> None: - checkpoint_callback = trainer.checkpoint_callback - - # use the connector as NeMo calls the connector directly in the exp_manager when restoring. - connector = trainer._checkpoint_connector - # Replace connector._ckpt_path with below to avoid calling into lightning's protected API - ckpt_path = trainer.ckpt_path - - if ckpt_path and checkpoint_callback is not None and 'NeMo' in type(checkpoint_callback).__name__: - ext = checkpoint_callback.FILE_EXTENSION - if ckpt_path.endswith(f'-EMA{ext}'): - rank_zero_info( - "loading EMA based weights. " - "The callback will treat the loaded EMA weights as the main weights" - " and create a new EMA copy when training." - ) - return - ema_path = ckpt_path.replace(ext, f'-EMA{ext}') - if os.path.exists(ema_path): - ema_state_dict = torch.load(ema_path, map_location=torch.device('cpu')) - - checkpoint['optimizer_states'] = ema_state_dict['optimizer_states'] - del ema_state_dict - rank_zero_info("EMA state has been restored.") - else: - raise MisconfigurationException( - "Unable to find the associated EMA weights when re-loading, " - f"training will start with new EMA weights. Expected them to be at: {ema_path}", - ) - - -@torch.no_grad() -def ema_update(ema_model_tuple, current_model_tuple, decay): - torch._foreach_mul_(ema_model_tuple, decay) - torch._foreach_add_( - ema_model_tuple, current_model_tuple, alpha=(1.0 - decay), - ) - - -def run_ema_update_cpu(ema_model_tuple, current_model_tuple, decay, pre_sync_stream=None): - if pre_sync_stream is not None: - pre_sync_stream.synchronize() - - ema_update(ema_model_tuple, current_model_tuple, decay) - - -class EMAOptimizer(torch.optim.Optimizer): - r""" - EMAOptimizer is a wrapper for torch.optim.Optimizer that computes - Exponential Moving Average of parameters registered in the optimizer. - - EMA parameters are automatically updated after every step of the optimizer - with the following formula: - - ema_weight = decay * ema_weight + (1 - decay) * training_weight - - To access EMA parameters, use ``swap_ema_weights()`` context manager to - perform a temporary in-place swap of regular parameters with EMA - parameters. - - Notes: - - EMAOptimizer is not compatible with APEX AMP O2. - - Args: - optimizer (torch.optim.Optimizer): optimizer to wrap - device (torch.device): device for EMA parameters - decay (float): decay factor - - Returns: - returns an instance of torch.optim.Optimizer that computes EMA of - parameters - - Example: - model = Model().to(device) - opt = torch.optim.Adam(model.parameters()) - - opt = EMAOptimizer(opt, device, 0.9999) - - for epoch in range(epochs): - training_loop(model, opt) - - regular_eval_accuracy = evaluate(model) - - with opt.swap_ema_weights(): - ema_eval_accuracy = evaluate(model) - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - device: torch.device, - decay: float = 0.9999, - every_n_steps: int = 1, - current_step: int = 0, - ): - self.optimizer = optimizer - self.decay = decay - self.device = device - self.current_step = current_step - self.every_n_steps = every_n_steps - self.save_original_optimizer_state = False - - self.first_iteration = True - self.rebuild_ema_params = True - self.stream = None - self.thread = None - - self.ema_params = () - self.in_saving_ema_model_context = False - - def all_parameters(self) -> Iterable[torch.Tensor]: - return (param for group in self.param_groups for param in group['params']) - - def step(self, closure=None, **kwargs): - self.join() - - if self.first_iteration: - if any(p.is_cuda for p in self.all_parameters()): - self.stream = torch.cuda.Stream() - - self.first_iteration = False - - if self.rebuild_ema_params: - opt_params = list(self.all_parameters()) - - self.ema_params += tuple( - copy.deepcopy(param.data.detach()).to(self.device) for param in opt_params[len(self.ema_params) :] - ) - self.rebuild_ema_params = False - - loss = self.optimizer.step(closure) - - if self._should_update_at_step(): - self.update() - self.current_step += 1 - return loss - - def _should_update_at_step(self) -> bool: - return self.current_step % self.every_n_steps == 0 - - @torch.no_grad() - def update(self): - if self.stream is not None: - self.stream.wait_stream(torch.cuda.current_stream()) - - with torch.cuda.stream(self.stream): - current_model_state = tuple( - param.data.to(self.device, non_blocking=True) for param in self.all_parameters() - ) - - if self.device.type == 'cuda': - ema_update(self.ema_params, current_model_state, self.decay) - - if self.device.type == 'cpu': - self.thread = threading.Thread( - target=run_ema_update_cpu, args=(self.ema_params, current_model_state, self.decay, self.stream,), - ) - self.thread.start() - - def swap_tensors(self, tensor1, tensor2): - tmp = torch.empty_like(tensor1) - tmp.copy_(tensor1) - tensor1.copy_(tensor2) - tensor2.copy_(tmp) - - def switch_main_parameter_weights(self, saving_ema_model: bool = False): - self.join() - self.in_saving_ema_model_context = saving_ema_model - for param, ema_param in zip(self.all_parameters(), self.ema_params): - self.swap_tensors(param.data, ema_param) - - @contextlib.contextmanager - def swap_ema_weights(self, enabled: bool = True): - r""" - A context manager to in-place swap regular parameters with EMA - parameters. - It swaps back to the original regular parameters on context manager - exit. - - Args: - enabled (bool): whether the swap should be performed - """ - - if enabled: - self.switch_main_parameter_weights() - try: - yield - finally: - if enabled: - self.switch_main_parameter_weights() - - def __getattr__(self, name): - return getattr(self.optimizer, name) - - def join(self): - if self.stream is not None: - self.stream.synchronize() - - if self.thread is not None: - self.thread.join() - - def state_dict(self): - self.join() - - if self.save_original_optimizer_state: - return self.optimizer.state_dict() - - # if we are in the context of saving an EMA model, the EMA weights are in the modules' actual weights - ema_params = self.ema_params if not self.in_saving_ema_model_context else list(self.all_parameters()) - state_dict = { - 'opt': self.optimizer.state_dict(), - 'ema': ema_params, - 'current_step': self.current_step, - 'decay': self.decay, - 'every_n_steps': self.every_n_steps, - } - return state_dict - - def load_state_dict(self, state_dict): - self.join() - - self.optimizer.load_state_dict(state_dict['opt']) - self.ema_params = tuple(param.to(self.device) for param in copy.deepcopy(state_dict['ema'])) - self.current_step = state_dict['current_step'] - self.decay = state_dict['decay'] - self.every_n_steps = state_dict['every_n_steps'] - self.rebuild_ema_params = False - - def add_param_group(self, param_group): - self.optimizer.add_param_group(param_group) - self.rebuild_ema_params = True diff --git a/SoundScribe/SpeakerID/nemo/collections/common/data/__init__.py b/SoundScribe/SpeakerID/nemo/collections/common/data/__init__.py deleted file mode 100644 index ecc67ef05ea564e23eb42ad4de2922a555bd9c3e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/data/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.common.data.dataset import CodeSwitchedDataset, ConcatDataset, ConcatMapDataset diff --git a/SoundScribe/SpeakerID/nemo/collections/common/data/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/data/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 918e9cd85ae547890cf886efe68821f070403193..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/data/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/data/__pycache__/dataset.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/data/__pycache__/dataset.cpython-310.pyc deleted file mode 100644 index bf351eef3333ddd3f7c66674e9c880d74afc4c6e..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/data/__pycache__/dataset.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/data/dataset.py b/SoundScribe/SpeakerID/nemo/collections/common/data/dataset.py deleted file mode 100644 index 5b4fba5ef24a5ff07eca61809af39662119f947c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/data/dataset.py +++ /dev/null @@ -1,659 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import io -import logging -from typing import Any, List, Optional, Tuple, Union - -import numpy as np -import torch -import torch.utils.data as pt_data -from torch.utils.data import Dataset, IterableDataset - -__all__ = ['ConcatDataset', 'ConcatMapDataset', 'CodeSwitchedDataset'] - - -class ConcatDataset(IterableDataset): - """ - A dataset that accepts as argument multiple datasets and then samples from them based on the specified - sampling technique. - Args: - datasets (list): A list of datasets to sample from. - shuffle (bool): Whether to shuffle individual datasets. Only works with non-iterable datasets. - Defaults to True. - sampling_technique (str): Sampling technique to choose which dataset to draw a sample from. - Defaults to 'temperature'. Currently supports 'temperature', 'random' and 'round-robin'. - sampling_temperature (int): Temperature value for sampling. Only used when sampling_technique = 'temperature'. - Defaults to 5. - sampling_scale: Gives you the ability to upsample / downsample the dataset. Defaults to 1. - sampling_probabilities (list): Probability values for sampling. Only used when sampling_technique = 'random'. - seed: Optional value to seed the numpy RNG. - global_rank (int): Worker rank, used for partitioning map style datasets. Defaults to 0. - world_size (int): Total number of processes, used for partitioning map style datasets. Defaults to 1. - """ - - def __init__( - self, - datasets: List[Any], - shuffle: bool = True, - sampling_technique: str = 'temperature', - sampling_temperature: int = 5, - sampling_scale: int = 1, - sampling_probabilities: List[float] = None, - seed: Optional[int] = None, - global_rank: int = 0, - world_size: int = 1, - ): - super().__init__() - - supported_sampling_techniques = ['temperature', 'random', 'round-robin'] - self.datasets = datasets - self.iterables = [None] * len(datasets) - self.shuffle = shuffle - self.global_rank = global_rank - self.world_size = world_size - self.sampling_kwargs = {} - self.sampling_scale = sampling_scale - - if sampling_technique == 'temperature': - self.index_generator = ConcatDataset.temperature_generator - self.sampling_kwargs['temperature'] = sampling_temperature - self.sampling_kwargs['seed'] = seed - elif sampling_technique == 'random': - self.index_generator = ConcatDataset.random_generator - self.sampling_kwargs['p'] = sampling_probabilities - self.sampling_kwargs['seed'] = seed - elif sampling_technique == 'round-robin': - self.index_generator = ConcatDataset.round_robin_generator - else: - raise ValueError(f"Currently we only support sampling techniques in {supported_sampling_techniques}.") - self.length = 0 - - if isinstance(datasets[0], IterableDataset): - self.kind = 'iterable' - else: - self.kind = 'map' - - for idx, dataset in enumerate(datasets): - isiterable = isinstance(dataset, IterableDataset) - if (isiterable and not self.kind == 'iterable') or (not isiterable and self.kind == 'iterable'): - raise ValueError("All datasets in ConcatDataset must be of the same kind (Iterable or Map).") - - if self.kind == 'map': - self.length += len(dataset) // world_size - else: - self.length += len(dataset) - - if self.sampling_scale != 1: - self.length = int(self.length * self.sampling_scale) - logging.info(f'applying {sampling_scale} sampling scale, concat ds len: {self.length}') - - def get_iterable(self, dataset): - if isinstance(dataset, IterableDataset): - return dataset.__iter__() - else: - indices = np.arange(len(dataset)) - if self.shuffle: - np.random.shuffle(indices) - return iter(indices) - - def __iter__(self): - worker_info = pt_data.get_worker_info() - if worker_info is None: - max_elements = self.length - wid = 0 - wnum = 1 - else: - wid = worker_info.id - wnum = worker_info.num_workers - max_elements = len(range(wid, self.length, wnum)) - - if self.kind == 'map': - for idx in range(len(self.datasets)): - start_idx = (len(self.datasets[idx]) // self.world_size) * self.global_rank - end_idx = start_idx + (len(self.datasets[idx]) // self.world_size) - if self.global_rank == self.world_size - 1: - end_idx = len(self.datasets[idx]) - indices = range(start_idx + wid, end_idx, wnum) - self.datasets[idx] = pt_data.Subset(self.datasets[idx], indices) - - for idx, dataset in enumerate(self.datasets): - iterable = self.get_iterable(dataset) - self.iterables[idx] = iterable - - n = 0 - ind_gen = self.index_generator(self.datasets, **self.sampling_kwargs) - while n < max_elements: - n += 1 - try: - ind = next(ind_gen) - except StopIteration: - return - try: - val = next(self.iterables[ind]) - if self.kind == 'map': - val = self.datasets[ind][val] - yield val - except StopIteration: - self.iterables[ind] = self.get_iterable(self.datasets[ind]) - n -= 1 - - def __len__(self): - return self.length - - @staticmethod - def temperature_generator(datasets, **kwargs): - temp = kwargs.get('temperature') - if not temp: - raise ValueError("Temperature generator expects a 'temperature' keyword argument.") - - seed = kwargs.get('seed', None) - np_rng = np.random.RandomState(seed) - lengths = [] - num = len(datasets) - for dataset in datasets: - lengths.append(len(dataset)) - - p = np.array(lengths) / np.sum(lengths) - p = np.power(p, 1 / temp) - p = p / np.sum(p) - - while True: - ind = np_rng.choice(np.arange(num), p=p) - yield ind - - @staticmethod - def round_robin_generator(datasets, **kwargs): - num = len(datasets) - while True: - for i in range(num): - yield i - - @staticmethod - def random_generator(datasets, **kwargs): - p = kwargs.get('p') - if not p: - raise ValueError("Random generator expects a 'p' keyowrd argument for sampling probabilities.") - - seed = kwargs.get('seed', None) - np_rng = np.random.RandomState(seed) - num = len(datasets) - if len(p) != num: - raise ValueError("Length of probabilities list must be equal to the number of datasets.") - - while True: - ind = np_rng.choice(np.arange(num), p=p) - yield ind - - -class ConcatMapDataset(Dataset): - """ - A dataset that accepts as argument multiple datasets and then samples from them based on the specified - sampling technique. - Args: - datasets (list): A list of datasets to sample from. - sampling_technique (str): Sampling technique to choose which dataset to draw a sample from. - Defaults to 'temperature'. Currently supports 'temperature', 'random' and 'round-robin'. - sampling_temperature (int): Temperature value for sampling. Only used when sampling_technique = 'temperature'. - Defaults to 5. - sampling_probabilities (list): Probability values for sampling. Only used when sampling_technique = 'random'. - seed: Optional value to seed the numpy RNG. - """ - - def __init__( - self, - datasets: List[Any], - sampling_technique: str = 'temperature', - sampling_temperature: int = 5, - sampling_probabilities: Optional[List[float]] = None, - seed: Optional[int] = None, - ): - super().__init__() - self.datasets = datasets - self.lengths = [len(x) for x in self.datasets] - self.sampling_technique = sampling_technique - self.sampling_temperature = sampling_temperature - self.sampling_probabilities = sampling_probabilities - self.np_rng = np.random.RandomState(seed) - - # Build a list of size `len(self)`. Each tuple contains (dataset_id, dataset_index) - self.indices: List[Tuple[int, int]] = [] - # Current position as we consume indices from each data set - dataset_positions = [0] * len(self.datasets) - # Random permutation of each dataset. Will be regenerated when exhausted. - shuffled_indices = [self.np_rng.permutation(len(x)) for x in self.datasets] - # Build the list of randomly-chosen datasets spanning the entire length, adhering to sampling technique - if self.sampling_technique == "round-robin": - # To exhaust longest dataset, need to draw `num_datasets * max_dataset_len` samples - total_length = max(self.lengths) * len(self.lengths) - # For round robin, iterate through each dataset - dataset_ids = np.arange(total_length) % len(self.datasets) - for dataset_id in dataset_ids: - position = dataset_positions[dataset_id] - index = shuffled_indices[dataset_id][position] - self.indices.append((dataset_id, index)) - dataset_positions[dataset_id] += 1 - if dataset_positions[dataset_id] == len(shuffled_indices[dataset_id]): - dataset_positions[dataset_id] = 0 - shuffled_indices[dataset_id] = self.np_rng.permutation(len(self.datasets[dataset_id])) - else: - # Resolve probabilities of drawing from each data set - if self.sampling_technique == "random": - if sampling_probabilities is None or len(sampling_probabilities) != len(self.datasets): - raise ValueError( - f"Need {len(self.datasets)} probabilities; got " - f"{len(sampling_probabilities) if sampling_probabilities is not None else 'None'}" - ) - p = np.array(self.sampling_probabilities) - elif self.sampling_technique == "temperature": - p = np.array([len(x) for x in self.datasets]) - p = np.power(p, 1 / self.sampling_temperature) - else: - raise ValueError(f"Couldn't interpret sampling technique: {sampling_technique}") - # Normalize probabilities - p = p / np.sum(p) - # Will randomly choose from datasets - choices = np.arange(len(self.datasets)) - # Keep going until largest dataset is exhausted. - exhausted_datasets = set() - while len(exhausted_datasets) < len(self.datasets): - # Randomly choose a dataset for each position in accordance with p - dataset_id = self.np_rng.choice(a=choices, p=p) - dataset = self.datasets[dataset_id] - # Pick next index from dataset - position = dataset_positions[dataset_id] - index = shuffled_indices[dataset_id][position] - self.indices.append((dataset_id, index)) - # Maybe reset this dataset's permutation - dataset_positions[dataset_id] += 1 - if dataset_positions[dataset_id] >= len(dataset): - shuffled_indices[dataset_id] = self.np_rng.permutation(len(dataset)) - dataset_positions[dataset_id] = 0 - exhausted_datasets.add(dataset_id) - - def __len__(self): - return len(self.indices) - - def __getitem__(self, idx): - dataset_id, dataset_index = self.indices[idx] - return self.datasets[dataset_id][dataset_index] - - -class CodeSwitchedDataset(IterableDataset): - """ - A dataset that accepts as argument multiple sub-datasets (usually from different languages, but that's not required) and then - samples from them in order to create synthetic code-switched samples of up to N different sub-datasets - Args: - datasets (list): A list of datasets - lang_probs (list): A list of probabilities (which must sum to 1) corresponding to the sampling probability for each dataset - shuffle (bool): Whether to shuffle individual datasets. Only works with non-iterable datasets. - Defaults to True. - min_duration (int): the minimum duration (secs) of each synthetic code-switched sample. Will draw randomly until this is hit. - Defaults to 4 - max_duration (int): the maximum duration (secs) of each synthetic code-switched sample. - Defaults to 20 - min_monolingual (float): this percentage of the dataset will be original monolingual samples - Defaults to 0.3 - means 30% - db_norm (float): will normalise the composite CS sample to this DB level - Defaults to -25.0 - pause_start (int): inserts silence equal to this value (msecs) at the start of each CS sample - Defaults to 0 - pause_join (int): inserts silence equal to this value (msecs) between all language changes in the CS sample - Defaults to 0 - pause_end (int): terminates all CS samples with silence equal to this value (msecs) - Defaults to 0 - sampling_scales (list or float): gives you the ability to upsample/downsample each individual dataset - seed: Optional value to seed the numpy RNG. - global_rank (int): Worker rank, used for partitioning map style datasets. Defaults to 0. - world_size (int): Total number of processes, used for partitioning map style datasets. Defaults to 1. - pure_random (bool): If true, then always draw random sample from lang_probs. If false, you only draw from those datasets - which you haven't sampled from yet for the composite sample - force_monochannel (bool): If true, then all output audio will be mono-channel - infinity_mode (bool): If true, then the dataset iterable will generate an infinite amount of samples - sample_rate (int): the sample rate of all audio being sent to this Dataset - augmentor (AudioAugmentor): The any perturbations you wish to have applied on the CS samples - """ - - def __init__( - self, - datasets: List[Any], - lang_probs: Optional[List[float]] = None, - shuffle: bool = True, - min_duration: int = 4, - max_duration: int = 20, - min_monolingual: float = 0.3, - db_norm: float = -25.0, - pause_start: int = 0, - pause_join: int = 0, - pause_end: int = 0, - sampling_scales: Optional[Union[float, List[float]]] = None, - seed: Optional[int] = None, - global_rank: int = 0, - world_size: int = 1, - pure_random: bool = False, - force_monochannel: bool = True, - infinity_mode: bool = False, - sample_rate: int = 16000, - augmentor: Optional['AudioAugmentor'] = None, - ): - super().__init__() - - if len(datasets) == 0: - raise ValueError("CodeSwitchedDataset must receive a non-zero length datasets dict object") - - self.datasets = datasets - self.langs = list(range(len(datasets))) - self.langs_set = set(self.langs) - self.lang_iterables = {k: None for k in self.langs} - self.lang_kind = {k: None for k in self.langs} - self.shuffle = shuffle - self.min_duration = min_duration - self.max_duration = max_duration - self.min_monolingual = min_monolingual - self.db_norm = db_norm - self.pause_start = pause_start - self.pause_join = pause_join - self.pause_end = pause_end - self.pure_random = pure_random - self.force_monochannel = force_monochannel - self.infinity_mode = infinity_mode - self.global_rank = global_rank - self.world_size = world_size - self.augmentor = augmentor - self.sample_rate = sample_rate - self.length = 0 - if lang_probs is None: - self.prob_dict = {l: 1.0 / len(self.langs) for l in self.langs} - else: - assert len(self.langs) == len( - lang_probs - ), "Size mismatch between languages and respective probs in CodeSwitchedDataset" - self.prob_dict = {l: lang_probs[l] for l in self.langs} - self.lang_probs = np.array(list(self.prob_dict.values())) - if sampling_scales is not None and not isinstance(sampling_scales, list): - self.sampling_scales = {k: sampling_scales for k in self.langs} - elif ( - sampling_scales is not None - and isinstance(sampling_scales, list) - and len(sampling_scales) == len(self.langs) - ): - self.sampling_scales = {k: v for k, v in zip(self.langs, sampling_scales)} - else: - self.sampling_scales = {k: 1 for k in self.langs} - - for lang, dataset in enumerate(self.datasets): - isiterable = isinstance(dataset, IterableDataset) - - if isiterable: - self.lang_kind[lang] = 'iterable' - self.length += int(len(dataset) * self.sampling_scales[lang]) - else: - self.lang_kind[lang] = 'map' - self.length += int((len(dataset) // world_size) * self.sampling_scales[lang]) - - if seed is not None: - np.random.seed(seed) - - # set this to ensure compatibility with models searching for the collate_fn - # since this class stores datasets as a dict, not list - # self.collate_fn = self.datasets[self.langs[0]].collate_fn - if hasattr(self.datasets[self.langs[0]], 'collate_fn'): - self.collate_fn = self.datasets[self.langs[0]].collate_fn - elif ( - hasattr(self.datasets[self.langs[0]], 'datasets') - and isinstance(self.datasets[self.langs[0]].datasets, list) - and len(self.datasets[self.langs[0]].datasets) > 0 - and hasattr(self.datasets[self.langs[0]].datasets[0], 'collate_fn') - ): - # support datasets that are lists of entries - self.collate_fn = self.datasets[self.langs[0]].datasets[0].collate_fn - elif ( - hasattr(self.datasets[self.langs[0]], 'datasets') - and isinstance(self.datasets[self.langs[0]].datasets, list) - and len(self.datasets[self.langs[0]].datasets) > 0 - and hasattr(self.datasets[self.langs[0]].datasets[0], 'datasets') - and isinstance(self.datasets[self.langs[0]].datasets[0].datasets, list) - and len(self.datasets[self.langs[0]].datasets[0].datasets) > 0 - and hasattr(self.datasets[self.langs[0]].datasets[0].datasets[0], 'collate_fn') - ): - # support datasets that are lists of lists - self.collate_fn = self.datasets[self.langs[0]].datasets[0].datasets[0].collate_fn - else: - raise RuntimeError("CodeSwitchedDataset could not locate a valid dataset collate_fn to bind to") - - # this method returns an iterator object for a given language ID - # it correctly handles whether the underlying dataset is IterableDataset or mappable - def get_iterable_by_lang(self, lang): - dataset = self.datasets[lang] - - if isinstance(dataset, IterableDataset): - return dataset.__iter__() - else: - indices = np.arange(len(dataset)) - if self.shuffle: - np.random.shuffle(indices) - return iter(indices) - - # this method is the main function which builds and returns a composite, synthetic code-switched - # utterance on the fly. It automatically works with all of the class-based variables stored to create - # the synthetic utterance - def build_single_CS_sample(self): - # get_sample_from_language returns a LongTensor for the transcripts so we create a LongTensor to hold - # all returned transcripts - comp_text = torch.LongTensor([]) - created_sample_duration_sec = 0 - created_sample_langs = [] - created_sample_audios = [] - - # if min_monolingual fires, it means we will just return a single, original monolingual utterance - # from one of our languages based on that language's probability - pure_mono = np.random.rand() <= self.min_monolingual - - # we continue to add to the composite utterance until we hit the min_duration - while created_sample_duration_sec < self.min_duration: - # we sample from only those languages which haven't already been sampled for this particular - # synthetic utterance, unless pure_random=True, in which case, you just sample with replacement - # every time - if (self.pure_random and not pure_mono) or ( - len(set(created_sample_langs)) == 0 or len(set(created_sample_langs)) == len(self.langs) - ): - lang_id = np.random.choice(self.langs, p=self.lang_probs) - # elif pure_mono: - # use this approach if you want synthetic utterances which are all monolingual - # lang_id = created_sample_langs[0] - else: - # this code is for when we need to sample from only those languages which haven't been sampled - # yet for this utterance - p = np.array(list(map(self.prob_dict.get, list(self.langs_set - set(created_sample_langs))))) - p = p / p.sum() - lang_id = np.random.choice(list(self.langs_set - set(created_sample_langs)), p=p) - - audio, audio_len, labels, labels_len, *_ = self.get_sample_from_language(lang_id) - - # in case you get an audio which is all silence we keep sampling - if audio.count_nonzero().item() == 0: - continue - - sample_duration = len(audio) / self.sample_rate - if (created_sample_duration_sec + sample_duration) > self.max_duration: - continue - - if comp_text.device != labels.device: - comp_text = comp_text.to(labels.device) - - if audio.ndim > 1 and self.force_monochannel: - audio = audio.mean(dim=-1) - - created_sample_duration_sec += sample_duration - created_sample_langs.append(lang_id) - # need to use numpy instead of torch here because we need numpy's trim_zeros function - created_sample_audios.append(audio.cpu().numpy()) - comp_text = torch.cat([comp_text, labels], dim=0) - - # we want a real, non-synth pure_mono sample so we break soon as we have one - if pure_mono: - break - - # check that all samples have the same number of channels - sample_channels = list(set([s.ndim for s in created_sample_audios])) - if len(sample_channels) > 1: - raise RuntimeError( - "Mixture of audios with different number of channels in CodeSwitchedDataset. All sources must be same number of channels." - ) - - multichannel = sample_channels[0] > 1 - - # we start with pause_start amount of silence (zero array) which needs the correct shape for multi/mono channel - if multichannel: - comp_audio = np.zeros( - shape=(int(self.pause_start * self.sample_rate / 1000.0), created_sample_audios[0].shape[-1]), - dtype=created_sample_audios[0].dtype, - ) - else: - comp_audio = np.zeros( - shape=(int(self.pause_start * self.sample_rate / 1000.0),), dtype=created_sample_audios[0].dtype - ) - - # iterate over all mono-lingual samples to build the final composite - for idx, wav in enumerate(created_sample_audios): - if not multichannel: - # this function only works if mono-channel - wav = np.trim_zeros(wav) - - # normalise to provided DB level - wav_norm = wav * (10.0 ** (self.db_norm / 20.0) / np.maximum(0.01, (wav ** 2).mean(axis=0) ** 0.5)) - - # this part appends the normed waveform to the existing waveform, and inserts pause_join amount of silence - # if necessary, otherwise just a straight append - if idx < len(created_sample_audios) - 1: - if multichannel: - wav_norm = np.append( - wav_norm, - np.zeros( - shape=( - int(self.pause_join * self.sample_rate / 1000.0), - created_sample_audios[0].shape[-1], - ), - dtype=comp_audio.dtype, - ), - axis=0, - ) - else: - wav_norm = np.append( - wav_norm, - np.zeros(shape=(int(self.pause_join * self.sample_rate / 1000.0),), dtype=comp_audio.dtype), - axis=0, - ) - - # this is the penultimate composite wavform, just need to add pause_end silence - comp_audio = np.append(comp_audio, wav_norm, axis=0) - - # here we add the pause_end amount of silence, in correct channel shape - if multichannel: - comp_audio = np.append( - comp_audio, - np.zeros( - shape=(int(self.pause_end * self.sample_rate / 1000.0), created_sample_audios[0].shape[-1]), - dtype=comp_audio.dtype, - ), - axis=0, - ) - else: - comp_audio = np.append( - comp_audio, - np.zeros(shape=(int(self.pause_end * self.sample_rate / 1000.0),), dtype=comp_audio.dtype), - axis=0, - ) - - # we only want augmentation to happen on the final, synthetic utterance, and not on any of the individual - # languages, which is why we set augmentor=None when building the individual language datasets in audio_to_text_dataset.get_code_switched_dataset - # here we now apply augmentation to the final, synthetic utterance only - # all of this logic here happens in-memory, nothing is written to disk - if self.augmentor is not None: - # import here to avoid circular import error - # import here because otherwise CI test-nlp-imports fails since soundfile is only in requirements_asr and not in requirements_common - import soundfile as sf - - from nemo.collections.asr.parts.preprocessing import AudioSegment - - mb = io.BytesIO() - sf.write(mb, comp_audio, self.sample_rate, format='WAV') - mb.seek(0) - comp_audio_as = AudioSegment.from_file(mb, target_sr=self.sample_rate) - self.augmentor.perturb(comp_audio_as) - comp_audio = comp_audio_as.samples - - return ( - torch.tensor(comp_audio, dtype=audio.dtype, device=audio.device), - torch.tensor(len(comp_audio), device=audio_len.device).long(), - comp_text, - torch.tensor(len(comp_text), device=labels_len.device).long(), - ) - - # this is a helper method which prepares all of the iterator objects for all languages - # based on whether that language's underlying dataset is a map or an IterableDataset - def prep_underlying_datasets(self): - worker_info = pt_data.get_worker_info() - if worker_info is None: - max_elements = self.length - wid = 0 - wnum = 1 - else: - wid = worker_info.id - wnum = worker_info.num_workers - max_elements = len(range(wid, self.length, wnum)) - - for lang in self.langs: - if self.lang_kind[lang] == 'map': - start_idx = (len(self.datasets[lang]) // self.world_size) * self.global_rank - end_idx = start_idx + (len(self.datasets[lang]) // self.world_size) - if self.global_rank == self.world_size - 1: - end_idx = len(self.datasets[lang]) - indices = range(start_idx + wid, end_idx, wnum) - self.datasets[lang] = pt_data.Subset(self.datasets[lang], indices) - - self.lang_iterables[lang] = self.get_iterable_by_lang(lang) - - return max_elements - - # returns a sample (audio and transcript) from any underlying language stored by the class on instantiation - # the sample returned is a tensor for the audio and a tensor of ints for the transcript - # this method automatically handles StopIteration errors for the underyling language and rebuilds - # the iterator if necessary - def get_sample_from_language(self, lang): - while True: - try: - val = next(self.lang_iterables[lang]) - if self.lang_kind[lang] == 'map': - val = self.datasets[lang][val] - return val - except StopIteration: - self.lang_iterables[lang] = self.get_iterable_by_lang(lang) - - def __iter__(self): - # we create primed iterators for all languages and return the grand total of samples for each - # underlying language as a sum - max_elements = self.prep_underlying_datasets() - - if self.infinity_mode: - while True: - yield self.build_single_CS_sample() - else: - n = 0 - while n < max_elements: - yield self.build_single_CS_sample() - n += 1 - - def __len__(self): - return self.length diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/__init__.py b/SoundScribe/SpeakerID/nemo/collections/common/losses/__init__.py deleted file mode 100644 index 4d780d9df4d6e8c5b056de3b4435bac40d5935f6..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/losses/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.common.losses.aggregator import AggregatorLoss -from nemo.collections.common.losses.bce_logits_loss import BCEWithLogitsLoss -from nemo.collections.common.losses.cross_entropy import CrossEntropyLoss, NLLLoss -from nemo.collections.common.losses.mse_loss import MSELoss -from nemo.collections.common.losses.multi_similarity_loss import MultiSimilarityLoss -from nemo.collections.common.losses.smoothed_cross_entropy import SmoothedCrossEntropyLoss, SmoothedNLLLoss -from nemo.collections.common.losses.spanning_loss import SpanningLoss diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 54000b12cdae3dd228bed31687e7ce2ce26db3bc..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/aggregator.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/aggregator.cpython-310.pyc deleted file mode 100644 index cb436be04325d40bec28e793eb0b0effd2952fcb..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/aggregator.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/bce_logits_loss.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/bce_logits_loss.cpython-310.pyc deleted file mode 100644 index 5e210993738971c5bb15962f54c682d74f07ade8..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/bce_logits_loss.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/cross_entropy.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/cross_entropy.cpython-310.pyc deleted file mode 100644 index d3729504260d753eaee700cd3d043cde6aeba0f9..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/cross_entropy.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/mse_loss.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/mse_loss.cpython-310.pyc deleted file mode 100644 index 13d958b641f09e0967f2914ba4d3099f7f337c99..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/mse_loss.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/multi_similarity_loss.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/multi_similarity_loss.cpython-310.pyc deleted file mode 100644 index 2e9a2b29e1c8bd17ed71a78721275e9de579e0a3..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/multi_similarity_loss.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/smoothed_cross_entropy.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/smoothed_cross_entropy.cpython-310.pyc deleted file mode 100644 index afd696dae0cf40ca6854d1aee9730fb679062501..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/smoothed_cross_entropy.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/spanning_loss.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/spanning_loss.cpython-310.pyc deleted file mode 100644 index c3e0dfd04f3e7ddeee3f7d0d27cd1fe12f6f371b..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/losses/__pycache__/spanning_loss.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/aggregator.py b/SoundScribe/SpeakerID/nemo/collections/common/losses/aggregator.py deleted file mode 100644 index 1987ddd22bae30d6455adf87a7cc4313d9cdff1c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/losses/aggregator.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List - -import torch - -from nemo.core.classes import Loss, typecheck -from nemo.core.neural_types import LossType, NeuralType - -__all__ = ['AggregatorLoss'] - - -class AggregatorLoss(Loss): - """ - Sums several losses into one. - - Args: - num_inputs: number of input losses - weights: a list of coefficient for merging losses - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - input_types = {} - for i in range(self._num_losses): - input_types["loss_" + str(i + 1)] = NeuralType(elements_type=LossType()) - - return input_types - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, num_inputs: int = 2, weights: List[float] = None): - super().__init__() - self._num_losses = num_inputs - if weights is not None and len(weights) != num_inputs: - raise ValueError("Length of weights should be equal to the number of inputs (num_inputs)") - - self._weights = weights - - @typecheck() - def forward(self, **kwargs): - values = [kwargs[x] for x in sorted(kwargs.keys())] - loss = torch.zeros_like(values[0]) - for loss_idx, loss_value in enumerate(values): - if self._weights is not None: - loss = loss.add(loss_value, alpha=self._weights[loss_idx]) - else: - loss = loss.add(loss_value) - return loss diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/bce_logits_loss.py b/SoundScribe/SpeakerID/nemo/collections/common/losses/bce_logits_loss.py deleted file mode 100644 index b65c41985afc047ad1c82489363cbfceff4e9247..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/losses/bce_logits_loss.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List - -import torch -from torch import nn - -from nemo.core.classes import Serialization, Typing, typecheck -from nemo.core.neural_types import LabelsType, LogitsType, LossType, MaskType, NeuralType - -__all__ = ["BCEWithLogitsLoss"] - - -class BCEWithLogitsLoss(nn.BCEWithLogitsLoss, Serialization, Typing): - """ - BCEWithLogitsLoss - - https://pytorch.org/docs/1.9.1/generated/torch.nn.BCEWithLogitsLoss.html - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "logits": NeuralType(["B"] + ["ANY"] * (self._logits_dim - 1), LogitsType()), - "labels": [NeuralType(["B"] + ["ANY"] * (self._logits_dim - 2), LabelsType())], - "loss_mask": NeuralType(["B"] + ["ANY"] * (self._logits_dim - 2), MaskType(), optional=True), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__( - self, - logits_ndim: int = 2, - weight: torch.Tensor = None, - reduction: str = "mean", - pos_weight: torch.Tensor = None, - ): - """ - Args: - logits_ndim: number of dimensions (or rank) of the logits tensor - weight: list of rescaling weight given to each class - reduction: type of the reduction over the batch - pos_weight: weight given to positive samples - """ - if pos_weight is not None and not torch.is_tensor(pos_weight): - pos_weight = torch.FloatTensor(pos_weight) - - super().__init__(weight=weight, pos_weight=pos_weight, reduction=reduction) - self._logits_dim = logits_ndim - - @typecheck() - def forward(self, logits: float, labels: List[int], loss_mask: torch.Tensor = None): - """ - Args: - logits: output of the classifier - labels: ground truth labels - """ - labels = torch.stack(labels) - labels = labels.t().float() - - return super().forward(logits, labels) diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/cross_entropy.py b/SoundScribe/SpeakerID/nemo/collections/common/losses/cross_entropy.py deleted file mode 100644 index 753cc089a981002878b1d6aaa8d002a222cc368d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/losses/cross_entropy.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from torch import nn - -from nemo.core.classes import Serialization, Typing, typecheck -from nemo.core.neural_types import LabelsType, LogitsType, LogprobsType, LossType, MaskType, NeuralType -from nemo.utils import logging - -__all__ = ['CrossEntropyLoss', 'NLLLoss'] - - -class CrossEntropyLoss(nn.CrossEntropyLoss, Serialization, Typing): - """ - CrossEntropyLoss - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "logits": NeuralType(['B'] + ['ANY'] * (self._logits_dim - 1), LogitsType()), - "labels": NeuralType(['B'] + ['ANY'] * (self._logits_dim - 2), LabelsType()), - "loss_mask": NeuralType(['B'] + ['ANY'] * (self._logits_dim - 2), MaskType(), optional=True), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, logits_ndim=2, weight=None, reduction='mean', ignore_index=-100): - """ - Args: - logits_ndim (int): number of dimensions (or rank) of the logits tensor - weight (list): list of rescaling weight given to each class - reduction (str): type of the reduction over the batch - """ - if weight is not None and not torch.is_tensor(weight): - weight = torch.FloatTensor(weight) - logging.info(f"Weighted Cross Entropy loss with weight {weight}") - super().__init__(weight=weight, reduction=reduction, ignore_index=ignore_index) - self._logits_dim = logits_ndim - - @typecheck() - def forward(self, logits, labels, loss_mask=None): - """ - Args: - logits (float): output of the classifier - labels (long): ground truth labels - loss_mask (bool/float/int): tensor to specify the masking - """ - logits_flatten = torch.flatten(logits, start_dim=0, end_dim=-2) - labels_flatten = torch.flatten(labels, start_dim=0, end_dim=-1) - - if loss_mask is not None: - if loss_mask.dtype is not torch.bool: - loss_mask = loss_mask > 0.5 - loss_mask_flatten = torch.flatten(loss_mask, start_dim=0, end_dim=-1) - logits_flatten = logits_flatten[loss_mask_flatten] - labels_flatten = labels_flatten[loss_mask_flatten] - - if len(labels_flatten) == 0: - return super().forward(logits, torch.argmax(logits, dim=-1)) - - loss = super().forward(logits_flatten, labels_flatten) - return loss - - -class NLLLoss(nn.NLLLoss, Serialization, Typing): - """ - NLLLoss - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "log_probs": NeuralType(("B", "T", "D"), LogprobsType()), - "labels": NeuralType(("B", "T"), LabelsType()), - "output_mask": NeuralType(("B", "T"), MaskType(), optional=True), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, log_probs_ndim=2, weight=None, reduction='mean', ignore_index=-100): - """ - Args: - log_probs_ndim (int): number of dimensions (or rank) of the logprobs tensor - weight (list): list of rescaling weight given to each class - reduction (str): type of the reduction over the batch - ignore_index (int): mask out loss computation where labels = ignore_index - """ - if weight is not None and not torch.is_tensor(weight): - weight = torch.FloatTensor(weight) - super().__init__(weight=weight, reduction=reduction, ignore_index=ignore_index) - self._log_probs_dim = log_probs_ndim - - @typecheck() - def forward(self, log_probs, labels, loss_mask=None): - """ - Args: - log_probs (float): output log probability tensor - labels (long): ground truth labels - loss_mask (bool/float/int): tensor to specify the masking - """ - log_probs_flatten = torch.flatten(log_probs, start_dim=0, end_dim=-2) - labels_flatten = torch.flatten(labels, start_dim=0, end_dim=-1) - - if loss_mask is not None: - if loss_mask.dtype is not torch.bool: - loss_mask = loss_mask > 0.5 - loss_mask_flatten = torch.flatten(loss_mask, start_dim=0, end_dim=-1) - log_probs_flatten = log_probs_flatten[loss_mask_flatten] - labels_flatten = labels_flatten[loss_mask_flatten] - - if len(labels_flatten) == 0: - return super().forward(log_probs, torch.argmax(log_probs, dim=-1)) - - loss = super().forward(log_probs_flatten, labels_flatten) - return loss diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/mse_loss.py b/SoundScribe/SpeakerID/nemo/collections/common/losses/mse_loss.py deleted file mode 100644 index 802e8ca49204a9bf9d439b114d4f3315d453e00f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/losses/mse_loss.py +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from torch import Tensor, nn - -from nemo.core.classes import Serialization, Typing, typecheck -from nemo.core.neural_types import LabelsType, LossType, NeuralType, RegressionValuesType - -__all__ = ['MSELoss'] - - -class MSELoss(nn.MSELoss, Serialization, Typing): - """ - MSELoss - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "preds": NeuralType(tuple('B'), RegressionValuesType()), - "labels": NeuralType(tuple('B'), LabelsType()), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, reduction: str = 'mean'): - """ - Args: - reduction: type of the reduction over the batch - """ - super().__init__(reduction=reduction) - - @typecheck() - def forward(self, preds: Tensor, labels: Tensor) -> Tensor: - """ - Args: - preds: output of the classifier - labels: ground truth labels - """ - return super().forward(preds, labels) diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/multi_similarity_loss.py b/SoundScribe/SpeakerID/nemo/collections/common/losses/multi_similarity_loss.py deleted file mode 100644 index 022f6d6de6912043920e5df0fad9658f4eb7996c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/losses/multi_similarity_loss.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Optional - -import torch - -from nemo.core.classes import Loss -from nemo.core.classes.common import typecheck -from nemo.core.neural_types import LabelsType, LogitsType, LossType, NeuralType -from nemo.utils import logging - -__all__ = ['MultiSimilarityLoss'] - - -class MultiSimilarityLoss(Loss): - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return {"logits": NeuralType(('B', 'D'), LogitsType()), "labels": NeuralType(('B'), LabelsType())} - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__( - self, - scale_pos: Optional[float] = 2.0, # Params found to work best in our experiments - scale_neg: Optional[float] = 40.0, - offset: Optional[float] = 0.5, - margin: Optional[float] = 0.1, - ): - super().__init__() - self._scale_pos = scale_pos - self._scale_neg = scale_neg - self._offset = offset - self._margin = margin - self._epsilon = 1e-5 - - @typecheck() - def forward(self, logits, labels): - cos_sim = torch.matmul(logits, torch.t(logits)) - losses = [] - - for i in range(logits.size(0)): - # mine hard pairs relative to anchor i - positive_sims = cos_sim[i][labels.eq(labels[i])] - positive_sims = positive_sims[positive_sims.lt(1 - self._epsilon)] # omit identical pairs - negative_sims = cos_sim[i][labels.ne(labels[i])] - - if len(negative_sims) == 0 or len(positive_sims) == 0: - continue - - # negatives that are more similar than the least-similar positive - hard_negatives = negative_sims[negative_sims.gt(min(positive_sims) - self._margin)] - - # positives that are less similar than the most-similar negative - hard_positives = positive_sims[positive_sims.lt(max(negative_sims) + self._margin)] - - if len(hard_negatives) == 0 or len(hard_positives) == 0: - continue - - pos_term = ( - 1.0 - / self._scale_pos - * torch.log(1 + torch.sum(torch.exp(-self._scale_pos * (hard_positives - self._offset)))) - ) - neg_term = ( - 1.0 - / self._scale_neg - * torch.log(1 + torch.sum(torch.exp(self._scale_neg * (hard_negatives - self._offset)))) - ) - losses.append(pos_term + neg_term) - - if len(losses) == 0: - loss = torch.zeros([], requires_grad=True).cuda() - logging.info(f'Encountered zero loss in multisimloss, loss = {loss}. No hard examples found in the batch') - else: - loss = torch.sum(torch.stack(losses)) / logits.size(0) - - return loss diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/smoothed_cross_entropy.py b/SoundScribe/SpeakerID/nemo/collections/common/losses/smoothed_cross_entropy.py deleted file mode 100644 index 265251acc390d1d488eedb5f74a736de231073a7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/losses/smoothed_cross_entropy.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Optional - -import torch - -from nemo.core.classes import Exportable, Loss, NeuralModule, typecheck -from nemo.core.neural_types import LabelsType, LogprobsType, LossType, MaskType, NeuralType - -__all__ = ["SmoothedCrossEntropyLoss", "SmoothedNLLLoss"] - - -class SmoothedCrossEntropyLoss(Loss): - """ - Calculates Cross-entropy loss with label smoothing for a batch of sequences. - - SmoothedCrossEntropyLoss: - 1) excludes padding tokens from loss calculation - 2) allows to use label smoothing regularization - 3) allows to calculate loss for the desired number of last tokens - 4) per_token_reduction - if False disables reduction per token - - Args: - label_smoothing (float): label smoothing regularization coefficient - predict_last_k (int): parameter which sets the number of last tokens to calculate the loss for, for example - 0: (default) calculate loss on the entire sequence (e.g., NMT) - 1: calculate loss on the last token only (e.g., LM evaluation) - Intermediate values allow to control the trade-off between eval - time (proportional to the number of batches) and eval performance - (proportional to the number of context tokens) - pad_id (int): padding id - eps (float): the small eps number to avoid division buy zero - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "log_probs": NeuralType(("B", "T", "D"), LogprobsType()), - "labels": NeuralType(("B", "T"), LabelsType()), - "output_mask": NeuralType(("B", "T"), MaskType(), optional=True), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__( - self, - pad_id: Optional[int] = None, - label_smoothing: Optional[float] = 0.0, - predict_last_k: Optional[int] = 0, - eps: float = 1e-6, - per_token_reduction: bool = True, - ): - super().__init__() - self._pad_id = pad_id - self._eps = eps - self._predict_last_k = predict_last_k - self._label_smoothing = label_smoothing - self._per_token_reduction = per_token_reduction - - @typecheck() - def forward(self, log_probs, labels, output_mask=None): - """ - Args: - log_probs: float tensor of shape batch_size x seq_len x vocab_size, values should be log probabilities - labels: int tensor of shape batch_size x seq_len - output_mask: binary tensor of shape batch_size x seq_len - eps: epsilon param to avoid divide by zero in loss calculation - """ - if output_mask is None and self._pad_id is None: - raise ValueError("Both output_mask and pad_id are None") - if output_mask is None and self._pad_id is not None: - output_mask = (labels != self._pad_id).to(log_probs.dtype) - - if output_mask.dtype is not log_probs.dtype: - output_mask = output_mask.to(log_probs.dtype) - - batch_size, seq_len, vocab_size = log_probs.size() - smoothing = vocab_size * self._label_smoothing / (vocab_size - 1) - target_log_probs = log_probs.gather(2, labels.unsqueeze(2)).squeeze(2) - - smoothing_log_probs = log_probs.mean(dim=-1) - neg_log_likelihood = (1.0 - smoothing) * target_log_probs + smoothing * smoothing_log_probs - neg_log_likelihood = neg_log_likelihood[:, -self._predict_last_k :] - output_mask = output_mask[:, -self._predict_last_k :] - - # when False avoid per token reduction - if self._per_token_reduction: - neg_log_likelihood = -torch.sum(neg_log_likelihood * output_mask) - neg_log_likelihood = neg_log_likelihood / (output_mask.sum() + self._eps) - else: - neg_log_likelihood = -(neg_log_likelihood * output_mask) - - return neg_log_likelihood - - -class SmoothedNLLLoss(NeuralModule, Exportable): - """ - Calculate negative log likelihodd for sequence input, also applies label smoothing (if set). - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "log_probs": NeuralType(("B", "T", "D"), LogprobsType()), - "labels": NeuralType(("B", "T"), LabelsType()), - "output_mask": NeuralType(("B", "T"), MaskType(), optional=True), - "lengths": NeuralType(("B"), LabelsType(), optional=True), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return {"loss": NeuralType(elements_type=LossType())} - - def __init__(self, reduction='mean', label_smoothing=0.0, eps=1e-8, **kwargs): - super().__init__() - self.reduction = reduction - self.label_smoothing = label_smoothing - self.nll_loss = torch.nn.NLLLoss(reduction='none', **kwargs) - self.eps = eps # small constant to avoid divide by zero - - @typecheck() - def forward(self, log_probs, labels, output_mask=None, lengths=None): - """ - Params: - - log_probs: BxTxC - - labels: B - - output_mask: BxT - - lengths: B - """ - - if output_mask is None and lengths is None: - output_mask = torch.ones_like(log_probs).float() - elif output_mask is None and lengths is not None: - output_mask = torch.arange(log_probs.size(1), device=log_probs.device)[None, :] < lengths[:, None] - output_mask = output_mask.float() - - log_probs = log_probs.transpose(1, 2) # BxTxC -> BxCxT - - loss = output_mask * self.nll_loss(log_probs, labels) - batch_size = loss.size(0) - if self.reduction == "mean": - loss = loss.sum() / (torch.sum(output_mask) + self.eps) - elif self.reduction == "batchmean": - loss = loss.sum() / batch_size - elif self.reduction == "batch": - loss = loss.reshape(batch_size, -1).sum(1) / (output_mask.reshape(batch_size, -1).sum(1) + self.eps) - - if self.label_smoothing == 0.0: - return loss - else: - # Regularizing Neural Networks by Penalizing Confident Output Distributions. - # https://arxiv.org/abs/1701.06548 - loss_reg = torch.mean(log_probs, dim=1) * output_mask - if self.reduction == "mean": - loss_reg = torch.sum(loss_reg) / torch.sum(output_mask) - elif self.reduction == "batchmean": - loss_reg = torch.sum(loss_reg) / labels.shape[0] - elif self.reduction == "batch": - loss_reg = loss_reg.sum(1) / output_mask.sum(1) - - return -self.label_smoothing * loss_reg + (1 - self.label_smoothing) * loss diff --git a/SoundScribe/SpeakerID/nemo/collections/common/losses/spanning_loss.py b/SoundScribe/SpeakerID/nemo/collections/common/losses/spanning_loss.py deleted file mode 100644 index a12dab64afd00f83003e8fb9fa25af3d72a360c7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/losses/spanning_loss.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from torch import nn - -from nemo.core.classes import Loss, typecheck -from nemo.core.neural_types import ChannelType, LogitsType, LossType, NeuralType - -__all__ = ['SpanningLoss'] - - -class SpanningLoss(Loss): - """ - implements start and end loss of a span e.g. for Question Answering. - """ - - @property - def input_types(self): - """Returns definitions of module input ports. - """ - return { - "logits": NeuralType(('B', 'T', 'D'), LogitsType()), - "start_positions": NeuralType(tuple('B'), ChannelType()), - "end_positions": NeuralType(tuple('B'), ChannelType()), - } - - @property - def output_types(self): - """Returns definitions of module output ports. - """ - return { - "loss": NeuralType(elements_type=LossType()), - "start_logits": NeuralType(('B', 'T'), LogitsType()), - "end_logits": NeuralType(('B', 'T'), LogitsType()), - } - - def __init__(self,): - super().__init__() - - @typecheck() - def forward(self, logits, start_positions, end_positions): - """ - Args: - logits: Output of question answering head, which is a token classfier. - start_positions: Ground truth start positions of the answer w.r.t. - input sequence. If question is unanswerable, this will be - pointing to start token, e.g. [CLS], of the input sequence. - end_positions: Ground truth end positions of the answer w.r.t. - input sequence. If question is unanswerable, this will be - pointing to start token, e.g. [CLS], of the input sequence. - """ - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) - ignored_index = start_logits.size(1) - start_positions.clamp_(0, ignored_index) - end_positions.clamp_(0, ignored_index) - - loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 - return total_loss, start_logits, end_logits diff --git a/SoundScribe/SpeakerID/nemo/collections/common/metrics/__init__.py b/SoundScribe/SpeakerID/nemo/collections/common/metrics/__init__.py deleted file mode 100644 index 322e62214ead6e482f8375852d396337359b1bd8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/metrics/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.common.metrics.classification_accuracy import TopKClassificationAccuracy -from nemo.collections.common.metrics.global_average_loss_metric import GlobalAverageLossMetric -from nemo.collections.common.metrics.metric_string_to_torchmetric import MetricStringToTorchMetric -from nemo.collections.common.metrics.perplexity import Perplexity diff --git a/SoundScribe/SpeakerID/nemo/collections/common/metrics/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/metrics/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index ffa62cef35166eb45fbfcce0aec7fe261ff3e7d7..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/metrics/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/metrics/__pycache__/classification_accuracy.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/metrics/__pycache__/classification_accuracy.cpython-310.pyc deleted file mode 100644 index d12374098e7953c480924b1b795615246b4d80d5..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/metrics/__pycache__/classification_accuracy.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/metrics/__pycache__/global_average_loss_metric.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/metrics/__pycache__/global_average_loss_metric.cpython-310.pyc deleted file mode 100644 index 8fa3dd51175696900a44471047d5e28dea4b5925..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/metrics/__pycache__/global_average_loss_metric.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/metrics/__pycache__/metric_string_to_torchmetric.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/metrics/__pycache__/metric_string_to_torchmetric.cpython-310.pyc deleted file mode 100644 index 2595079572db8fa227ebaf00e07dfa6c5aa6ac7b..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/metrics/__pycache__/metric_string_to_torchmetric.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/metrics/__pycache__/perplexity.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/metrics/__pycache__/perplexity.cpython-310.pyc deleted file mode 100644 index d8c03e034a940dee85519298e9d65c6dbdf038f0..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/metrics/__pycache__/perplexity.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/metrics/classification_accuracy.py b/SoundScribe/SpeakerID/nemo/collections/common/metrics/classification_accuracy.py deleted file mode 100644 index 46eca7474a5ebd3af9234ffa3688afcffb291e32..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/metrics/classification_accuracy.py +++ /dev/null @@ -1,262 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import re -import string -from collections import Counter -from typing import List, Union - -import torch -from torchmetrics import Metric - -__all__ = ['TopKClassificationAccuracy'] - - -class TopKClassificationAccuracy(Metric): - """ - This metric computes numerator and denominator for Overall Accuracy between logits and labels. - When doing distributed training/evaluation the result of res=TopKClassificationAccuracy(logits, labels) calls - will be all-reduced between all workers using SUM operations. - Here contains two numbers res=[correctly_predicted, total_samples]. Accuracy=correctly_predicted/total_samples. - - If used with PytorchLightning LightningModule, include correct_count and total_count inside validation_step results. - Then aggregate (sum) then at the end of validation epoch to correctly compute validation WER. - - Example: - def validation_step(self, batch, batch_idx): - ... - correct_count, total_count = self._accuracy(logits, labels) - self.val_outputs = {'val_loss': loss_value, 'val_correct_count': correct_count, 'val_total_count': total_count} - return self.val_outputs - - def on_validation_epoch_end(self): - ... - val_loss_mean = torch.stack([x['val_loss'] for x in self.val_outputs]).mean() - correct_counts = torch.stack([x['val_correct_counts'] for x in self.val_outputs]) - total_counts = torch.stack([x['val_total_counts'] for x in self.val_outputs]) - - topk_scores = compute_topk_accuracy(correct_counts, total_counts) - - tensorboard_log = {'val_loss': val_loss_mean} - for top_k, score in zip(self._accuracy.top_k, topk_scores): - tensorboard_log['val_epoch_top@{}'.format(top_k)] = score - - self.val_outputs.clear() # free memory - return {'log': tensorboard_log} - - Args: - top_k: Optional list of integers. Defaults to [1]. - - Returns: - res: a torch.Tensor object with two elements: [correct_count, total_count]. To correctly compute average - accuracy, compute acc=correct_count/total_count - """ - - full_state_update = True - - def __init__(self, top_k=None, dist_sync_on_step=False): - super().__init__(dist_sync_on_step=dist_sync_on_step) - - if top_k is None: - top_k = [1] - - self.top_k = top_k - self.add_state( - "correct_counts_k", default=torch.zeros(len(self.top_k)), dist_reduce_fx='sum', persistent=False - ) - self.add_state("total_counts_k", default=torch.zeros(len(self.top_k)), dist_reduce_fx='sum', persistent=False) - - @torch.no_grad() - def top_k_predicted_labels(self, logits: torch.Tensor) -> torch.Tensor: - max_k = max(self.top_k) - _, predictions = logits.topk(max_k, dim=1, largest=True, sorted=True) - return predictions - - def update(self, logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor: - with torch.no_grad(): - predictions = self.top_k_predicted_labels(logits) - predictions = predictions.t() - correct = predictions.eq(labels.view(1, -1)).expand_as(predictions) - - correct_counts_k = [] - total_counts_k = [] - - for k in self.top_k: - correct_k = correct[:k].reshape(-1).long().sum() - total_k = labels.shape[0] - - correct_counts_k.append(correct_k) - total_counts_k.append(total_k) - - self.correct_counts_k = torch.tensor(correct_counts_k, dtype=labels.dtype, device=labels.device) - self.total_counts_k = torch.tensor(total_counts_k, dtype=labels.dtype, device=labels.device) - - def compute(self): - """ - Computes the top-k accuracy. - - Returns: - A list of length `K`, such that k-th index corresponds to top-k accuracy - over all distributed processes. - """ - if not len(self.correct_counts_k) == len(self.top_k) == len(self.total_counts_k): - raise ValueError("length of counts must match to topk length") - - if self.top_k == [1]: - return [self.correct_counts_k.float() / self.total_counts_k] - - else: - top_k_scores = compute_topk_accuracy(self.correct_counts_k, self.total_counts_k) - - return top_k_scores - - @property - def top_k(self) -> List[int]: - return self._top_k - - @top_k.setter - def top_k(self, value: List[int]): - if value is None: - value = [1] - - if type(value) == int: - value = [value] - - if type(value) != list: - value = list(value) - - self._top_k = value - - -def compute_topk_accuracy(correct_counts_k, total_counts_k): - """ - Computes the top-k accuracy - Args: - correct_counts: Tensor of shape [K], K being the top-k parameter. - total_counts: Tensor of shape [K], and K being the top-k parameter. - Returns: - A list of length `K`, such that k-th index corresponds to top-k accuracy - over all distributed processes. - """ - top_k_scores = [] - - for ki in range(len(correct_counts_k)): - correct_count = correct_counts_k[ki].item() - total_count = total_counts_k[ki].item() - top_k_scores.append(correct_count / float(total_count)) - - return top_k_scores - - -class ExactStringPerCategoryMatchMetric(Metric): - def __init__(self, categories=[], dist_sync_on_step=False, *args, **kwargs): - super().__init__(dist_sync_on_step=dist_sync_on_step) - self.categories = set(categories) - - self.add_state("correct", default=torch.tensor(0), dist_reduce_fx="sum") - self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum") - for category in categories: - self.add_state(f"{category}_total", default=torch.tensor(0), dist_reduce_fx="sum") - self.add_state(f"{category}_correct", default=torch.tensor(0), dist_reduce_fx="sum") - - def update(self, pred: str, target: str, category: str = None): - if pred == target: - self.correct += 1 - self.total += 1 - if category is None: - return - if category in self.categories: - val = getattr(self, f"{category}_total") - setattr(self, f"{category}_total", val + 1) - if pred == target: - val = getattr(self, f"{category}_correct") - setattr(self, f"{category}_correct", val + 1) - else: - logging.warn(f'{category} is not in the pre-defined list') - - def compute(self): - results = {} - results['acc'] = self.correct.float() / self.total - for category in self.categories: - results[category] = getattr(self, f"{category}_correct") / getattr(self, f"{category}_total") - for category in self.categories: - results[f"{category}_total"] = getattr(self, f"{category}_total") - return results - - -class ExactStringMatchMetric(Metric): - def __init__(self, dist_sync_on_step=False, *args, **kwargs): - super().__init__(dist_sync_on_step=dist_sync_on_step) - - self.add_state("correct", default=torch.tensor(0), dist_reduce_fx="sum") - self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum") - - def update(self, pred: str, target: str): - if pred == target: - self.correct += 1 - self.total += 1 - - def compute(self): - return self.correct.float() / self.total - - -class TokenF1Score(Metric): - """Taken from the official evaluation script for v1.1 of the SQuAD dataset""" - - def __init__(self, dist_sync_on_step=False, *args, **kwargs): - super().__init__(dist_sync_on_step=dist_sync_on_step) - - self.add_state("correct", default=torch.tensor(0.0), dist_reduce_fx="sum") - self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum") - - def update(self, pred: str, target: Union[str, List[str]]): - if isinstance(target, str): - self.correct += self.f1_score(pred, target) - elif isinstance(target, list): - self.correct += max([self.f1_score(pred, tgt) for tgt in target]) - self.total += 1 - - def compute(self): - return self.correct.float() / self.total - - def f1_score(self, prediction, ground_truth): - prediction_tokens = self.normalize(prediction).split() - ground_truth_tokens = self.normalize(ground_truth).split() - common = Counter(prediction_tokens) & Counter(ground_truth_tokens) - num_same = sum(common.values()) - if num_same == 0: - return 0.0 - precision = 1.0 * num_same / len(prediction_tokens) - recall = 1.0 * num_same / len(ground_truth_tokens) - f1 = (2 * precision * recall) / (precision + recall) - return f1 - - def normalize(self, s): - """Lower text and remove punctuation, articles and extra whitespace.""" - - def remove_articles(text): - return re.sub(r"\b(a|an|the)\b", " ", text) - - def white_space_fix(text): - return " ".join(text.split()) - - def remove_punc(text): - exclude = set(string.punctuation) - return "".join(ch for ch in text if ch not in exclude) - - def lower(text): - return text.lower() - - return white_space_fix(remove_articles(remove_punc(lower(s)))) diff --git a/SoundScribe/SpeakerID/nemo/collections/common/metrics/global_average_loss_metric.py b/SoundScribe/SpeakerID/nemo/collections/common/metrics/global_average_loss_metric.py deleted file mode 100644 index 3bbd4d13abf4a6aed345f53e5f173a0cf038408c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/metrics/global_average_loss_metric.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from torchmetrics import Metric - -__all__ = ['GlobalAverageLossMetric'] - - -class GlobalAverageLossMetric(Metric): - """ - This class is for averaging loss across multiple processes if a distributed backend is used. True average is - computed not running average. It does not accumulate gradients so the averaged loss cannot be used for optimization. - If ``take_avg_loss`` is ``True``, the :meth:`update` method ``loss`` argument has to be a mean loss. If - ``take_avg_loss`` is ``False`` then the :meth:`update` method ``loss`` argument has to be a sum of losses. - - See :doc:`PyTorch Lightning Metrics` for the metric usage instruction. - - Args: - dist_sync_on_step: - Synchronize metric state across processes at each method :meth:`forward` call before returning the - value at the step - process_group: - Specify the process group on which synchronization is called. default: ``None`` (which selects the entire - world) - take_avg_loss: - If ``True`` values of :meth:`update` method ``loss`` argument has to be a mean loss. If ``False`` - values of :meth:`update` method ``loss`` argument has to be a sum of losses. default: ``True`` - """ - - full_state_update = True - - def __init__(self, dist_sync_on_step=False, process_group=None, take_avg_loss=True): - super().__init__(dist_sync_on_step=dist_sync_on_step, process_group=process_group) - self.add_state("loss_sum", torch.tensor(0.0, dtype=torch.float64), dist_reduce_fx='sum') - self.add_state("num_measurements", torch.tensor(0, dtype=torch.int64), dist_reduce_fx='sum') - self.take_avg_loss = take_avg_loss - - def update(self, loss, num_measurements): - """ - Updates :attr:`loss_sum` and :attr:`num_measurements`. - - Args: - loss: A float zero dimensional ``torch.Tensor`` which is either sum or average of losses for processed - examples. See ``take_avg_loss`` parameter of :meth:`__init__`. - num_measurements: An integer zero dimensional ``torch.Tensor`` which contains a number of loss measurements. - The sum or mean of the results of these measurements are in the ``loss`` parameter. - """ - if self.take_avg_loss: - self.loss_sum += loss.detach() * num_measurements - else: - self.loss_sum += loss.detach() - self.num_measurements += num_measurements - - def compute(self): - """ - Returns mean loss. - """ - if self.num_measurements.eq(0): - return torch.tensor(float('nan')) - return self.loss_sum / self.num_measurements diff --git a/SoundScribe/SpeakerID/nemo/collections/common/metrics/metric_string_to_torchmetric.py b/SoundScribe/SpeakerID/nemo/collections/common/metrics/metric_string_to_torchmetric.py deleted file mode 100644 index b38047b576cc304f77b1604678eeb5b36dd32267..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/metrics/metric_string_to_torchmetric.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from torchmetrics import Accuracy, AveragePrecision, F1Score, MatthewsCorrCoef, PearsonCorrCoef, SpearmanCorrCoef -from torchmetrics.text.rouge import ROUGEScore - -from nemo.collections.common.metrics.classification_accuracy import ExactStringMatchMetric, TokenF1Score - -__all__ = ['MetricStringToTorchMetric'] - -# Dictionary that maps a metric string name to its corresponding torchmetric class. - -MetricStringToTorchMetric = { - 'accuracy': Accuracy, - 'average_precision': AveragePrecision, - 'f1': F1Score, - 'token_f1': TokenF1Score, - 'pearson_corr_coef': PearsonCorrCoef, - 'spearman_corr_coef': SpearmanCorrCoef, - 'matthews_corr_coef': MatthewsCorrCoef, - 'exact_string_match': ExactStringMatchMetric, - 'rouge': ROUGEScore, -} diff --git a/SoundScribe/SpeakerID/nemo/collections/common/metrics/perplexity.py b/SoundScribe/SpeakerID/nemo/collections/common/metrics/perplexity.py deleted file mode 100644 index 9e1c21737ec8caea46c8850c32dd23fe499f6394..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/metrics/perplexity.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from torch.distributions.categorical import Categorical -from torchmetrics import Metric - -__all__ = ['Perplexity'] - - -class Perplexity(Metric): - """ - This class computes mean perplexity of distributions in the last dimension of inputs. It is a wrapper around - :doc:`torch.distributions.Categorical.perplexity` method. You have to provide either - ``probs`` or ``logits`` to the :meth:`update` method. The class computes perplexities for distributions passed to - :meth:`update` method in ``probs`` or ``logits`` arguments and averages the perplexities. Reducing results between - all workers is done via SUM operations. - See `PyTorch Lightning Metrics `_ for the metric usage instructions. - - Args: - dist_sync_on_step: - Synchronize metric state across processes at each ``forward()`` - before returning the value at the step. - process_group: - Specify the process group on which synchronization is called. default: ``None`` (which selects the entire - world) - validate_args: - If ``True`` values of :meth:`update` method parameters are checked. ``logits`` has to not contain NaNs and - ``probs`` last dim has to be valid probability distribution. - """ - - full_state_update = True - - def __init__(self, dist_sync_on_step=False, process_group=None, validate_args=True): - super().__init__(dist_sync_on_step=dist_sync_on_step, process_group=process_group) - self.validate_args = validate_args - self.add_state('perplexities_sum', torch.tensor(0.0, dtype=torch.float64), dist_reduce_fx='sum') - # Total number of distributions seen since last reset - self.add_state('num_distributions', torch.tensor(0, dtype=torch.int64), dist_reduce_fx='sum') - - def update(self, probs=None, logits=None): - """ - Updates :attr:`perplexities_sum` and :attr:`num_distributions`. - Args: - probs: A ``torch.Tensor`` which innermost dimension is valid probability distribution. - logits: A ``torch.Tensor`` without NaNs. - """ - d = Categorical( - None if probs is None else probs.detach(), - None if logits is None else logits.detach(), - validate_args=self.validate_args, - ) - ppl = d.perplexity() - self.num_distributions += ppl.numel() - self.perplexities_sum += ppl.sum() - - def compute(self): - """ - Returns perplexity across all workers and resets to 0 :attr:`perplexities_sum` and :attr:`num_distributions`. - """ - if self.num_distributions.eq(0): - return None - return self.perplexities_sum / self.num_distributions diff --git a/SoundScribe/SpeakerID/nemo/collections/common/metrics/punct_er.py b/SoundScribe/SpeakerID/nemo/collections/common/metrics/punct_er.py deleted file mode 100644 index 933c1581f016dc73fdb4be001aa2b45b0c3079f0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/metrics/punct_er.py +++ /dev/null @@ -1,473 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -from collections import namedtuple -from tqdm import tqdm - -from nemo.utils import logging - -try: - import pandas as pd - from tabulate import tabulate - - HAVE_TABLUATE_AND_PANDAS = True -except (ImportError, ModuleNotFoundError): - HAVE_TABLUATE_AND_PANDAS = False - - -def punctuation_error_rate( - references: list[str], hypotheses: list[str], punctuation_marks: list[str], punctuation_mask: str = "[PUNCT]", -) -> None: - - """ - Computes Punctuation Error Rate - - Args: - references (list[str]) - list of references - hypotheses (list[str]) - list of hypotheses - punctuation_marks (list[str]) - list of punctuation marks for computing metrics - punctuation_mask (str, by default "[PUNCT]") - mask token that will be applied to - given punctuation marks while edit distance calculation - - Return: - punct_er (float) - Punctuation Error Rate - """ - - dper_obj = DatasetPunctuationErrorRate( - references=references, - hypotheses=hypotheses, - punctuation_marks=punctuation_marks, - punctuation_mask=punctuation_mask, - ) - - dper_obj.compute() - - return dper_obj.punct_er - - -class OccurancePunctuationErrorRate: - """ - Class for computation puncutation-related absolute amounts of operations and thier rates - between reference and hypothesis strings: - - Absolute amounts of correct predictions, deletions, insertions - and substitutions for each given punctuation mark - - Rates of correct predictions, deletions, insertions - and substitutions for each given punctuation mark - - Overall rates of correct predictions, deletions, insertions - and substiturions between reference and hypothesis string - - Punctuation Error Rate - - Args to init: - punctuation_marks (list[str]) - list of punctuation marks for computing metrics - punctuation_mask (str, by default "[PUNCT]") - mask token that will be applied to - given punctuation marks while edit distance calculation - - How to use: - 1. Create object of OccurancePunctuationErrorRate class. - Example: - punctuation_marks = [".", ",", "!", "?"] - oper_obj = OccurancePunctuationErrorRate(punctuation_marks) - - 2. To compute punctuation metrics, pass reference and hypothesis string to the "compute" method - of created object. - Example: - reference_str = "Hi, dear! Nice to see you. What's" - hypothesis_str = "Hi dear! Nice to see you! What's?" - oper_obj.compute(reference_str, hypothesis_str) - - Output (listed in order of output): - 1. Dict of absolute operations amounts for each given punctuation mark: - Example: - {'.': {'Correct': 0, 'Deletions': 0, 'Insertions': 0, 'Substitutions': 1}, - ',': {'Correct': 0, 'Deletions': 1, 'Insertions': 0, 'Substitutions': 0}, - '!': {'Correct': 1, 'Deletions': 0, 'Insertions': 0, 'Substitutions': 0}, - '?': {'Correct': 0, 'Deletions': 0, 'Insertions': 1, 'Substitutions': 0}} - - 2. Dict of substitutions absolute amounts between given punctuation marks: - Example: - {'.': {'.': 0, ',': 0, '!': 1, '?': 0}, - ',': {'.': 0, ',': 0, '!': 0, '?': 0}, - '!': {'.': 0, ',': 0, '!': 0, '?': 0}, - '?': {'.': 0, ',': 0, '!': 0, '?': 0}} - - 3. namedtuple "PunctuationRates" of punctuation operation rates (in range from 0 to 1): - 3.1. correct_rate - overall correct rate - Example: correct_rate=0.25 - 3.2. deletions_rate - overall deletions rate - Example: deletions_rate=0.25 - 3.3. insertions_rate - overall insertions rate - Example: insertions_rate=0.25 - 3.4. substitutions_rate - overall substitutions_rate - Example: substitutions_rate=0.25 - 3.5. punct_er - Punctuation Error Rate - Example: punct_er=0.75 - 3.6. operation_rates - dict of operations rates for each given punctuation mark - Example: - operation_rates={ - '.': {'Correct': 0.0, 'Deletions': 0.0, 'Insertions': 0.0, 'Substitutions': 1.0}, - ',': {'Correct': 0.0, 'Deletions': 1.0, 'Insertions': 0.0, 'Substitutions': 0.0}, - '!': {'Correct': 1.0, 'Deletions': 0.0, 'Insertions': 0.0, 'Substitutions': 0.0}, - '?': {'Correct': 0.0, 'Deletions': 0.0, 'Insertions': 1.0, 'Substitutions': 0.0} - } - - 3.7. substitution_rates - dict of substitution rates for each given punctuation mark - Example: - substitution_rates={ - '.': {'.': 0.0, ',': 0.0, '!': 1.0, '?': 0.0}, - ',': {'.': 0.0, ',': 0.0, '!': 0.0, '?': 0.0}, - '!': {'.': 0.0, ',': 0.0, '!': 0.0, '?': 0.0}, - '?': {'.': 0.0, ',': 0.0, '!': 0.0, '?': 0.0} - } - """ - - def __init__(self, punctuation_marks: list[str], punctuation_mask: str = "[PUNCT]") -> None: - - assert len(punctuation_marks) != 0, f"List of punctuation marks is empty" - - self.punctuation_marks = punctuation_marks - self.punctuation_mask = punctuation_mask - - self.operations = ["Correct", "Deletions", "Insertions", "Substitutions"] - - def compute_rates(self, operation_amounts: dict, substitution_amounts: dict): - operation_rates = {pm: {operation: 0 for operation in self.operations} for pm in self.punctuation_marks} - substitution_rates = {pm: {pm: 0 for pm in self.punctuation_marks} for pm in self.punctuation_marks} - - for pm in self.punctuation_marks: - operations_amount_by_pm = sum(operation_amounts[pm].values()) - - if operations_amount_by_pm == 0: - continue - - operation_rates[pm] = { - operation: (operation_amounts[pm][operation] / operations_amount_by_pm) - for operation in self.operations - } - - substitution_rates[pm] = { - _pm: (substitution_amounts[pm][_pm] / operations_amount_by_pm) - for _pm in substitution_amounts[pm].keys() - } - - _operation_amounts = { - operation: {pm: operation_amounts[operation] for pm, operation_amounts in operation_amounts.items()} - for operation in self.operations - } - - overall_amounts_by_operation = { - operation: sum(_operation_amounts[operation].values()) for operation in _operation_amounts - } - overall_operations_amount = sum(overall_amounts_by_operation.values()) - - punctuation_rates = namedtuple( - 'PunctuationRates', - [ - 'correct_rate', - 'deletions_rate', - 'insertions_rate', - 'substitutions_rate', - 'punct_er', - 'operation_rates', - 'substitution_rates', - ], - ) - - if overall_operations_amount == 0: - rates = punctuation_rates(0, 0, 0, 0, 0, operation_rates, substitution_rates) - else: - correct_rate = overall_amounts_by_operation["Correct"] / overall_operations_amount - deletions_rate = overall_amounts_by_operation["Deletions"] / overall_operations_amount - insertions_rate = overall_amounts_by_operation["Insertions"] / overall_operations_amount - substitutions_rate = overall_amounts_by_operation["Substitutions"] / overall_operations_amount - punct_er = deletions_rate + insertions_rate + substitutions_rate - - rates = punctuation_rates( - correct_rate, - deletions_rate, - insertions_rate, - substitutions_rate, - punct_er, - operation_rates, - substitution_rates, - ) - - return rates - - def compute_operation_amounts(self, reference: str, hypothesis: str): - operation_amounts = {pm: {operation: 0 for operation in self.operations} for pm in self.punctuation_marks} - substitution_amounts = {pm: {pm: 0 for pm in self.punctuation_marks} for pm in self.punctuation_marks} - - def tokenize(text: str, punctuation_marks: list[str]): - punctuation_marks = "\\" + "\\".join(self.punctuation_marks) - tokens = re.findall(rf"[\w']+|[{punctuation_marks}]", text) - return tokens - - def mask_punct_tokens(tokens: list[str], punctuation_marks: list[str], punctuation_mask: str): - masked = [punctuation_mask if token in punctuation_marks else token for token in tokens] - return masked - - r_tokens = tokenize(reference, self.punctuation_marks) - h_tokens = tokenize(hypothesis, self.punctuation_marks) - - r_masked = mask_punct_tokens(r_tokens, self.punctuation_marks, self.punctuation_mask) - h_masked = mask_punct_tokens(h_tokens, self.punctuation_marks, self.punctuation_mask) - - r_punct_amount = r_masked.count(self.punctuation_mask) - h_punct_amount = h_masked.count(self.punctuation_mask) - - if r_punct_amount + h_punct_amount == 0: - return operation_amounts, substitution_amounts - - r_len = len(r_masked) - h_len = len(h_masked) - - costs = [[0 for inner in range(h_len + 1)] for outer in range(r_len + 1)] - backtrace = [[0 for inner in range(h_len + 1)] for outer in range(r_len + 1)] - - COR = 'C' - DEL, DEL_PENALTY = 'D', 1 - INS, INS_PENALTY = 'I', 1 - SUB, SUB_PENALTY = 'S', 1 - - for i in range(1, r_len + 1): - costs[i][0] = DEL_PENALTY * i - backtrace[i][0] = DEL - - for j in range(1, h_len + 1): - costs[0][j] = INS_PENALTY * j - backtrace[0][j] = INS - - for j in range(1, h_len + 1): - costs[0][j] = INS_PENALTY * j - backtrace[0][j] = INS - - for i in range(1, r_len + 1): - for j in range(1, h_len + 1): - if r_masked[i - 1] == h_masked[j - 1]: - costs[i][j] = costs[i - 1][j - 1] - backtrace[i][j] = COR - else: - substitution_cost = costs[i - 1][j - 1] + SUB_PENALTY - insertion_cost = costs[i][j - 1] + INS_PENALTY - deletion_cost = costs[i - 1][j] + DEL_PENALTY - - costs[i][j] = min(substitution_cost, insertion_cost, deletion_cost) - if costs[i][j] == substitution_cost: - backtrace[i][j] = SUB - elif costs[i][j] == insertion_cost: - backtrace[i][j] = INS - else: - backtrace[i][j] = DEL - - i = r_len - j = h_len - - while i > 0 or j > 0: - if backtrace[i][j] == COR: - if r_masked[i - 1] == self.punctuation_mask or h_masked[j - 1] == self.punctuation_mask: - r_token = r_tokens[i - 1] - h_token = h_tokens[j - 1] - - if r_token == h_token: - operation_amounts[r_token]['Correct'] += 1 - else: - operation_amounts[r_token]['Substitutions'] += 1 - substitution_amounts[r_token][h_token] += 1 - i -= 1 - j -= 1 - - elif backtrace[i][j] == SUB: - i -= 1 - j -= 1 - - elif backtrace[i][j] == INS: - j -= 1 - - elif backtrace[i][j] == DEL: - i -= 1 - - for pm in self.punctuation_marks: - num_of_correct = operation_amounts[pm]['Correct'] - - num_substitutions_of_pm = operation_amounts[pm]['Substitutions'] - num_substitutions_to_pm = sum([substitution_amounts[_pm][pm] for _pm in self.punctuation_marks]) - - num_of_deletions = r_tokens.count(pm) - (num_of_correct + num_substitutions_of_pm) - operation_amounts[pm]['Deletions'] = num_of_deletions - - num_of_insertions = h_tokens.count(pm) - (num_of_correct + num_substitutions_to_pm) - operation_amounts[pm]['Insertions'] = num_of_insertions - - return operation_amounts, substitution_amounts - - def compute(self, reference: str, hypothesis: str): - operation_amounts, substitution_amounts = self.compute_operation_amounts(reference, hypothesis) - punctuation_rates = self.compute_rates(operation_amounts, substitution_amounts) - return operation_amounts, substitution_amounts, punctuation_rates - - -class DatasetPunctuationErrorRate: - """ - Class for computation the total puncutation-related absolute amounts of operations and their rates - in pairs of reference and hypothesis strins: - - Absolute amounts of correct predictions, deletions, insertions - and substitutions for each given punctuation mark - - Rates of correct predictions, deletions, insertions - and substitutions for each given punctuation mark - - Total rates of correct predictions, deletions, insertions - and substiturions in pairs of reference and hypothesis strings - - Punctuation Error Rate - - Args to init: - references (list[str]) - list of references - hypotheses (list[str]) - list of hypotheses - punctuation_marks (list[str]) - list of punctuation marks for computing metrics - punctuation_mask (str, by default "[PUNCT]") - mask token that will be applied to - given punctuation marks while edit distance calculation - - How to use: - 1. Create object of DatasetPunctuationErrorRate class. - Example: - references = ["Hi, dear! Nice to see you. What's"] - hypotheses = ["Hi dear! Nice to see you! What's?"] - punctuation_marks = [".", ",", "!", "?"] - - dper_obj = DatasetPunctuationErrorRate(references, hypotheses, punctuation_marks) - - 2. To compute punctuation metrics, call the class method "compute()". - Example: - dper_obj.compute() - - Result: - The following atributes of class object will be updated with calculated metrics values. - The values are available with calling the atributes: - - dper_obj.operation_rates - dict, rates of correctness and errors for each punctuation mark - from `preset dper_obj.punctuation_marks` list. - - dper_obj.substitution_rates - dict, substitution rates between puncutation marks from - `preset dper_obj.punctuation_marks` list. - - dper_obj.correct_rate - float, total rate of correctness between provided pairs of - references and hypotheses. - - dper_obj.deletions_rate - float, total rate of deletions between provided pairs of - references and hypotheses. - - dper_obj.insertions_rate - float, total rate of insertions between provided pairs of - references and hypotheses. - - dper_obj.substitutions_rate - float, total rate of substitutions between provided pairs of - references and hypotheses. - - dper_obj.punct_er - float, total Punctuation Error Rate between provided pairs of - references and hypotheses. - """ - - def __init__( - self, - references: list[str], - hypotheses: list[str], - punctuation_marks: list[str], - punctuation_mask: str = "[PUNCT]", - ) -> None: - - self.references = references - self.hypotheses = hypotheses - self.punctuation_marks = punctuation_marks - self.punctuation_mask = punctuation_mask - - self.oper_obj = OccurancePunctuationErrorRate( - punctuation_marks=self.punctuation_marks, punctuation_mask=self.punctuation_mask - ) - - self.operation_amounts = [] - self.substitution_amounts = [] - self.rates = [] - - self.operation_rates = None - self.substitution_rates = None - self.correct_rate = None - self.deletions_rate = None - self.insertions_rate = None - self.substitutions_rate = None - self.punct_er = None - - def compute(self): - def sum_amounts(amounts_dicts: list[dict]): - amounts = {key: {_key: 0 for _key in amounts_dicts[0][key]} for key in amounts_dicts[0].keys()} - - for amounts_dict in amounts_dicts: - for outer_key, inner_dict in amounts_dict.items(): - for inner_key, value in inner_dict.items(): - amounts[outer_key][inner_key] += value - return amounts - - logging.info("Computing Punctuation Error Rate") - - for reference, hypothesis in tqdm(zip(self.references, self.hypotheses), total=len(self.references)): - operation_amounts, substitution_amounts, punctuation_rates = self.oper_obj.compute(reference, hypothesis) - self.operation_amounts.append(operation_amounts) - self.substitution_amounts.append(substitution_amounts) - self.rates.append(punctuation_rates) - - overall_operation_amounts = sum_amounts(self.operation_amounts) - overall_substitution_amounts = sum_amounts(self.substitution_amounts) - overall_rates = self.oper_obj.compute_rates( - operation_amounts=overall_operation_amounts, substitution_amounts=overall_substitution_amounts - ) - - self.operation_rates = overall_rates.operation_rates - self.substitution_rates = overall_rates.substitution_rates - self.correct_rate = overall_rates.correct_rate - self.deletions_rate = overall_rates.deletions_rate - self.insertions_rate = overall_rates.insertions_rate - self.substitutions_rate = overall_rates.substitutions_rate - self.punct_er = overall_rates.punct_er - - def reset(self): - self.operation_amounts = [] - self.substitution_amounts = [] - self.rates = [] - - self.operation_rates = None - self.substitution_rates = None - self.correct_rate = None - self.deletions_rate = None - self.insertions_rate = None - self.substitutions_rate = None - self.punct_er = None - - def print(self): - logging.info(f'Dataset PER ' + str(round(100 * self.punct_er, 2)) + '%') - - if HAVE_TABLUATE_AND_PANDAS: - rates_by_pm_df = pd.DataFrame(self.operation_rates) * 100 - substitution_rates_by_pm_df = pd.DataFrame(self.substitution_rates) * 100 - - logging.info( - "Rates of punctuation correctness and errors (%):\n" - + tabulate(rates_by_pm_df, headers='keys', tablefmt='psql') - ) - logging.info( - "Substitution rates between punctuation marks (%):\n" - + tabulate(substitution_rates_by_pm_df, headers='keys', tablefmt='psql') - ) - else: - logging.warning("Some of the modules (pandas or tabulate) can't be imported") - logging.info(f"Rates of punctuation correctness and errors (in range [0, 1]):\n{self.operation_rates}\n") - logging.info( - f"Substitution rates between punctuation marks (in range [0, 1]):\n{self.substitution_rates}\n" - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/__init__.py b/SoundScribe/SpeakerID/nemo/collections/common/parts/__init__.py deleted file mode 100644 index f1997e0c5dd5dcd0f25c73e02a2d0667e62e371a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/parts/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.common.parts.adapter_modules import LinearAdapter, LinearAdapterConfig -from nemo.collections.common.parts.mlm_scorer import MLMScorer -from nemo.collections.common.parts.multi_layer_perceptron import MultiLayerPerceptron -from nemo.collections.common.parts.transformer_utils import * -from nemo.collections.common.parts.utils import * diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 29e204a4750935180eddb937601c6243fabbe324..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/adapter_modules.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/adapter_modules.cpython-310.pyc deleted file mode 100644 index 4919234cb258653d63527bf4de8985294bfb6015..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/adapter_modules.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/mlm_scorer.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/mlm_scorer.cpython-310.pyc deleted file mode 100644 index 875cd5090d401a681fc6bb0b4877fbc61d241017..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/mlm_scorer.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/multi_layer_perceptron.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/multi_layer_perceptron.cpython-310.pyc deleted file mode 100644 index 068416f46712ed6bd6758e4e8f92ed10041e7511..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/multi_layer_perceptron.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/rnn.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/rnn.cpython-310.pyc deleted file mode 100644 index 4285df1b8bd5b9060fd4d12d53d06b809e752e89..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/rnn.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/transformer_utils.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/transformer_utils.cpython-310.pyc deleted file mode 100644 index 2984bbe75819b5c466b436f66473e735a903dc1d..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/transformer_utils.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/utils.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/utils.cpython-310.pyc deleted file mode 100644 index c0f07991cbb557ca372f4d650d688e964e50efde..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/parts/__pycache__/utils.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/adapter_modules.py b/SoundScribe/SpeakerID/nemo/collections/common/parts/adapter_modules.py deleted file mode 100644 index 2084147f9cbcb912fd41f16db8f3897c77cc67d1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/parts/adapter_modules.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field, is_dataclass -from typing import Any, Optional - -from hydra.utils import instantiate -from omegaconf import OmegaConf -from torch import nn as nn - -from nemo.collections.common.parts.utils import activation_registry -from nemo.core.classes.mixins import access_mixins, adapter_mixin_strategies - - -class AdapterModuleUtil(access_mixins.AccessMixin): - """ - Base class of Adapter Modules, providing common functionality to all Adapter Modules. - """ - - def setup_adapter_strategy(self, adapter_strategy: Optional[adapter_mixin_strategies.AbstractAdapterStrategy]): - """ - Setup adapter strategy of this class, enabling dynamic change in the way the adapter output is - merged with the input. - - When called successfully, will assign the variable `adapter_strategy` to the module. - - Args: - adapter_strategy: Can be a None or an implementation of AbstractAdapterStrategy. - """ - # set default adapter strategy - if adapter_strategy is None: - adapter_strategy = self.get_default_strategy_config() - - if is_dataclass(adapter_strategy): - adapter_strategy = OmegaConf.structured(adapter_strategy) - OmegaConf.set_struct(adapter_strategy, False) - - # The config must have the `_target_` field pointing to the actual adapter strategy class - # which will load that strategy dynamically to this module. - if isinstance(adapter_strategy, dict) or OmegaConf.is_config(adapter_strategy): - self.adapter_strategy = instantiate(adapter_strategy) - elif isinstance(adapter_strategy, adapter_mixin_strategies.AbstractAdapterStrategy): - self.adapter_strategy = adapter_strategy - else: - raise AttributeError(f'`adapter_strategy` provided is invalid : {adapter_strategy}') - - def get_default_strategy_config(self) -> 'dataclass': - """ - Returns a default adapter module strategy. - """ - return adapter_mixin_strategies.ResidualAddAdapterStrategyConfig() - - def adapter_unfreeze(self,): - """ - Sets the requires grad for all parameters in the adapter to True. - This method should be overridden for any custom unfreeze behavior that is required. - For example, if not all params of the adapter should be unfrozen. - """ - for param in self.parameters(): - param.requires_grad_(True) - - -class LinearAdapter(nn.Module, AdapterModuleUtil): - - """ - Simple Linear Feedforward Adapter module with LayerNorm and singe hidden layer with activation function. - Note: The adapter explicitly initializes its final layer with all zeros in order to avoid affecting the - original model when all adapters are disabled. - - Args: - in_features: Input dimension of the module. Note that for adapters, input_dim == output_dim. - dim: Hidden dimension of the feed forward network. - activation: Str name for an activation function. - norm_position: Str, can be `pre` or `post`. Defaults to `pre`. Determines whether the normalization - will occur in the first layer or the last layer. Certain architectures may prefer one over the other. - dropout: float value, whether to perform dropout on the output of the last layer of the adapter. - adapter_strategy: By default, ResidualAddAdapterStrategyConfig. An adapter composition function object. - """ - - def __init__( - self, - in_features: int, - dim: int, - activation: str = 'swish', - norm_position: str = 'pre', - dropout: float = 0.0, - adapter_strategy: adapter_mixin_strategies.ResidualAddAdapterStrategyConfig = None, - ): - super().__init__() - - activation = activation_registry[activation]() - # If the activation can be executed in place, do so. - if hasattr(activation, 'inplace'): - activation.inplace = True - - assert norm_position in ['pre', 'post'] - self.norm_position = norm_position - - if norm_position == 'pre': - self.module = nn.Sequential( - nn.LayerNorm(in_features), - nn.Linear(in_features, dim, bias=False), - activation, - nn.Linear(dim, in_features, bias=False), - ) - - elif norm_position == 'post': - self.module = nn.Sequential( - nn.Linear(in_features, dim, bias=False), - activation, - nn.Linear(dim, in_features, bias=False), - nn.LayerNorm(in_features), - ) - - if dropout > 0.0: - self.dropout = nn.Dropout(dropout) - else: - self.dropout = None - - # Setup adapter strategy - self.setup_adapter_strategy(adapter_strategy) - - # reset parameters - self.reset_parameters() - - def reset_parameters(self): - # Final layer initializations must be 0 - if self.norm_position == 'pre': - self.module[-1].weight.data *= 0 - - elif self.norm_position == 'post': - self.module[-1].weight.data *= 0 - self.module[-1].bias.data *= 0 - - def forward(self, x): - x = self.module(x) - - # Add dropout if available - if self.dropout is not None: - x = self.dropout(x) - - return x - - -@dataclass -class LinearAdapterConfig: - in_features: int - dim: int - activation: str = 'swish' - norm_position: str = 'pre' - dropout: float = 0.0 - adapter_strategy: Optional[Any] = field( - default_factory=lambda: adapter_mixin_strategies.ResidualAddAdapterStrategyConfig() - ) - _target_: str = "{0}.{1}".format(LinearAdapter.__module__, LinearAdapter.__name__) diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/mlm_scorer.py b/SoundScribe/SpeakerID/nemo/collections/common/parts/mlm_scorer.py deleted file mode 100644 index c38e4b25ed7214ce9c965714cf40a0c0ea15340c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/parts/mlm_scorer.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2020 AWSLABS, AMAZON. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List - -import numpy as np -import torch -from torch.nn.functional import softmax -from transformers import AutoModelForMaskedLM, AutoTokenizer - -__all__ = ['MLMScorer'] - - -class MLMScorer: - def __init__(self, model_name: str, device: str = 'cpu'): - """ - Creates MLM scorer from https://arxiv.org/abs/1910.14659. - Args: - model_name: HuggingFace pretrained model name - device: either 'cpu' or 'cuda' - """ - self.model = AutoModelForMaskedLM.from_pretrained(model_name).to(device).eval() - self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) - self.device = device - self.MASK_LABEL = self.tokenizer.mask_token - - def score_sentences(self, sentences: List[str]): - """ - returns list of MLM scores for each sentence in list. - """ - return [self.score_sentence(sentence) for sentence in sentences] - - def score_sentence(self, sentence: str): - """ - returns MLM score for sentence. - """ - assert type(sentence) == str - - tokens = self.tokenizer.tokenize(sentence) - mask_idx = [] - token_type = [] - attn_mask = [] - ids = [] - for m_idx, _ in enumerate(tokens): - masked = self.__mask_text__(m_idx, tokens) - mask_idx.append(m_idx) - ids.append(self.tokenizer.encode(masked)) - id_len = len(ids[-1]) - token_type.append([0] * id_len) - attn_mask.append([1] * id_len) - - data = { - 'input_ids': torch.tensor(ids, device=self.device), - 'attention_mask': torch.tensor(attn_mask, device=self.device), - 'token_type_ids': torch.tensor(token_type, device=self.device), - } - - with torch.no_grad(): - outputs = self.model(**data) - logits = outputs.logits - - scores = [] - scores_log_prob = 0.0 - - for i, m_idx in enumerate(mask_idx): - preds = logits[i].squeeze(0) - probs = softmax(preds, dim=1) - token_id = self.tokenizer.convert_tokens_to_ids([tokens[m_idx]])[0] - log_prob = np.log(probs[m_idx + 1, token_id].cpu().numpy()).item() - scores.append(log_prob) - scores_log_prob += log_prob - - return scores_log_prob - - def __mask_text__(self, idx: int, tokens: List[str]): - """ - replaces string at index idx in list `tokens` with a masked token and returns the modified list. - """ - masked = tokens.copy() - masked[idx] = self.MASK_LABEL - return masked diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/multi_layer_perceptron.py b/SoundScribe/SpeakerID/nemo/collections/common/parts/multi_layer_perceptron.py deleted file mode 100644 index 76c06bf23ea647148d4eaa0a9686e631c169a436..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/parts/multi_layer_perceptron.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - - -class MultiLayerPerceptron(torch.nn.Module): - """ - A simple MLP that can either be used independently or put on top - of pretrained models (such as BERT) and act as a classifier. - Args: - hidden_size (int): the size of each layer - num_classes (int): number of output classes - num_layers (int): number of layers - activation (str): type of activations for layers in between - log_softmax (bool): whether to add a log_softmax layer before output - """ - - def __init__( - self, - hidden_size: int, - num_classes: int, - num_layers: int = 2, - activation: str = 'relu', - log_softmax: bool = True, - ): - super().__init__() - self.layers = 0 - for _ in range(num_layers - 1): - layer = torch.nn.Linear(hidden_size, hidden_size) - setattr(self, f'layer{self.layers}', layer) - setattr(self, f'layer{self.layers + 1}', getattr(torch, activation)) - self.layers += 2 - layer = torch.nn.Linear(hidden_size, num_classes) - setattr(self, f'layer{self.layers}', layer) - self.layers += 1 - self.log_softmax = log_softmax - - @property - def last_linear_layer(self): - return getattr(self, f'layer{self.layers - 1}') - - def forward(self, hidden_states): - output_states = hidden_states[:] - for i in range(self.layers): - output_states = getattr(self, f'layer{i}')(output_states) - - if self.log_softmax: - output_states = torch.log_softmax(output_states, dim=-1) - return output_states diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/patch_utils.py b/SoundScribe/SpeakerID/nemo/collections/common/parts/patch_utils.py deleted file mode 100644 index eb67d17eece826faf7af5f5d18099589bd8cc33a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/parts/patch_utils.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from packaging import version - -# Library version globals -TORCH_VERSION = None -TORCH_VERSION_MIN = version.Version('1.7') diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__init__.py b/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__init__.py deleted file mode 100644 index bc443be41c4c3fcf0764eb9fd3e1828d63f8438c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 8e666f88a3ec5fa9165133a1a763c74f359185ca..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__pycache__/cleaners.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__pycache__/cleaners.cpython-310.pyc deleted file mode 100644 index c4adf39d46e482167cee65ef57a986ff6fe3e650..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__pycache__/cleaners.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__pycache__/collections.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__pycache__/collections.cpython-310.pyc deleted file mode 100644 index a2707adabb3e5ed97c105138d6dd1a8b9f548639..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__pycache__/collections.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__pycache__/manifest.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__pycache__/manifest.cpython-310.pyc deleted file mode 100644 index 656ade59a296c8ec36bf9305db1b6fe3d82bca3d..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__pycache__/manifest.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__pycache__/parsers.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__pycache__/parsers.cpython-310.pyc deleted file mode 100644 index 11908b3ca94ec05dbf574ccba158a7796db325f7..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/__pycache__/parsers.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/cleaners.py b/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/cleaners.py deleted file mode 100644 index 40c80115786a414eab3fc3edcd46f121453ca5d0..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/cleaners.py +++ /dev/null @@ -1,259 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - -import inflect -from text_unidecode import unidecode - -from nemo.utils import logging - -NUM_CHECK = re.compile(r'([$]?)(^|\s)(\S*[0-9]\S*)(?=(\s|$)((\S*)(\s|$))?)') - -TIME_CHECK = re.compile(r'([0-9]{1,2}):([0-9]{2})(am|pm)?') -CURRENCY_CHECK = re.compile(r'\$') -ORD_CHECK = re.compile(r'([0-9]+)(st|nd|rd|th)') -THREE_CHECK = re.compile(r'([0-9]{3})([.,][0-9]{1,2})?([!.?])?$') -DECIMAL_CHECK = re.compile(r'([.,][0-9]{1,2})$') - -ABBREVIATIONS_COMMON = [ - (re.compile('\\b%s\\.' % x[0]), x[1]) - for x in [ - ("ms", "miss"), - ("mrs", "misess"), - ("mr", "mister"), - ("messrs", "messeurs"), - ("dr", "doctor"), - ("drs", "doctors"), - ("st", "saint"), - ("co", "company"), - ("jr", "junior"), - ("sr", "senior"), - ("rev", "reverend"), - ("hon", "honorable"), - ("sgt", "sergeant"), - ("capt", "captain"), - ("maj", "major"), - ("col", "colonel"), - ("lt", "lieutenant"), - ("gen", "general"), - ("prof", "professor"), - ("lb", "pounds"), - ("rep", "representative"), - ("st", "street"), - ("ave", "avenue"), - ("etc", "et cetera"), - ("jan", "january"), - ("feb", "february"), - ("mar", "march"), - ("apr", "april"), - ("jun", "june"), - ("jul", "july"), - ("aug", "august"), - ("sep", "september"), - ("oct", "october"), - ("nov", "november"), - ("dec", "december"), - ] -] - -ABBREVIATIONS_EXPANDED = [ - (re.compile('\\b%s\\.' % x[0]), x[1]) - for x in [ - ("ltd", "limited"), - ("fig", "figure"), - ("figs", "figures"), - ("gent", "gentlemen"), - ("ft", "fort"), - ("esq", "esquire"), - ("prep", "preperation"), - ("bros", "brothers"), - ("ind", "independent"), - ("mme", "madame"), - ("pro", "professional"), - ("vs", "versus"), - ("inc", "include"), - ] -] - -ABBREVIATIONS_TTS_FASTPITCH = [ - (re.compile('\\b%s\\.' % x[0]), x[1]) - for x in [ - ("ms", "miss"), - ("mrs", "misess"), - ("mr", "mister"), - ("dr", "doctor"), - ("drs", "doctors"), - ("st", "saint"), - ("co", "company"), - ("jr", "junior"), - ("sr", "senior"), - ("rev", "reverend"), - ("hon", "honorable"), - ("sgt", "sergeant"), - ("capt", "captain"), - ("maj", "major"), - ("col", "colonel"), - ("lt", "lieutenant"), - ("gen", "general"), - ("prof", "professor"), - ("lb", "pounds"), - ("rep", "representative"), - ("st", "street"), - ("ave", "avenue"), - ("jan", "january"), - ("feb", "february"), - ("mar", "march"), - ("apr", "april"), - ("jun", "june"), - ("jul", "july"), - ("aug", "august"), - ("sep", "september"), - ("oct", "october"), - ("nov", "november"), - ("dec", "december"), - ("ltd", "limited"), - ("fig", "figure"), - ("figs", "figures"), - ("gent", "gentlemen"), - ("ft", "fort"), - ("esq", "esquire"), - ("prep", "preperation"), - ("bros", "brothers"), - ("ind", "independent"), - ("mme", "madame"), - ("pro", "professional"), - ("vs", "versus"), - ] -] - - -inflect = inflect.engine() - - -def clean_text(string, table, punctuation_to_replace, abbreviation_version=None): - warn_common_chars(string) - string = unidecode(string) - string = string.lower() - string = re.sub(r'\s+', " ", string) - string = clean_numbers(string) - string = clean_abbreviations(string, version=abbreviation_version) - string = clean_punctuations(string, table, punctuation_to_replace) - string = re.sub(r'\s+', " ", string).strip() - return string - - -def warn_common_chars(string): - if re.search(r'[£€]', string): - logging.warning("Your transcript contains one of '£' or '€' which we do not currently handle") - - -def clean_numbers(string): - cleaner = NumberCleaner() - string = NUM_CHECK.sub(cleaner.clean, string) - return string - - -def clean_abbreviations(string, version=None): - abbbreviations = ABBREVIATIONS_COMMON - if version == "fastpitch": - abbbreviations = ABBREVIATIONS_TTS_FASTPITCH - elif version == "expanded": - abbbreviations.extend = ABBREVIATIONS_EXPANDED - for regex, replacement in abbbreviations: - string = re.sub(regex, replacement, string) - return string - - -def clean_punctuations(string, table, punctuation_to_replace): - for punc, replacement in punctuation_to_replace.items(): - string = re.sub('\\{}'.format(punc), " {} ".format(replacement), string) - if table: - string = string.translate(table) - return string - - -class NumberCleaner: - def __init__(self): - super().__init__() - self.reset() - - def reset(self): - self.curr_num = [] - self.currency = None - - def format_final_number(self, whole_num, decimal): - if self.currency: - return_string = inflect.number_to_words(whole_num) - return_string += " dollar" if whole_num == 1 else " dollars" - if decimal: - return_string += " and " + inflect.number_to_words(decimal) - return_string += " cent" if whole_num == decimal else " cents" - self.reset() - return return_string - - self.reset() - if decimal: - whole_num += "." + decimal - return inflect.number_to_words(whole_num) - else: - # Check if there are non-numbers - def convert_to_word(match): - return " " + inflect.number_to_words(match.group(0)) + " " - - return re.sub(r'[0-9,]+', convert_to_word, whole_num) - - def clean(self, match): - ws = match.group(2) - number = match.group(3) - _proceeding_symbol = match.group(7) - - time_match = TIME_CHECK.match(number) - if time_match: - string = ws + inflect.number_to_words(time_match.group(1)) + "{}{}" - mins = int(time_match.group(2)) - min_string = "" - if mins != 0: - min_string = " " + inflect.number_to_words(time_match.group(2)) - ampm_string = "" - if time_match.group(3): - ampm_string = " " + time_match.group(3) - return string.format(min_string, ampm_string) - - ord_match = ORD_CHECK.match(number) - if ORD_CHECK.match(number): - return ws + inflect.number_to_words(ord_match.group(0)) - - if self.currency is None: - # Check if it is a currency - self.currency = match.group(1) or CURRENCY_CHECK.match(number) - - # Check to see if next symbol is a number - # If it is a number and it has 3 digits, then it is probably a - # continuation - three_match = THREE_CHECK.match(match.group(6)) - if three_match: - self.curr_num.append(number) - return " " - # Else we can output - else: - # Check for decimals - whole_num = "".join(self.curr_num) + number - decimal = None - decimal_match = DECIMAL_CHECK.search(whole_num) - if decimal_match: - decimal = decimal_match.group(1)[1:] - whole_num = whole_num[: -len(decimal) - 1] - whole_num = re.sub(r'\.', '', whole_num) - return ws + self.format_final_number(whole_num, decimal) diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/collections.py b/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/collections.py deleted file mode 100644 index 66def034400fb9d6f3cfbde7ba04b0e04b5bf14f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/collections.py +++ /dev/null @@ -1,1420 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collections -import json -import os -from itertools import combinations -from typing import Any, Dict, Iterable, List, Optional, Union - -import pandas as pd - -from nemo.collections.common.parts.preprocessing import manifest, parsers -from nemo.utils import logging - - -class _Collection(collections.UserList): - """List of parsed and preprocessed data.""" - - OUTPUT_TYPE = None # Single element output type. - - -class Text(_Collection): - """Simple list of preprocessed text entries, result in list of tokens.""" - - OUTPUT_TYPE = collections.namedtuple('TextEntity', 'tokens') - - def __init__(self, texts: List[str], parser: parsers.CharParser): - """Instantiates text manifest and do the preprocessing step. - - Args: - texts: List of raw texts strings. - parser: Instance of `CharParser` to convert string to tokens. - """ - - data, output_type = [], self.OUTPUT_TYPE - for text in texts: - tokens = parser(text) - - if tokens is None: - logging.warning("Fail to parse '%s' text line.", text) - continue - - data.append(output_type(tokens)) - - super().__init__(data) - - -class FromFileText(Text): - """Another form of texts manifest with reading from file.""" - - def __init__(self, file: str, parser: parsers.CharParser): - """Instantiates text manifest and do the preprocessing step. - - Args: - file: File path to read from. - parser: Instance of `CharParser` to convert string to tokens. - """ - - texts = self.__parse_texts(file) - - super().__init__(texts, parser) - - @staticmethod - def __parse_texts(file: str) -> List[str]: - if not os.path.exists(file): - raise ValueError('Provided texts file does not exists!') - - _, ext = os.path.splitext(file) - if ext == '.csv': - texts = pd.read_csv(file)['transcript'].tolist() - elif ext == '.json': # Not really a correct json. - texts = list(item['text'] for item in manifest.item_iter(file)) - else: - with open(file, 'r') as f: - texts = f.readlines() - - return texts - - -class AudioText(_Collection): - """List of audio-transcript text correspondence with preprocessing.""" - - OUTPUT_TYPE = collections.namedtuple( - typename='AudioTextEntity', - field_names='id audio_file duration text_tokens offset text_raw speaker orig_sr lang', - ) - - def __init__( - self, - ids: List[int], - audio_files: List[str], - durations: List[float], - texts: List[str], - offsets: List[str], - speakers: List[Optional[int]], - orig_sampling_rates: List[Optional[int]], - token_labels: List[Optional[int]], - langs: List[Optional[str]], - parser: parsers.CharParser, - min_duration: Optional[float] = None, - max_duration: Optional[float] = None, - max_number: Optional[int] = None, - do_sort_by_duration: bool = False, - index_by_file_id: bool = False, - ): - """Instantiates audio-text manifest with filters and preprocessing. - - Args: - ids: List of examples positions. - audio_files: List of audio files. - durations: List of float durations. - texts: List of raw text transcripts. - offsets: List of duration offsets or None. - speakers: List of optional speakers ids. - orig_sampling_rates: List of original sampling rates of audio files. - langs: List of language ids, one for eadh sample, or None. - parser: Instance of `CharParser` to convert string to tokens. - min_duration: Minimum duration to keep entry with (default: None). - max_duration: Maximum duration to keep entry with (default: None). - max_number: Maximum number of samples to collect. - do_sort_by_duration: True if sort samples list by duration. Not compatible with index_by_file_id. - index_by_file_id: If True, saves a mapping from filename base (ID) to index in data. - """ - - output_type = self.OUTPUT_TYPE - data, duration_filtered, num_filtered, total_duration = [], 0.0, 0, 0.0 - if index_by_file_id: - self.mapping = {} - - for id_, audio_file, duration, offset, text, speaker, orig_sr, token_labels, lang in zip( - ids, audio_files, durations, offsets, texts, speakers, orig_sampling_rates, token_labels, langs - ): - # Duration filters. - if min_duration is not None and duration < min_duration: - duration_filtered += duration - num_filtered += 1 - continue - - if max_duration is not None and duration > max_duration: - duration_filtered += duration - num_filtered += 1 - continue - - if token_labels is not None: - text_tokens = token_labels - else: - if text != '': - if hasattr(parser, "is_aggregate") and parser.is_aggregate and isinstance(text, str): - if lang is not None: - text_tokens = parser(text, lang) - # for future use if want to add language bypass to audio_to_text classes - # elif hasattr(parser, "lang") and parser.lang is not None: - # text_tokens = parser(text, parser.lang) - else: - raise ValueError("lang required in manifest when using aggregate tokenizers") - else: - text_tokens = parser(text) - else: - text_tokens = [] - - if text_tokens is None: - duration_filtered += duration - num_filtered += 1 - continue - - total_duration += duration - - data.append(output_type(id_, audio_file, duration, text_tokens, offset, text, speaker, orig_sr, lang)) - if index_by_file_id: - file_id, _ = os.path.splitext(os.path.basename(audio_file)) - if file_id not in self.mapping: - self.mapping[file_id] = [] - self.mapping[file_id].append(len(data) - 1) - - # Max number of entities filter. - if len(data) == max_number: - break - - if do_sort_by_duration: - if index_by_file_id: - logging.warning("Tried to sort dataset by duration, but cannot since index_by_file_id is set.") - else: - data.sort(key=lambda entity: entity.duration) - - logging.info("Dataset loaded with %d files totalling %.2f hours", len(data), total_duration / 3600) - logging.info("%d files were filtered totalling %.2f hours", num_filtered, duration_filtered / 3600) - - super().__init__(data) - - -class VideoText(_Collection): - """List of video-transcript text correspondence with preprocessing.""" - - OUTPUT_TYPE = collections.namedtuple( - typename='AudioTextEntity', - field_names='id video_file duration text_tokens offset text_raw speaker orig_sr lang', - ) - - def __init__( - self, - ids: List[int], - video_files: List[str], - durations: List[float], - texts: List[str], - offsets: List[str], - speakers: List[Optional[int]], - orig_sampling_rates: List[Optional[int]], - token_labels: List[Optional[int]], - langs: List[Optional[str]], - parser: parsers.CharParser, - min_duration: Optional[float] = None, - max_duration: Optional[float] = None, - max_number: Optional[int] = None, - do_sort_by_duration: bool = False, - index_by_file_id: bool = False, - ): - """Instantiates video-text manifest with filters and preprocessing. - - Args: - ids: List of examples positions. - video_files: List of video files. - durations: List of float durations. - texts: List of raw text transcripts. - offsets: List of duration offsets or None. - speakers: List of optional speakers ids. - orig_sampling_rates: List of original sampling rates of audio files. - langs: List of language ids, one for eadh sample, or None. - parser: Instance of `CharParser` to convert string to tokens. - min_duration: Minimum duration to keep entry with (default: None). - max_duration: Maximum duration to keep entry with (default: None). - max_number: Maximum number of samples to collect. - do_sort_by_duration: True if sort samples list by duration. Not compatible with index_by_file_id. - index_by_file_id: If True, saves a mapping from filename base (ID) to index in data. - """ - - output_type = self.OUTPUT_TYPE - data, duration_filtered, num_filtered, total_duration = [], 0.0, 0, 0.0 - if index_by_file_id: - self.mapping = {} - - for id_, video_file, duration, offset, text, speaker, orig_sr, token_labels, lang in zip( - ids, video_files, durations, offsets, texts, speakers, orig_sampling_rates, token_labels, langs - ): - # Duration filters. - if min_duration is not None and duration < min_duration: - duration_filtered += duration - num_filtered += 1 - continue - - if max_duration is not None and duration > max_duration: - duration_filtered += duration - num_filtered += 1 - continue - - if token_labels is not None: - text_tokens = token_labels - else: - if text != '': - if hasattr(parser, "is_aggregate") and parser.is_aggregate and isinstance(text, str): - if lang is not None: - text_tokens = parser(text, lang) - else: - raise ValueError("lang required in manifest when using aggregate tokenizers") - else: - text_tokens = parser(text) - else: - text_tokens = [] - - if text_tokens is None: - duration_filtered += duration - num_filtered += 1 - continue - - total_duration += duration - - data.append(output_type(id_, video_file, duration, text_tokens, offset, text, speaker, orig_sr, lang)) - if index_by_file_id: - file_id, _ = os.path.splitext(os.path.basename(video_file)) - if file_id not in self.mapping: - self.mapping[file_id] = [] - self.mapping[file_id].append(len(data) - 1) - - # Max number of entities filter. - if len(data) == max_number: - break - - if do_sort_by_duration: - if index_by_file_id: - logging.warning("Tried to sort dataset by duration, but cannot since index_by_file_id is set.") - else: - data.sort(key=lambda entity: entity.duration) - - logging.info("Dataset loaded with %d files totalling %.2f hours", len(data), total_duration / 3600) - logging.info("%d files were filtered totalling %.2f hours", num_filtered, duration_filtered / 3600) - - super().__init__(data) - - -class ASRAudioText(AudioText): - """`AudioText` collector from asr structured json files.""" - - def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs): - """Parse lists of audio files, durations and transcripts texts. - - Args: - manifests_files: Either single string file or list of such - - manifests to yield items from. - *args: Args to pass to `AudioText` constructor. - **kwargs: Kwargs to pass to `AudioText` constructor. - """ - - ids, audio_files, durations, texts, offsets, = ( - [], - [], - [], - [], - [], - ) - speakers, orig_srs, token_labels, langs = [], [], [], [] - for item in manifest.item_iter(manifests_files): - ids.append(item['id']) - audio_files.append(item['audio_file']) - durations.append(item['duration']) - texts.append(item['text']) - offsets.append(item['offset']) - speakers.append(item['speaker']) - orig_srs.append(item['orig_sr']) - token_labels.append(item['token_labels']) - langs.append(item['lang']) - super().__init__( - ids, audio_files, durations, texts, offsets, speakers, orig_srs, token_labels, langs, *args, **kwargs - ) - - -class ASRVideoText(VideoText): - """`VideoText` collector from cv structured json files.""" - - def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs): - """Parse lists of video files, durations and transcripts texts. - - Args: - manifests_files: Either single string file or list of such - - manifests to yield items from. - *args: Args to pass to `VideoText` constructor. - **kwargs: Kwargs to pass to `VideoText` constructor. - """ - - ids, video_files, durations, texts, offsets, = ( - [], - [], - [], - [], - [], - ) - speakers, orig_srs, token_labels, langs = [], [], [], [] - for item in manifest.item_iter(manifests_files): - ids.append(item['id']) - video_files.append(item['video_file']) - durations.append(item['duration']) - texts.append(item['text']) - offsets.append(item['offset']) - speakers.append(item['speaker']) - orig_srs.append(item['orig_sr']) - token_labels.append(item['token_labels']) - langs.append(item['lang']) - super().__init__( - ids, video_files, durations, texts, offsets, speakers, orig_srs, token_labels, langs, *args, **kwargs - ) - - -class SpeechLabel(_Collection): - """List of audio-label correspondence with preprocessing.""" - - OUTPUT_TYPE = collections.namedtuple(typename='SpeechLabelEntity', field_names='audio_file duration label offset',) - - def __init__( - self, - audio_files: List[str], - durations: List[float], - labels: List[Union[int, str]], - offsets: List[Optional[float]], - min_duration: Optional[float] = None, - max_duration: Optional[float] = None, - max_number: Optional[int] = None, - do_sort_by_duration: bool = False, - index_by_file_id: bool = False, - ): - """Instantiates audio-label manifest with filters and preprocessing. - - Args: - audio_files: List of audio files. - durations: List of float durations. - labels: List of labels. - offsets: List of offsets or None. - min_duration: Minimum duration to keep entry with (default: None). - max_duration: Maximum duration to keep entry with (default: None). - max_number: Maximum number of samples to collect. - do_sort_by_duration: True if sort samples list by duration. - index_by_file_id: If True, saves a mapping from filename base (ID) to index in data. - """ - - if index_by_file_id: - self.mapping = {} - output_type = self.OUTPUT_TYPE - data, duration_filtered = [], 0.0 - total_duration = 0.0 - for audio_file, duration, command, offset in zip(audio_files, durations, labels, offsets): - # Duration filters. - if min_duration is not None and duration < min_duration: - duration_filtered += duration - continue - - if max_duration is not None and duration > max_duration: - duration_filtered += duration - continue - - data.append(output_type(audio_file, duration, command, offset)) - total_duration += duration - - if index_by_file_id: - file_id, _ = os.path.splitext(os.path.basename(audio_file)) - self.mapping[file_id] = len(data) - 1 - - # Max number of entities filter. - if len(data) == max_number: - break - - if do_sort_by_duration: - if index_by_file_id: - logging.warning("Tried to sort dataset by duration, but cannot since index_by_file_id is set.") - else: - data.sort(key=lambda entity: entity.duration) - - logging.info(f"Filtered duration for loading collection is {duration_filtered / 3600: .2f} hours.") - logging.info(f"Dataset loaded with {len(data)} items, total duration of {total_duration / 3600: .2f} hours.") - self.uniq_labels = sorted(set(map(lambda x: x.label, data))) - logging.info("# {} files loaded accounting to # {} labels".format(len(data), len(self.uniq_labels))) - - super().__init__(data) - - -class ASRSpeechLabel(SpeechLabel): - """`SpeechLabel` collector from structured json files.""" - - def __init__( - self, - manifests_files: Union[str, List[str]], - is_regression_task=False, - cal_labels_occurrence=False, - delimiter=None, - *args, - **kwargs, - ): - """Parse lists of audio files, durations and transcripts texts. - - Args: - manifests_files: Either single string file or list of such - - manifests to yield items from. - is_regression_task: It's a regression task. - cal_labels_occurrence: whether to calculate occurence of labels. - delimiter: separator for labels strings. - *args: Args to pass to `SpeechLabel` constructor. - **kwargs: Kwargs to pass to `SpeechLabel` constructor. - """ - audio_files, durations, labels, offsets = [], [], [], [] - all_labels = [] - for item in manifest.item_iter(manifests_files, parse_func=self.__parse_item): - audio_files.append(item['audio_file']) - durations.append(item['duration']) - if not is_regression_task: - label = item['label'] - label_list = label.split() if not delimiter else label.split(delimiter) - else: - label = float(item['label']) - label_list = [label] - - labels.append(label) - offsets.append(item['offset']) - all_labels.extend(label_list) - if cal_labels_occurrence: - self.labels_occurrence = collections.Counter(all_labels) - - super().__init__(audio_files, durations, labels, offsets, *args, **kwargs) - - def __parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]: - item = json.loads(line) - - # Audio file - if 'audio_filename' in item: - item['audio_file'] = item.pop('audio_filename') - elif 'audio_filepath' in item: - item['audio_file'] = item.pop('audio_filepath') - else: - raise ValueError(f"Manifest file has invalid json line structure: {line} without proper audio file key.") - item['audio_file'] = manifest.get_full_path(audio_file=item['audio_file'], manifest_file=manifest_file) - - # Duration. - if 'duration' not in item: - raise ValueError(f"Manifest file has invalid json line structure: {line} without proper duration key.") - - # Label. - if 'command' in item: - item['label'] = item.pop('command') - elif 'target' in item: - item['label'] = item.pop('target') - elif 'label' in item: - pass - else: - raise ValueError(f"Manifest file has invalid json line structure: {line} without proper label key.") - - item = dict( - audio_file=item['audio_file'], - duration=item['duration'], - label=item['label'], - offset=item.get('offset', None), - ) - - return item - - -class FeatureSequenceLabel(_Collection): - """List of feature sequence of label correspondence with preprocessing.""" - - OUTPUT_TYPE = collections.namedtuple(typename='FeatureSequenceLabelEntity', field_names='feature_file seq_label',) - - def __init__( - self, - feature_files: List[str], - seq_labels: List[str], - max_number: Optional[int] = None, - index_by_file_id: bool = False, - ): - """Instantiates feature-SequenceLabel manifest with filters and preprocessing. - - Args: - feature_files: List of feature files. - seq_labels: List of sequences of labels. - max_number: Maximum number of samples to collect. - index_by_file_id: If True, saves a mapping from filename base (ID) to index in data. - """ - - output_type = self.OUTPUT_TYPE - data, num_filtered = ( - [], - 0.0, - ) - self.uniq_labels = set() - - if index_by_file_id: - self.mapping = {} - - for feature_file, seq_label in zip(feature_files, seq_labels): - - label_tokens, uniq_labels_in_seq = self.relative_speaker_parser(seq_label) - - data.append(output_type(feature_file, label_tokens)) - self.uniq_labels |= uniq_labels_in_seq - - if label_tokens is None: - num_filtered += 1 - continue - - if index_by_file_id: - file_id, _ = os.path.splitext(os.path.basename(feature_file)) - self.mapping[feature_file] = len(data) - 1 - - # Max number of entities filter. - if len(data) == max_number: - break - - logging.info("# {} files loaded including # {} unique labels".format(len(data), len(self.uniq_labels))) - super().__init__(data) - - def relative_speaker_parser(self, seq_label): - """Convert sequence of speaker labels to relative labels. - Convert sequence of absolute speaker to sequence of relative speaker [E A C A E E C] -> [0 1 2 1 0 0 2] - In this seq of label , if label do not appear before, assign new relative labels len(pos); else reuse previous assigned relative labels. - Args: - seq_label (str): A string of a sequence of labels. - - Return: - relative_seq_label (List) : A list of relative sequence of labels - unique_labels_in_seq (Set): A set of unique labels in the sequence - """ - seq = seq_label.split() - conversion_dict = dict() - relative_seq_label = [] - - for seg in seq: - if seg in conversion_dict: - converted = conversion_dict[seg] - else: - converted = len(conversion_dict) - conversion_dict[seg] = converted - - relative_seq_label.append(converted) - - unique_labels_in_seq = set(conversion_dict.keys()) - return relative_seq_label, unique_labels_in_seq - - -class ASRFeatureSequenceLabel(FeatureSequenceLabel): - """`FeatureSequenceLabel` collector from asr structured json files.""" - - def __init__( - self, manifests_files: Union[str, List[str]], max_number: Optional[int] = None, index_by_file_id: bool = False, - ): - - """Parse lists of feature files and sequences of labels. - - Args: - manifests_files: Either single string file or list of such - - manifests to yield items from. - max_number: Maximum number of samples to collect; pass to `FeatureSequenceLabel` constructor. - index_by_file_id: If True, saves a mapping from filename base (ID) to index in data; pass to `FeatureSequenceLabel` constructor. - """ - - feature_files, seq_labels = [], [] - for item in manifest.item_iter(manifests_files, parse_func=self._parse_item): - feature_files.append(item['feature_file']) - seq_labels.append(item['seq_label']) - - super().__init__(feature_files, seq_labels, max_number, index_by_file_id) - - def _parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]: - item = json.loads(line) - - # Feature file - if 'feature_filename' in item: - item['feature_file'] = item.pop('feature_filename') - elif 'feature_filepath' in item: - item['feature_file'] = item.pop('feature_filepath') - else: - raise ValueError( - f"Manifest file has invalid json line " f"structure: {line} without proper feature file key." - ) - item['feature_file'] = os.path.expanduser(item['feature_file']) - - # Seq of Label. - if 'seq_label' in item: - item['seq_label'] = item.pop('seq_label') - else: - raise ValueError( - f"Manifest file has invalid json line " f"structure: {line} without proper seq_label key." - ) - - item = dict(feature_file=item['feature_file'], seq_label=item['seq_label'],) - - return item - - -class DiarizationLabel(_Collection): - """List of diarization audio-label correspondence with preprocessing.""" - - OUTPUT_TYPE = collections.namedtuple( - typename='DiarizationLabelEntity', - field_names='audio_file duration rttm_file offset target_spks sess_spk_dict clus_spk_digits rttm_spk_digits', - ) - - def __init__( - self, - audio_files: List[str], - durations: List[float], - rttm_files: List[str], - offsets: List[float], - target_spks_list: List[tuple], - sess_spk_dicts: List[Dict], - clus_spk_list: List[tuple], - rttm_spk_list: List[tuple], - max_number: Optional[int] = None, - do_sort_by_duration: bool = False, - index_by_file_id: bool = False, - ): - """Instantiates audio-label manifest with filters and preprocessing. - - Args: - audio_files: - List of audio file paths. - durations: - List of float durations. - rttm_files: - List of RTTM files (Groundtruth diarization annotation file). - offsets: - List of offsets or None. - target_spks (tuple): - List of tuples containing the two indices of targeted speakers for evaluation. - Example: [[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)], [(0, 1), (1, 2), (0, 2)], ...] - sess_spk_dict (Dict): - List of Mapping dictionaries between RTTM speakers and speaker labels in the clustering result. - clus_spk_digits (tuple): - List of Tuple containing all the speaker indices from the clustering result. - Example: [(0, 1, 2, 3), (0, 1, 2), ...] - rttm_spkr_digits (tuple): - List of tuple containing all the speaker indices in the RTTM file. - Example: (0, 1, 2), (0, 1), ...] - max_number: Maximum number of samples to collect - do_sort_by_duration: True if sort samples list by duration - index_by_file_id: If True, saves a mapping from filename base (ID) to index in data. - """ - - if index_by_file_id: - self.mapping = {} - output_type = self.OUTPUT_TYPE - data, duration_filtered = [], 0.0 - - zipped_items = zip( - audio_files, durations, rttm_files, offsets, target_spks_list, sess_spk_dicts, clus_spk_list, rttm_spk_list - ) - for ( - audio_file, - duration, - rttm_file, - offset, - target_spks, - sess_spk_dict, - clus_spk_digits, - rttm_spk_digits, - ) in zipped_items: - - if duration is None: - duration = 0 - - data.append( - output_type( - audio_file, - duration, - rttm_file, - offset, - target_spks, - sess_spk_dict, - clus_spk_digits, - rttm_spk_digits, - ) - ) - - if index_by_file_id: - file_id, _ = os.path.splitext(os.path.basename(audio_file)) - self.mapping[file_id] = len(data) - 1 - - # Max number of entities filter. - if len(data) == max_number: - break - - if do_sort_by_duration: - if index_by_file_id: - logging.warning("Tried to sort dataset by duration, but cannot since index_by_file_id is set.") - else: - data.sort(key=lambda entity: entity.duration) - - logging.info( - "Filtered duration for loading collection is %f.", duration_filtered, - ) - logging.info(f"Total {len(data)} session files loaded accounting to # {len(audio_files)} audio clips") - - super().__init__(data) - - -class DiarizationSpeechLabel(DiarizationLabel): - """`DiarizationLabel` diarization data sample collector from structured json files.""" - - def __init__( - self, - manifests_files: Union[str, List[str]], - emb_dict: Dict, - clus_label_dict: Dict, - round_digit=2, - seq_eval_mode=False, - pairwise_infer=False, - *args, - **kwargs, - ): - """ - Parse lists of audio files, durations, RTTM (Diarization annotation) files. Since diarization model infers only - two speakers, speaker pairs are generated from the total number of speakers in the session. - - Args: - manifest_filepath (str): - Path to input manifest json files. - emb_dict (Dict): - Dictionary containing cluster-average embeddings and speaker mapping information. - clus_label_dict (Dict): - Segment-level speaker labels from clustering results. - round_digit (int): - Number of digits to be rounded. - seq_eval_mode (bool): - If True, F1 score will be calculated for each speaker pair during inference mode. - pairwise_infer (bool): - If True, this dataset class operates in inference mode. In inference mode, a set of speakers in the input audio - is split into multiple pairs of speakers and speaker tuples (e.g. 3 speakers: [(0,1), (1,2), (0,2)]) and then - fed into the diarization system to merge the individual results. - *args: Args to pass to `SpeechLabel` constructor. - **kwargs: Kwargs to pass to `SpeechLabel` constructor. - """ - self.round_digit = round_digit - self.emb_dict = emb_dict - self.clus_label_dict = clus_label_dict - self.seq_eval_mode = seq_eval_mode - self.pairwise_infer = pairwise_infer - audio_files, durations, rttm_files, offsets, target_spks_list, sess_spk_dicts, clus_spk_list, rttm_spk_list = ( - [], - [], - [], - [], - [], - [], - [], - [], - ) - - for item in manifest.item_iter(manifests_files, parse_func=self.__parse_item_rttm): - # Inference mode - if self.pairwise_infer: - clus_speaker_digits = sorted(list(set([x[2] for x in clus_label_dict[item['uniq_id']]]))) - if item['rttm_file']: - base_scale_index = max(self.emb_dict.keys()) - _sess_spk_dict = self.emb_dict[base_scale_index][item['uniq_id']]['mapping'] - sess_spk_dict = {int(v.split('_')[-1]): k for k, v in _sess_spk_dict.items()} - rttm_speaker_digits = [int(v.split('_')[1]) for k, v in _sess_spk_dict.items()] - if self.seq_eval_mode: - clus_speaker_digits = rttm_speaker_digits - else: - sess_spk_dict = None - rttm_speaker_digits = None - - # Training mode - else: - rttm_labels = [] - with open(item['rttm_file'], 'r') as f: - for line in f.readlines(): - start, end, speaker = self.split_rttm_line(line, decimals=3) - rttm_labels.append('{} {} {}'.format(start, end, speaker)) - speaker_set = set() - for rttm_line in rttm_labels: - spk_str = rttm_line.split()[-1] - speaker_set.add(spk_str) - speaker_list = sorted(list(speaker_set)) - sess_spk_dict = {key: val for key, val in enumerate(speaker_list)} - target_spks = tuple(sess_spk_dict.keys()) - clus_speaker_digits = target_spks - rttm_speaker_digits = target_spks - - if len(clus_speaker_digits) <= 2: - spk_comb_list = [(0, 1)] - else: - spk_comb_list = [x for x in combinations(clus_speaker_digits, 2)] - - for target_spks in spk_comb_list: - audio_files.append(item['audio_file']) - durations.append(item['duration']) - rttm_files.append(item['rttm_file']) - offsets.append(item['offset']) - target_spks_list.append(target_spks) - sess_spk_dicts.append(sess_spk_dict) - clus_spk_list.append(clus_speaker_digits) - rttm_spk_list.append(rttm_speaker_digits) - - super().__init__( - audio_files, - durations, - rttm_files, - offsets, - target_spks_list, - sess_spk_dicts, - clus_spk_list, - rttm_spk_list, - *args, - **kwargs, - ) - - def split_rttm_line(self, rttm_line: str, decimals: int = 3): - """ - Convert a line in RTTM file to speaker label, start and end timestamps. - - An example line of `rttm_line`: - SPEAKER abc_dev_0123 1 146.903 1.860 speaker543 - - The above example RTTM line contains the following information: - session name: abc_dev_0123 - segment start time: 146.903 - segment duration: 1.860 - speaker label: speaker543 - - Args: - rttm_line (str): - A line in RTTM formatted file containing offset and duration of each segment. - decimals (int): - Number of digits to be rounded. - - Returns: - start (float): - Start timestamp in floating point number. - end (float): - End timestamp in floating point number. - speaker (str): - speaker string in RTTM lines. - """ - rttm = rttm_line.strip().split() - start = round(float(rttm[3]), decimals) - end = round(float(rttm[4]), decimals) + round(float(rttm[3]), decimals) - speaker = rttm[7] - return start, end, speaker - - def __parse_item_rttm(self, line: str, manifest_file: str) -> Dict[str, Any]: - """Parse each rttm file and save it to in Dict format""" - item = json.loads(line) - if 'audio_filename' in item: - item['audio_file'] = item.pop('audio_filename') - elif 'audio_filepath' in item: - item['audio_file'] = item.pop('audio_filepath') - else: - raise ValueError( - f"Manifest file has invalid json line " f"structure: {line} without proper audio file key." - ) - item['audio_file'] = os.path.expanduser(item['audio_file']) - item['uniq_id'] = os.path.splitext(os.path.basename(item['audio_file']))[0] - if 'duration' not in item: - raise ValueError(f"Manifest file has invalid json line " f"structure: {line} without proper duration key.") - item = dict( - audio_file=item['audio_file'], - uniq_id=item['uniq_id'], - duration=item['duration'], - rttm_file=item['rttm_filepath'], - offset=item.get('offset', None), - ) - return item - - -class Audio(_Collection): - """Prepare a list of all audio items, filtered by duration. - """ - - OUTPUT_TYPE = collections.namedtuple(typename='Audio', field_names='audio_files duration offset text') - - def __init__( - self, - audio_files_list: List[Dict[str, str]], - duration_list: List[float], - offset_list: List[float], - text_list: List[str], - min_duration: Optional[float] = None, - max_duration: Optional[float] = None, - max_number: Optional[int] = None, - do_sort_by_duration: bool = False, - ): - """Instantiantes an list of audio files. - - Args: - audio_files_list: list of dictionaries with mapping from audio_key to audio_filepath - duration_list: list of durations of input files - offset_list: list of offsets - text_list: list of texts - min_duration: Minimum duration to keep entry with (default: None). - max_duration: Maximum duration to keep entry with (default: None). - max_number: Maximum number of samples to collect. - do_sort_by_duration: True if sort samples list by duration. - """ - - output_type = self.OUTPUT_TYPE - data, total_duration = [], 0.0 - num_filtered, duration_filtered = 0, 0.0 - - for audio_files, duration, offset, text in zip(audio_files_list, duration_list, offset_list, text_list): - # Duration filters - if min_duration is not None and duration < min_duration: - duration_filtered += duration - num_filtered += 1 - continue - - if max_duration is not None and duration > max_duration: - duration_filtered += duration - num_filtered += 1 - continue - - total_duration += duration - data.append(output_type(audio_files, duration, offset, text)) - - # Max number of entities filter - if len(data) == max_number: - break - - if do_sort_by_duration: - data.sort(key=lambda entity: entity.duration) - - logging.info("Dataset loaded with %d files totalling %.2f hours", len(data), total_duration / 3600) - logging.info("%d files were filtered totalling %.2f hours", num_filtered, duration_filtered / 3600) - - super().__init__(data) - - -class AudioCollection(Audio): - """List of audio files from a manifest file. - """ - - def __init__( - self, manifest_files: Union[str, List[str]], audio_to_manifest_key: Dict[str, str], *args, **kwargs, - ): - """Instantiates a list of audio files loaded from a manifest file. - - Args: - manifest_files: path to a single manifest file or a list of paths - audio_to_manifest_key: dictionary mapping audio signals to keys of the manifest - """ - # Support for comma-separated manifests - if type(manifest_files) == str: - manifest_files = manifest_files.split(',') - - for audio_key, manifest_key in audio_to_manifest_key.items(): - # Support for comma-separated keys - if type(manifest_key) == str and ',' in manifest_key: - audio_to_manifest_key[audio_key] = manifest_key.split(',') - - # Keys from manifest which contain audio - self.audio_to_manifest_key = audio_to_manifest_key - - # Initialize data - audio_files_list, duration_list, offset_list, text_list = [], [], [], [] - - # Parse manifest files - for item in manifest.item_iter(manifest_files, parse_func=self.__parse_item): - audio_files_list.append(item['audio_files']) - duration_list.append(item['duration']) - offset_list.append(item['offset']) - text_list.append(item['text']) - - super().__init__(audio_files_list, duration_list, offset_list, text_list, *args, **kwargs) - - def __parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]: - """Parse a single line from a manifest file. - - Args: - line: a string representing a line from a manifest file in JSON format - manifest_file: path to the manifest file. Used to resolve relative paths. - - Returns: - Dictionary with audio_files, duration, and offset. - """ - # Local utility function - def get_audio_file(item: Dict, manifest_key: Union[str, List[str]]): - """Get item[key] if key is string, or a list - of strings by combining item[key[0]], item[key[1]], etc. - """ - # Prepare audio file(s) - if manifest_key is None: - # Support for inference, when a target key is None - audio_file = None - elif isinstance(manifest_key, str): - # Load files from a single manifest key - audio_file = item[manifest_key] - elif isinstance(manifest_key, Iterable): - # Load files from multiple manifest keys - audio_file = [] - for key in manifest_key: - item_key = item[key] - if isinstance(item_key, str): - audio_file.append(item_key) - elif isinstance(item_key, list): - audio_file += item_key - else: - raise ValueError(f'Unexpected type {type(item_key)} of item for key {key}: {item_key}') - else: - raise ValueError(f'Unexpected type {type(manifest_key)} of manifest_key: {manifest_key}') - - return audio_file - - # Convert JSON line to a dictionary - item = json.loads(line) - - # Handle all audio files - audio_files = {} - for audio_key, manifest_key in self.audio_to_manifest_key.items(): - - audio_file = get_audio_file(item, manifest_key) - - # Get full path to audio file(s) - if isinstance(audio_file, str): - # This dictionary entry points to a single file - audio_files[audio_key] = manifest.get_full_path(audio_file, manifest_file) - elif isinstance(audio_file, Iterable): - # This dictionary entry points to multiple files - # Get the files and keep the list structure for this key - audio_files[audio_key] = [manifest.get_full_path(f, manifest_file) for f in audio_file] - elif audio_file is None and audio_key.startswith('target'): - # For inference, we don't need the target - audio_files[audio_key] = None - else: - raise ValueError(f'Unexpected type {type(audio_file)} of audio_file: {audio_file}') - item['audio_files'] = audio_files - - # Handle duration - if 'duration' not in item: - raise ValueError(f'Duration not available in line: {line}. Manifest file: {manifest_file}') - - # Handle offset - if 'offset' not in item: - item['offset'] = 0.0 - - # Handle text - if 'text' not in item: - item['text'] = None - - return dict( - audio_files=item['audio_files'], duration=item['duration'], offset=item['offset'], text=item['text'] - ) - - -class FeatureLabel(_Collection): - """List of feature sequence and their label correspondence with preprocessing.""" - - OUTPUT_TYPE = collections.namedtuple(typename='FeatureLabelEntity', field_names='feature_file label duration',) - - def __init__( - self, - feature_files: List[str], - labels: List[str], - durations: List[float], - min_duration: Optional[float] = None, - max_duration: Optional[float] = None, - max_number: Optional[int] = None, - do_sort_by_duration: bool = False, - index_by_file_id: bool = False, - ): - """Instantiates feature-SequenceLabel manifest with filters and preprocessing. - - Args: - feature_files: List of feature files. - labels: List of labels. - max_number: Maximum number of samples to collect. - index_by_file_id: If True, saves a mapping from filename base (ID) to index in data. - """ - - output_type = self.OUTPUT_TYPE - data = [] - duration_filtered = 0.0 - total_duration = 0.0 - self.uniq_labels = set() - - if index_by_file_id: - self.mapping = {} - - for feature_file, label, duration in zip(feature_files, labels, durations): - # Duration filters. - if min_duration is not None and duration < min_duration: - duration_filtered += duration - continue - - if max_duration is not None and duration > max_duration: - duration_filtered += duration - continue - - data.append(output_type(feature_file, label, duration)) - self.uniq_labels |= set(label) - total_duration += duration - - if index_by_file_id: - file_id, _ = os.path.splitext(os.path.basename(feature_file)) - self.mapping[file_id] = len(data) - 1 - - # Max number of entities filter. - if len(data) == max_number: - break - - if do_sort_by_duration: - if index_by_file_id: - logging.warning("Tried to sort dataset by duration, but cannot since index_by_file_id is set.") - else: - data.sort(key=lambda entity: entity.duration) - - logging.info(f"Filtered duration for loading collection is {duration_filtered / 2600:.2f} hours.") - logging.info(f"Dataset loaded with {len(data)} items, total duration of {total_duration / 3600: .2f} hours.") - logging.info("# {} files loaded including # {} unique labels".format(len(data), len(self.uniq_labels))) - super().__init__(data) - - -class ASRFeatureLabel(FeatureLabel): - """`FeatureLabel` collector from asr structured json files.""" - - def __init__( - self, - manifests_files: Union[str, List[str]], - is_regression_task: bool = False, - cal_labels_occurrence: bool = False, - delimiter: Optional[str] = None, - *args, - **kwargs, - ): - - """Parse lists of feature files and sequences of labels. - - Args: - manifests_files: Either single string file or list of such - - manifests to yield items from. - max_number: Maximum number of samples to collect; pass to `FeatureSequenceLabel` constructor. - index_by_file_id: If True, saves a mapping from filename base (ID) to index in data; pass to `FeatureSequenceLabel` constructor. - """ - - feature_files, labels, durations = [], [], [] - all_labels = [] - for item in manifest.item_iter(manifests_files, parse_func=self._parse_item): - feature_files.append(item['feature_file']) - durations.append(item['duration']) - - if not is_regression_task: - label = item['label'] - label_list = label.split() if not delimiter else label.split(delimiter) - else: - label = float(item['label']) - label_list = [label] - - labels.append(label) - all_labels.extend(label_list) - if cal_labels_occurrence: - self.labels_occurrence = collections.Counter(all_labels) - - super().__init__(feature_files, labels, durations, *args, **kwargs) - - def _parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]: - item = json.loads(line) - - # Feature file - if 'feature_filename' in item: - item['feature_file'] = item.pop('feature_filename') - elif 'feature_filepath' in item: - item['feature_file'] = item.pop('feature_filepath') - elif 'feature_file' not in item: - raise ValueError( - f"Manifest file has invalid json line " f"structure: {line} without proper 'feature_file' key." - ) - item['feature_file'] = manifest.get_full_path(audio_file=item['feature_file'], manifest_file=manifest_file) - - # Label. - if 'label' in item: - item['label'] = item.pop('label') - else: - raise ValueError(f"Manifest file has invalid json line structure: {line} without proper 'label' key.") - - item = dict(feature_file=item['feature_file'], label=item['label'], duration=item['duration']) - - return item - - -class FeatureText(_Collection): - """List of audio-transcript text correspondence with preprocessing.""" - - OUTPUT_TYPE = collections.namedtuple( - typename='FeatureTextEntity', - field_names='id feature_file rttm_file duration text_tokens offset text_raw speaker orig_sr lang', - ) - - def __init__( - self, - ids: List[int], - feature_files: List[str], - rttm_files: List[str], - durations: List[float], - texts: List[str], - offsets: List[str], - speakers: List[Optional[int]], - orig_sampling_rates: List[Optional[int]], - token_labels: List[Optional[int]], - langs: List[Optional[str]], - parser: parsers.CharParser, - min_duration: Optional[float] = None, - max_duration: Optional[float] = None, - max_number: Optional[int] = None, - do_sort_by_duration: bool = False, - index_by_file_id: bool = False, - ): - """Instantiates feature-text manifest with filters and preprocessing. - - Args: - ids: List of examples positions. - feature_files: List of audio feature files. - rttm_files: List of audio rttm files. - durations: List of float durations. - texts: List of raw text transcripts. - offsets: List of duration offsets or None. - speakers: List of optional speakers ids. - orig_sampling_rates: List of original sampling rates of audio files. - langs: List of language ids, one for eadh sample, or None. - parser: Instance of `CharParser` to convert string to tokens. - min_duration: Minimum duration to keep entry with (default: None). - max_duration: Maximum duration to keep entry with (default: None). - max_number: Maximum number of samples to collect. - do_sort_by_duration: True if sort samples list by duration. Not compatible with index_by_file_id. - index_by_file_id: If True, saves a mapping from filename base (ID) to index in data. - """ - - output_type = self.OUTPUT_TYPE - data, duration_filtered, num_filtered, total_duration = [], 0.0, 0, 0.0 - if index_by_file_id: - self.mapping = {} - - for id_, feat_file, rttm_file, duration, offset, text, speaker, orig_sr, token_labels, lang in zip( - ids, - feature_files, - rttm_files, - durations, - offsets, - texts, - speakers, - orig_sampling_rates, - token_labels, - langs, - ): - # Duration filters. - if min_duration is not None and duration < min_duration: - duration_filtered += duration - num_filtered += 1 - continue - - if max_duration is not None and duration > max_duration: - duration_filtered += duration - num_filtered += 1 - continue - - if token_labels is not None: - text_tokens = token_labels - else: - if text != '': - if hasattr(parser, "is_aggregate") and parser.is_aggregate and isinstance(text, str): - if lang is not None: - text_tokens = parser(text, lang) - else: - raise ValueError("lang required in manifest when using aggregate tokenizers") - else: - text_tokens = parser(text) - else: - text_tokens = [] - - if text_tokens is None: - duration_filtered += duration - num_filtered += 1 - continue - - total_duration += duration - - data.append( - output_type(id_, feat_file, rttm_file, duration, text_tokens, offset, text, speaker, orig_sr, lang) - ) - if index_by_file_id: - file_id, _ = os.path.splitext(os.path.basename(feat_file)) - if file_id not in self.mapping: - self.mapping[file_id] = [] - self.mapping[file_id].append(len(data) - 1) - - # Max number of entities filter. - if len(data) == max_number: - break - - if do_sort_by_duration: - if index_by_file_id: - logging.warning("Tried to sort dataset by duration, but cannot since index_by_file_id is set.") - else: - data.sort(key=lambda entity: entity.duration) - - logging.info("Dataset loaded with %d files totalling %.2f hours", len(data), total_duration / 3600) - logging.info("%d files were filtered totalling %.2f hours", num_filtered, duration_filtered / 3600) - - super().__init__(data) - - -class ASRFeatureText(FeatureText): - """`FeatureText` collector from asr structured json files.""" - - def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs): - """Parse lists of audio files, durations and transcripts texts. - - Args: - manifests_files: Either single string file or list of such - - manifests to yield items from. - *args: Args to pass to `AudioText` constructor. - **kwargs: Kwargs to pass to `AudioText` constructor. - """ - - ids, feature_files, rttm_files, durations, texts, offsets, = ( - [], - [], - [], - [], - [], - [], - ) - speakers, orig_srs, token_labels, langs = [], [], [], [] - for item in manifest.item_iter(manifests_files): - ids.append(item['id']) - feature_files.append(item['feature_file']) - rttm_files.append(item['rttm_file']) - durations.append(item['duration']) - texts.append(item['text']) - offsets.append(item['offset']) - speakers.append(item['speaker']) - orig_srs.append(item['orig_sr']) - token_labels.append(item['token_labels']) - langs.append(item['lang']) - - super().__init__( - ids, - feature_files, - rttm_files, - durations, - texts, - offsets, - speakers, - orig_srs, - token_labels, - langs, - *args, - **kwargs, - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/manifest.py b/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/manifest.py deleted file mode 100644 index 8828955707119deccf3d97b8daa46d56c3008b81..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/manifest.py +++ /dev/null @@ -1,242 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -from os.path import expanduser -from typing import Any, Callable, Dict, Iterator, List, Optional, Union - -from nemo.utils import logging -from nemo.utils.data_utils import DataStoreObject, datastore_path_to_local_path, is_datastore_path - - -class ManifestBase: - def __init__(self, *args, **kwargs): - raise ValueError( - "This class is deprecated, look at https://github.com/NVIDIA/NeMo/pull/284 for correct behaviour." - ) - - -class ManifestEN: - def __init__(self, *args, **kwargs): - raise ValueError( - "This class is deprecated, look at https://github.com/NVIDIA/NeMo/pull/284 for correct behaviour." - ) - - -def item_iter( - manifests_files: Union[str, List[str]], parse_func: Callable[[str, Optional[str]], Dict[str, Any]] = None -) -> Iterator[Dict[str, Any]]: - """Iterate through json lines of provided manifests. - - NeMo ASR pipelines often assume certain manifest files structure. In - particular, each manifest file should consist of line-per-sample files with - each line being correct json dict. Each such json dict should have a field - for audio file string, a field for duration float and a field for text - string. Offset also could be additional field and is set to None by - default. - - Args: - manifests_files: Either single string file or list of such - - manifests to yield items from. - - parse_func: A callable function which accepts as input a single line - of a manifest and optionally the manifest file itself, - and parses it, returning a dictionary mapping from str -> Any. - - Yields: - Parsed key to value item dicts. - - Raises: - ValueError: If met invalid json line structure. - """ - - if isinstance(manifests_files, str): - manifests_files = [manifests_files] - - if parse_func is None: - parse_func = __parse_item - - k = -1 - logging.debug('Manifest files: %s', str(manifests_files)) - for manifest_file in manifests_files: - logging.debug('Using manifest file: %s', str(manifest_file)) - cached_manifest_file = DataStoreObject(manifest_file).get() - logging.debug('Cached at: %s', str(cached_manifest_file)) - with open(expanduser(cached_manifest_file), 'r') as f: - for line in f: - k += 1 - item = parse_func(line, manifest_file) - item['id'] = k - - yield item - - -def __parse_item(line: str, manifest_file: str) -> Dict[str, Any]: - item = json.loads(line) - - # Audio file - if 'audio_filename' in item: - item['audio_file'] = item.pop('audio_filename') - elif 'audio_filepath' in item: - item['audio_file'] = item.pop('audio_filepath') - - # Video File - if 'video_filename' in item: - item['video_file'] = item.pop('video_filename') - elif 'video_filepath' in item: - item['video_file'] = item.pop('video_filepath') - - if 'video_file' not in item and 'audio_file' not in item: - raise ValueError( - f"Manifest file {manifest_file} has invalid json line structure: {line} without proper audio/video file key." - ) - - # If the audio/video path is a relative path and does not exist, - # try to attach the parent directory of manifest to the audio path. - # Revert to the original path if the new path still doesn't exist. - # Assume that the audio path is like "wavs/xxxxxx.wav". - if 'audio_file' in item: - item['audio_file'] = get_full_path(audio_file=item['audio_file'], manifest_file=manifest_file) - if 'video_file' in item: - item['video_file'] = get_full_path(audio_file=item['video_file'], manifest_file=manifest_file) - - # Duration. - if 'duration' not in item: - raise ValueError( - f"Manifest file {manifest_file} has invalid json line structure: {line} without proper duration key." - ) - - # Text. - if 'text' in item: - pass - elif 'text_filepath' in item: - with open(item.pop('text_filepath'), 'r') as f: - item['text'] = f.read().replace('\n', '') - elif 'normalized_text' in item: - item['text'] = item['normalized_text'] - else: - item['text'] = "" - - # Optional RTTM file - if 'rttm_file' in item: - pass - elif 'rttm_filename' in item: - item['rttm_file'] = item.pop('rttm_filename') - elif 'rttm_filepath' in item: - item['rttm_file'] = item.pop('rttm_filepath') - else: - item['rttm_file'] = None - if item['rttm_file'] is not None: - item['rttm_file'] = get_full_path(audio_file=item['rttm_file'], manifest_file=manifest_file) - - # Optional audio feature file - if 'feature_file' in item: - pass - elif 'feature_filename' in item: - item['feature_file'] = item.pop('feature_filename') - elif 'feature_filepath' in item: - item['feature_file'] = item.pop('feature_filepath') - else: - item['feature_file'] = None - if item['feature_file'] is not None: - item['feature_file'] = get_full_path(audio_file=item['feature_file'], manifest_file=manifest_file) - - item = dict( - audio_file=item.get('audio_file', None), - video_file=item.get('video_file', None), - duration=item['duration'], - text=item['text'], - rttm_file=item['rttm_file'], - feature_file=item['feature_file'], - offset=item.get('offset', None), - speaker=item.get('speaker', None), - orig_sr=item.get('orig_sample_rate', None), - token_labels=item.get('token_labels', None), - lang=item.get('lang', None), - ) - return item - - -def get_full_path( - audio_file: Union[str, List[str]], - manifest_file: Optional[str] = None, - data_dir: Optional[str] = None, - audio_file_len_limit: int = 255, -) -> Union[str, List[str]]: - """Get full path to audio_file. - - If the audio_file is a relative path and does not exist, - try to attach the parent directory of manifest to the audio path. - Revert to the original path if the new path still doesn't exist. - Assume that the audio path is like "wavs/xxxxxx.wav". - - Args: - audio_file: path to an audio file, either absolute or assumed relative - to the manifest directory or data directory. - Alternatively, a list of paths may be provided. - manifest_file: path to a manifest file - data_dir: path to a directory containing data, use only if a manifest file is not provided - audio_file_len_limit: limit for length of audio_file when using relative paths - - Returns: - Full path to audio_file or a list of paths. - """ - if isinstance(audio_file, list): - # If input is a list, return a list of full paths - return [ - get_full_path( - audio_file=a_file, - manifest_file=manifest_file, - data_dir=data_dir, - audio_file_len_limit=audio_file_len_limit, - ) - for a_file in audio_file - ] - elif isinstance(audio_file, str): - # If input is a string, get the corresponding full path - if ( - (len(audio_file) < audio_file_len_limit) - and not os.path.isabs(audio_file) - and not os.path.isfile(audio_file) - ): - # If audio_file is not available and the path is not absolute, the full path is assumed - # to be relative to the manifest file parent directory or data directory. - if manifest_file is None and data_dir is None: - raise ValueError(f'Use either manifest_file or data_dir to specify the data directory.') - elif manifest_file is not None and data_dir is not None: - raise ValueError( - f'Parameters manifest_file and data_dir cannot be used simultaneously. Currently manifest_file is {manifest_file} and data_dir is {data_dir}.' - ) - - # resolve the data directory - if data_dir is None: - data_dir = os.path.dirname(manifest_file) - - # assume audio_file path is relative to data_dir - audio_file_path = os.path.join(data_dir, audio_file) - - if is_datastore_path(audio_file_path): - # If audio was originally on an object store, use locally-cached path - audio_file_path = datastore_path_to_local_path(audio_file_path) - - if os.path.isfile(audio_file_path): - audio_file = os.path.abspath(audio_file_path) - else: - audio_file = expanduser(audio_file) - else: - audio_file = expanduser(audio_file) - return audio_file - else: - raise ValueError(f'Unexpected audio_file type {type(audio_file)}, audio_file {audio_file}.') diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/parsers.py b/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/parsers.py deleted file mode 100644 index 10a3522ef241418cfa15fc9c592b2518666b7919..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/parts/preprocessing/parsers.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -A collection of simple character based parsers. These parser handle cleaning and tokenization by default. -We currently support English. -""" - -import string -from typing import List, Optional - -from nemo.collections.common.parts.preprocessing import cleaners - - -class CharParser: - """Functor for parsing raw strings into list of int tokens. - - Examples: - >>> parser = CharParser(['a', 'b', 'c']) - >>> parser('abc') - [0, 1, 2] - """ - - def __init__( - self, - labels: List[str], - *, - unk_id: int = -1, - blank_id: int = -1, - do_normalize: bool = True, - do_lowercase: bool = True, - do_tokenize: bool = True, - ): - """Creates simple mapping char parser. - - Args: - labels: List of labels to allocate indexes for. Essentially, - this is a id to str mapping. - unk_id: Index to choose for OOV words (default: -1). - blank_id: Index to filter out from final list of tokens - (default: -1). - do_normalize: True if apply normalization step before tokenizing - (default: True). - do_lowercase: True if apply lowercasing at normalizing step - (default: True). - """ - - self._labels = labels - self._unk_id = unk_id - self._blank_id = blank_id - self._do_normalize = do_normalize - self._do_lowercase = do_lowercase - self._do_tokenize = do_tokenize - - self._labels_map = {label: index for index, label in enumerate(labels)} - self._special_labels = set([label for label in labels if len(label) > 1]) - - def __call__(self, text: str) -> Optional[List[int]]: - if self._do_normalize: - text = self._normalize(text) - if text is None: - return None - - if not self._do_tokenize: - return text - - text_tokens = self._tokenize(text) - return text_tokens - - def _normalize(self, text: str) -> Optional[str]: - text = text.strip() - - if self._do_lowercase: - text = text.lower() - - return text - - def _tokenize(self, text: str) -> List[int]: - tokens = [] - # Split by word for find special labels. - for word_id, word in enumerate(text.split(' ')): - if word_id != 0: # Not first word - so we insert space before. - tokens.append(self._labels_map.get(' ', self._unk_id)) - - if word in self._special_labels: - tokens.append(self._labels_map[word]) - continue - - for char in word: - tokens.append(self._labels_map.get(char, self._unk_id)) - - # If unk_id == blank_id, OOV tokens are removed. - tokens = [token for token in tokens if token != self._blank_id] - - return tokens - - def decode(self, str_input): - r_map = {} - for k, v in self._labels_map.items(): - r_map[v] = k - r_map[len(self._labels_map)] = "" - r_map[len(self._labels_map) + 1] = "" - r_map[len(self._labels_map) + 2] = "

" - - out = [] - for i in str_input: - # Skip OOV - if i not in r_map: - continue - out.append(r_map[i.item()]) - - return "".join(out) - - -class ENCharParser(CharParser): - """Incorporates english-specific parsing logic.""" - - PUNCTUATION_TO_REPLACE = {'+': 'plus', '&': 'and', '%': 'percent'} - - def __init__(self, abbreviation_version=None, make_table=True, *args, **kwargs): - """Creates english-specific mapping char parser. - - This class overrides normalizing implementation. - - Args: - *args: Positional args to pass to `CharParser` constructor. - **kwargs: Key-value args to pass to `CharParser` constructor. - """ - - super().__init__(*args, **kwargs) - - self._table = None - if make_table: - self._table = self.__make_trans_table() - self.abbreviation_version = abbreviation_version - - def __make_trans_table(self): - punctuation = string.punctuation - - for char in self.PUNCTUATION_TO_REPLACE: - punctuation = punctuation.replace(char, '') - - for label in self._labels: - punctuation = punctuation.replace(label, '') - - table = str.maketrans(punctuation, ' ' * len(punctuation)) - - return table - - def _normalize(self, text: str) -> Optional[str]: - # noinspection PyBroadException - try: - text = cleaners.clean_text( - string=text, - table=self._table, - punctuation_to_replace=self.PUNCTUATION_TO_REPLACE, - abbreviation_version=self.abbreviation_version, - ) - except Exception: - return None - - return text - - -class RUCharParser(CharParser): - """Incorporates russian-specific parsing logic.""" - - PUNCTUATION_TO_REPLACE = {'+': 'плюс', 'ё': 'е'} - - def __init__(self, *args, **kwargs): - """Creates cyrillic-specific mapping char parser. - This class overrides normalizing implementation. - Args: - *args: Positional args to pass to `CharParser` constructor. - **kwargs: Key-value args to pass to `CharParser` constructor. - """ - - super().__init__(*args, **kwargs) - - self._table = self.__make_trans_table() - - def __make_trans_table(self): - punctuation = string.punctuation - - for char in self.PUNCTUATION_TO_REPLACE: - punctuation = punctuation.replace(char, '') - - for label in self._labels: - punctuation = punctuation.replace(label, '') - - table = str.maketrans(punctuation, ' ' * len(punctuation)) - - return table - - def _normalize(self, text: str) -> Optional[str]: - # noinspection PyBroadException - try: - text = cleaners.clean_text( - string=text, table=self._table, punctuation_to_replace=self.PUNCTUATION_TO_REPLACE, - ) - except Exception: - return None - - return text - - -NAME_TO_PARSER = {'base': CharParser, 'en': ENCharParser, 'ru': RUCharParser} - - -def make_parser(labels: Optional[List[str]] = None, name: str = 'base', **kwargs,) -> CharParser: - """Creates parser from labels, set of arguments and concise parser name. - - Args: - labels: List of labels to allocate indexes for. If set to - None then labels would be ascii table list. Essentially, this is an - id to str mapping (default: None). - name: Concise name of parser to create (default: 'base'). - (default: -1). - **kwargs: Other set of kwargs to pass to parser constructor. - - Returns: - Instance of `CharParser`. - - Raises: - ValueError: For invalid parser name. - - Examples: - >>> type(make_parser(['a', 'b', 'c'], 'en')) - ENCharParser - """ - - if name not in NAME_TO_PARSER: - raise ValueError('Invalid parser name.') - - if labels is None: - labels = list(string.printable) - - parser_type = NAME_TO_PARSER[name] - parser = parser_type(labels=labels, **kwargs) - - return parser diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/ptl_overrides.py b/SoundScribe/SpeakerID/nemo/collections/common/parts/ptl_overrides.py deleted file mode 100644 index 0225ecd50feec7301a89845e1e1de9d3494d24ad..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/parts/ptl_overrides.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from pytorch_lightning.plugins.precision import MixedPrecisionPlugin - - -class NeMoMixedPrecisionPlugin(MixedPrecisionPlugin): - def __init__(self, init_scale: float = 2 ** 32, growth_interval: int = 1000) -> None: - super().__init__(precision=16) - - self.scaler = torch.cuda.amp.GradScaler(init_scale=init_scale, growth_interval=growth_interval) diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/rnn.py b/SoundScribe/SpeakerID/nemo/collections/common/parts/rnn.py deleted file mode 100644 index 0b5435acc7646b9ec26af50e40ea6857922d1fd2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/parts/rnn.py +++ /dev/null @@ -1,561 +0,0 @@ -# Copyright (c) 2019, Myrtle Software Limited. All rights reserved. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from typing import List, Optional, Tuple, Union - -import numpy as np -import torch - -from nemo.utils import logging - - -def rnn( - input_size: int, - hidden_size: int, - num_layers: int, - norm: Optional[str] = None, - forget_gate_bias: Optional[float] = 1.0, - dropout: Optional[float] = 0.0, - norm_first_rnn: Optional[bool] = None, - t_max: Optional[int] = None, - weights_init_scale: float = 1.0, - hidden_hidden_bias_scale: float = 0.0, - proj_size: int = 0, -) -> torch.nn.Module: - """ - Utility function to provide unified interface to common LSTM RNN modules. - - Args: - input_size: Input dimension. - - hidden_size: Hidden dimension of the RNN. - - num_layers: Number of RNN layers. - - norm: Optional string representing type of normalization to apply to the RNN. - Supported values are None, batch and layer. - - forget_gate_bias: float, set by default to 1.0, which constructs a forget gate - initialized to 1.0. - Reference: - [An Empirical Exploration of Recurrent Network Architectures](http://proceedings.mlr.press/v37/jozefowicz15.pdf) - - dropout: Optional dropout to apply to end of multi-layered RNN. - - norm_first_rnn: Whether to normalize the first RNN layer. - - t_max: int value, set to None by default. If an int is specified, performs Chrono Initialization - of the LSTM network, based on the maximum number of timesteps `t_max` expected during the course - of training. - Reference: - [Can recurrent neural networks warp time?](https://openreview.net/forum?id=SJcKhk-Ab) - - weights_init_scale: Float scale of the weights after initialization. Setting to lower than one - sometimes helps reduce variance between runs. - - hidden_hidden_bias_scale: Float scale for the hidden-to-hidden bias scale. Set to 0.0 for - the default behaviour. - - Returns: - A RNN module - """ - if norm not in [None, "batch", "layer"]: - raise ValueError(f"unknown norm={norm}") - - if norm is None: - return LSTMDropout( - input_size=input_size, - hidden_size=hidden_size, - num_layers=num_layers, - dropout=dropout, - forget_gate_bias=forget_gate_bias, - t_max=t_max, - weights_init_scale=weights_init_scale, - hidden_hidden_bias_scale=hidden_hidden_bias_scale, - proj_size=proj_size, - ) - - if norm == "batch": - return BNRNNSum( - input_size=input_size, - hidden_size=hidden_size, - rnn_layers=num_layers, - batch_norm=True, - dropout=dropout, - forget_gate_bias=forget_gate_bias, - t_max=t_max, - norm_first_rnn=norm_first_rnn, - weights_init_scale=weights_init_scale, - hidden_hidden_bias_scale=hidden_hidden_bias_scale, - proj_size=proj_size, - ) - - if norm == "layer": - return torch.jit.script( - ln_lstm( - input_size=input_size, - hidden_size=hidden_size, - num_layers=num_layers, - dropout=dropout, - forget_gate_bias=forget_gate_bias, - t_max=t_max, - weights_init_scale=weights_init_scale, - hidden_hidden_bias_scale=hidden_hidden_bias_scale, - ) - ) - - -class OverLastDim(torch.nn.Module): - """Collapses a tensor to 2D, applies a module, and (re-)expands the tensor. - An n-dimensional tensor of shape (s_1, s_2, ..., s_n) is first collapsed to - a tensor with shape (s_1*s_2*...*s_n-1, s_n). The module is called with - this as input producing (s_1*s_2*...*s_n-1, s_n') --- note that the final - dimension can change. This is expanded to (s_1, s_2, ..., s_n-1, s_n') and - returned. - Args: - module (torch.nn.Module): Module to apply. Must accept a 2D tensor as - input and produce a 2D tensor as output, optionally changing the - size of the last dimension. - """ - - def __init__(self, module: torch.nn.Module): - super().__init__() - self.module = module - - def forward(self, x: torch.Tensor) -> torch.Tensor: - *dims, _ = x.size() - - reduced_dims = 1 - for dim in dims: - reduced_dims *= dim - - x = x.view(reduced_dims, -1) - x = self.module(x) - x = x.view(*dims, -1) - return x - - -class LSTMDropout(torch.nn.Module): - def __init__( - self, - input_size: int, - hidden_size: int, - num_layers: int, - dropout: Optional[float], - forget_gate_bias: Optional[float], - t_max: Optional[int] = None, - weights_init_scale: float = 1.0, - hidden_hidden_bias_scale: float = 0.0, - proj_size: int = 0, - ): - """Returns an LSTM with forget gate bias init to `forget_gate_bias`. - Args: - input_size: See `torch.nn.LSTM`. - hidden_size: See `torch.nn.LSTM`. - num_layers: See `torch.nn.LSTM`. - dropout: See `torch.nn.LSTM`. - - forget_gate_bias: float, set by default to 1.0, which constructs a forget gate - initialized to 1.0. - Reference: - [An Empirical Exploration of Recurrent Network Architectures](http://proceedings.mlr.press/v37/jozefowicz15.pdf) - - t_max: int value, set to None by default. If an int is specified, performs Chrono Initialization - of the LSTM network, based on the maximum number of timesteps `t_max` expected during the course - of training. - Reference: - [Can recurrent neural networks warp time?](https://openreview.net/forum?id=SJcKhk-Ab) - - weights_init_scale: Float scale of the weights after initialization. Setting to lower than one - sometimes helps reduce variance between runs. - - hidden_hidden_bias_scale: Float scale for the hidden-to-hidden bias scale. Set to 0.0 for - the default behaviour. - - Returns: - A `torch.nn.LSTM`. - """ - super(LSTMDropout, self).__init__() - - self.lstm = torch.nn.LSTM( - input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, proj_size=proj_size - ) - - if t_max is not None: - # apply chrono init - for name, v in self.lstm.named_parameters(): - if 'bias' in name: - p = getattr(self.lstm, name) - n = p.nelement() - hidden_size = n // 4 - p.data.fill_(0) - p.data[hidden_size : 2 * hidden_size] = torch.log( - torch.nn.init.uniform_(p.data[0:hidden_size], 1, t_max - 1) - ) - # forget gate biases = log(uniform(1, Tmax-1)) - p.data[0:hidden_size] = -p.data[hidden_size : 2 * hidden_size] - # input gate biases = -(forget gate biases) - - elif forget_gate_bias is not None: - for name, v in self.lstm.named_parameters(): - if "bias_ih" in name: - bias = getattr(self.lstm, name) - bias.data[hidden_size : 2 * hidden_size].fill_(forget_gate_bias) - if "bias_hh" in name: - bias = getattr(self.lstm, name) - bias.data[hidden_size : 2 * hidden_size] *= float(hidden_hidden_bias_scale) - - self.dropout = torch.nn.Dropout(dropout) if dropout else None - - for name, v in self.named_parameters(): - if 'weight' in name or 'bias' in name: - v.data *= float(weights_init_scale) - - def forward( - self, x: torch.Tensor, h: Optional[Tuple[torch.Tensor, torch.Tensor]] = None - ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - x, h = self.lstm(x, h) - - if self.dropout: - x = self.dropout(x) - - return x, h - - -class RNNLayer(torch.nn.Module): - """A single RNNLayer with optional batch norm.""" - - def __init__( - self, - input_size: int, - hidden_size: int, - rnn_type: torch.nn.Module = torch.nn.LSTM, - batch_norm: bool = True, - forget_gate_bias: Optional[float] = 1.0, - t_max: Optional[int] = None, - weights_init_scale: float = 1.0, - hidden_hidden_bias_scale: float = 0.0, - proj_size: int = 0, - ): - super().__init__() - - if batch_norm: - self.bn = OverLastDim(torch.nn.BatchNorm1d(input_size)) - - if isinstance(rnn_type, torch.nn.LSTM) and not batch_norm: - # batch_norm will apply bias, no need to add a second to LSTM - self.rnn = LSTMDropout( - input_size=input_size, - hidden_size=hidden_size, - num_layers=1, - dropout=0.0, - forget_gate_bias=forget_gate_bias, - t_max=t_max, - weights_init_scale=weights_init_scale, - hidden_hidden_bias_scale=hidden_hidden_bias_scale, - proj_size=proj_size, - ) - else: - self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size, bias=not batch_norm) - - def forward( - self, x: torch.Tensor, hx: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None - ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - if hasattr(self, 'bn'): - x = x.contiguous() - x = self.bn(x) - x, h = self.rnn(x, hx=hx) - return x, h - - def _flatten_parameters(self): - self.rnn.flatten_parameters() - - -class BNRNNSum(torch.nn.Module): - """RNN wrapper with optional batch norm. - Instantiates an RNN. If it is an LSTM it initialises the forget gate - bias =`lstm_gate_bias`. Optionally applies a batch normalisation layer to - the input with the statistics computed over all time steps. If dropout > 0 - then it is applied to all layer outputs except the last. - """ - - def __init__( - self, - input_size: int, - hidden_size: int, - rnn_type: torch.nn.Module = torch.nn.LSTM, - rnn_layers: int = 1, - batch_norm: bool = True, - dropout: Optional[float] = 0.0, - forget_gate_bias: Optional[float] = 1.0, - norm_first_rnn: bool = False, - t_max: Optional[int] = None, - weights_init_scale: float = 1.0, - hidden_hidden_bias_scale: float = 0.0, - proj_size: int = 0, - ): - super().__init__() - self.rnn_layers = rnn_layers - - self.layers = torch.nn.ModuleList() - for i in range(rnn_layers): - final_layer = (rnn_layers - 1) == i - - self.layers.append( - RNNLayer( - input_size, - hidden_size, - rnn_type=rnn_type, - batch_norm=batch_norm and (norm_first_rnn or i > 0), - forget_gate_bias=forget_gate_bias, - t_max=t_max, - weights_init_scale=weights_init_scale, - hidden_hidden_bias_scale=hidden_hidden_bias_scale, - proj_size=proj_size, - ) - ) - - if dropout is not None and dropout > 0.0 and not final_layer: - self.layers.append(torch.nn.Dropout(dropout)) - - input_size = hidden_size - - def forward( - self, x: torch.Tensor, hx: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None - ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - hx = self._parse_hidden_state(hx) - - hs = [] - cs = [] - rnn_idx = 0 - for layer in self.layers: - if isinstance(layer, torch.nn.Dropout): - x = layer(x) - else: - x, h_out = layer(x, hx=hx[rnn_idx]) - hs.append(h_out[0]) - cs.append(h_out[1]) - rnn_idx += 1 - del h_out - - h_0 = torch.stack(hs, dim=0) - c_0 = torch.stack(cs, dim=0) - return x, (h_0, c_0) - - def _parse_hidden_state( - self, hx: Optional[Tuple[torch.Tensor, torch.Tensor]] - ) -> Union[List[None], List[Tuple[torch.Tensor, torch.Tensor]]]: - """ - Dealing w. hidden state: - Typically in pytorch: (h_0, c_0) - h_0 = ``[num_layers * num_directions, batch, hidden_size]`` - c_0 = ``[num_layers * num_directions, batch, hidden_size]`` - """ - if hx is None: - return [None] * self.rnn_layers - else: - h_0, c_0 = hx - - if h_0.shape[0] != self.rnn_layers: - raise ValueError( - 'Provided initial state value `h_0` must be of shape : ' - '[num_layers * num_directions, batch, hidden_size]' - ) - - return [(h_0[i], c_0[i]) for i in range(h_0.shape[0])] - - def _flatten_parameters(self): - for layer in self.layers: - if isinstance(layer, (torch.nn.LSTM, torch.nn.GRU, torch.nn.RNN)): - layer._flatten_parameters() - - -class StackTime(torch.nn.Module): - """ - Stacks time within the feature dim, so as to behave as a downsampling operation. - """ - - def __init__(self, factor: int): - super().__init__() - self.factor = int(factor) - - def forward(self, x: List[Tuple[torch.Tensor]]) -> Tuple[torch.Tensor, torch.Tensor]: - # T, B, U - x, x_lens = x - seq = [x] - for i in range(1, self.factor): - tmp = torch.zeros_like(x) - tmp[:-i, :, :] = x[i:, :, :] - seq.append(tmp) - x_lens = torch.ceil(x_lens.float() / self.factor).int() - return torch.cat(seq, dim=2)[:: self.factor, :, :], x_lens - - -def ln_lstm( - input_size: int, - hidden_size: int, - num_layers: int, - dropout: Optional[float], - forget_gate_bias: Optional[float], - t_max: Optional[int], - weights_init_scale: Optional[float] = None, # ignored - hidden_hidden_bias_scale: Optional[float] = None, # ignored -) -> torch.nn.Module: - """Returns a ScriptModule that mimics a PyTorch native LSTM.""" - # The following are not implemented. - if dropout is not None and dropout != 0.0: - raise ValueError('`dropout` not supported with LayerNormLSTM') - - if t_max is not None: - logging.warning("LayerNormLSTM does not support chrono init via `t_max`") - - if weights_init_scale is not None: - logging.warning("`weights_init_scale` is ignored for LayerNormLSTM") - - if hidden_hidden_bias_scale is not None: - logging.warning("`hidden_hidden_bias_scale` is ignored for LayerNormLSTM") - - return StackedLSTM( - num_layers, - LSTMLayer, - first_layer_args=[LayerNormLSTMCell, input_size, hidden_size, forget_gate_bias], - other_layer_args=[LayerNormLSTMCell, hidden_size, hidden_size, forget_gate_bias], - ) - - -class LSTMLayer(torch.nn.Module): - def __init__(self, cell, *cell_args): - super(LSTMLayer, self).__init__() - self.cell = cell(*cell_args) - - def forward( - self, input: torch.Tensor, state: Tuple[torch.Tensor, torch.Tensor] - ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - inputs = input.unbind(0) - outputs = [] - for i in range(len(inputs)): - out, state = self.cell(inputs[i], state) - outputs += [out] - return torch.stack(outputs), state - - -class LayerNormLSTMCell(torch.nn.Module): - def __init__(self, input_size, hidden_size, forget_gate_bias): - super().__init__() - self.input_size = input_size - self.hidden_size = hidden_size - self.weight_ih = torch.nn.Parameter(torch.randn(4 * hidden_size, input_size)) - self.weight_hh = torch.nn.Parameter(torch.randn(4 * hidden_size, hidden_size)) - - # LayerNorm provide learnable biases - self.layernorm_i = torch.nn.LayerNorm(4 * hidden_size) - self.layernorm_h = torch.nn.LayerNorm(4 * hidden_size) - self.layernorm_c = torch.nn.LayerNorm(hidden_size) - - self.reset_parameters() - - self.layernorm_i.bias.data[hidden_size : 2 * hidden_size].fill_(0.0) - self.layernorm_h.bias.data[hidden_size : 2 * hidden_size].fill_(forget_gate_bias) - - def reset_parameters(self): - stdv = 1.0 / math.sqrt(self.hidden_size) - for weight in self.parameters(): - torch.nn.init.uniform_(weight, -stdv, stdv) - - def forward( - self, input: torch.Tensor, state: Tuple[torch.Tensor, torch.Tensor] - ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - hx, cx = state - igates = self.layernorm_i(torch.mm(input, self.weight_ih.t())) - hgates = self.layernorm_h(torch.mm(hx, self.weight_hh.t())) - gates = igates + hgates - ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) - - ingate = torch.sigmoid(ingate) - forgetgate = torch.sigmoid(forgetgate) - cellgate = torch.tanh(cellgate) - outgate = torch.sigmoid(outgate) - - cy = self.layernorm_c((forgetgate * cx) + (ingate * cellgate)) - hy = outgate * torch.tanh(cy) - - return hy, (hy, cy) - - -def init_stacked_lstm( - num_layers: int, layer: torch.nn.Module, first_layer_args: List, other_layer_args: List -) -> torch.nn.ModuleList: - layers = [layer(*first_layer_args)] + [layer(*other_layer_args) for _ in range(num_layers - 1)] - return torch.nn.ModuleList(layers) - - -class StackedLSTM(torch.nn.Module): - def __init__(self, num_layers: int, layer: torch.nn.Module, first_layer_args: List, other_layer_args: List): - super(StackedLSTM, self).__init__() - self.layers: torch.nn.ModuleList = init_stacked_lstm(num_layers, layer, first_layer_args, other_layer_args) - - def forward( - self, input: torch.Tensor, states: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] - ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]: - if states is None: - temp_states: List[Tuple[torch.Tensor, torch.Tensor]] = [] - batch = input.size(1) - for layer in self.layers: - temp_states.append( - ( - torch.zeros(batch, layer.cell.hidden_size, dtype=input.dtype, device=input.device), - torch.zeros(batch, layer.cell.hidden_size, dtype=input.dtype, device=input.device), - ) - ) - - states = temp_states - - output_states: List[Tuple[torch.Tensor, torch.Tensor]] = [] - output = input - for i, rnn_layer in enumerate(self.layers): - state = states[i] - output, out_state = rnn_layer(output, state) - output_states.append(out_state) - i += 1 - return output, output_states - - -def label_collate(labels, device=None): - """Collates the label inputs for the rnn-t prediction network. - If `labels` is already in torch.Tensor form this is a no-op. - - Args: - labels: A torch.Tensor List of label indexes or a torch.Tensor. - device: Optional torch device to place the label on. - - Returns: - A padded torch.Tensor of shape (batch, max_seq_len). - """ - - if isinstance(labels, torch.Tensor): - return labels.type(torch.int64) - if not isinstance(labels, (list, tuple)): - raise ValueError(f"`labels` should be a list or tensor not {type(labels)}") - - batch_size = len(labels) - max_len = max(len(label) for label in labels) - - cat_labels = np.full((batch_size, max_len), fill_value=0.0, dtype=np.int32) - for e, l in enumerate(labels): - cat_labels[e, : len(l)] = l - labels = torch.tensor(cat_labels, dtype=torch.int64, device=device) - - return labels diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/transformer_utils.py b/SoundScribe/SpeakerID/nemo/collections/common/parts/transformer_utils.py deleted file mode 100644 index a40e7eebc9bb1e8fc6da8a6c50704a4ed2ed6ffd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/parts/transformer_utils.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import torch.nn as nn - -__all__ = ['NEG_INF', 'form_attention_mask', 'transformer_weights_init', 'mask_padded_tokens'] - -NEG_INF = -10000.0 - - -def form_attention_mask(input_mask, diagonal=None): - """ - Build attention mask with optional masking of future tokens we forbid - to attend to (e.g. as it is in Transformer decoder). - - Args: - input_mask: binary mask of size B x L with 1s corresponding to valid - tokens and 0s corresponding to padding tokens - diagonal: diagonal where triangular future mask starts - None -- do not mask anything - 0 -- regular translation or language modeling future masking - 1 -- query stream masking as in XLNet architecture - Returns: - attention_mask: mask of size B x 1 x L x L with 0s corresponding to - tokens we plan to attend to and -10000 otherwise - """ - - if input_mask is None: - return None - attn_shape = (1, input_mask.shape[1], input_mask.shape[1]) - attn_mask = input_mask.to(dtype=bool).unsqueeze(1) - if diagonal is not None: - future_mask = torch.tril(torch.ones(attn_shape, dtype=torch.bool, device=input_mask.device), diagonal) - attn_mask = attn_mask & future_mask - attention_mask = (1 - attn_mask.to(torch.float)) * NEG_INF - return attention_mask.unsqueeze(1) - - -def transformer_weights_init(module, std_init_range=0.02, xavier=True): - """ - Initialize different weights in Transformer model. - - Args: - module: torch.nn.Module to be initialized - std_init_range: standard deviation of normal initializer - xavier: if True, xavier initializer will be used in Linear layers - as was proposed in AIAYN paper, otherwise normal initializer - will be used (like in BERT paper) - """ - - if isinstance(module, nn.Linear): - if xavier: - nn.init.xavier_uniform_(module.weight) - else: - nn.init.normal_(module.weight, mean=0.0, std=std_init_range) - if module.bias is not None: - nn.init.constant_(module.bias, 0.0) - elif isinstance(module, nn.Embedding): - nn.init.normal_(module.weight, mean=0.0, std=std_init_range) - elif isinstance(module, nn.LayerNorm): - nn.init.constant_(module.weight, 1.0) - nn.init.constant_(module.bias, 0.0) - - -def mask_padded_tokens(tokens, pad_id): - mask = tokens != pad_id - return mask diff --git a/SoundScribe/SpeakerID/nemo/collections/common/parts/utils.py b/SoundScribe/SpeakerID/nemo/collections/common/parts/utils.py deleted file mode 100644 index 0bfcfaf9cdfb26ed542e18db04b3c3b91635d7a1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/parts/utils.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import os -from typing import Iterable, List - -import torch.nn as nn - -__all__ = ['if_exist', '_compute_softmax', 'flatten'] - -activation_registry = { - "identity": nn.Identity, - "hardtanh": nn.Hardtanh, - "relu": nn.ReLU, - "selu": nn.SELU, - "swish": nn.SiLU, - "silu": nn.SiLU, - "gelu": nn.GELU, -} - - -def if_exist(outfold: str, files: List[str]): - """ - Returns true if all given files exist in the given folder - Args: - outfold: folder path - files: list of file names relative to outfold - """ - if not os.path.exists(outfold): - return False - for file in files: - if not os.path.exists(f'{outfold}/{file}'): - return False - return True - - -def _compute_softmax(scores): - """Compute softmax probability over raw logits.""" - if not scores: - return [] - - max_score = None - for score in scores: - if max_score is None or score > max_score: - max_score = score - - exp_scores = [] - total_sum = 0.0 - for score in scores: - x = math.exp(score - max_score) - exp_scores.append(x) - total_sum += x - - probs = [] - for score in exp_scores: - probs.append(score / total_sum) - return probs - - -def flatten_iterable(iter: Iterable) -> Iterable: - """Flatten an iterable which contains values or - iterables with values. - - Args: - iter: iterable containing values at the deepest level. - - Returns: - A flat iterable containing values. - """ - for it in iter: - if isinstance(it, str) or not isinstance(it, Iterable): - yield it - else: - yield from flatten_iterable(it) - - -def flatten(list_in: List) -> List: - """Flatten a list of (nested lists of) values into a flat list. - - Args: - list_in: list of values, possibly nested - - Returns: - A flat list of values. - """ - return list(flatten_iterable(list_in)) diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__init__.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__init__.py deleted file mode 100644 index f46e3b150acc7bb1c3f2db39dc0af7ea145e2b58..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.common.tokenizers.aggregate_tokenizer import AggregateTokenizer -from nemo.collections.common.tokenizers.bytelevel_tokenizers import ByteLevelTokenizer -from nemo.collections.common.tokenizers.char_tokenizer import CharTokenizer -from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer -from nemo.collections.common.tokenizers.regex_tokenizer import RegExTokenizer -from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.common.tokenizers.word_tokenizer import WordTokenizer -from nemo.collections.common.tokenizers.youtokentome_tokenizer import YouTokenToMeTokenizer diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 844f517d12c98114559b228caea42ee4c5345b96..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/aggregate_tokenizer.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/aggregate_tokenizer.cpython-310.pyc deleted file mode 100644 index 804f411adfa07b223aa7acef855bf7843e86e6f0..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/aggregate_tokenizer.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/bytelevel_tokenizers.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/bytelevel_tokenizers.cpython-310.pyc deleted file mode 100644 index 016351b50d46ba5d8c7ccc1d8984ed8d1d1687fa..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/bytelevel_tokenizers.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/char_tokenizer.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/char_tokenizer.cpython-310.pyc deleted file mode 100644 index d7168993084b1aab870cde885c0a9be858be5b92..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/char_tokenizer.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/regex_tokenizer.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/regex_tokenizer.cpython-310.pyc deleted file mode 100644 index b52e7ea83b10b2d6c6695cde164079f2a9375a8c..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/regex_tokenizer.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/sentencepiece_tokenizer.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/sentencepiece_tokenizer.cpython-310.pyc deleted file mode 100644 index 22c67585dfc5e931c4bc4dd5b15ee22cf1e8665e..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/sentencepiece_tokenizer.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/tokenizer_spec.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/tokenizer_spec.cpython-310.pyc deleted file mode 100644 index 22c4f22e56ac28916428f2342edac5d65a35827a..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/tokenizer_spec.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/word_tokenizer.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/word_tokenizer.cpython-310.pyc deleted file mode 100644 index 4e84a6dd11db4f91697ac14ad6ce2d1b37d00226..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/word_tokenizer.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/youtokentome_tokenizer.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/youtokentome_tokenizer.cpython-310.pyc deleted file mode 100644 index 157327634c8512d044091791ddddebe3a2dca97b..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/__pycache__/youtokentome_tokenizer.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/aggregate_tokenizer.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/aggregate_tokenizer.py deleted file mode 100644 index 9c003c37525ab4475859a6bd96c3834c0f838a8e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/aggregate_tokenizer.py +++ /dev/null @@ -1,233 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, List, Union - -import numpy as np - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.utils import logging - -__all__ = ['AggregateTokenizer'] - - -class DummyTokenizer: - def __init__(self, vocab): - self.vocab = vocab - self.vocab_size = len(vocab) - - # minimum compatibility - # since all the monolingual tokenizers have a vocab - # additional methods could be added here - def get_vocab(self): - return self.vocab - - -class AggregateTokenizer(TokenizerSpec): - ''' - AggregateTokenizer, allowing one to combine multiple regular monolongual tokenizers into one tokenizer. - The intuition is that we can use existing tokenizers "as is", without retraining, and associate each tokenizer with a language id - during text processing (language id will be used to route the incoming text sample to the right tokenizer) - as well as a token id range for detokenization (e.g. [0..127] for tokenizer A, [128..255] for tokenizer B) so - that the orignal text could be reconstructed. Note that we assume that the incoming dict of langs / tokenizers - is ordered, e.g. the first tokenizer will be assigned a lower interval of token ids - Args: - tokenizers: dict of tokenizers, keys are lang ids, values are actual tokenizers - ''' - - def __init__(self, tokenizers: Dict): - - self.tokenizers_dict = tokenizers - self.vocabulary = [] - - # the tokenizers should produce non-overlapping, ordered token ids - # keys are language ids - self.token_id_offset = {} - - # keys are tokenizer numbers - self.token_id_offset_by_tokenizer_num = {} - offset = 0 - i = 0 - for lang, tokenizer in self.tokenizers_dict.items(): - self.token_id_offset[lang] = offset - self.token_id_offset_by_tokenizer_num[i] = offset - offset += len(tokenizer.vocab) - i += 1 - - for tokenizer in self.tokenizers_dict.values(): - self.vocabulary.extend(tokenizer.vocab) - - self.vocab_size = len(self.vocabulary) - logging.info(f'Aggregate vocab size: {self.vocab_size}') - - # for compatibility purposes only -- right now only the get_vocab method - # is supported, returning the joint vocab across all tokenizers - self.tokenizer = DummyTokenizer(self.vocabulary) - - # lookup tables to speed up token to text operations - # if there are two tokenizers, [0,1], ['en', 'es'], each with 128 tokens, the aggregate tokenizer - # token range will be [0,255]. The below method provides three look up tables: - # one, to convert the incoming token id -- e.g. 200 into its real id (200-127 = 73) - # second, to compute the tokenizer id that should process that token (1) - # third, the compute the lang id for that token ('es') - offset_token_ids_by_token_id, tokenizers_by_token_id, langs_by_token_id = self._calculate_offsets() - - self.offset_token_ids_by_token_id = offset_token_ids_by_token_id - self.tokenizers_by_token_id = tokenizers_by_token_id - self.langs_by_token_id = langs_by_token_id - - def _calculate_offsets(self): - offsets = {} - tokenizers = {} - langs = {} - cur_num = 0 - tot = len(self.tokenizers_dict) - for id in range(len(self.vocabulary)): - off_id = id - list(self.token_id_offset.values())[cur_num] - if cur_num + 1 < tot: - if id >= list(self.token_id_offset.values())[cur_num + 1]: - cur_num += 1 - off_id = id - list(self.token_id_offset.values())[cur_num] - offsets[id] = off_id - tokenizers[id] = list(self.tokenizers_dict.values())[cur_num] - langs[id] = list(self.tokenizers_dict.keys())[cur_num] - - return offsets, tokenizers, langs - - def text_to_tokens(self, text, lang_id): - tokenizer = self.tokenizers_dict[lang_id] - return tokenizer.text_to_tokens(text) - - def text_to_ids(self, text, lang_id): - tokenizer = self.tokenizers_dict[lang_id] - token_ids = tokenizer.text_to_ids(text) - token_ids[:] = [t + self.token_id_offset[lang_id] for t in token_ids] - - return token_ids - - def tokens_to_text(self, tokens, lang_id): - if isinstance(tokens, np.ndarray): - tokens = tokens.tolist() - - tokenizer = self.tokenizers_dict[lang_id] - return tokenizer.decode_pieces(tokens) - - def ids_to_text(self, ids): - if isinstance(ids, np.ndarray): - ids = ids.tolist() - - tokens = [] - for id in ids: - offset_id = self.offset_token_ids_by_token_id[id] - tokenizer = self.tokenizers_by_token_id[id] - tokens.extend(tokenizer.ids_to_tokens([offset_id])) - text = ''.join(tokens).replace('▁', ' ') - - return text - - def token_to_id(self, token, lang_id): - tokenizer = self.tokenizers_dict[lang_id] - return tokenizer.token_to_id(token) + self.token_id_offset[lang_id] - - def ids_to_tokens(self, ids): - tokens = [] - - for id in ids: - offset_id = self.offset_token_ids_by_token_id[id] - tokenizer = self.tokenizers_by_token_id[id] - token = tokenizer.ids_to_tokens([offset_id])[0] - tokens.append(token) - - return tokens - - def ids_to_text_and_langs(self, ids): - text_and_langs = [] - - for id in ids: - offset_id = self.offset_token_ids_by_token_id[id] - tokenizer = self.tokenizers_by_token_id[id] - token = tokenizer.ids_to_tokens([offset_id])[0] - text = token.replace('▁', ' ') - text = text.strip() # strip for display purposes - lang = self.langs_by_token_id[id] - text_and_langs.append({'char': text, 'lang': lang}) - - return text_and_langs - - def ids_to_words_and_langs(self, ids): - words_and_langs = [] - - word_ids = [] # tokens belonging to the current word - for id in ids: - offset_id = self.offset_token_ids_by_token_id[id] - tokenizer = self.tokenizers_by_token_id[id] - token = tokenizer.ids_to_tokens([offset_id])[0] - if token.startswith('▁'): - if len(word_ids) > 0: # if this isn't the first word - word = self.ids_to_text(word_ids) - word = word.strip() # strip for display purposes - lang = self.ids_to_lang(word_ids) - wl = {'word': word, 'lang': lang} - words_and_langs.append(wl) - word_ids = [] - word_ids.append(id) - - if len(word_ids) > 0: # the last tokens - word = self.ids_to_text(word_ids) - word = word.strip() # strip for display purposes - lang = self.ids_to_lang(word_ids) - wl = {'word': word, 'lang': lang} - words_and_langs.append(wl) - - return words_and_langs - - def ids_to_lang(self, ids): - lang_cnts = {} - - for id in ids: - lang = self.langs_by_token_id[id] - lang_cnt = lang_cnts.get(lang) - if lang_cnt is not None: - lang_cnts[lang] = lang_cnt + 1 - else: - lang_cnts[lang] = 1 - - max_lang = '' - max_lang_cnt = -1 - for lang, lang_cnt in lang_cnts.items(): - if lang_cnt > max_lang_cnt: - max_lang = lang - max_lang_cnt = lang_cnt - - return max_lang - - def tokens_to_ids(self, tokens: Union[str, List[str]], langs: Union[str, List[str]]) -> Union[int, List[int]]: - if isinstance(tokens, str): - tokens = [tokens] - if isinstance(langs, str): - langs = [langs] - - ids = [] - for i, token in enumerate(tokens): - lang_id = langs[i] - ids.append(self.token_to_id(token, lang_id)) - return ids - - @property - def vocab(self): - return self.vocabulary - - @property - def langs(self): - return list(self.tokenizers_dict.keys()) diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/bytelevel_tokenizers.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/bytelevel_tokenizers.py deleted file mode 100644 index eb965b0828159e3cae2ea4b21c966792403a147c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/bytelevel_tokenizers.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, List, Optional, Union - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec - -__all__ = ['ByteLevelProcessor', 'ByteLevelTokenizer'] - - -class ByteLevelProcessor: - """ - A very basic tokenization and detokenization class for use with byte-level - tokenization. - """ - - def detokenize(self, tokens: List[str]) -> str: - return ' '.join(tokens) - - def tokenize(self, text) -> str: - return text - - def normalize(self, text) -> str: - return text - - -class ByteLevelTokenizer(TokenizerSpec): - def __init__(self, special_tokens: Optional[Union[Dict[str, str], List[str]]] = None): - self.vocab_size = 259 - self.special_start = 256 - self.special_token_to_id = { - self.pad_id: self.pad_id, - self.bos_id: self.bos_id, - self.eos_id: self.eos_id, - } - special_tokens = {} if special_tokens is None else special_tokens - for tok in special_tokens: - self.special_start -= 1 - self.special_token_to_id[tok] = self.special_start - - self.id_to_special_token = {v: k for k, v in self.special_token_to_id.items()} - - # no distinction between tokens and ids. - def text_to_tokens(self, text): - return self.text_to_ids(text) - - def tokens_to_text(self, tokens): - return self.ids_to_text(tokens) - - def text_to_ids(self, text): - return list(text.encode('utf-8')) - - def ids_to_text(self, ids): - # remove special tokens. - ids = [x for x in ids if x < self.special_start] - return bytes(ids).decode('utf-8', errors='ignore').rstrip() - - def tokens_to_ids(self, tokens): - if isinstance(tokens, str): - tokens = [tokens] - ids = [] - for token in tokens: - ids.append(self.token_to_id(token)) - return ids - - def ids_to_tokens(self, ids): - if isinstance(ids, int): - ids = [ids] - tokens = [] - for id in ids: - tokens.append(self.id_to_token(id)) - return tokens - - def token_to_id(self, token): - if token in self.special_token_to_id: - return self.special_token_to_id[token] - else: - return token - - def id_to_token(self, id): - if id < self.special_start: - return id - else: - return self.id_to_special_token[id] - - @property - def pad_id(self): - return 256 - - @property - def bos_id(self): - return 257 - - @property - def eos_id(self): - return 258 - - @property - def unk_id(self): - return 259 # unused diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/char_tokenizer.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/char_tokenizer.py deleted file mode 100644 index 27674256f315838bdfa8f6f3f60444306281ff0d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/char_tokenizer.py +++ /dev/null @@ -1,521 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import json -import os -import warnings -from collections import Counter -from enum import Enum -from pathlib import Path -from typing import Dict, List, NewType, Optional, Union - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec - -__all__ = ['CharTokenizer'] - - -NUMBER_OF_CHARACTERS_READ_BUFFER_SIZE = 10 ** 7 - - -class SpecialTokenString(Enum): - MASK = 'mask' - BOS = 'bos' - EOS = 'eos' - PAD = 'pad' - SEP = 'sep' - CLS = 'cls' - UNK = 'unk' - - @classmethod - def has_value(cls, value): - return value in cls._value2member_map_ - - -SpecialTokenStringType = NewType('SpecialTokenString', SpecialTokenString) - - -class CharTokenizer(TokenizerSpec): - rf""" - Each character is a token. - Args: - vocab_file: path to file with vocabulary for a tokenizer. The file consists of valid Python string literals - separated by the new line character. Such literals must contain 1 character. Examples of valid Python - literals: ``'a'``, ``'\n'``, ``"'"``, ``'ж'``, ``'\u8976'``. Optionally the first line in the file can be a - JSON dictionary of special tokens. The keys of the special tokens dictionary are ``'mask_token'``, - ``'bos_token'`` and so on. Some special tokens names can be omitted in the special tokens dictionary line. - A file ``vocab_file`` has to be in ``'utf-8'`` encoding. - mask_token: mask token. The following is applicable to all special tokens. Parameter ``mask_token`` is used - for adding mask token to vocabulary or for modification of mask token present in special tokens dictionary - in the first line of file ``vocab_file``. Parameter ``mask_token`` can be either of type ``bool`` or a - ``str`` of length 1. - - If ``mask_token`` is ``bool`` it has to be ``False``. If ``mask_token`` is ``True`` an exception is raised. - If ``mask_token`` is ``False`` and ``mask_token`` is present in special tokens dictionary in vocabulary - file ``vocab_file``, then ``mask_token`` is remove from special tokens dictionary. - - If the parameter ``mask_token`` is a string, then such strings in the input sequence are interpreted as - mask tokens. - bos_token: the beginning of sequence token. See more in ``mask_token`` parameter description. - eos_token: the end of sequence token. Usually equal to sep_token. See more in ``mask_token`` parameter - description. - pad_token: token to use for padding. See more in ``mask_token`` parameter description. - sep_token: token used for separating sequences. See more in ``mask_token`` parameter description. - cls_token: class token. Usually equal to bos_token. See more in ``mask_token`` parameter description. - unk_token: token to use for unknown tokens. If the parameter ``unk_token`` is set and there is a character - in the input of ``text_to_ids`` of ``text_to_tokens`` methods which is not in the vocabulary, then - such an unknown character is tokenized into ``unk_token``. If the parameter ``unk_token`` is ``False``, - then unknown tokens are discarded. See more in ``mask_token`` parameter description. - special_token_to_prepend: special token to prepend to the output of ``text_to_ids`` of ``text_to_tokens`` - methods. This option can be used if you decide to add EOS and BOS tokens to the input on the stage of - tokenization. Possible options are: {[None] + [e.value for e in SpecialTokenString]}. - special_token_to_append: special token to append to the output of ``text_to_ids`` of ``text_to_tokens`` - methods. See more in the description of ``special_token_to_prepend`` parameter. - special_tokens_to_remove_while_decoding: which special tokens are remove before detokenization. If this - parameter equals ``'all'``, then all special tokens are removed. The parameter - ``special_tokens_to_remove_while_decoding`` can also be a list of values from this set - {set(e.value for e in SpecialTokenString)}. - """ - - def __init__( - self, - vocab_file: str, - mask_token: Optional[Union[str, bool]] = None, - bos_token: Optional[Union[str, bool]] = None, - eos_token: Optional[Union[str, bool]] = None, - pad_token: Optional[Union[str, bool]] = None, - sep_token: Optional[Union[str, bool]] = None, - cls_token: Optional[Union[str, bool]] = None, - unk_token: Optional[Union[str, bool]] = None, - special_token_to_prepend: Optional[SpecialTokenStringType] = None, - special_token_to_append: Optional[SpecialTokenStringType] = None, - special_tokens_to_remove_while_decoding: Union[List[SpecialTokenStringType], str] = 'all', - ): - vocab_file = Path(vocab_file).expanduser() - with vocab_file.open(encoding='utf-8') as f: - first_line = f.readline() - if first_line[0] == '{': - special_tokens_dict = json.loads(first_line) - self.check_special_tokens_dict_from_file(special_tokens_dict, vocab_file) - vocab_list = f.readlines() - else: - special_tokens_dict = {} - vocab_list = [first_line] + f.readlines() - special_tokens_dict = self.update_special_tokens_dict( - special_tokens_dict, mask_token, bos_token, eos_token, pad_token, sep_token, cls_token, unk_token - ) - for e in SpecialTokenString: - name = e.value + '_token' - setattr(self, name, special_tokens_dict[name] if name in special_tokens_dict else None) - for k, v in special_tokens_dict.items(): - setattr(self, k, v) - for value, name in [ - (special_token_to_prepend, 'special_token_to_prepend'), - (special_token_to_append, 'special_token_to_append'), - ]: - self.check_special_token_name(name, value, special_tokens_dict) - setattr(self, name, value + '_token' if isinstance(value, str) else value) - self.vocab = {} - count = 0 - for v in special_tokens_dict.values(): - self.vocab[v] = count - count += 1 - for i, token in enumerate(vocab_list): - token = eval(token.strip()) - self.check_token_from_file(token, vocab_file, i) - if token not in self.vocab: - self.vocab[token] = count - count += 1 - self.inv_vocab = {v: k for k, v in self.vocab.items()} - self.vocab_size = len(self.vocab) - self.check_special_tokens_to_remove_while_decoding( - special_tokens_to_remove_while_decoding, special_tokens_dict - ) - self.special_token_ids_to_remove_while_decoding = ( - self.tokens_to_ids([v for v in special_tokens_dict.values()]) - if special_tokens_to_remove_while_decoding == 'all' - else [getattr(self, e + '_id') for e in special_tokens_to_remove_while_decoding] - ) - - @classmethod - def check_special_tokens_dict_from_file(cls, special_tokens_dict, vocab_file): - for k, v in special_tokens_dict.items(): - if k[-6:] != '_token' or not SpecialTokenString.has_value(k[:-6]): - raise ValueError( - f"Unsupported key {repr(k)} in special tokens dictionary in vocabulary file {vocab_file} " - f"(first line). Supported keys are {[e.value + '_token' for e in SpecialTokenString]}." - ) - if not isinstance(v, str): - raise ValueError( - f"Values of special tokens dictionary in vocabulary file {vocab_file} (first line) has to belong " - f"to type `str`, whereas type of item '{k}' value {repr(v)} is `{type(v)}`." - ) - elif len(v) == 0: - raise ValueError( - f"Values of special tokens dictionary in vocabulary file {vocab_file} (first line) has to not " - f"empty strings, whereas value of item '{k}' is an empty string." - ) - cls.check_special_tokens_dict_for_duplicate_values( - special_tokens_dict, f"Loaded from vocabulary file {vocab_file}" - ) - - @staticmethod - def check_special_tokens_dict_for_duplicate_values(special_tokens_dict, err_msg_prefix): - if len(special_tokens_dict) != len(set(special_tokens_dict.values())): - tokens_with_equal_values = [] - duplicate_values = [] - for k, v in list(reversed(list(special_tokens_dict.items())))[:-1]: - tokens = [k] - for kk, vv in special_tokens_dict.items(): - if kk == k: - break - if v == vv: - tokens.append(kk) - if len(tokens) > 1: - duplicate_values.append(v) - tokens_with_equal_values.append(tokens) - if duplicate_values: - dup_values_msg = '. '.join( - [f"Tokens {t} have value '{v}'" for t, v in zip(tokens_with_equal_values, duplicate_values)] - ) - raise ValueError( - err_msg_prefix + f" special tokens dictionary has duplicate values. " + dup_values_msg - ) - - @classmethod - def update_special_tokens_dict( - cls, - init_special_tokens_dict: Dict[str, str], - mask_token: Optional[Union[str, bool]] = None, - bos_token: Optional[Union[str, bool]] = None, - eos_token: Optional[Union[str, bool]] = None, - pad_token: Optional[Union[str, bool]] = None, - sep_token: Optional[Union[str, bool]] = None, - cls_token: Optional[Union[str, bool]] = None, - unk_token: Optional[Union[str, bool]] = None, - ): - special_tokens_dict = init_special_tokens_dict.copy() - for value, name in zip( - [pad_token, unk_token, bos_token, eos_token, sep_token, mask_token, cls_token], - ['pad_token', 'unk_token', 'bos_token', 'eos_token', 'sep_token', 'mask_token', 'cls_token'], - ): - if value is not None: - if isinstance(value, bool): - if value: - raise ValueError( - f"If `CharTokenizer` constructor parameter `{name}` is `bool` it has to be `False`" - ) - else: - if name in special_tokens_dict: - del special_tokens_dict[name] - else: - warnings.warn( - f"Cannot remove special token `{name}` since it is not in special tokens dictionary " - f"{special_tokens_dict}." - ) - elif not isinstance(value, str): - raise ValueError( - f"`CharTokenizer` constructor parameter `{name}` has to be either `False` or belong to type " - f"`str`, whereas type of `{name}` is `{type(value)}`." - ) - else: - special_tokens_dict[name] = value - cls.check_special_tokens_dict_for_duplicate_values( - special_tokens_dict, - "After updating special tokens dictionary with tokens passed in `CharTokenizer` constructor parameters", - ) - return special_tokens_dict - - @staticmethod - def check_token_from_file(token, vocab_file, line_i): - if not isinstance(token, str) or isinstance(token, str) and len(token) != 1: - raise ValueError( - f"Each line in vocabulary have to be a Python string literal containing 1 character. " - f"Encountered {repr(token)} on line {line_i} in file {vocab_file}." - ) - - @staticmethod - def check_special_token_name(parameter_name, value, special_tokens_dict): - if value is not None: - if not SpecialTokenString.has_value(value): - raise ValueError( - f"Value {repr(value)} of parameter `{parameter_name}` is wrong. Supported values are " - f"{[e.value for e in SpecialTokenString]}." - ) - elif value + '_token' not in special_tokens_dict: - raise ValueError( - f"You should provide `{value + '_token'}` parameter to `CharTokenizer` constructor if " - f"you wish to pass token {repr(value)} in parameter `{parameter_name}`." - ) - - @staticmethod - def check_special_tokens_to_remove_while_decoding(special_tokens_to_remove_while_decoding, special_tokens_dict): - if isinstance(special_tokens_to_remove_while_decoding, list): - for i, value in enumerate(special_tokens_to_remove_while_decoding): - if not SpecialTokenString.has_value(value): - raise ValueError( - f'Wrong element with value {repr(value)} in position {i} of parameter ' - f'`special_tokens_to_remove_while_decoding` of `CharTokenizer` constructor. Supported values ' - f'are {[e.value for e in SpecialTokenString]}.' - ) - elif value + '_token' not in special_tokens_dict: - raise ValueError( - f"You should provide `{value + '_token'}` parameter to `CharTokenizer` constructor if " - f"you wish to pass token {repr(value)} in parameter `special_tokens_to_remove_while_decoding`. " - f"`{value + '_token'}` was detected in position {i} in " - f"`special_tokens_to_remove_while_decoding`." - ) - elif ( - isinstance(special_tokens_to_remove_while_decoding, str) - and special_tokens_to_remove_while_decoding != 'all' - or not isinstance(special_tokens_to_remove_while_decoding, str) - ): - raise ValueError( - f"Parameter `special_tokens_to_remove_while_decoding` of `CharTokenizer` constructor has to be " - f"equal to a string 'all' or be a list of values from set {set(e.value for e in SpecialTokenString)} " - f"whereas `special_tokens_to_remove_while_decoding={repr(special_tokens_to_remove_while_decoding)}`" - ) - - def text_to_tokens(self, text: str) -> List[str]: - token_candidates = [char for char in text] - tokens = [] - if self.special_token_to_prepend is not None: - tokens.append(getattr(self, self.special_token_to_prepend)) - for i, token in enumerate(token_candidates): - if token in self.vocab: - tokens.append(token) - elif self.unk_token is not None: - tokens.append(self.unk_token) - else: - warnings.warn( - f"Character {repr(token)} in position {i} is not present in vocabulary and no `` token was " - f"set. Character {repr(token)} is discarded." - ) - if self.special_token_to_append is not None: - tokens.append(getattr(self, self.special_token_to_append)) - return tokens - - def tokens_to_text(self, tokens: List[str]) -> str: - return self.ids_to_text(self.tokens_to_ids(tokens)) - - def text_to_ids(self, text: str) -> List[int]: - ids = [self.vocab[token] for token in self.text_to_tokens(text)] - return ids - - def ids_to_text(self, ids: List[int]) -> str: - ids_ = [id_ for id_ in ids if id_ not in self.special_token_ids_to_remove_while_decoding] - return "".join(self.ids_to_tokens(ids_)) - - def tokens_to_ids(self, tokens: List[str]) -> List[int]: - return [self.vocab[token] for token in tokens] - - def token_to_id(self, token: str) -> int: - return self.vocab[token] - - def ids_to_tokens(self, ids: List[int]) -> List[str]: - return [self.inv_vocab[id] for id in ids] - - @staticmethod - def check_special_token_id_getting(special_token, id_name): - if special_token is None: - token_param = id_name[:-3] + '_token' - raise ValueError( - f"Cannot return `{id_name}` since `{token_param}` is not set. To obtain `{id_name}` you need to pass " - f"parameter `{token_param}` to `CharTokenizer` constructor." - ) - - @property - def pad_id(self): - self.check_special_token_id_getting(self.pad_token, 'pad_id') - return self.vocab[self.pad_token] - - @property - def bos_id(self): - self.check_special_token_id_getting(self.bos_token, 'bos_id') - return self.vocab[self.bos_token] - - @property - def eos_id(self): - self.check_special_token_id_getting(self.eos_token, 'eos_id') - return self.vocab[self.eos_token] - - @property - def unk_id(self): - self.check_special_token_id_getting(self.unk_token, 'unk_id') - return self.vocab[self.unk_token] - - @property - def mask_id(self): - self.check_special_token_id_getting(self.mask_token, 'mask_id') - return self.vocab[self.mask_token] - - @property - def sep_id(self): - self.check_special_token_id_getting(self.sep_token, 'sep_id') - return self.vocab[self.sep_token] - - @property - def cls_id(self): - self.check_special_token_id_getting(self.cls_token, 'cls_id') - return self.vocab[self.cls_token] - - @staticmethod - def create_special_tokens_dict( - mask_token: Optional[str] = None, - bos_token: Optional[str] = None, - eos_token: Optional[str] = None, - pad_token: Optional[str] = None, - sep_token: Optional[str] = None, - cls_token: Optional[str] = None, - unk_token: Optional[str] = None, - ): - special_tokens_dict = {} - for value, name in zip( - [pad_token, unk_token, bos_token, eos_token, sep_token, mask_token, cls_token], - ['pad_token', 'unk_token', 'bos_token', 'eos_token', 'sep_token', 'mask_token', 'cls_token'], - ): - if value is not None: - if not isinstance(value, str): - raise ValueError( - f"The type of parameter `{name}` has to be `None` or `str`, found `{type(value)}`" - ) - elif len(value) == 0: - raise ValueError(f"If the parameter `{name}` is `str`, then its length has to be nonzero.") - elif value in special_tokens_dict.values(): - other_name = None - for k, v in special_tokens_dict.items(): - if v == value: - other_name = k - raise ValueError( - f"The value {repr(value)} of special token `{name}` is the same as the value of special token " - f"`{other_name}`." - ) - special_tokens_dict[name] = value - return special_tokens_dict - - @staticmethod - def check_characters_to_exclude_from_vocabulary(characters_to_exclude_from_vocabulary): - for i, char in enumerate(characters_to_exclude_from_vocabulary): - if not isinstance(char, str): - raise ValueError( - f"Character to exclude from vocabulary has to `str`, whereas an element in position {i} is of " - f"type `{type(char)}`." - ) - elif len(char) != 1: - raise ValueError( - f"A length of an element of `characters_to_exclude_from_vocabulary` parameter has to be 1. " - f"The length of an element in position {i} is {len(char)}." - ) - - @staticmethod - def check_text_and_text_file_name(text, text_file_name): - if text is None and text_file_name is None: - raise ValueError( - f'Exactly one of parameters `text` and `text_file_name` should be provided whereas both parameters ' - f'are `None`.' - ) - if text is not None and text_file_name is not None: - raise ValueError( - f"Exactly one of parameters `text` and `text_file_name` has to be provided, whereas both parameters " - f"are not `None`." - ) - if text is not None: - if not isinstance(text, str): - raise ValueError( - f"Parameter `text` has to be of type `str`, whereas it belongs to type `{type(text)}`." - ) - - @classmethod - def build_vocab( - cls, - save_path: Union[str, bytes, os.PathLike], - text: Optional[str] = None, - text_file_name: Optional[Union[str, bytes, os.PathLike]] = None, - characters_to_exclude: Optional[List[str]] = None, - vocab_size: int = None, - mask_token: Optional[str] = None, - bos_token: Optional[str] = None, - eos_token: Optional[str] = None, - pad_token: Optional[str] = None, - sep_token: Optional[str] = None, - cls_token: Optional[str] = None, - unk_token: Optional[str] = None, - ): - """ - Creates character vocabulary and saves it to file ``save_path``. You should provide one of parameters ``text`` - and ``text_file_name``. The format of created character vocabulary file is following: - ``` - {['mask_token': "ANY NON EMPTY STRING", ]['bos_token': "ANY NON EMPTY STRING", ] and so on} - ' ' - 'e' - ... - ``` - The first line is a JSON which contains special tokens. This special token are set using parameters - ``mas_token``, ``bos_token``, ``eos_token``, ``pad_token``, ``sep_token``, ``cls_token``, ``unk_token``. - Other lines in created vocabulary file are Python string literals containing one character each. - - Args: - save_path: path to the output text file. If ``save_path`` parent directory does not exist it will be created - text: string which characters are used for vocabulary creation. - text_file_name: path to a file which characters are used for vocabulary creation. Use this parameter if - the text in file is too large to be loaded in memory. - characters_to_exclude: a list of characters which will not be added to vocabulary. - vocab_size: vocabulary size. If this parameter is set only most frequent ``vocab_size`` characters are added - to vocabulary. - mask_token: mask token - bos_token: the beginning of sequence token - eos_token: the end of sequence token. Usually equal to sep_token. - pad_token: token to use for padding. - sep_token: token used for separating sequences. - cls_token: class token. Usually equal to bos_token. - unk_token: token to use for unknown tokens. If the parameter ``unk_token`` is set and there is a character - in the input of ``text_to_ids`` of ``text_to_tokens`` methods which is not in the vocabulary, then - such an unknown character is tokenized into ``unk_token``. If the parameter ``unk_token`` is ``False``, - then unknown tokens are discarded. - """ - special_tokens_dict = cls.create_special_tokens_dict( - mask_token, bos_token, eos_token, pad_token, sep_token, cls_token, unk_token - ) - if characters_to_exclude is None: - characters_to_exclude = [] - else: - cls.check_characters_to_exclude_from_vocabulary(characters_to_exclude) - cls.check_text_and_text_file_name(text, text_file_name) - if text is not None: - counter = Counter(text) - else: - assert text_file_name is not None - text_file_name = Path(text_file_name).expanduser() - counter = Counter() - with text_file_name.open(encoding='utf-8') as f: - while True: - segment = f.read(NUMBER_OF_CHARACTERS_READ_BUFFER_SIZE) - if not segment: - break - counter.update(segment) - for char in characters_to_exclude: - if char in counter: - del counter[char] - save_path = Path(save_path).expanduser() - save_path.parent.mkdir(exist_ok=True, parents=True) - with save_path.open('w', encoding='utf-8') as f: - f.write(json.dumps(special_tokens_dict) + '\n') - if vocab_size is None: - for c, _ in sorted(counter.items(), key=lambda x: -x[1]): - f.write(repr(c) + '\n') - else: - vocab_size -= len(special_tokens_dict) - for i, (c, _) in enumerate(sorted(counter.items(), key=lambda x: -x[1])): - if i < vocab_size: - f.write(repr(c) + '\n') - else: - break diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/chinese_tokenizers.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/chinese_tokenizers.py deleted file mode 100644 index a9923f0a4fd220ffba0ae475a67e4427d568d0cd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/chinese_tokenizers.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The MIT License (MIT) -# Copyright (c) 2016 The-Orizon -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - -# The detokenize function is based on : https://github.com/The-Orizon/nlputils/blob/master/detokenizer.py - -import re -from typing import List - -import jieba -import opencc -from pangu import spacing - - -class ChineseProcessor: - """ - Tokenizer, Detokenizer and Normalizer utilities for Chinese. - """ - - def __init__(self): - self.normalizer = opencc.OpenCC('t2s.json') - - def normalize(self, text: str) -> str: - return self.normalizer.convert(text) - - def detokenize(self, text: List[str]) -> str: - RE_WS_IN_FW = re.compile( - r'([\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])\s+(?=[\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])' - ) - - detokenize = lambda s: spacing(RE_WS_IN_FW.sub(r'\1', s)).strip() - return detokenize(' '.join(text)) - - def tokenize(self, text: str) -> str: - text = jieba.cut(text) - return ' '.join(text) diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/column_coder.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/column_coder.py deleted file mode 100644 index bf1ab2f7accb8e5b53576fe4c76ca1454e2def2a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/column_coder.py +++ /dev/null @@ -1,305 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from typing import Dict, List, Tuple - -import numpy as np -from numpy import ndarray -from sklearn.preprocessing import PowerTransformer, QuantileTransformer, RobustScaler - -from nemo.utils import logging - -__all__ = ["IntCode", "FloatCode", "CategoryCode", "ColumnCodes"] - - -class Code(object): - def compute_code(self, data_series: ndarray): - """ - @params: - data_series: an array of input data used to calculate mapping - """ - raise NotImplementedError() - - def __init__(self, col_name: str, code_len: int, start_id: int, fillall: bool = True, hasnan: bool = True): - """ - @params: - col_name: name of the column - code_len: number of tokens used to code the column. - start_id: offset for token_id. - fillall: if True, reserve space for digit number even the digit number is - not present in the data_series. Otherwise, only reserve space for the numbers - in the data_series. - hasnan: if True, reserve space for nan - """ - self.name = col_name - self.code_len = code_len - self.start_id = start_id - self.end_id = start_id - self.fillall = fillall - self.hasnan = hasnan - - def encode(self, item: str) -> List[int]: - raise NotImplementedError() - - def decode(self, ids: List[int]) -> str: - raise NotImplementedError() - - @property - def code_range(self) -> List[Tuple[int, int]]: - """ - get the vocab id range for each of the encoded tokens - @returns [(min, max), (min, max), ...] - """ - return [(self.start_id, self.end_id)] - - -class IntCode(Code): - def __init__( - self, col_name: str, code_len: int, start_id: int, fillall: bool = True, base: int = 100, hasnan: bool = True - ): - super().__init__(col_name, code_len, start_id, fillall, hasnan) - self.base = base - self.int_min: int = None - - def compute_code(self, data_series: ndarray): - significant_val = self.array_convert_to_int(data_series) - - digits_id_to_item = [{} for _ in range(self.code_len)] - digits_item_to_id = [{} for _ in range(self.code_len)] - for i in range(self.code_len): - id_to_item = digits_id_to_item[i] - item_to_id = digits_item_to_id[i] - v = (significant_val // self.base ** i) % self.base - if self.fillall: - uniq_items = range(0, self.base) - else: - uniq_items = sorted(np.unique(v).tolist()) - for k in range(len(uniq_items)): - item = str(uniq_items[k]) - item_to_id[item] = self.end_id - id_to_item[self.end_id] = item - self.end_id += 1 - self.digits_id_to_item = digits_id_to_item - self.digits_item_to_id = digits_item_to_id - self.NA_token = 'nan' - if self.hasnan: - self.end_id += 1 # add the N/A token - codes = [] - ranges = self.code_range - for i in ranges: - codes.append(i[1] - 1) - self.NA_token_id = codes - - def array_convert_to_int(self, val: ndarray): - val = val.astype(int) - self.int_min = val.min() - return val - self.int_min - - def convert_to_int(self, val: float) -> int: - return int(val) - self.int_min - - def reverse_convert_to_int(self, val: int) -> int: - return val + self.int_min - - @property - def code_range(self) -> List[Tuple[int, int]]: - """ - get the vocab id range for each of the encoded tokens - @returns [(min, max), (min, max), ...] - """ - # first largest digits - outputs = [] - c = 0 - for i in reversed(range(self.code_len)): - ids = self.digits_id_to_item[i].keys() - if c == 0: - if self.hasnan: - outputs.append((min(ids), max(ids) + 2)) # the first token contains the N/A - else: - outputs.append((min(ids), max(ids) + 1)) # non N/A - else: - outputs.append((min(ids), max(ids) + 1)) - c += 1 - return outputs - - def encode(self, item: str) -> List[int]: - if self.hasnan and item == self.NA_token: - return self.NA_token_id - elif not self.hasnan and item == self.NA_token: - raise ValueError(f"colum {self.name} cannot handle nan, please set hasnan=True") - val = float(item) - val_int = self.convert_to_int(val) - digits = [] - for i in range(self.code_len): - digit = (val_int // self.base ** i) % self.base - digits.append(str(digit)) - if (val_int // self.base ** self.code_len) != 0: - raise ValueError("not right length") - codes = [] - for i in reversed(range(self.code_len)): - digit_str = digits[i] - if digit_str in self.digits_item_to_id[i]: - codes.append(self.digits_item_to_id[i][digit_str]) - else: - # find the nearest encode id - allowed_digits = np.array([int(d) for d in self.digits_item_to_id[i].keys()]) - near_id = np.argmin(np.abs(allowed_digits - int(digit_str))) - digit_str = str(allowed_digits[near_id]) - codes.append(self.digits_item_to_id[i][digit_str]) - logging.warning('out of domain num is encounterd, use nearest code') - return codes - - def decode(self, ids: List[int]) -> str: - if self.hasnan and ids[0] == self.NA_token_id[0]: - return self.NA_token - v = 0 - for i in reversed(range(self.code_len)): - digit = int(self.digits_id_to_item[i][ids[self.code_len - i - 1]]) - v += digit * self.base ** i - v = self.reverse_convert_to_int(v) - return str(v) - - -class FloatCode(IntCode): - def __init__( - self, - col_name: str, - code_len: int, - start_id: int, - fillall: bool = True, - base: int = 100, - hasnan: bool = True, - transform: str = 'quantile', - ): - super().__init__(col_name, code_len, start_id, fillall, base, hasnan) - if transform == 'yeo-johnson': - self.scaler = PowerTransformer(standardize=True) - elif transform == 'quantile': - self.scaler = QuantileTransformer(output_distribution='uniform', n_quantiles=100) - elif transform == 'robust': - self.scaler = RobustScaler() - else: - raise ValueError('Supported data transformations are "yeo-johnson", "quantile", and "robust"') - - def convert_to_int(self, val: float) -> int: - val = np.expand_dims(np.array(val), axis=0) - values = self.scaler.transform(val[:, None])[:, 0] - self.mval - values = (values * self.base ** self.extra_digits).astype(int) - output = values[0] - return output - - def array_convert_to_int(self, val: ndarray): - values = self.scaler.fit_transform(val[:, None])[:, 0] - self.mval = values.min() - values = values - self.mval - digits = int(math.log(values.max(), self.base)) + 1 - # extra digits used for 'float' part of the number - extra_digits = self.code_len - digits - if extra_digits < 0: - raise ValueError("need large length to code the nummber") - self.extra_digits = extra_digits - values = (values * self.base ** self.extra_digits).astype(int) - return values - - def reverse_convert_to_int(self, val: int) -> float: - val = val / self.base ** self.extra_digits - val = np.expand_dims(np.array(val), axis=0) - v = self.scaler.inverse_transform(val[:, None] + self.mval)[0, 0] - return v - - def decode(self, ids: List[int]) -> str: - if self.hasnan and ids[0] == self.NA_token_id[0]: - return self.NA_token - v = 0 - for i in reversed(range(self.code_len)): - digit = int(self.digits_id_to_item[i][ids[self.code_len - i - 1]]) - v += digit * self.base ** i - v = self.reverse_convert_to_int(v) - accuracy = max(int(abs(np.log10(0.1 / self.base ** self.extra_digits))), 1) - return f"{v:.{accuracy}f}" - - -class CategoryCode(Code): - def __init__(self, col_name: str, start_id: int): - super().__init__(col_name, 1, start_id, True, False) - - def compute_code(self, data_series: ndarray): - uniq_items = np.unique(data_series).tolist() - id_to_item = {} - item_to_id = {} - for i in range(len(uniq_items)): - item = str(uniq_items[i]) - item_to_id[item] = self.end_id - id_to_item[self.end_id] = item - self.end_id += 1 - self.id_to_item = id_to_item - self.item_to_id = item_to_id - - def encode(self, item) -> List[int]: - return [self.item_to_id[item]] - - def decode(self, ids: List[int]) -> str: - return self.id_to_item[ids[0]] - - -column_map = {"int": IntCode, "float": FloatCode, "category": CategoryCode} - - -class ColumnCodes(object): - def __init__(self): - self.column_codes: Dict[str, Code] = {} - self.columns = [] - self.sizes = [] - - @property - def vocab_size(self): - return self.column_codes[self.columns[-1]].end_id - - def register(self, name: str, ccode: Code): - self.columns.append(name) - self.column_codes[name] = ccode - self.sizes.append(ccode.code_len) - - def encode(self, col: str, item: str) -> List[int]: - if col in self.column_codes: - return self.column_codes[col].encode(item) - else: - raise ValueError(f"cannot encode {col} {item}") - - def decode(self, col: str, ids: List[int]) -> str: - if col in self.column_codes: - return self.column_codes[col].decode(ids) - else: - raise ValueError("cannot decode") - - def get_range(self, column_id: int) -> List[Tuple[int, int]]: - return self.column_codes[self.columns[column_id]].code_range - - @classmethod - def get_column_codes(cls, column_configs, example_arrays): - column_codes = cls() - beg = 0 - cc = None - for config in column_configs: - col_name = config['name'] - coder = column_map[config['code_type']] - args = config.get('args', {}) - start_id = beg if cc is None else cc.end_id - args['start_id'] = start_id - args['col_name'] = col_name - cc = coder(**args) - cc.compute_code(example_arrays[col_name]) - column_codes.register(col_name, cc) - return column_codes diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/en_ja_tokenizers.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/en_ja_tokenizers.py deleted file mode 100644 index cf58130834e9f0f05028892821ca243347df2afe..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/en_ja_tokenizers.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import re -from typing import List - -from pangu import spacing -from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer - -try: - import ipadic - import MeCab - - HAVE_MECAB = True - HAVE_IPADIC = True -except (ImportError, ModuleNotFoundError): - HAVE_MECAB = False - HAVE_IPADIC = False - - -class EnJaProcessor: - """ - Tokenizer, Detokenizer and Normalizer utilities for Japanese & English - Args: - lang_id: One of ['en', 'ja']. - """ - - def __init__(self, lang_id: str): - self.lang_id = lang_id - self.moses_tokenizer = MosesTokenizer(lang=lang_id) - self.moses_detokenizer = MosesDetokenizer(lang=lang_id) - self.normalizer = MosesPunctNormalizer( - lang=lang_id, pre_replace_unicode_punct=True, post_remove_control_chars=True - ) - - def detokenize(self, tokens: List[str]) -> str: - """ - Detokenizes a list of tokens - Args: - tokens: list of strings as tokens - Returns: - detokenized Japanese or English string - """ - return self.moses_detokenizer.detokenize(tokens) - - def tokenize(self, text) -> str: - """ - Tokenizes text using Moses. Returns a string of tokens. - """ - tokens = self.moses_tokenizer.tokenize(text) - return ' '.join(tokens) - - def normalize(self, text) -> str: - # Normalization doesn't handle Japanese periods correctly; - # '。'becomes '.'. - if self.lang_id == 'en': - return self.normalizer.normalize(text) - else: - return text - - -class JaMecabProcessor: - """ - Tokenizer, Detokenizer and Normalizer utilities for Japanese MeCab & English - """ - - def __init__(self): - if not HAVE_MECAB or not HAVE_IPADIC: - raise ImportError("Please ensure that you have installed `MeCab` and `ipadic` to use JaMecabProcessor") - - self.mecab_tokenizer = MeCab.Tagger(ipadic.MECAB_ARGS + " -Owakati") - - def detokenize(self, text: List[str]) -> str: - RE_WS_IN_FW = re.compile( - r'([\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])\s+(?=[\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])' - ) - - detokenize = lambda s: spacing(RE_WS_IN_FW.sub(r'\1', s)).strip() - return detokenize(' '.join(text)) - - def tokenize(self, text) -> str: - """ - Tokenizes text using Moses. Returns a string of tokens. - """ - return self.mecab_tokenizer.parse(text).strip() - - def normalize(self, text) -> str: - return text diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/fairseq_tokenizer.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/fairseq_tokenizer.py deleted file mode 100644 index aa30bacd8f812556f95b0f5d3f5fd35f44bcd0fe..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/fairseq_tokenizer.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the license found in the LICENSE file in -# the root directory of this source tree. An additional grant of patent rights -# can be found in the PATENTS file in the same directory. - -""" Code from -https://github.com/NVIDIA/DeepLearningExamples/blob/ -master/PyTorch/Translation/Transformer/fairseq/tokenizer.py -""" - -import re -import sys -import unicodedata -from collections import defaultdict - -__all__ = ['get_unicode_categories', 'tokenize_en'] - - -def get_unicode_categories(): - cats = defaultdict(list) - for c in map(chr, range(sys.maxunicode + 1)): - cats[unicodedata.category(c)].append(c) - return cats - - -NUMERICS = ''.join(get_unicode_categories()['No']) - - -def tokenize_en(line): - line = line.strip() - line = ' ' + line + ' ' - # remove ASCII junk - line = re.sub(r'\s+', ' ', line) - line = re.sub(r'[\x00-\x1F]', '', line) - # fix whitespaces - line = re.sub(r'\ +', ' ', line) - line = re.sub('^ ', '', line) - line = re.sub(' $', '', line) - # separate other special characters - line = re.sub(r'([^\s\.\'\`\,\-\w]|[_' + NUMERICS + '])', r' \g<1> ', line) - line = re.sub(r'(\w)\-(?=\w)', r'\g<1> @-@ ', line) - - # multidots stay together - line = re.sub(r'\.([\.]+)', r' DOTMULTI\g<1>', line) - while re.search(r'DOTMULTI\.', line): - line = re.sub(r'DOTMULTI\.([^\.])', r'DOTDOTMULTI \g<1>', line) - line = re.sub(r'DOTMULTI\.', r'DOTDOTMULTI', line) - - # separate out "," except if within numbers (5,300) - line = re.sub(r'([\D])[,]', r'\g<1> , ', line) - line = re.sub(r'[,]([\D])', r' , \g<1>', line) - - # separate "," after a number if it's the end of sentence - line = re.sub(r'(\d)[,]$', r'\g<1> ,', line) - - # split contractions right - line = re.sub(r'([\W\d])[\']([\W\d])', r'\g<1> \' \g<2>', line) - line = re.sub(r'(\W)[\']([\w\D])', r'\g<1> \' \g<2>', line) - line = re.sub(r'([\w\D])[\']([\W\d])', r'\g<1> \' \g<2>', line) - line = re.sub(r'([\w\D])[\']([\w\D])', r'\g<1> \'\g<2>', line) - # special case for "1990's" - line = re.sub(r'([\W\d])[\']([s])', r'\g<1> \'\g<2>', line) - - # apply nonbreaking prefixes - words = line.split() - line = '' - for i in range(len(words)): - word = words[i] - match = re.search(r'^(\S+)\.$', word) - if match: - pre = match.group(1) - if i == len(words) - 1: - """split last words independently as they are unlikely - to be non-breaking prefixes""" - word = pre + ' .' - else: - word = pre + ' .' - - word += ' ' - line += word - - # clean up extraneous spaces - line = re.sub(' +', ' ', line) - line = re.sub('^ ', '', line) - line = re.sub(' $', '', line) - - # .' at end of sentence is missed - line = re.sub(r'\.\' ?$', ' . \' ', line) - - # restore multi-dots - while re.search('DOTDOTMULTI', line): - line = re.sub('DOTDOTMULTI', 'DOTMULTI.', line) - - line = re.sub('DOTMULTI', '.', line) - - # escape special characters - line = re.sub(r'\&', r'&', line) - line = re.sub(r'\|', r'|', line) - line = re.sub(r'\<', r'<', line) - line = re.sub(r'\>', r'>', line) - line = re.sub(r'\'', r''', line) - line = re.sub(r'\"', r'"', line) - line = re.sub(r'\[', r'[', line) - line = re.sub(r'\]', r']', line) - - # ensure final line breaks - # if line[-1] is not '\n': - # line += '\n' - - return line diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/huggingface/__init__.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/huggingface/__init__.py deleted file mode 100644 index 7231c7949a8169bb5e1e1c3d639d000b874a3c99..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/huggingface/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/huggingface/__pycache__/__init__.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/huggingface/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 8b51c666754ddbfd17aa5ff2116f6584bbb18307..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/huggingface/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/huggingface/__pycache__/auto_tokenizer.cpython-310.pyc b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/huggingface/__pycache__/auto_tokenizer.cpython-310.pyc deleted file mode 100644 index 120ddf6f4a943ba2563d5e0d184ac611c24c75cc..0000000000000000000000000000000000000000 Binary files a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/huggingface/__pycache__/auto_tokenizer.cpython-310.pyc and /dev/null differ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py deleted file mode 100644 index e6e5840d93b9b83dde82e0303f5cf1d18df1e31a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Optional - -from transformers import AutoTokenizer as AUTOTOKENIZER - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.utils import logging - -__all__ = [ - 'AutoTokenizer', -] - - -class AutoTokenizer(TokenizerSpec): - ''' - Wrapper of HuggingFace AutoTokenizer https://huggingface.co/transformers/model_doc/auto.html#autotokenizer. - ''' - - def __init__( - self, - pretrained_model_name: str, - vocab_file: Optional[str] = None, - merges_file: Optional[str] = None, - mask_token: Optional[str] = None, - bos_token: Optional[str] = None, - eos_token: Optional[str] = None, - pad_token: Optional[str] = None, - sep_token: Optional[str] = None, - cls_token: Optional[str] = None, - unk_token: Optional[str] = None, - use_fast: Optional[bool] = False, - ): - - """ - Args: - pretrained_model_name: corresponds to HuggingFace-AutoTokenizer's 'pretrained_model_name_or_path' input argument. - For more details please refer to https://huggingface.co/transformers/_modules/transformers/tokenization_auto.html#AutoTokenizer.from_pretrained. - The list of all supported models can be found here: ALL_PRETRAINED_CONFIG_ARCHIVE_MAP - vocab_file: path to file with vocabulary which consists - of characters separated by '\n'. - mask_token: mask token - bos_token: the beginning of sequence token - eos_token: the end of sequence token. Usually equal to sep_token - pad_token: token to use for padding - sep_token: token used for separating sequences - cls_token: class token. Usually equal to bos_token - unk_token: token to use for unknown tokens - use_fast: whether to use fast HuggingFace tokenizer - """ - try: - # this logic deals with different huggingface tokenizers having different positional args - if vocab_file is None: - self.tokenizer = AUTOTOKENIZER.from_pretrained( - pretrained_model_name_or_path=pretrained_model_name, use_fast=use_fast, - ) - elif merges_file is None: - self.tokenizer = AUTOTOKENIZER.from_pretrained( - pretrained_model_name_or_path=pretrained_model_name, vocab_file=vocab_file, use_fast=use_fast, - ) - else: - self.tokenizer = AUTOTOKENIZER.from_pretrained( - pretrained_model_name_or_path=pretrained_model_name, - vocab_file=vocab_file, - merges_file=merges_file, - use_fast=use_fast, - ) - except Exception as e: - raise ValueError( - f'Unable to instantiate HuggingFace AUTOTOKENIZER for {pretrained_model_name}. Exception: {e}' - ) - - self.original_vocab_size = len(self.tokenizer) - special_tokens_dict = {} - - # # setting special tokens, by default the default model's special tokens will be preserved - # # unless passes new values to the special tokens - if unk_token is not None: - special_tokens_dict["unk_token"] = unk_token - if mask_token is not None: - special_tokens_dict["mask_token"] = mask_token - if pad_token is not None: - special_tokens_dict["pad_token"] = pad_token - - # if the model does not have eos_token but has sep_token, - # set eos_token = sep_token, and vice versa - if sep_token is not None: - special_tokens_dict["sep_token"] = sep_token - elif self.tokenizer.sep_token is None and self.tokenizer.eos_token: - special_tokens_dict["sep_token"] = self.tokenizer.eos_token - if eos_token is not None: - special_tokens_dict["eos_token"] = eos_token - elif self.tokenizer.eos_token is None and self.tokenizer.sep_token: - special_tokens_dict["eos_token"] = self.tokenizer.sep_token - - # if the model does not have bos_token but has cls_token, - # set bos_token = cls_token, and vice versa - if bos_token is not None: - special_tokens_dict["bos_token"] = bos_token - elif self.tokenizer.bos_token is None and self.tokenizer.cls_token: - special_tokens_dict["bos_token"] = self.tokenizer.cls_token - if cls_token is not None: - special_tokens_dict["cls_token"] = cls_token - elif self.tokenizer.cls_token is None and self.tokenizer.bos_token: - special_tokens_dict["cls_token"] = self.tokenizer.bos_token - - new_tokens_in_vocab = [] - for token in [mask_token, bos_token, eos_token, pad_token, sep_token, cls_token, unk_token]: - if token is not None and token not in self.tokenizer.get_vocab(): - new_tokens_in_vocab.append(token) - - if len(new_tokens_in_vocab) > 0: - """ - Special tokens that were not previously included in the tokenizer's vocabulary file will be added to - the vocabulary and, as a result, the model should be resized, for example: - - # define your model - pretrained_model_name = 'roberta-base' - model = nemo_nlp.modules.get_lm_model(pretrained_model_name=pretrained_model_name) - - # define pretrained tokenizer - tokenizer_default = nemo_nlp.modules.get_tokenizer(tokenizer_name=pretrained_model_name) - - special_tokens = {'bos_token': '', - 'cls_token': '', - 'additional_special_tokens': ['', '']} - tokenizer_default.add_special_tokens(special_tokens_dict=special_tokens) - - # resize your model so that the embeddings for newly added tokens are updated during training/finetuning - model.resize_token_embeddings(tokenizer_default.vocab_size) - - See NLP_Tokenizers.ipynb for more details. - """ - logging.warning( - f'{new_tokens_in_vocab} \n will be added to the vocabulary.\n' - f'Please resize your model accordingly, ' - f'see NLP_Tokenizers.ipynb for more details.' - ) - self.add_special_tokens(special_tokens_dict) - self.space_sensitive = self.text_to_tokens('x y') != self.text_to_tokens('x') + self.text_to_tokens('y') - - @property - def vocab_size(self): - return len(self.tokenizer) - - def add_special_tokens(self, special_tokens_dict: dict) -> int: - """ - Adds a dictionary of special tokens (eos, pad, cls...). If special tokens are NOT in the vocabulary, they are added - to it (indexed starting from the last index of the current vocabulary). - Args: - special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: - [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, - ``additional_special_tokens``]. - Tokens are only added if they are not already in the vocabulary. - Returns: - Number of tokens added to the vocabulary. - """ - num_tokens_added = self.tokenizer.add_special_tokens(special_tokens_dict) - - if num_tokens_added > 0: - logging.info(f'{num_tokens_added} special tokens added, resize your model accordingly.') - for k in self.tokenizer.SPECIAL_TOKENS_ATTRIBUTES: - setattr(self, k, getattr(self.tokenizer, k, None)) - return num_tokens_added - - @property - def additional_special_tokens_ids(self): - """Returns a list of the additional special tokens (excluding bos, eos, pad, unk). Used to return sentinel tokens for e.g. T5.""" - return [self.token_to_id(token) for token in self.additional_special_tokens] - - def text_to_tokens(self, text): - tokens = self.tokenizer.tokenize(text) - return tokens - - def tokens_to_text(self, tokens): - text = self.tokenizer.convert_tokens_to_string(tokens) - return text - - def token_to_id(self, token): - return self.tokens_to_ids([token])[0] - - def tokens_to_ids(self, tokens): - ids = self.tokenizer.convert_tokens_to_ids(tokens) - return ids - - def ids_to_tokens(self, ids): - tokens = self.tokenizer.convert_ids_to_tokens(ids) - return tokens - - def text_to_ids(self, text): - tokens = self.text_to_tokens(text) - ids = self.tokens_to_ids(tokens) - return ids - - def ids_to_text(self, ids): - tokens = self.ids_to_tokens(ids) - tokens_clean = [t for t in tokens if t not in self.tokenizer.all_special_tokens] - text = self.tokens_to_text(tokens_clean) - return text - - @property - def vocab(self): - id2vocab = {v: k for k, v in self.tokenizer.vocab.items()} - return [id2vocab[i] for i in range(len(id2vocab))] - - @property - def pad_id(self): - return self.tokens_to_ids([getattr(self, 'pad_token')])[0] - - @property - def bos_id(self): - return self.tokens_to_ids([getattr(self, 'bos_token')])[0] - - @property - def eos_id(self): - return self.tokens_to_ids([getattr(self, 'eos_token')])[0] - - @property - def sep_id(self): - return self.tokens_to_ids([getattr(self, 'sep_token')])[0] - - @property - def cls_id(self): - return self.tokens_to_ids([getattr(self, 'cls_token')])[0] - - @property - def unk_id(self): - return self.tokens_to_ids([getattr(self, 'unk_token')])[0] - - @property - def mask_id(self): - return self.tokens_to_ids([getattr(self, 'mask_token')])[0] - - @property - def name(self): - return type(self.tokenizer).__name__ - - def save_vocabulary(self, save_directory: str, filename_prefix: str = None): - """Saves tokenizer's vocabulary and other artifacts to the specified directory""" - return self.tokenizer.save_vocabulary(save_directory=save_directory, filename_prefix=filename_prefix) diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/indic_tokenizers.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/indic_tokenizers.py deleted file mode 100644 index 3b9192c8885b8cc6ac07cc68f18d324ac4503412..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/indic_tokenizers.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List - -from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer - - -class IndicProcessor: - """ - Tokenizer, Detokenizer and Normalizer utilities in Indic Languages. - Currently supports: 'hi' - """ - - def __init__(self, lang_id: str): - if lang_id != 'hi': - raise NotImplementedError - self.moses_tokenizer = MosesTokenizer(lang=lang_id) - self.moses_detokenizer = MosesDetokenizer(lang=lang_id) - self.normalizer = MosesPunctNormalizer(lang=lang_id) - - def detokenize(self, tokens: List[str]) -> str: - """ - Detokenizes a list of tokens - Args: - tokens: list of strings as tokens - Returns: - detokenized string - """ - return self.moses_detokenizer.detokenize(tokens) - - def tokenize(self, text: str): - return text - - def normalize(self, text: str): - return text diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/moses_tokenizers.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/moses_tokenizers.py deleted file mode 100644 index 27e91e6c5262490842365f7b07728f61ea5bb8e6..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/moses_tokenizers.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List - -from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer - - -class MosesProcessor: - """ - Tokenizer, Detokenizer and Normalizer utilities in Moses - """ - - def __init__(self, lang_id: str): - self.moses_tokenizer = MosesTokenizer(lang=lang_id) - self.moses_detokenizer = MosesDetokenizer(lang=lang_id) - self.normalizer = MosesPunctNormalizer(lang=lang_id) - - def detokenize(self, tokens: List[str]) -> str: - """ - Detokenizes a list of tokens - Args: - tokens: list of strings as tokens - Returns: - detokenized string - """ - return self.moses_detokenizer.detokenize(tokens) - - def tokenize(self, text: str): - """ - Tokenizes text using Moses -> Sentencepiece. - """ - return self.moses_tokenizer.tokenize(text, escape=False, return_str=True) - - def normalize(self, text: str): - return self.normalizer.normalize(text) diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/regex_tokenizer.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/regex_tokenizer.py deleted file mode 100644 index 07e278026e03c90d7483bba1d1482ae497ef63e1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/regex_tokenizer.py +++ /dev/null @@ -1,314 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import re -from typing import Optional - -import pandas as pd - -from nemo.collections.common.tokenizers.char_tokenizer import TokenizerSpec -from nemo.utils import logging - -__all__ = ['RegExTokenizer'] - -DEFAULT_MASK_TOKEN = '' -DEFAULT_BOS_TOKEN = '^' -DEFAULT_EOS_TOKEN = '&' -DEFAULT_PAD_TOKEN = '' -DEFAULT_SEP_TOKEN = '' -DEFAULT_UNK_TOKEN = '?' - - -class RegExTokenizer(TokenizerSpec): - """ - A regular expression-based tokenizer at word boundary. - This tokenizer default to support MegaMolBART. - - """ - - def __init__( - self, - regex: Optional[str] = "", - mask_token: Optional[str] = DEFAULT_MASK_TOKEN, - bos_token: Optional[str] = DEFAULT_BOS_TOKEN, - eos_token: Optional[str] = DEFAULT_EOS_TOKEN, - pad_token: Optional[str] = DEFAULT_PAD_TOKEN, - sep_token: Optional[str] = DEFAULT_SEP_TOKEN, - unk_token: Optional[str] = DEFAULT_UNK_TOKEN, - ): - """ - Args: - regex: regular expression that defined tokenization rules - mask_token: mask token - bos_token: the beginning of sequence token - eos_token: the end of sequence token. Usually equal to sep_token - pad_token: token to use for padding - sep_token: token used for separating sequences - cls_token: class token. Usually equal to bos_token - unk_token: token to use for unknown tokens - """ - self.regex = regex - self.mask_token = mask_token - self.bos_token = bos_token - self.eos_token = eos_token - self.pad_token = pad_token - self.sep_token = sep_token - self.unk_token = unk_token - - # holds names of .model/.vocab files - self.regex_file = None - self.vocab_file = None - - # initialize with default vocab - self.vocab = { - self.pad_token: 0, # pad_token - self.unk_token: 1, # unk_token - self.bos_token: 2, # begin_token - self.eos_token: 3, # end_token - self.mask_token: 4, # mask_token - self.sep_token: 5, # sep_token - } - self._update_cache() - - # Computed attributes - self._compile_regex() - - def _update_cache(self): - # Cache data/attributes required for tokenization - self._unk_id = self.vocab.get(self.unk_token, DEFAULT_UNK_TOKEN) - self._decode_vocab = {i: t for t, i in self.vocab.items()} - - def _compile_regex(self): - regex_string = r"(" - regex_string += self.regex + r"|" - regex_string += r".)" - self._compiled_regex = re.compile(regex_string) - - @property - def vocab_size(self): - return len(self.vocab) - - def text_to_tokens(self, text): - tokens = self._compiled_regex.findall(text) - - return tokens - - def tokens_to_text(self, tokens): - tokens_list = [] - for token in tokens: - if token[0] == self.bos_token: - token = token[1:] - - # Remove end token and the following values - if self.eos_token in token: - eos_idx = token.index(self.eos_token) - token = token[:eos_idx] - - tokens_list.append(token) - - text = ["".join(tokens) for tokens in tokens_list] - return text - - def token_to_ids(self, tokens): - ids_list = [] - for token in tokens: - ids_list.append(self.vocab.get(token, self._unk_id)) - return ids_list - - def tokens_to_ids(self, token_data): - if isinstance(token_data, str): - token_data = [token_data] - - ids_list = [] - for tokens in token_data: - ids = self.token_to_ids(tokens) - ids_list.append(ids) - return ids_list - - def ids_to_tokens(self, ids_list): - if len(ids_list) and not isinstance(ids_list[0], list): - ids_list = [ids_list] - added_list = True - else: - added_list = False - - tokens_list = [] - for ids in ids_list: - tokens = [] - for token_id in ids: - token = self._decode_vocab.get(token_id) - if token is None: - raise ValueError(f"Token id {token_id} is not recognised") - tokens.append(token) - - tokens_list.append(tokens) - - if added_list: - return tokens_list[0] - else: - return tokens_list - - def text_to_ids(self, text): - tokens = self.text_to_tokens(text) - tokens = [tokens] - return self.tokens_to_ids(tokens)[0] - - def ids_to_text(self, ids): - tokens = self.ids_to_tokens(ids) - return self.tokens_to_text(tokens) - - @property - def pad_id(self): - return 0 - - @property - def unk_id(self): - return 1 - - @property - def bos_id(self): - return 2 - - @property - def eos_id(self): - return 3 - - @property - def mask_id(self): - return 4 - - @property - def sep_id(self): - return 5 - - def _get_regex_vocab_files(self, regex_file=None, vocab_file=None): - """ - Infers files or update if given. - """ - regex_file = regex_file or self.regex_file - if not regex_file: - raise ValueError(f"regex_file must be specified") - - vocab_file = vocab_file or self.vocab_file - # try to infer vocab_file from regex_file - if not vocab_file: - vocab_file = os.path.splitext(regex_file)[0] + '.vocab' - - self.regex_file = regex_file - self.vocab_file = vocab_file - - return regex_file, vocab_file - - def save_tokenizer(self, regex_file=None, vocab_file=None): - """ - Saves tokenizer's regex and vocab files - """ - regex_file, vocab_file = self._get_regex_vocab_files(regex_file=regex_file, vocab_file=vocab_file) - - logging.info(f"Saving vocabulary to file = {vocab_file}") - with open(vocab_file, 'w') as fp: - for token in self.vocab: - fp.write(f"{token[0]}\n") - - logging.info(f"Saving regex to file = {regex_file}") - with open(regex_file, 'w') as f: - f.write(self.regex) - - def load_tokenizer(self, regex_file=None, vocab_file=None): - """ - Loads tokenizer's regex and vocab files - """ - regex_file, vocab_file = self._get_regex_vocab_files(regex_file=regex_file, vocab_file=vocab_file) - - # load vocab file - # vocab_file: path to file with vocabulary which consists - # of characters separated by \n (None/"" for empty vocab) - - logging.info(f"Loading vocabulary from file = {vocab_file}") - if os.path.exists(vocab_file): - vocab = {} - with open(vocab_file, "r") as f: - for line in f: - line = line.strip() - if line: - vocab[line] = len(vocab) - self.vocab = vocab - else: - raise RuntimeError(f"Missing vocab_file = {vocab_file}") - - # load regex from a file - if os.path.exists(regex_file): - logging.info(f"Loading regex from file = {regex_file}") - self.regex = open(regex_file, encoding="utf-8").read().strip() - else: - raise RuntimeError(f"Missing regex_file = {regex_file}") - - self._update_cache() - self._compile_regex() - - return self - - def build_vocab_from_csv(self, data_csv_file, col="smiles"): - """ - Learns vocabulary from a CSV file. Can be called multiple times to update vocabulary. - """ - logging.debug(f"Building vocabulary from CSV col = {col} file = {data_csv_file}") - - # NOTE this has to be run on each CSV file - if not os.path.exists(data_csv_file): - raise ValueError(f"Data file: {data_csv_file} is missing") - - df = pd.read_csv(data_csv_file) - - vocab = self.vocab - for d in df[col]: - tokens = self.text_to_tokens(d) - logging.debug(f"Text: {d}, Tokens: {tokens}") - for token in tokens: - if token not in vocab: - vocab[token] = len(vocab) - - sorted_vocab = sorted(vocab.items(), key=lambda k_v: k_v[1]) - logging.debug(f"Vocab: {sorted_vocab}") - - self.vocab = vocab - self._update_cache() - - def build_vocab_from_text(self, data_text_file): - """ - Learns vocabulary from a text file. Can be called multiple times to update vocabulary. - """ - logging.debug(f"Building vocabulary from TEXT file = {data_text_file}") - - # NOTE this has to be run on each text file - if not os.path.exists(data_text_file): - raise ValueError(f"Data file: {data_text_file} is missing") - - vocab = self.vocab - with open(data_text_file, encoding="utf-8") as f: - for d in f.readlines(): - d = d.rstrip() - tokens = self.text_to_tokens(d) - logging.debug(f"Text: {d}, Tokens: {d}") - for token in tokens: - if token not in vocab: - vocab[token] = len(vocab) - - sorted_vocab = sorted(vocab.items(), key=lambda k_v: k_v[1]) - logging.debug(f"Vocab: {sorted_vocab}") - - self.vocab = vocab - self._update_cache() diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py deleted file mode 100644 index 3ef87437a8b244c9fa9454f1b000ff9eca1174d1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py +++ /dev/null @@ -1,399 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import Dict, List, Optional, Union - -import numpy as np -import sentencepiece - -from nemo.collections.common.parts.utils import if_exist -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.utils import logging - -__all__ = ['SentencePieceTokenizer', 'create_spt_model'] - - -class SentencePieceTokenizer(TokenizerSpec): - """ - Sentencepiecetokenizer https://github.com/google/sentencepiece. - - Args: - model_path: path to sentence piece tokenizer model. To create the model use create_spt_model() - special_tokens: either list of special tokens or dictionary of token name to token value - legacy: when set to True, the previous behavior of the SentecePiece wrapper will be restored, - including the possibility to add special tokens inside wrapper. - """ - - def __init__( - self, model_path: str, special_tokens: Optional[Union[Dict[str, str], List[str]]] = None, legacy: bool = False - ): - if not model_path or not os.path.exists(model_path): - raise ValueError(f"model_path: {model_path} is invalid") - self.tokenizer = sentencepiece.SentencePieceProcessor() - self.tokenizer.Load(model_path) - - self.original_vocab_size = self.tokenizer.get_piece_size() - self.vocab_size = self.tokenizer.get_piece_size() - self.legacy = legacy - self.special_token_to_id = {} - self.id_to_special_token = {} - if special_tokens: - if not self.legacy: - raise ValueError( - "Special tokens must be None when legacy is set to False. Provide special tokens at train time." - ) - self.add_special_tokens(special_tokens) - self.space_sensitive = self.text_to_tokens('x y') != self.text_to_tokens('x') + self.text_to_tokens('y') - - def text_to_tokens(self, text): - if self.legacy: - tokens = [] - idx = 0 - last_idx = 0 - - while 1: - indices = {} - - for token in self.special_token_to_id: - try: - indices[token] = text[idx:].index(token) - except ValueError: - continue - - if len(indices) == 0: - break - - next_token = min(indices, key=indices.get) - next_idx = idx + indices[next_token] - - tokens.extend(self.tokenizer.encode_as_pieces(text[idx:next_idx])) - tokens.append(next_token) - idx = next_idx + len(next_token) - - tokens.extend(self.tokenizer.encode_as_pieces(text[idx:])) - return tokens - - return self.tokenizer.encode_as_pieces(text) - - def text_to_ids(self, text): - if self.legacy: - ids = [] - idx = 0 - last_idx = 0 - - while 1: - indices = {} - - for token in self.special_token_to_id: - try: - indices[token] = text[idx:].index(token) - except ValueError: - continue - - if len(indices) == 0: - break - - next_token = min(indices, key=indices.get) - next_idx = idx + indices[next_token] - - ids.extend(self.tokenizer.encode_as_ids(text[idx:next_idx])) - ids.append(self.special_token_to_id[next_token]) - idx = next_idx + len(next_token) - - ids.extend(self.tokenizer.encode_as_ids(text[idx:])) - return ids - - return self.tokenizer.encode_as_ids(text) - - def tokens_to_text(self, tokens): - if isinstance(tokens, np.ndarray): - tokens = tokens.tolist() - - return self.tokenizer.decode_pieces(tokens) - - def ids_to_text(self, ids): - if isinstance(ids, np.ndarray): - ids = ids.tolist() - - if self.legacy: - text = "" - last_i = 0 - - for i, id in enumerate(ids): - if id in self.id_to_special_token: - text += self.tokenizer.decode_ids(ids[last_i:i]) + " " - text += self.id_to_special_token[id] + " " - last_i = i + 1 - - text += self.tokenizer.decode_ids(ids[last_i:]) - return text.strip() - - return self.tokenizer.decode_ids(ids) - - def token_to_id(self, token): - if self.legacy and token in self.special_token_to_id: - return self.special_token_to_id[token] - - return self.tokenizer.piece_to_id(token) - - def ids_to_tokens(self, ids): - tokens = [] - for id in ids: - if id >= self.original_vocab_size: - tokens.append(self.id_to_special_token[id]) - else: - tokens.append(self.tokenizer.id_to_piece(id)) - return tokens - - def tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: - if isinstance(tokens, str): - tokens = [tokens] - ids = [] - for token in tokens: - ids.append(self.token_to_id(token)) - return ids - - def add_special_tokens(self, special_tokens): - if not self.legacy: - raise AttributeError("Special Token addition does not work when legacy is set to False.") - - if isinstance(special_tokens, list): - for token in special_tokens: - if ( - self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id() - and token not in self.special_token_to_id - ): - self.special_token_to_id[token] = self.vocab_size - self.id_to_special_token[self.vocab_size] = token - self.vocab_size += 1 - elif isinstance(special_tokens, dict): - for token_name, token in special_tokens.items(): - setattr(self, token_name, token) - if ( - self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id() - and token not in self.special_token_to_id - ): - self.special_token_to_id[token] = self.vocab_size - self.id_to_special_token[self.vocab_size] = token - self.vocab_size += 1 - - @property - def pad_id(self): - if self.legacy: - pad_id = self.tokens_to_ids([self.pad_token])[0] - else: - pad_id = self.tokenizer.pad_id() - return pad_id - - @property - def bos_id(self): - if self.legacy: - bos_id = self.tokens_to_ids([self.bos_token])[0] - else: - bos_id = self.tokenizer.bos_id() - return bos_id - - @property - def eos_id(self): - if self.legacy: - eos_id = self.tokens_to_ids([self.eos_token])[0] - else: - eos_id = self.tokenizer.eos_id() - return eos_id - - @property - def sep_id(self): - if self.legacy: - return self.tokens_to_ids([self.sep_token])[0] - else: - raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.") - - @property - def cls_id(self): - if self.legacy: - return self.tokens_to_ids([self.cls_token])[0] - else: - raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.") - - @property - def mask_id(self): - if self.legacy: - return self.tokens_to_ids([self.mask_token])[0] - else: - raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.") - - @property - def unk_id(self): - return self.tokenizer.unk_id() - - @property - def additional_special_tokens_ids(self): - """Returns a list of the additional special tokens (excluding bos, eos, pad, unk). Used to return sentinel tokens for e.g. T5.""" - special_tokens = set( - [self.bos_token, self.eos_token, self.pad_token, self.mask_token, self.cls_token, self.sep_token] - ) - return [v for k, v in self.special_token_to_id.items() if k not in special_tokens] - - @property - def vocab(self): - main_vocab = [self.tokenizer.id_to_piece(id) for id in range(self.tokenizer.get_piece_size())] - special_tokens = [ - self.id_to_special_token[self.original_vocab_size + i] - for i in range(self.vocab_size - self.original_vocab_size) - ] - return main_vocab + special_tokens - - -def create_spt_model( - data_file: str, - vocab_size: int, - sample_size: int, - do_lower_case: bool, - tokenizer_type: str = 'unigram', - output_dir: Optional[str] = None, - character_coverage: float = 1.0, - train_extremely_large_corpus: bool = False, - max_sentencepiece_length: int = -1, - bos: bool = False, - eos: bool = False, - pad: bool = False, - control_symbols: List[str] = None, - user_defined_symbols: List[str] = None, - byte_fallback: bool = False, - split_digits: bool = False, - split_by_whitespace: bool = True, - split_by_unicode_script: bool = True, -): - """ - Creates sentence piece tokenizer model from data file. - Args: - data_file: data file - vocab_size: vocabulary size - sample_size: maximum size of sentences the trainer loads - do_lower_case: if text should be lower cased before tokenizer model is created - character_coverage: float value between 0 and 1 (as a percentage). For languages with a vast charset, - can be < 1.0, but for all other languages, it should be set as 1.0 - output_dir: folder to save created tokenizer model. If not specified will store model at data_file/../spt folder - train_extremely_large_corpus: If training on huge datasets, pass this flag to allow SentencePiece - to build the tokenizer. - max_sentencepiece_length: Limits the maximum length of the SentencePiece subword that can be constructed. - By default, no limit is placed. - bos: when True, bos token "" is added to the vocabulary. - eos: when True, eos token "" is added to the vocabulary. - pad: when True, pad token "" is added to the vocabulary. - control_symbols: control symbols to add to tokenizer, as defined by sentencepiece. - These tokens get removed at decode time and are not encoded from the text - can only be added to the input programatically. - user_defined_symbols: user symbols to add to tokenizer, as defined by sentencepiece. - These tokens remain in the decoded text and are encoded automatically when present in the input text. - byte_fallback: If , fallback to a byte sequence of the character. - split_digits: If true, digits are split into individual tokens. - split_by_whitespace: Whether to respect white space while creating subwords. If False, will learn merges across whitespace. - split_by_unicode_script: Whether to include multiple Unicode scripts. Ex. is Arabic diacritics which are considered part of the letter (عِدَّةُ) - """ - - if not data_file or not os.path.exists(data_file): - raise ValueError(f"data_file must be valid file path, but got {data_file}") - data_dir = os.path.dirname(data_file) - vocab = [] - special_tokens = ["", "", "", ""] - if not output_dir: - output_dir = f'{data_dir}/spt' - if if_exist(output_dir, ['tokenizer.model']): - logging.info(f"tokenizer model {output_dir}/tokenizer.model already exists") - return f'{output_dir}/tokenizer.model', f'{output_dir}/vocab.txt' - logging.info(f'Processing {data_file} and store at {output_dir}') - os.makedirs(output_dir, exist_ok=True) - - cmd = ( - f"--input={data_file} --model_prefix={output_dir}/tokenizer " - f"--vocab_size={vocab_size} " - f"--shuffle_input_sentence=true --hard_vocab_limit=false " - f"--model_type={tokenizer_type} " - f"--character_coverage={character_coverage}" - ) - - pad_id = 3 - if not bos: - pad_id -= 1 - cmd += " --bos_id=-1" - - if not eos: - pad_id -= 1 - cmd += " --eos_id=-1" - - if pad: - cmd += f" --pad_id={pad_id}" - - if control_symbols: - control_string = (",").join(control_symbols) - cmd += f" --control_symbols={control_string}" - special_tokens += control_symbols - - if user_defined_symbols: - user_string = (",").join(user_defined_symbols) - cmd += f" --user_defined_symbols={user_string}" - special_tokens += user_defined_symbols - - if do_lower_case: - cmd += " --normalization_rule_name=nmt_nfkc_cf" - - if sample_size > 0: - cmd += f" --input_sentence_size={sample_size}" - - if train_extremely_large_corpus: - cmd += " --train_extremely_large_corpus=true" - - if max_sentencepiece_length >= 0: - cmd += f" --max_sentencepiece_length={max_sentencepiece_length}" - - if byte_fallback: - cmd += " --byte_fallback=true" - - if split_digits: - cmd += " --split_digits=true" - - if not split_by_whitespace: - cmd += " --split_by_whitespace=false" - - if not split_by_unicode_script: - cmd += " --split_by_unicode_script=false" - - sentencepiece.SentencePieceTrainer.Train(cmd) - - # Add BERT control symbols - tokens = [] - - with open(f"{output_dir}/tokenizer.vocab", "r") as f: - # Read tokens from each line and parse for vocab - for line in f: - piece = line.split("\t")[0] - if piece in special_tokens: - # skip special tokens - continue - token = piece[1:] if piece.startswith("▁") else f"##{piece}" - - if len(token) > 0: - tokens.append(token) - else: - tokens.append(piece[0]) - - vocab.extend(tokens) - - # Save vocabulary to output file - vocab_file = f'{output_dir}/vocab.txt' - with open(vocab_file, "w") as f: - for token in vocab: - f.write(f"{token}\n") - return f'{output_dir}/tokenizer.model', vocab_file diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/tabular_tokenizer.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/tabular_tokenizer.py deleted file mode 100644 index 5fa36832959c23a9eefc7898ef036fdd7d7890b7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/tabular_tokenizer.py +++ /dev/null @@ -1,199 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pickle -from typing import List - -import numpy - -from nemo.collections.common.tokenizers.column_coder import ColumnCodes -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec - -__all__ = ['TabularTokenizer'] - -END_OF_TEXT = '<|endoftext|>' -NEW_LINE = '\n' - - -def find_index_of(list_input, item): - output = -1 - try: - output = list_input.index(item) - except ValueError: - pass - return output - - -class TabularTokenizer(TokenizerSpec): - def __init__(self, coder, special_tokens=[END_OF_TEXT, NEW_LINE], delimiter=','): - if isinstance(coder, ColumnCodes): - self.code_column: ColumnCodes = coder - else: - with open(coder, 'rb') as handle: - self.code_column: ColumnCodes = pickle.load(handle) - self.num_columns = len(self.code_column.columns) - self.special_tokens = {} - self.special_tokens_decoder = {} - self.add_special_tokens(special_tokens) - self.delimiter = delimiter - self.eod_id = self.special_tokens[END_OF_TEXT] - self.eos_id = self.eod_id - self.bos_id = self.eos_id - - def __len__(self): - return self.vocab_size - - @property - def vocab_size(self): - return max(self.special_tokens_decoder.keys()) + 1 - - def text_to_ids(self, text): - return self.encode(text) - - def ids_to_text(self, token_ids): - return self.decode(token_ids) - - @property - def eod(self): - return self.eod_id - - @property - def eor(self): - return self.special_tokens[NEW_LINE] - - def add_special_tokens(self, special_tokens): - """ Add a list of additional tokens to the encoder. - The additional tokens are indexed starting from the last - index of the - current vocabulary in the order of the `special_tokens` list. - """ - if not special_tokens: - self.special_tokens = {} - self.special_tokens_decoder = {} - return - new = dict( - (tok, self.code_column.vocab_size + i) - for i, tok in enumerate(special_tokens) - if tok not in self.special_tokens - ) - self.special_tokens.update(new) - self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()} - - def text_to_tokens(self, text): - """ Tokenize a string. """ - tokens = [] - rows = text.split(NEW_LINE) - num_rows = len(rows) - for row_id in range(num_rows): - row = rows[row_id] - if row == '': - continue - fields = row.split(self.delimiter) - for f in fields: - splits = f.split(END_OF_TEXT) - if len(splits) == 1: - tokens.append(f.strip()) - elif len(splits) == 2: - if splits[0] != '': - tokens.append(splits[0].strip()) - tokens.append(END_OF_TEXT) - if splits[1] != '': - tokens.append(splits[1].strip()) - else: - raise ValueError("delimiter error") - if row_id != num_rows - 1: - tokens.append(NEW_LINE) - return tokens - - def tokens_to_ids(self, tokens: List[str]): - """ Converts a sequence of tokens into ids using the vocab. """ - ids = [] - cindex = 0 - if NEW_LINE in tokens: - idd = tokens.index(NEW_LINE) - cindex = (self.num_columns - idd) % self.num_columns - for token in tokens: - - if token in self.special_tokens: - ids.append(self.special_tokens[token]) - else: - index = cindex % self.num_columns - column = self.code_column.columns[index] - ids.extend(self.code_column.encode(column, token)) - cindex += 1 - return ids - - def ids_to_tokens(self, ids, skip_special_tokens=False): - """Converts a sequence of ids in Tabular tokens using the vocab.""" - tokens = [] - sizes = self.code_column.sizes - ids_size = sum(sizes) - cindex = 0 - eor_pos = find_index_of(ids, self.eor) - eod_pos = find_index_of(ids, self.eod) - if eor_pos >= 0 and eod_pos >= 0: - idd = min(eor_pos, eod_pos) - cindex = (ids_size - idd) % ids_size - elif eor_pos >= 0 and eod_pos < 0: - idd = eor_pos - cindex = (ids_size - idd) % ids_size - elif eod_pos >= 0 and eor_pos < 0: - idd = eod_pos - cindex = (ids_size - idd) % ids_size - cum_sizes = numpy.cumsum(sizes) - old_column_index = -1 - token_ids = [] - for i in ids: - if i in self.special_tokens_decoder: - if not skip_special_tokens: - tokens.append(self.special_tokens_decoder[i]) - else: - index = cindex % ids_size - column_index = numpy.where(index < cum_sizes)[0][0] - column = self.code_column.columns[column_index] - if old_column_index != column_index: - token_ids = [i] - old_column_index = column_index - else: - token_ids.append(i) - if len(token_ids) == sizes[column_index]: - tokens.append(self.code_column.decode(column, token_ids)) - cindex += 1 - return tokens - - def encode(self, text): - return self.tokens_to_ids(self.text_to_tokens(text)) - - def decode(self, token_ids): - tokens = self.ids_to_tokens(token_ids, skip_special_tokens=False) - return self.tokens_to_text(tokens) - - def tokens_to_text(self, tokens): - all_lines = [] - line = [] - for token in tokens: - if token == END_OF_TEXT or token == NEW_LINE: - if len(line) != 0: - line_text = self.delimiter.join(line) - all_lines.append(line_text) - all_lines.append(token) - line = [] - else: - line.append(token) - if len(line) != 0: - # remaining items - line_text = self.delimiter.join(line) - all_lines.append(line_text) - text = "".join(all_lines) - return text diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/text_to_speech/__init__.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/text_to_speech/__init__.py deleted file mode 100644 index 2db92b2574167fb3436c079fd7941450ae0c316b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/text_to_speech/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py deleted file mode 100644 index f4081735eb7142b72cf3c4eae732e5710bd2eb0c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py +++ /dev/null @@ -1,223 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# fmt: off - -SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR"] - -DEFAULT_PUNCTUATION = ( - ',', '.', '!', '?', '-', - ':', ';', '/', '"', '(', - ')', '[', ']', '{', '}', -) - -VITS_PUNCTUATION = ( - ',', '.', '!', '?', '-', - ':', ';', '"', '«', '»', - '“', '”', '¡', '¿', '—', - '…', -) - -GRAPHEME_CHARACTER_SETS = { - "en-US": ( - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', - 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', - 'U', 'V', 'W', 'X', 'Y', 'Z' - ), - "es-ES": ( - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', - 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', - 'U', 'V', 'W', 'X', 'Y', 'Z', 'Á', 'É', 'Í', 'Ñ', - 'Ó', 'Ú', 'Ü' - ), - # ref: https://en.wikipedia.org/wiki/German_orthography#Alphabet - "de-DE": ( - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', - 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', - 'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ü', 'ẞ', - ), - "fr-FR": ( - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', - 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', - 'U', 'V', 'W', 'X', 'Y', 'Z', 'À', 'Â', 'Ä', 'Æ', - 'Ç', 'È', 'É', 'Ê', 'Ë', 'Í', 'Î', 'Ï', 'Ñ', 'Ô', - 'Ö', 'Ù', 'Û', 'Ü', 'Ō', 'Œ', - ), - "it-IT": ( - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', - 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', - 'U', 'V', 'W', 'X', 'Y', 'Z', 'À', 'È', 'É', 'Ì', - 'Ò', 'Ù' - ), -} - -IPA_CHARACTER_SETS = { - "en-US": ( - 'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l', - 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', - 'x', 'z', 'æ', 'ð', 'ŋ', 'ɐ', 'ɑ', 'ɔ', 'ə', 'ɚ', - 'ɛ', 'ɜ', 'ɡ', 'ɪ', 'ɬ', 'ɹ', 'ɾ', 'ʃ', 'ʊ', 'ʌ', - 'ʒ', 'ʔ', 'ʲ', '̃', '̩', 'θ', 'ᵻ' - ), - "es-ES": ( - 'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l', - 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'w', 'x', - 'ð', 'ŋ', 'ɛ', 'ɡ', 'ɣ', 'ɪ', 'ɲ', 'ɾ', 'ʃ', 'ʊ', - 'ʎ', 'ʒ', 'ʝ', 'β', 'θ' - ), - "de-DE": ( - '1', 'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', - 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', - 'w', 'x', 'y', 'z', 'ç', 'ø', 'ŋ', 'œ', 'ɐ', 'ɑ', - 'ɒ', 'ɔ', 'ə', 'ɛ', 'ɜ', 'ɡ', 'ɪ', 'ɹ', 'ɾ', 'ʃ', - 'ʊ', 'ʌ', 'ʒ', '̃', 'θ' - ), - "fr-FR": ( - 'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l', - 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', - 'y', 'z', 'ð', 'ø', 'ŋ', 'œ', 'ɐ', 'ɑ', 'ɒ', 'ɔ', - 'ə', 'ɛ', 'ɜ', 'ɡ', 'ɪ', 'ɲ', 'ɹ', 'ʁ', 'ʃ', 'ʊ', - 'ʌ', 'ʒ', 'θ', 'ː', '̃' - ), - "it-IT": ( - 'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l', - 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', - 'x', 'z', 'æ', 'ɐ', 'ɑ', 'ɔ', 'ə', 'ɚ', - 'ɜ', 'ɬ', 'ɹ', 'ʌ', 'ʔ', 'ʲ', '̃', '̩', 'ᵻ', - 'ð', 'ŋ', 'ɛ', 'ɡ', 'ɣ', 'ɪ', 'ɲ', 'ɾ', 'ʃ', - 'ʊ', 'ʎ', 'ʒ', 'ʝ', 'β', 'θ', 'd͡', 't͡', 'ø', 'ɒ', - 'ɕ', 'ɓ', 'ç', 'ɖ', 'ɘ', 'ɝ', 'ɞ', 'ɟ','ʄ','ɡ','ɠ', - 'ɢ','ʛ','ɦ','ɧ','ħ','ɥ','ʜ','ɨ','ɬ','ɫ','ɮ','ʟ', - 'ɱ','ɯ','ɰ','ɳ','ɵ','ɸ','œ','ɶ','ʘ','ɺ','ɻ','ʀ','ʁ', - 'ɽ','ʂ','ʈ','ʧ','ʉ','ʋ','ⱱ','ɤ','ʍ','χ','ʏ','ʑ','ʐ', - 'ʔ','ʡ','ʕ','ʢ','ǀ','ǁ','ǂ','ᵻ', 'ʃ','ː', - ), -} - -GRAPHEME_CHARACTER_CASES = ["upper", "lower", "mixed"] - -# fmt: on - - -def validate_locale(locale): - if locale not in SUPPORTED_LOCALES: - raise ValueError(f"Unsupported locale '{locale}'. " f"Supported locales {SUPPORTED_LOCALES}") - - -def get_grapheme_character_set(locale: str, case: str = "upper") -> str: - if locale not in GRAPHEME_CHARACTER_SETS: - raise ValueError( - f"Grapheme character set not found for locale '{locale}'. " - f"Supported locales {GRAPHEME_CHARACTER_SETS.keys()}" - ) - - charset_str_origin = ''.join(GRAPHEME_CHARACTER_SETS[locale]) - if case == "upper": - # Directly call .upper() will convert 'ß' into 'SS' according to https://bugs.python.org/issue30810. - charset_str = charset_str_origin.replace('ß', 'ẞ').upper() - elif case == "lower": - charset_str = charset_str_origin.lower() - elif case == "mixed": - charset_str = charset_str_origin.replace('ß', 'ẞ').upper() + charset_str_origin.lower() - else: - raise ValueError( - f"Grapheme character case not found: '{case}'. Supported cases are {GRAPHEME_CHARACTER_CASES}" - ) - - return charset_str - - -def get_ipa_character_set(locale): - if locale not in IPA_CHARACTER_SETS: - raise ValueError( - f"IPA character set not found for locale '{locale}'. " f"Supported locales {IPA_CHARACTER_SETS.keys()}" - ) - char_set = set(IPA_CHARACTER_SETS[locale]) - return char_set - - -def get_ipa_punctuation_list(locale): - if locale is None: - return sorted(list(DEFAULT_PUNCTUATION)) - - validate_locale(locale) - - punct_set = set(DEFAULT_PUNCTUATION) - # TODO @xueyang: verify potential mismatches with locale-specific punctuation sets used - # in nemo_text_processing.text_normalization.en.taggers.punctuation.py - if locale in ["de-DE", "es-ES", "it-IT", "fr-FR"]: - # ref: https://en.wikipedia.org/wiki/Guillemet#Uses - punct_set.update(['«', '»', '‹', '›']) - if locale == "de-DE": - # ref: https://en.wikipedia.org/wiki/German_orthography#Punctuation - punct_set.update( - [ - '„', # double low-9 quotation mark, U+201E, decimal 8222 - '“', # left double quotation mark, U+201C, decimal 8220 - '‚', # single low-9 quotation mark, U+201A, decimal 8218 - '‘', # left single quotation mark, U+2018, decimal 8216 - '‒', # figure dash, U+2012, decimal 8210 - '–', # en dash, U+2013, decimal 8211 - '—', # em dash, U+2014, decimal 8212 - ] - ) - if locale == "it-IT": - # ref: https://en.wikipedia.org/wiki/German_orthography#Punctuation - punct_set.update( - [ - '„', # double low-9 quotation mark, U+201E, decimal 8222 - '“', # left double quotation mark, U+201C, decimal 8220 - '‚', # single low-9 quotation mark, U+201A, decimal 8218 - '‘', # left single quotation mark, U+2018, decimal 8216 - '‒', # figure dash, U+2012, decimal 8210 - '–', # en dash, U+2013, decimal 8211 - '—', # em dash, U+2014, decimal 8212 - 'ʴ', - 'ʰ', - 'ʱ', - 'ʲ', - 'ʷ', - 'ˠ', - 'ˤ', - '˞↓', - '↑', - '→', - '↗', - '↘', - '”', - '’', - '-', - ] - ) - elif locale == "es-ES": - # ref: https://en.wikipedia.org/wiki/Spanish_orthography#Punctuation - punct_set.update(['¿', '¡']) - elif locale == "fr-FR": - punct_set.update( - [ - '–', # en dash, U+2013, decimal 8211 - '“', # left double quotation mark, U+201C, decimal 8220 - '”', # right double quotation mark, U+201D, decimal 8221 - '…', # horizontal ellipsis, U+2026, decimal 8230 - '̀', # combining grave accent, U+0300, decimal 768 - '́', # combining acute accent, U+0301, decimal 769 - '̂', # combining circumflex accent, U+0302, decimal 770 - '̈', # combining diaeresis, U+0308, decimal 776 - '̧', # combining cedilla, U+0327, decimal 807 - ] - ) - - punct_list = sorted(list(punct_set)) - return punct_list diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py deleted file mode 100644 index 542b18186846f1b5dd88f899eeac5cbef9e7cdb2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import re -import unicodedata -from builtins import str as unicode -from typing import List, Tuple - -__all__ = [ - "french_text_preprocessing", - "chinese_text_preprocessing", - "english_text_preprocessing", - "any_locale_text_preprocessing", - "spanish_text_preprocessing", - "italian_text_preprocessing", - "any_locale_word_tokenize", - "english_word_tokenize", - "LATIN_CHARS_ALL", - "normalize_unicode_text", -] - -# Derived from LJSpeech -_synoglyphs = { - "'": ['’'], - '"': ['”', '“'], -} -SYNOGLYPH2ASCII = {g: asc for asc, glyphs in _synoglyphs.items() for g in glyphs} - -# Example of parsing by groups via _WORDS_RE_EN. -# Regular expression pattern groups: -# 1st group -- valid english words, -# 2nd group -- any substring starts from | to | (mustn't be nested), useful when you want to leave sequence unchanged, -# 3rd group -- punctuation marks or whitespaces. -# Text (first line) and mask of groups for every char (second line). -# config file must contain |EY1 EY1|, B, C, D, E, F, and G. - -# define char set based on https://en.wikipedia.org/wiki/List_of_Unicode_characters -LATIN_ALPHABET_BASIC = "A-Za-z" -ACCENTED_CHARS = "À-ÖØ-öø-ÿ" -LATIN_CHARS_ALL = f"{LATIN_ALPHABET_BASIC}{ACCENTED_CHARS}" -_WORDS_RE_EN = re.compile( - fr"([{LATIN_ALPHABET_BASIC}]+(?:[{LATIN_ALPHABET_BASIC}\-']*[{LATIN_ALPHABET_BASIC}]+)*)|(\|[^|]*\|)|([^{LATIN_ALPHABET_BASIC}|]+)" -) -_WORDS_RE_ANY_LOCALE = re.compile( - fr"([{LATIN_CHARS_ALL}]+(?:[{LATIN_CHARS_ALL}\-']*[{LATIN_CHARS_ALL}]+)*)|(\|[^|]*\|)|([^{LATIN_CHARS_ALL}|]+)" -) - - -def english_text_preprocessing(text, lower=True): - text = unicode(text) - text = ''.join(char for char in unicodedata.normalize('NFD', text) if unicodedata.category(char) != 'Mn') - text = ''.join(char if char not in SYNOGLYPH2ASCII else SYNOGLYPH2ASCII[char] for char in text) - - if lower: - text = text.lower() - - return text - - -def any_locale_text_preprocessing(text: str) -> str: - """ - Normalize unicode text with "NFC", and convert right single quotation mark (U+2019, decimal 8217) as an apostrophe. - - Args: - text (str): the original input sentence. - - Returns: normalized text (str). - """ - res = [] - for c in normalize_unicode_text(text): - if c in ['’']: # right single quotation mark (U+2019, decimal 8217) as an apostrophe - res.append("'") - else: - res.append(c) - - return ''.join(res) - - -def normalize_unicode_text(text: str) -> str: - """ - TODO @xueyang: Apply NFC form may be too aggressive since it would ignore some accented characters that do not exist - in predefined German alphabet (nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon.IPA_CHARACTER_SETS), - such as 'é'. This is not expected. A better solution is to add an extra normalization with NFD to discard the - diacritics and consider 'é' and 'e' produce similar pronunciations. - - Note that the tokenizer needs to run `unicodedata.normalize("NFC", x)` before calling `encode` function, - especially for the characters that have diacritics, such as 'ö' in the German alphabet. 'ö' can be encoded as - b'\xc3\xb6' (one char) as well as b'o\xcc\x88' (two chars). Without the normalization of composing two chars - together and without a complete predefined set of diacritics, when the tokenizer reads the input sentence - char-by-char, it would skip the combining diaeresis b'\xcc\x88', resulting in indistinguishable pronunciations - for 'ö' and 'o'. - - Args: - text (str): the original input sentence. - - Returns: - NFC normalized sentence (str). - """ - # normalize word with NFC form - if not unicodedata.is_normalized("NFC", text): - text = unicodedata.normalize("NFC", text) - - return text - - -def _word_tokenize(words: List[Tuple[str, str, str]], is_lower: bool = False) -> List[Tuple[List[str], bool]]: - """ - Process a list of words and attach indicators showing if each word is unchangeable or not. Each word representation - can be one of valid word, any substring starting from | to | (unchangeable word), or punctuation marks including - whitespaces. This function will split unchanged strings by whitespaces and return them as `List[str]`. For example, - - .. code-block:: python - [ - ('Hello', '', ''), # valid word - ('', '', ' '), # punctuation mark - ('World', '', ''), # valid word - ('', '', ' '), # punctuation mark - ('', '|NVIDIA unchanged|', ''), # unchangeable word - ('', '', '!') # punctuation mark - ] - - will be converted into, - - .. code-block:: python - [ - (["Hello"], False), - ([" "], False), - (["World"], False), - ([" "], False), - (["NVIDIA", "unchanged"], True), - (["!"], False) - ] - - Args: - words (List[str]): a list of tuples like `(maybe_word, maybe_without_changes, maybe_punct)` where each element - corresponds to a non-overlapping match of either `_WORDS_RE_EN` or `_WORDS_RE_ANY_LOCALE`. - is_lower (bool): a flag to trigger lowercase all words. By default, it is False. - - Returns: List[Tuple[List[str], bool]], a list of tuples like `(a list of words, is_unchanged)`. - - """ - result = [] - for word in words: - maybe_word, maybe_without_changes, maybe_punct = word - - without_changes = False - if maybe_word != '': - if is_lower: - token = [maybe_word.lower()] - else: - token = [maybe_word] - elif maybe_punct != '': - token = [maybe_punct] - elif maybe_without_changes != '': - without_changes = True - token = maybe_without_changes[1:-1].split(" ") - else: - raise ValueError( - f"This is not expected. Found empty string: <{word}>. " - f"Please validate your regular expression pattern '_WORDS_RE_EN' or '_WORDS_RE_ANY_LOCALE'." - ) - - result.append((token, without_changes)) - - return result - - -def english_word_tokenize(text: str) -> List[Tuple[List[str], bool]]: - words = _WORDS_RE_EN.findall(text) - return _word_tokenize(words, is_lower=True) - - -def any_locale_word_tokenize(text: str) -> List[Tuple[List[str], bool]]: - words = _WORDS_RE_ANY_LOCALE.findall(text) - return _word_tokenize(words) - - -def spanish_text_preprocessing(text: str) -> str: - return text.lower() - - -def italian_text_preprocessing(text: str) -> str: - return text.lower() - - -def chinese_text_preprocessing(text: str) -> str: - return text - - -def french_text_preprocessing(text: str) -> str: - return text.lower() diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/text_to_speech/tokenizer_wrapper.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/text_to_speech/tokenizer_wrapper.py deleted file mode 100644 index e2d06f83141f5bc1b4e53f87ffba4d0f8154c1cd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/text_to_speech/tokenizer_wrapper.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.common.tokenizers import TokenizerSpec -from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import EnglishPhonemesTokenizer -from nemo.collections.tts.g2p.models.en_us_arpabet import EnglishG2p - -__all__ = ['TextToSpeechTokenizer'] - - -class TextToSpeechTokenizer(TokenizerSpec): - def __init__(self, phoneme_dict, heteronyms): - self.g2p = EnglishG2p(phoneme_dict=phoneme_dict, heteronyms=heteronyms) - self.tokenizer = EnglishPhonemesTokenizer( - self.g2p, stresses=True, chars=True, pad_with_space=True, add_blank_at=True - ) - self.vocab_size = len(self.tokenizer.tokens) - - def text_to_ids(self, text): - return self.tokenizer.encode(text) - - def text_to_tokens(self, text): - return self.g2p(text) - - def tokens_to_text(self, tokens): - pass - - def tokens_to_ids(self, tokens): - pass - - def ids_to_tokens(self, ids): - pass - - def ids_to_text(self, ids): - pass - - @property - def pad_id(self): - return self.tokenizer.pad - - @property - def bos_id(self): - return self.tokenizer.pad - - @property - def eos_id(self): - return self.tokenizer.pad diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py deleted file mode 100644 index 1aefc6f1b4bba8a502b2033ac84e8ef7966a1cad..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py +++ /dev/null @@ -1,918 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -import string -from abc import ABC, abstractmethod -from contextlib import contextmanager -from typing import List, Optional - -from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import ( - get_grapheme_character_set, - get_ipa_punctuation_list, - validate_locale, -) -from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import ( - any_locale_text_preprocessing, - chinese_text_preprocessing, - english_text_preprocessing, - french_text_preprocessing, - italian_text_preprocessing, - spanish_text_preprocessing, -) -from nemo.utils import logging -from nemo.utils.decorators import experimental - - -class BaseTokenizer(ABC): - PAD, BLANK, OOV = '', '', '' - - def __init__(self, tokens, *, pad=PAD, blank=BLANK, oov=OOV, sep='', add_blank_at=None): - """Abstract class for creating an arbitrary tokenizer to convert string to list of int tokens. - Args: - tokens: List of tokens. - pad: Pad token as string. - blank: Blank token as string. - oov: OOV token as string. - sep: Separation token as string. - add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), - if None then no blank in labels. - """ - super().__init__() - - tokens = list(tokens) - # TODO @xueyang: in general, IDs of pad, sil, blank, and oov are preserved ahead instead of dynamically - # assigned according to the number of tokens. The downside of using dynamical assignment leads to different IDs - # for each. - self.pad, tokens = len(tokens), tokens + [pad] # Padding - - if add_blank_at is not None: - self.blank, tokens = len(tokens), tokens + [blank] # Reserved for blank from asr-model - else: - # use add_blank_at=None only for ASR where blank is added automatically, disable blank here - self.blank = None - - self.oov, tokens = len(tokens), tokens + [oov] # Out Of Vocabulary - - if add_blank_at == "last": - tokens[-1], tokens[-2] = tokens[-2], tokens[-1] - self.oov, self.blank = self.blank, self.oov - - self.tokens = tokens - self.sep = sep - - self._util_ids = {self.pad, self.blank, self.oov} - self._token2id = {l: i for i, l in enumerate(tokens)} - self._id2token = tokens - - def __call__(self, text: str) -> List[int]: - return self.encode(text) - - @abstractmethod - def encode(self, text: str) -> List[int]: - """Turns str text into int tokens.""" - pass - - def decode(self, tokens: List[int]) -> str: - """Turns ints tokens into str text.""" - return self.sep.join(self._id2token[t] for t in tokens if t not in self._util_ids) - - -class BaseCharsTokenizer(BaseTokenizer): - # fmt: off - # TODO @xueyang: unify definition of the default PUNCT_LIST and import from ipa_lexicon.py - PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally - ',', '.', '!', '?', '-', - ':', ';', '/', '"', '(', - ')', '[', ']', '{', '}', - ) - # fmt: on - - def __init__( - self, - chars, - punct=True, - apostrophe=True, - add_blank_at=None, - pad_with_space=False, - non_default_punct_list=None, - text_preprocessing_func=lambda x: x, - ): - """Base class for char-based tokenizer. - Args: - chars: string that represents all possible characters. - punct: Whether to reserve grapheme for basic punctuation or not. - apostrophe: Whether to use apostrophe or not. - add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), - if None then no blank in labels. - pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. - non_default_punct_list: List of punctuation marks which will be used instead default. - text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. - """ - - tokens = [] - self.space, tokens = len(tokens), tokens + [' '] # Space - tokens.extend(chars) - if apostrophe: - tokens.append("'") # Apostrophe for saving "don't" and "Joe's" - - if punct: - if non_default_punct_list is not None: - self.PUNCT_LIST = non_default_punct_list - tokens.extend(self.PUNCT_LIST) - - super().__init__(tokens, add_blank_at=add_blank_at) - - self.punct = punct - self.pad_with_space = pad_with_space - - self.text_preprocessing_func = text_preprocessing_func - - def encode(self, text): - """See base class.""" - cs, space, tokens = [], self.tokens[self.space], set(self.tokens) - - text = self.text_preprocessing_func(text) - for c in text: - # Add a whitespace if the current char is a whitespace while the previous char is not a whitespace. - if c == space and len(cs) > 0 and cs[-1] != space: - cs.append(c) - # Add the current char that is an alphanumeric or an apostrophe. - elif (c.isalnum() or c == "'") and c in tokens: - cs.append(c) - # Add a punctuation that has a single char. - elif (c in self.PUNCT_LIST) and self.punct: - cs.append(c) - # Warn about unknown char - elif c != space: - logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.") - - # Remove trailing spaces - if cs: - while cs[-1] == space: - cs.pop() - - if self.pad_with_space: - cs = [space] + cs + [space] - - return [self._token2id[p] for p in cs] - - -class EnglishCharsTokenizer(BaseCharsTokenizer): - def __init__( - self, - punct=True, - apostrophe=True, - add_blank_at=None, - pad_with_space=False, - non_default_punct_list=None, - text_preprocessing_func=english_text_preprocessing, - ): - """English char-based tokenizer. - Args: - punct: Whether to reserve grapheme for basic punctuation or not. - apostrophe: Whether to use apostrophe or not. - add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), - if None then no blank in labels. - pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. - non_default_punct_list: List of punctuation marks which will be used instead default. - text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. - Basically, it replaces all non-unicode characters with unicode ones and apply lower() function. - """ - super().__init__( - chars=string.ascii_lowercase, - punct=punct, - apostrophe=apostrophe, - add_blank_at=add_blank_at, - pad_with_space=pad_with_space, - non_default_punct_list=non_default_punct_list, - text_preprocessing_func=text_preprocessing_func, - ) - - -class GermanCharsTokenizer(BaseCharsTokenizer): - - _LOCALE = "de-DE" - _PUNCT_LIST = get_ipa_punctuation_list(_LOCALE) - _CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="mixed") - - def __init__( - self, - chars=_CHARSET_STR, - punct=True, - apostrophe=True, - add_blank_at=None, - pad_with_space=False, - non_default_punct_list=_PUNCT_LIST, - text_preprocessing_func=any_locale_text_preprocessing, - ): - """German grapheme-based tokenizer. - Args: - punct: Whether to reserve grapheme for basic punctuation or not. - apostrophe: Whether to use apostrophe or not. - add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), - if None then no blank in labels. - pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. - non_default_punct_list: List of punctuation marks which will be used instead default. - text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. By default, it - would keep any word unchanged. - """ - super().__init__( - chars=chars, - punct=punct, - apostrophe=apostrophe, - add_blank_at=add_blank_at, - pad_with_space=pad_with_space, - non_default_punct_list=non_default_punct_list, - text_preprocessing_func=text_preprocessing_func, - ) - - -class SpanishCharsTokenizer(BaseCharsTokenizer): - - PUNCT_LIST = get_ipa_punctuation_list("es-ES") - - def __init__( - self, punct=True, apostrophe=True, add_blank_at=None, pad_with_space=False, non_default_punct_list=None, - ): - """Spanish grapheme tokenizer. - Args: - punct: Whether to reserve grapheme for basic punctuation or not. - apostrophe: Whether to use apostrophe or not. - add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), - if None then no blank in labels. - pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. - non_default_punct_list: List of punctuation marks which will be used instead default. - """ - - es_alphabet = "abcdefghijklmnopqrstuvwxyzáéíñóúü" - super().__init__( - chars=es_alphabet, - punct=punct, - apostrophe=apostrophe, - add_blank_at=add_blank_at, - pad_with_space=pad_with_space, - non_default_punct_list=non_default_punct_list, - text_preprocessing_func=spanish_text_preprocessing, - ) - - -class FrenchCharsTokenizer(BaseCharsTokenizer): - - PUNCT_LIST = get_ipa_punctuation_list("fr-FR") - - def __init__( - self, punct=True, apostrophe=True, add_blank_at=None, pad_with_space=False, non_default_punct_list=None, - ): - """French grapheme tokenizer. - Args: - punct: Whether to reserve grapheme for basic punctuation or not. - apostrophe: Whether to use apostrophe or not. - add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), - if None then no blank in labels. - pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. - non_default_punct_list: List of punctuation marks which will be used instead default. - """ - - fr_alphabet = get_grapheme_character_set(locale="fr-FR", case="lower") - super().__init__( - chars=fr_alphabet, - punct=punct, - apostrophe=apostrophe, - add_blank_at=add_blank_at, - pad_with_space=pad_with_space, - non_default_punct_list=non_default_punct_list, - text_preprocessing_func=french_text_preprocessing, - ) - - -class ItalianCharsTokenizer(BaseCharsTokenizer): - PUNCT_LIST = get_ipa_punctuation_list("it-IT") - - def __init__( - self, punct=True, apostrophe=True, add_blank_at=None, pad_with_space=False, non_default_punct_list=None - ): - """Italian grapheme tokenizer. - Args: - punct: Whether to reserve grapheme for basic punctuation or not. - apostrophe: Whether to use apostrophe or not. - add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), - if None then no blank in labels. - pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. - non_default_punct_list: List of punctuation marks which will be used instead default. - """ - - it_alphabet = "abcdefghijklmnopqrstuvwxyzàèéìòùó" - super().__init__( - chars=it_alphabet, - punct=punct, - apostrophe=apostrophe, - add_blank_at=add_blank_at, - pad_with_space=pad_with_space, - non_default_punct_list=non_default_punct_list, - text_preprocessing_func=italian_text_preprocessing, - ) - - -class GermanPhonemesTokenizer(BaseCharsTokenizer): - # fmt: off - PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally - ',', '.', '!', '?', '-', - ':', ';', '/', '"', '(', - ')', '[', ']', '{', '}', - ) - # fmt: on - - def __init__( - self, - punct=True, - apostrophe=True, - add_blank_at=None, - pad_with_space=False, - non_default_punct_list=None, - text_preprocessing_func=any_locale_text_preprocessing, - ): - """Deutsch phoneme-based tokenizer. - Args: - punct: Whether to reserve grapheme for basic punctuation or not. - apostrophe: Whether to use apostrophe or not. - add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), - if None then no blank in labels. - pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. - non_default_punct_list: List of punctuation marks which will be used instead default. - text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. - Currently, it only applies lower() function. - """ - - de_ipa = "abdefhijklmnoprstuvwxyzçðøŋœɐɑɒɔəɛɜɡɪɹɾʃʊʌʒː̃" - de_suprasegmentals = "12" - super().__init__( - chars=de_ipa + de_suprasegmentals, - punct=punct, - apostrophe=apostrophe, - add_blank_at=add_blank_at, - pad_with_space=pad_with_space, - non_default_punct_list=non_default_punct_list, - text_preprocessing_func=text_preprocessing_func, - ) - - def encode(self, text): - """See base class.""" - cs, space, tokens = [], self.tokens[self.space], set(self.tokens) - - text = self.text_preprocessing_func(text) - for c in text: - # Add space if last one isn't one - if c == space and len(cs) > 0 and cs[-1] != space: - cs.append(c) - # Add next char - elif (c.isalnum() or c == "'" or c == "\u0303") and c in tokens: - cs.append(c) - # Add punct - elif (c in self.PUNCT_LIST) and self.punct: - cs.append(c) - # Warn about unknown char - elif c != space: - logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.") - - # Remove trailing spaces - while cs[-1] == space: - cs.pop() - - if self.pad_with_space: - cs = [space] + cs + [space] - - return [self._token2id[p] for p in cs] - - -class ItalianPhonemesTokenizer(BaseCharsTokenizer): - # fmt: off - PUNCT_LIST = ( - ',', '.', '!', '?', '-', - ':', ';', '/', '"', '(', - ')', '[', ']', '{', '}', - '„', '“', '”', '‘', '’', '‒', '—', '«', '»', '‹', '›', '_', - ) - # fmt: on - - def __init__( - self, - punct=True, - apostrophe=True, - add_blank_at=None, - pad_with_space=False, - non_default_punct_list=None, - text_preprocessing_func=italian_text_preprocessing, - ): - """Italian phoneme-based tokenizer. - Args: - punct: Whether to reserve grapheme for basic punctuation or not. - apostrophe: Whether to use apostrophe or not. - add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), - if None then no blank in labels. - pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. - non_default_punct_list: List of punctuation marks which will be used instead default. - text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. - Currently, it only applies lower() function. - """ - - it_ipa = "abcdefghijklmnopqrstuvwxyzàèéìòùóæɐɑɔəɚɜɬɹʌʔᵻðŋɛɡɣɪɲɾʃʊʎʒʝβθd͡'t͡'øɒɕɓçɖɘɝɞɟʄɡɠɢʛɦɧħɥʜɨɬɫɮʟɱɯɰɳɵɸœɶʘɺɻʀʁɽʂʈʧʉʋⱱɤʍχʏʑʐʔʡʕʢǀǁǂᵻʃ'ː" - super().__init__( - chars=it_ipa, - punct=punct, - apostrophe=apostrophe, - add_blank_at=add_blank_at, - pad_with_space=pad_with_space, - non_default_punct_list=non_default_punct_list, - text_preprocessing_func=text_preprocessing_func, - ) - - def encode(self, text): - """See base class.""" - cs, space, tokens = [], self.tokens[self.space], set(self.tokens) - - text = self.text_preprocessing_func(text) - for c in text: - # Add space if last one isn't one - if c == space and len(cs) > 0 and cs[-1] != space: - cs.append(c) - # Add next char - elif (c.isalnum() or c == "'" or c == "\u0303") and c in tokens: - cs.append(c) - # Add punct - elif (c in self.PUNCT_LIST) and self.punct: - cs.append(c) - # Warn about unknown char - elif c != space: - logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.") - - # Remove trailing spaces - while cs[-1] == space: - cs.pop() - - if self.pad_with_space: - cs = [space] + cs + [space] - - return [self._token2id[p] for p in cs] - - -class EnglishPhonemesTokenizer(BaseTokenizer): - # fmt: off - PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally - ',', '.', '!', '?', '-', - ':', ';', '/', '"', '(', - ')', '[', ']', '{', '}', - ) - VOWELS = ( - 'AA', 'AE', 'AH', 'AO', 'AW', - 'AY', 'EH', 'ER', 'EY', 'IH', - 'IY', 'OW', 'OY', 'UH', 'UW', - ) - CONSONANTS = ( - 'B', 'CH', 'D', 'DH', 'F', 'G', - 'HH', 'JH', 'K', 'L', 'M', 'N', - 'NG', 'P', 'R', 'S', 'SH', 'T', - 'TH', 'V', 'W', 'Y', 'Z', 'ZH', - ) - # fmt: on - - def __init__( - self, - g2p, - punct=True, - non_default_punct_list=None, - stresses=False, - chars=False, - *, - space=' ', - silence=None, - apostrophe=True, - oov=BaseTokenizer.OOV, - sep='|', # To be able to distinguish between 2/3 letters codes. - add_blank_at=None, - pad_with_space=False, - text_preprocessing_func=lambda text: english_text_preprocessing(text, lower=False), - ): - """English phoneme-based tokenizer. - Args: - g2p: Grapheme to phoneme module. - punct: Whether to reserve grapheme for basic punctuation or not. - non_default_punct_list: List of punctuation marks which will be used instead default. - stresses: Whether to use phonemes codes with stresses (0-2) or not. - chars: Whether to additionally use chars together with phonemes. It is useful if g2p module can return chars too. - space: Space token as string. - silence: Silence token as string (will be disabled if it is None). - apostrophe: Whether to use apostrophe or not. - oov: OOV token as string. - sep: Separation token as string. - add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), - if None then no blank in labels. - pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. - text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. - Basically, it replaces all non-unicode characters with unicode ones. - Note that lower() function shouldn't be applied here, in case the text contains phonemes (it will be handled by g2p). - """ - - self.phoneme_probability = None - if hasattr(g2p, "phoneme_probability"): - self.phoneme_probability = g2p.phoneme_probability - tokens = [] - self.space, tokens = len(tokens), tokens + [space] # Space - - if silence is not None: - self.silence, tokens = len(tokens), tokens + [silence] # Silence - - tokens.extend(self.CONSONANTS) - vowels = list(self.VOWELS) - - if stresses: - vowels = [f'{p}{s}' for p, s in itertools.product(vowels, (0, 1, 2))] - tokens.extend(vowels) - - if chars or self.phoneme_probability is not None: - if not chars: - logging.warning( - "phoneme_probability was not None, characters will be enabled even though " - "chars was set to False." - ) - tokens.extend(string.ascii_lowercase) - - if apostrophe: - tokens.append("'") # Apostrophe - - if punct: - if non_default_punct_list is not None: - self.PUNCT_LIST = non_default_punct_list - tokens.extend(self.PUNCT_LIST) - - super().__init__(tokens, oov=oov, sep=sep, add_blank_at=add_blank_at) - - self.chars = chars if self.phoneme_probability is None else True - self.punct = punct - self.stresses = stresses - self.pad_with_space = pad_with_space - - self.text_preprocessing_func = text_preprocessing_func - self.g2p = g2p - - def encode(self, text): - """See base class for more information.""" - - text = self.text_preprocessing_func(text) - g2p_text = self.g2p(text) # TODO: handle infer - return self.encode_from_g2p(g2p_text, text) - - def encode_from_g2p(self, g2p_text: List[str], raw_text: Optional[str] = None): - """ - Encodes text that has already been run through G2P. - Called for encoding to tokens after text preprocessing and G2P. - - Args: - g2p_text: G2P's output, could be a mixture of phonemes and graphemes, - e.g. "see OOV" -> ['S', 'IY1', ' ', 'O', 'O', 'V'] - raw_text: original raw input - """ - ps, space, tokens = [], self.tokens[self.space], set(self.tokens) - for p in g2p_text: # noqa - # Remove stress - if p.isalnum() and len(p) == 3 and not self.stresses: - p = p[:2] - - # Add space if last one isn't one - if p == space and len(ps) > 0 and ps[-1] != space: - ps.append(p) - # Add next phoneme or char (if chars=True) - elif (p.isalnum() or p == "'") and p in tokens: - ps.append(p) - # Add punct - elif (p in self.PUNCT_LIST) and self.punct: - ps.append(p) - # Warn about unknown char/phoneme - elif p != space: - message = f"Text: [{''.join(g2p_text)}] contains unknown char/phoneme: [{p}]." - if raw_text is not None: - message += f"Original text: [{raw_text}]. Symbol will be skipped." - logging.warning(message) - - # Remove trailing spaces - if ps: - while ps[-1] == space: - ps.pop() - - if self.pad_with_space: - ps = [space] + ps + [space] - - return [self._token2id[p] for p in ps] - - @contextmanager - def set_phone_prob(self, prob): - if hasattr(self.g2p, "phoneme_probability"): - self.g2p.phoneme_probability = prob - try: - yield - finally: - if hasattr(self.g2p, "phoneme_probability"): - self.g2p.phoneme_probability = self.phoneme_probability - - -@experimental -class IPATokenizer(BaseTokenizer): - def __init__( - self, - g2p, - locale="en-US", - punct=True, - non_default_punct_list=None, - fixed_vocab=None, - *, - space=' ', - silence=None, - apostrophe=False, - oov=BaseTokenizer.OOV, - sep='|', # To be able to distinguish between symbols - add_blank_at=None, - pad_with_space=False, - ): - """General-purpose IPA-based tokenizer. - Args: - g2p: Grapheme to phoneme module, should be IpaG2p or some subclass thereof. - locale: Locale used to determine default text processing logic and punctuation. - Supports ["en-US", "de-DE", "es-ES", "fr-FR"]. Defaults to "en-US". - Specify None if implementing custom logic for a new locale. - punct: Whether to reserve grapheme for basic punctuation or not. - non_default_punct_list: List of punctuation marks which will be used instead default, if any. - fixed_vocab: List of valid grapheme/phoneme tokens for the model. - Set only if overriding the default vocab generation process (reading from G2P dict). - If set, any dataset entries that have unincluded graphemes will be filtered out, and any words whose - pronunciations have unincluded phonemes will be treated as OOV. - Please make sure that the grapheme prefixes and cases are consistent with the G2P module's settings. - Defaults to None, which means default vocab generation is used. - space: Space token as string. - silence: Silence token as string (will be disabled if it is None). - apostrophe: Whether to use apostrophe or not. - oov: OOV token as string. - sep: Separation token as string. - add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), - if None then no blank in labels. - pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. - """ - if not hasattr(g2p, "symbols"): - logging.error( - f"Please make sure the G2P module passed into the IPATokenizer has a `symbols` attribute. " - f"This is required in order to build the tokenizer vocabulary.\n" - f"Expected e.g. IpaG2p, found {type(g2p)}" - ) - raise ValueError("G2P modules passed into the IPATokenizer must have `symbols` defined.") - - if locale is not None: - validate_locale(locale) - - self.phoneme_probability = None - if hasattr(g2p, "phoneme_probability"): - self.phoneme_probability = g2p.phoneme_probability - - if locale == "en-US": - self.text_preprocessing_func = lambda text: english_text_preprocessing(text, lower=False) - else: - self.text_preprocessing_func = any_locale_text_preprocessing - - # Build tokens list if fixed_vocab isn't set - if fixed_vocab: - tokens = {self.text_preprocessing_func(c) for c in fixed_vocab} - self.set_fixed_vocab = True # Used to check whether dataset entries need filtering - - if g2p.symbols == tokens: - logging.info( - "Did not replace G2P valid symbol set since the given set is equivalent to the existing one." - ) - self.set_fixed_vocab = False - else: - g2p.replace_symbols(tokens) - else: - tokens = set(g2p.symbols) - self.set_fixed_vocab = False - - if apostrophe: - tokens.add("'") - - if punct: - if non_default_punct_list is not None: - self.punct_list = non_default_punct_list - else: - self.punct_list = get_ipa_punctuation_list(locale) - - tokens.update(self.punct_list) - - # Sort to ensure that vocab is in the same order every time - tokens = sorted(list(tokens)) - - if space in g2p.symbols: - self.space = tokens.index(space) - else: - self.space, tokens = len(tokens), tokens + [space] - - if silence is not None: - self.silence, tokens = len(tokens), tokens + [silence] - - super().__init__(tokens, oov=oov, sep=sep, add_blank_at=add_blank_at) - - self.tokens_set = set(self.tokens) # To save some repeated work when filtering entries - - self.punct = punct - self.pad_with_space = pad_with_space - - self.g2p = g2p - - def encode(self, text: str) -> List[int]: - """See base class for more information.""" - # normalize the input text with "NFC" form. - text = self.text_preprocessing_func(text) - - # transliterate the text into phoneme sequences and/or grapheme sequences. - g2p_text = self.g2p(text) - - return self.encode_from_g2p(g2p_text, text) - - def encode_from_g2p(self, g2p_text: List[str], raw_text: Optional[str] = None) -> List[int]: - """ - Tokenize the `g2p_text` that has been already run through G2P. Each item in the `g2p_text` would be encoded as - one of the integer IDs predefined in `self._token2id`. Note that this function should be called after - `self.text_preprocessing_func` and `self.g2p` functions - - Args: - g2p_text (List[str]): a sequence of tokens from G2P's output. It could be a sequence of phonemes, a sequence - of graphemes, or a mixture of both. For example, `['ˈ', 's', 'i', ' ', '#O', '#O', '#V']`, which is the - G2P's output of the text "see OOV", where '#' is prepended to each grapheme in order to distinguish - graphemes from phonemes if there are overlaps in between. The prefix '#' can be customized in - `nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p.grapheme_prefix`. - raw_text (str): the original text after calling `self.text_preprocessing_func`. It is optional. It is only - used to deliver a warning message that some graphemes from the original text are skipped. - - Returns: a list of integer IDs that tokenize the `g2p_text`. - """ - ps, space, tokens = [], self.tokens[self.space], set(self.tokens) - for p in g2p_text: - if p == space and len(ps) > 0 and ps[-1] != space: - # Add space if last token isn't one - ps.append(p) - elif p in tokens: - # Add next phoneme or char (if chars=True) - ps.append(p) - elif (p in self.punct_list) and self.punct: - # Add punct - ps.append(p) - elif p != space: - message = f"Text: [{''.join(g2p_text)}] contains unknown char/phoneme: [{p}]." - if raw_text is not None: - message += f"Original text: [{raw_text}]. Symbol will be skipped." - logging.warning(message) - - # Remove trailing spaces - if ps: - while ps[-1] == space: - ps.pop() - - if self.pad_with_space: - ps = [space] + ps + [space] - - # Token index lookups - return [self._token2id[p] for p in ps] - - @contextmanager - def set_phone_prob(self, prob): - if hasattr(self.g2p, "phoneme_probability"): - self.g2p.phoneme_probability = prob - try: - yield - finally: - if hasattr(self.g2p, "phoneme_probability"): - self.g2p.phoneme_probability = self.phoneme_probability - - -class ChinesePhonemesTokenizer(BaseTokenizer): - # fmt: off - PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally - ',', '.', '!', '?', '-', - ':', ';', '/', '"', '(', - ')', '[', ']', '{', '}', - ) - ZH_PUNCT_LIST = list(",。?!;:、‘’“”()【】「」《》") + list(PUNCT_LIST) - - def __init__( - self, - g2p, - punct=True, - non_default_punct_list=None, - *, - space=' ', - silence=None, - apostrophe=True, - sep='|', # To be able to distinguish between 2/3 letters codes. - add_blank_at=None, - pad_with_space=False, - text_preprocessing_func=chinese_text_preprocessing, - ): - """Chinese phoneme-based tokenizer. - Note: This tokenizer for now covers Chinese phonemes/tones and English letters because our dataset contains - both Chinese and English graphemes. - Args: - g2p: Grapheme to phoneme module. - punct: Whether to reserve grapheme for basic punctuation or not. - non_default_punct_list: List of punctuation marks which will be used instead default. - space: Space token as string. - silence: Silence token as string (will be disabled if it is None). - apostrophe: Whether to use apostrophe or not. - sep: Separation token as string. - add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), - if None then no blank in labels. - pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. - text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. - Basically, it replaces all non-unicode characters with unicode ones. - Note that lower() function shouldn't be applied here, in case the text contains phonemes (it will be handled by g2p). - """ - tokens = [] - self.space, tokens = len(tokens), tokens + [space] # Space - - if silence is not None: - self.silence, tokens = len(tokens), tokens + [silence] # Silence - - self.phoneme_list = g2p.phoneme_list - self.tone_list = g2p.tone_list - self.ascii_letter_list = g2p.ascii_letter_list - - tokens.extend(self.phoneme_list) - tokens.extend(self.tone_list) - tokens.extend(self.ascii_letter_list) - - self.text_preprocessing_func = text_preprocessing_func - - if apostrophe: - tokens.append("'") # Apostrophe - - if punct: - if non_default_punct_list is not None: - self.PUNCT_LIST = non_default_punct_list - else: - self.PUNCT_LIST = list(self.ZH_PUNCT_LIST) - tokens.extend(self.PUNCT_LIST) - - super().__init__(tokens, sep=sep, add_blank_at=add_blank_at) - - self.punct = punct - self.pad_with_space = pad_with_space - self.g2p = g2p - - def encode(self, text: str) -> List[int]: - """See base class for more information.""" - text = self.text_preprocessing_func(text) - g2p_text = self.g2p(text) - return self.encode_from_g2p(g2p_text, text) - - def encode_from_g2p(self, g2p_text: List[str], raw_text: Optional[str] = None): - """ - Encodes text that has already been run through G2Pr. - Called for encoding to tokens after text preprocessing and G2P. - - Args: - g2p_text: G2P's output, could be a mixture of Chinese phonemes and English letters. - raw_text: original raw input - """ - ps, space, tokens = [], self.tokens[self.space], set(self.tokens) - for p in g2p_text: # noqa - # Add space if last one isn't one - if p == space and len(ps) > 0 and ps[-1] != space: - ps.append(p) - # Add next phoneme or tone or ascii letter or apostrophe. - elif (p.isalnum() or p == "'" or p in self.phoneme_list + self.tone_list + self.ascii_letter_list) and p in tokens: - ps.append(p) - # Add punctuation - elif (p in self.PUNCT_LIST) and self.punct: - ps.append(p) - # Warn about unknown char/phoneme - elif p != space: - message = f"Text: [{' '.join(g2p_text)}] contains unknown char/phoneme: [{p}]." - if raw_text is not None: - message += f"Original text: [{raw_text}]. Symbol will be skipped." - logging.warning(message) - - # Remove trailing spaces - if ps: - while ps[-1] == space: - ps.pop() - - if self.pad_with_space: - ps = [space] + ps + [space] - - return [self._token2id[p] for p in ps] diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/tokenizer_spec.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/tokenizer_spec.py deleted file mode 100644 index 252571d76ef205e8f29283c9143eccde11a3d6f8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/tokenizer_spec.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from abc import ABC, abstractmethod -from typing import List - -__all__ = ['TokenizerSpec'] - - -class TokenizerSpec(ABC): - """ - Inherit this class to implement a new tokenizer. - """ - - @abstractmethod - def text_to_tokens(self, text): - pass - - @abstractmethod - def tokens_to_text(self, tokens): - pass - - @abstractmethod - def tokens_to_ids(self, tokens): - pass - - @abstractmethod - def ids_to_tokens(self, ids): - pass - - @abstractmethod - def text_to_ids(self, text): - pass - - @abstractmethod - def ids_to_text(self, ids): - pass - - def add_special_tokens(self, special_tokens: List[str]): - raise NotImplementedError("To be implemented") - - @property - def name(self): - return type(self).__name__ diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/word_tokenizer.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/word_tokenizer.py deleted file mode 100644 index f3431af9d734707f63c8d68f38a090cf9f76478b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/word_tokenizer.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Optional - -from nemo.collections.common.tokenizers.char_tokenizer import CharTokenizer - -__all__ = ['WordTokenizer'] - - -class WordTokenizer(CharTokenizer): - "Tokenizes at word boundary" - - def __init__( - self, - vocab_file: str, - mask_token: Optional[str] = None, - bos_token: Optional[str] = None, - eos_token: Optional[str] = None, - pad_token: Optional[str] = None, - sep_token: Optional[str] = None, - cls_token: Optional[str] = None, - unk_token: Optional[str] = None, - ): - """ - Args: - vocab_file: path to file with vocabulary which consists - of characters separated by \n - mask_token: mask token - bos_token: the beginning of sequence token - eos_token: the end of sequence token. Usually equal to sep_token - pad_token: token to use for padding - sep_token: token used for separating sequences - cls_token: class token. Usually equal to bos_token - unk_token: token to use for unknown tokens - """ - - super().__init__( - vocab_file=vocab_file, - mask_token=mask_token, - bos_token=bos_token, - eos_token=eos_token, - pad_token=pad_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - ) - - def text_to_tokens(self, text): - token_candidates = text.strip().split() - tokens = [] - for token in token_candidates: - if token in self.vocab: - tokens.append(token) - else: - tokens.append(self.unk_token) - return tokens - - def ids_to_text(self, ids): - ids_ = [id_ for id_ in ids if id_ not in self.special_tokens] - return " ".join(self.ids_to_tokens(ids_)) diff --git a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/youtokentome_tokenizer.py b/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/youtokentome_tokenizer.py deleted file mode 100644 index 77375d38558a159f377dc69ad97a47e89136aa75..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/common/tokenizers/youtokentome_tokenizer.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from pathlib import Path - -import youtokentome as yttm - -from nemo.collections.common.tokenizers import TokenizerSpec - -__all__ = ['YouTokenToMeTokenizer'] - - -class YouTokenToMeTokenizer(TokenizerSpec): - def __init__(self, model_path, bpe_dropout=0.0, legacy=False, r2l=False): - model_path = Path(model_path).expanduser() - self.tokenizer = yttm.BPE(model=str(model_path)) - self.vocab_size = len(self.tokenizer.vocab()) - self.special_tokens = self.tokens_to_ids(["", "", "", ""]) - self.bpe_dropout = bpe_dropout - self.legacy = legacy - self.r2l = r2l - - def text_to_tokens(self, text): - return self.tokenizer.encode( - text, output_type=yttm.OutputType.SUBWORD, dropout_prob=self.bpe_dropout, reverse=self.r2l - ) - - def tokens_to_text(self, tokens): - return self.ids_to_text(self.tokens_to_ids(tokens)) - - def text_to_ids(self, text): - return self.tokenizer.encode( - text, output_type=yttm.OutputType.ID, dropout_prob=self.bpe_dropout, reverse=self.r2l - ) - - def ids_to_text(self, ids): - ids_ = [id_ for id_ in ids if id_ not in self.special_tokens] - if self.r2l: - ids_ = ids_[::-1] - return self.tokenizer.decode([ids_])[0] - - def tokens_to_ids(self, tokens): - return [self.tokenizer.subword_to_id(token) for token in tokens] - - def ids_to_tokens(self, ids): - if self.legacy: - ids_ = [id_ for id_ in ids if id_ not in self.special_tokens] - else: - ids_ = ids - return [self.tokenizer.id_to_subword(id_) for id_ in ids_] - - @property - def pad_id(self): - return self.tokenizer.subword_to_id("") - - @property - def bos_id(self): - return self.tokenizer.subword_to_id("") - - @property - def eos_id(self): - return self.tokenizer.subword_to_id("") - - @property - def unk_id(self): - return self.tokenizer.subword_to_id("") diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/__init__.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/__init__.py deleted file mode 100644 index 4fc50543f1d247a021674eaea55ba5f4c224c56e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/__init__.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/__init__.py deleted file mode 100644 index e13e4812457b0be14c1f049c6849e78955f0da87..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.multimodal.speech_cv import data, models, modules -from nemo.package_info import __version__ - -# Set collection version equal to NeMo version. -__version = __version__ - -# Authorship. -__author__ = "NVIDIA Corporation" - -# Set collection name. -__description__ = "Speech Computer Vision collection" diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/data/__init__.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/data/__init__.py deleted file mode 100644 index 9e3250071955216f6abc505e6181fb59931baa8d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/data/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/data/video_to_text.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/data/video_to_text.py deleted file mode 100644 index d0b903a5895b6cfcb39812471943542274316b06..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/data/video_to_text.py +++ /dev/null @@ -1,870 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union - -import braceexpand -import torch -import webdataset as wd - -from nemo.collections.asr.data.audio_to_text import cache_datastore_manifests, expand_sharded_filepaths -from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType -from nemo.collections.common import tokenizers -from nemo.collections.common.parts.preprocessing import collections, parsers -from nemo.collections.multimodal.speech_cv.parts.preprocessing.features import VideoFeaturizer -from nemo.core.classes import Dataset, IterableDataset -from nemo.core.neural_types import * -from nemo.utils import logging -from nemo.utils.data_utils import datastore_path_to_webdataset_url, is_datastore_path - - -def _video_speech_collate_fn(batch, pad_id): - """collate batch of video sig, video len, tokens, tokens len - Args: - batch (Optional[FloatTensor], Optional[LongTensor], LongTensor, - LongTensor): A tuple of tuples of signal, signal lengths, - encoded tokens, and encoded tokens length. This collate func - assumes the signals are 4d torch tensors (Time, Height, Width, Channels). - """ - packed_batch = list(zip(*batch)) - - if len(packed_batch) == 5: - _, video_lengths, _, tokens_lengths, sample_ids = packed_batch - elif len(packed_batch) == 4: - sample_ids = None - _, video_lengths, _, tokens_lengths = packed_batch - else: - raise ValueError("Expects 4 or 5 tensors in the batch!") - - # Max Video Len - max_video_len = 0 - has_video = video_lengths[0] is not None - if has_video: - max_video_len = max(video_lengths).item() - - # Max Token Len - max_tokens_len = max(tokens_lengths).item() - - video_signal, tokens = [], [] - for b in batch: - - if len(b) == 5: - video_sig, video_sig_len, tokens_i, tokens_i_len, _ = b - else: - video_sig, video_sig_len, tokens_i, tokens_i_len = b - - # Pad and Append Video - if has_video: - video_sig_len = video_sig_len.item() - if video_sig_len < max_video_len: - pad = (0, 0, 0, 0, 0, 0, 0, max_video_len - video_sig_len) - video_sig = torch.nn.functional.pad(video_sig, pad) - video_signal.append(video_sig) - - # Pad and Append Token - tokens_i_len = tokens_i_len.item() - if tokens_i_len < max_tokens_len: - pad = (0, max_tokens_len - tokens_i_len) - tokens_i = torch.nn.functional.pad(tokens_i, pad, value=pad_id) - tokens.append(tokens_i) - - # Stack Video - if has_video: - video_signal = torch.stack(video_signal) - video_lengths = torch.stack(video_lengths) - else: - video_signal, video_lengths = None, None - - # Stack Text - tokens = torch.stack(tokens) - tokens_lengths = torch.stack(tokens_lengths) - - # Return - if sample_ids is None: - return video_signal, video_lengths, tokens, tokens_lengths - else: - sample_ids = torch.tensor(sample_ids, dtype=torch.int32) - return video_signal, video_lengths, tokens, tokens_lengths, sample_ids - - -class _VideoTextDataset(Dataset): - """ - Dataset that loads tensors via a json file containing paths to video files, transcripts, and durations (in seconds). - Each new line is a different sample. Example below: - {"video_filepath": "/path/to/video.mp4", "text_filepath": "/path/to/video.txt", "duration": 23.147} - ... - {"video_filepath": "/path/to/video.mp4", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt": - "utterance_id", "ctm_utt": "en_4156", "side": "A"} - Args: - manifest_filepath: Path to manifest json as described above. Can be comma-separated paths. - parser: Str for a language specific preprocessor or a callable. - int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - max_duration: If video exceeds this length, do not include in dataset - min_duration: If video is less than this length, do not include in dataset - max_utts: Limit number of utterances - trim: whether or not to trim silence. Defaults to False - bos_id: Id of beginning of sequence symbol to append if not None - eos_id: Id of end of sequence symbol to append if not None - pad_id: Id of pad symbol. Defaults to 0 - return_sample_id (bool): whether to return the sample_id as a part of each sample - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'video_signal': NeuralType(('B', 'C', 'T', 'H', 'W'), VideoSignal()), - 'video_sig_length': NeuralType(tuple('B'), LengthsType()), - 'transcripts': NeuralType(('B', 'T'), LabelsType()), - 'transcript_length': NeuralType(tuple('B'), LengthsType()), - 'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True), - } - - def __init__( - self, - manifest_filepath: str, - parser: Union[str, Callable], - int_values: bool = False, - max_duration: Optional[int] = None, - min_duration: Optional[int] = None, - max_utts: int = 0, - trim: bool = False, - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - pad_id: int = 0, - return_sample_id: bool = False, - channel_selector: Optional[ChannelSelectorType] = None, - ): - if type(manifest_filepath) == str: - manifest_filepath = manifest_filepath.split(",") - - # If necessary, cache manifests and audio from object store - cache_datastore_manifests(manifest_filepaths=manifest_filepath, cache_audio=True) - - self.manifest_processor = VSRManifestProcessor( - manifest_filepath=manifest_filepath, - parser=parser, - max_duration=max_duration, - min_duration=min_duration, - max_utts=max_utts, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - ) - self.video_featurizer = VideoFeaturizer() - self.trim = trim - self.return_sample_id = return_sample_id - self.channel_selector = channel_selector - - def get_manifest_sample(self, sample_id): - return self.manifest_processor.collection[sample_id] - - def __getitem__(self, index): - - # Select Sample - sample = self.manifest_processor.collection[index] - - # Offset - offset = sample.offset - if offset is None: - offset = 0 - - # Load Video - video_features = self.video_featurizer.process(sample.video_file, offset=offset, duration=sample.duration) - vf, vfl = video_features, torch.tensor(video_features.shape[0]).long() - - # Load Tokens - t, tl = self.manifest_processor.process_text_by_sample(sample=sample) - - if self.return_sample_id: - output = vf, vfl, torch.tensor(t).long(), torch.tensor(tl).long(), index - else: - output = vf, vfl, torch.tensor(t).long(), torch.tensor(tl).long() - - return output - - def __len__(self): - return len(self.manifest_processor.collection) - - def _collate_fn(self, batch): - return _video_speech_collate_fn(batch, pad_id=self.manifest_processor.pad_id) - - -class VSRManifestProcessor: - """ - Class that processes a manifest json file containing paths to video files, transcripts, and durations (in seconds). - Each new line is a different sample. Example below: - {"video_filepath": "/path/to/video.mp4", "text_filepath": "/path/to/video.txt", "duration": 23.147} - ... - {"video_filepath": "/path/to/video.mp4", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt": - "utterance_id", "ctm_utt": "en_4156", "side": "A"} - Args: - manifest_filepath: Path to manifest json as described above. Can be comma-separated paths. - parser: Str for a language specific preprocessor or a callable. - max_duration: If video exceeds this length, do not include in dataset. - min_duration: If video is less than this length, do not include in dataset. - max_utts: Limit number of utterances. - bos_id: Id of beginning of sequence symbol to append if not None. - eos_id: Id of end of sequence symbol to append if not None. - pad_id: Id of pad symbol. Defaults to 0. - """ - - def __init__( - self, - manifest_filepath: str, - parser: Union[str, Callable], - max_duration: Optional[float] = None, - min_duration: Optional[float] = None, - max_utts: int = 0, - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - pad_id: int = 0, - index_by_file_id: bool = False, - ): - self.parser = parser - - self.collection = collections.ASRVideoText( - manifests_files=manifest_filepath, - parser=parser, - min_duration=min_duration, - max_duration=max_duration, - max_number=max_utts, - index_by_file_id=index_by_file_id, - ) - - self.eos_id = eos_id - self.bos_id = bos_id - self.pad_id = pad_id - - def process_text_by_id(self, index: int) -> Tuple[List[int], int]: - sample = self.collection[index] - return self.process_text_by_sample(sample) - - def process_text_by_file_id(self, file_id: str) -> Tuple[List[int], int]: - manifest_idx = self.collection.mapping[file_id][0] - sample = self.collection[manifest_idx] - return self.process_text_by_sample(sample) - - def process_text_by_sample(self, sample: collections.ASRAudioText.OUTPUT_TYPE) -> Tuple[List[int], int]: - t, tl = sample.text_tokens, len(sample.text_tokens) - - if self.bos_id is not None: - t = [self.bos_id] + t - tl += 1 - if self.eos_id is not None: - t = t + [self.eos_id] - tl += 1 - - return t, tl - - -class VideoToBPEDataset(_VideoTextDataset): - """ - Dataset that loads tensors via a json file containing paths to video - files, transcripts, and durations (in seconds). Each new line is a - different sample. Example below: - {"video_filepath": "/path/to/video.mp4", "text_filepath": - "/path/to/video.txt", "duration": 23.147} - ... - {"video_filepath": "/path/to/video.mp4", "text": "the - transcription", "offset": 301.75, "duration": 0.82, "utt": - "utterance_id", "ctm_utt": "en_4156", "side": "A"} - - In practice, the dataset and manifest used for character encoding and byte pair encoding - are exactly the same. The only difference lies in how the dataset tokenizes the text in - the manifest. - - Args: - manifest_filepath: Path to manifest json as described above. Can - be comma-separated paths. - tokenizer: A subclass of the Tokenizer wrapper found in the common collection, - nemo.collections.common.tokenizers.TokenizerSpec. ASR Models support a subset of - all available tokenizers. - int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - max_duration: If video exceeds this length, do not include in dataset - min_duration: If video is less than this length, do not include - in dataset - max_utts: Limit number of utterances - trim: Whether to trim silence segments - use_start_end_token: Boolean which dictates whether to add [BOS] and [EOS] - tokens to beginning and ending of speech respectively. - return_sample_id (bool): whether to return the sample_id as a part of each sample - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'video_signal': NeuralType(('B', 'C', 'T', 'H', 'W'), VideoSignal()), - 'video_sig_length': NeuralType(tuple('B'), LengthsType()), - 'transcripts': NeuralType(('B', 'T'), LabelsType()), - 'transcript_length': NeuralType(tuple('B'), LengthsType()), - 'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True), - } - - def __init__( - self, - manifest_filepath: str, - tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec', - int_values: bool = False, - max_duration: Optional[int] = None, - min_duration: Optional[int] = None, - max_utts: int = 0, - trim: bool = False, - use_start_end_token: bool = True, - return_sample_id: bool = False, - channel_selector: Optional[ChannelSelectorType] = None, - ): - if use_start_end_token and hasattr(tokenizer, "bos_id") and tokenizer.bos_id > 0: - bos_id = tokenizer.bos_id - else: - bos_id = None - - if use_start_end_token and hasattr(tokenizer, "eos_id") and tokenizer.eos_id > 0: - eos_id = tokenizer.eos_id - else: - eos_id = None - - if hasattr(tokenizer, "pad_id") and tokenizer.pad_id > 0: - pad_id = tokenizer.pad_id - else: - pad_id = 0 - - class TokenizerWrapper: - def __init__(self, tokenizer): - if isinstance(tokenizer, tokenizers.aggregate_tokenizer.AggregateTokenizer): - self.is_aggregate = True - else: - self.is_aggregate = False - self._tokenizer = tokenizer - - def __call__(self, *args): - if isinstance(args[0], List) and self.is_aggregate: - t = [] - for span in args[0]: - t.extend(self._tokenizer.text_to_ids(span['str'], span['lang'])) - return t - - t = self._tokenizer.text_to_ids(*args) - return t - - super().__init__( - manifest_filepath=manifest_filepath, - parser=TokenizerWrapper(tokenizer), - int_values=int_values, - max_duration=max_duration, - min_duration=min_duration, - max_utts=max_utts, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - trim=trim, - return_sample_id=return_sample_id, - channel_selector=channel_selector, - ) - - -class VideoToCharDataset(_VideoTextDataset): - """ - Dataset that loads tensors via a json file containing paths to video - files, transcripts, and durations (in seconds). Each new line is a - different sample. Example below: - {"video_filepath": "/path/to/video.mp4", "text_filepath": - "/path/to/video.txt", "duration": 23.147} - ... - {"video_filepath": "/path/to/video.mp4", "text": "the - transcription", "offset": 301.75, "duration": 0.82, "utt": - "utterance_id", "ctm_utt": "en_4156", "side": "A"} - - Args: - manifest_filepath: Path to manifest json as described above. Can - be comma-separated paths. - labels: String containing all the possible characters to map to - int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - max_duration: If video exceeds this length, do not include in dataset - min_duration: If video is less than this length, do not include - in dataset - max_utts: Limit number of utterances - blank_index: blank character index, default = -1 - unk_index: unk_character index, default = -1 - normalize: whether to normalize transcript text (default): True - bos_id: Id of beginning of sequence symbol to append if not None - eos_id: Id of end of sequence symbol to append if not None - return_sample_id (bool): whether to return the sample_id as a part of each sample - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'video_signal': NeuralType(('B', 'C', 'T', 'H', 'W'), VideoSignal()), - 'video_sig_length': NeuralType(tuple('B'), LengthsType()), - 'transcripts': NeuralType(('B', 'T'), LabelsType()), - 'transcript_length': NeuralType(tuple('B'), LengthsType()), - 'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True), - } - - def __init__( - self, - manifest_filepath: str, - labels: Union[str, List[str]], - int_values: bool = False, - max_duration: Optional[float] = None, - min_duration: Optional[float] = None, - max_utts: int = 0, - blank_index: int = -1, - unk_index: int = -1, - normalize: bool = True, - trim: bool = False, - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - pad_id: int = 0, - parser: Union[str, Callable] = 'en', - return_sample_id: bool = False, - channel_selector: Optional[ChannelSelectorType] = None, - ): - self.labels = labels - - parser = parsers.make_parser( - labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize - ) - - super().__init__( - manifest_filepath=manifest_filepath, - parser=parser, - int_values=int_values, - max_duration=max_duration, - min_duration=min_duration, - max_utts=max_utts, - trim=trim, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - return_sample_id=return_sample_id, - channel_selector=channel_selector, - ) - - -class _TarredVideoToTextDataset(IterableDataset): - """ - A similar Dataset to the VideoToCharDataset/VideoToBPEDataset, but which loads tarred video files. - - Accepts a single comma-separated JSON manifest file (in the same style as for the VideoToCharDataset/VideoToBPEDataset), - as well as the path(s) to the tarball(s) containing the mp4 files. Each line of the manifest should - contain the information for one video file, including at least the transcript and name of the audio - file within the tarball. - - Valid formats for the audio_tar_filepaths argument include: - (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or - (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...]. - - Note: For brace expansion in (1), there may be cases where `{x..y}` syntax cannot be used due to shell interference. - This occurs most commonly inside SLURM scripts. Therefore we provide a few equivalent replacements. - Supported opening braces - { <=> (, [, < and the special tag _OP_. - Supported closing braces - } <=> ), ], > and the special tag _CL_. - For SLURM based tasks, we suggest the use of the special tags for ease of use. - - See the WebDataset documentation for more information about accepted data and input formats. - - If using multiple workers the number of shards should be divisible by world_size to ensure an - even split among workers. If it is not divisible, logging will give a warning but training will proceed. - In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering - is applied. We currently do not check for this, but your program may hang if the shards are uneven! - - Notice that a few arguments are different from the AudioToCharDataset; for example, shuffle (bool) has been - replaced by shuffle_n (int). - - Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest - after filtering. An incorrect manifest length may lead to some DataLoader issues down the line. - - Args: - audio_tar_filepaths: Either a list of audio tarball filepaths, or a - string (can be brace-expandable). - manifest_filepath (str): Path to the manifest. - parser (callable): A callable which is used to pre-process the text output. - int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - shuffle_n (int): How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - Defaults to 0. - min_duration (float): Dataset parameter. - All training files which have a duration less than min_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to 0.1. - max_duration (float): Dataset parameter. - All training files which have a duration more than max_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to None. - blank_index (int): Blank character index, defaults to -1. - unk_index (int): Unknown character index, defaults to -1. - normalize (bool): Dataset parameter. - Whether to use automatic text cleaning. - It is highly recommended to manually clean text for best results. - Defaults to True. - trim (bool): Whether to use trim silence from beginning and end - of audio signal using librosa.effects.trim(). - Defaults to False. - bos_id (id): Dataset parameter. - Beginning of string symbol id used for seq2seq models. - Defaults to None. - eos_id (id): Dataset parameter. - End of string symbol id used for seq2seq models. - Defaults to None. - pad_id (id): Token used to pad when collating samples in batches. - If this is None, pads using 0s. - Defaults to None. - shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp. - - `scatter`: The default shard strategy applied by WebDataset, where each node gets - a unique set of shards, which are permanently pre-allocated and never changed at runtime. - - `replicate`: Optional shard strategy, where each node gets all of the set of shards - available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. - The benefit of replication is that it allows each node to sample data points from the entire - dataset independently of other nodes, and reduces dependence on value of `shuffle_n`. - - .. warning:: - Replicated strategy allows every node to sample the entire set of available tarfiles, - and therefore more than one node may sample the same tarfile, and even sample the same - data points! As such, there is no assured guarantee that all samples in the dataset will be - sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific - occasions (when the number of shards is not divisible with ``world_size``), will not sample - the entire dataset. For these reasons it is not advisable to use tarred datasets as validation - or test datasets. - global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. - world_size (int): Total number of processes, used for partitioning shards. Defaults to 0. - return_sample_id (bool): whether to return the sample_id as a part of each sample - """ - - def __init__( - self, - audio_tar_filepaths: Union[str, List[str]], - manifest_filepath: str, - parser: Callable, - int_values: bool = False, - shuffle_n: int = 0, - min_duration: Optional[float] = None, - max_duration: Optional[float] = None, - trim: bool = False, - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - pad_id: int = 0, - shard_strategy: str = "scatter", - global_rank: int = 0, - world_size: int = 0, - return_sample_id: bool = False, - ): - # If necessary, cache manifests from object store - cache_datastore_manifests(manifest_filepaths=manifest_filepath) - - self.manifest_processor = VSRManifestProcessor( - manifest_filepath=manifest_filepath, - parser=parser, - max_duration=max_duration, - min_duration=min_duration, - max_utts=0, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - index_by_file_id=True, # Must set this so the manifest lines can be indexed by file ID - ) - - self.video_featurizer = VideoFeaturizer() - self.trim = trim - self.eos_id = eos_id - self.bos_id = bos_id - self.pad_id = pad_id - self.return_sample_id = return_sample_id - - audio_tar_filepaths = expand_sharded_filepaths( - audio_tar_filepaths=audio_tar_filepaths, - shard_strategy=shard_strategy, - world_size=world_size, - global_rank=global_rank, - ) - - # Put together WebDataset - self._dataset = wd.WebDataset(urls=audio_tar_filepaths, nodesplitter=None) - - if shuffle_n > 0: - self._dataset = self._dataset.shuffle(shuffle_n) - else: - logging.info("WebDataset will not shuffle files within the tar files.") - - self._dataset = ( - self._dataset.map(wd.autodecode.Decoder([wd.torch_video])) - .rename(video="mp4", key='__key__') - .to_tuple('video', 'key') - .pipe(self._filter) - .pipe(self._loop_offsets) - .map(f=self._build_sample) - ) - - def _filter(self, iterator): - """This function is used to remove samples that have been filtered out by ASRVideoText already. - Otherwise, we would get a KeyError as _build_sample attempts to find the manifest entry for a sample - that was filtered out (e.g. for duration). - Note that if using multi-GPU training, filtering may lead to an imbalance in samples in each shard, - which may make your code hang as one process will finish before the other. - """ - - class TarredAudioFilter: - def __init__(self, collection): - self.iterator = iterator - self.collection = collection - - def __iter__(self): - return self - - def __next__(self): - while True: - try: - video_bytes, audio_filename = next(self.iterator) - except: - print("except") - continue - file_id, _ = os.path.splitext(os.path.basename(audio_filename)) - if file_id in self.collection.mapping: - return video_bytes, audio_filename - - return TarredAudioFilter(self.manifest_processor.collection) - - def _loop_offsets(self, iterator): - """This function is used to iterate through utterances with different offsets for each file. - """ - - class TarredAudioLoopOffsets: - def __init__(self, collection): - self.iterator = iterator - self.collection = collection - self.current_fn = None - self.current_video_bytes = None - self.offset_id = 0 - - def __iter__(self): - return self - - def __next__(self): - if self.current_fn is None: - self.current_video_bytes, self.current_fn = next(self.iterator) - self.offset_id = 0 - else: - offset_list = self.collection.mapping[self.current_fn] - if len(offset_list) == self.offset_id + 1: - self.current_video_bytes, self.current_fn = next(self.iterator) - self.offset_id = 0 - else: - self.offset_id += 1 - - return self.current_video_bytes, self.current_fn, self.offset_id - - return TarredAudioLoopOffsets(self.manifest_processor.collection) - - def _collate_fn(self, batch): - return _video_speech_collate_fn(batch, self.pad_id) - - def _build_sample(self, tup): - """Builds the training sample by combining the data from the WebDataset with the manifest info. - """ - video_tuple, audio_filename, offset_id = tup - - # Grab manifest entry from self.manifest_preprocessor.collection - file_id, _ = os.path.splitext(os.path.basename(audio_filename)) - manifest_idx = self.manifest_processor.collection.mapping[file_id][offset_id] - manifest_entry = self.manifest_processor.collection[manifest_idx] - - offset = manifest_entry.offset - if offset is None: - offset = 0 - - # Load Video - video_features = video_tuple[0] - - # Signal length - vf, vfl = video_features, torch.tensor(video_features.shape[0]).long() - - # Load Tokens - t, tl = manifest_entry.text_tokens, len(manifest_entry.text_tokens) - - self.manifest_processor.process_text_by_sample(sample=manifest_entry) - - if self.bos_id is not None: - t = [self.bos_id] + t - tl += 1 - if self.eos_id is not None: - t = t + [self.eos_id] - tl += 1 - - if self.return_sample_id: - return vf, vfl, torch.tensor(t).long(), torch.tensor(tl).long(), manifest_idx - else: - return vf, vfl, torch.tensor(t).long(), torch.tensor(tl).long() - - def get_manifest_sample(self, sample_id): - return self.manifest_processor.collection[sample_id] - - def __iter__(self): - return self._dataset.__iter__() - - def __len__(self): - return len(self.manifest_processor.collection) - - -class TarredVideoToBPEDataset(_TarredVideoToTextDataset): - """ - A similar Dataset to the VideoToBPEDataset, but which loads tarred audio files. - - Accepts a single comma-separated JSON manifest file (in the same style as for the VideoToBPEDataset), - as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should - contain the information for one audio file, including at least the transcript and name of the audio - file within the tarball. - - Valid formats for the audio_tar_filepaths argument include: - (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or - (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...]. - - See the WebDataset documentation for more information about accepted data and input formats. - - If using multiple workers the number of shards should be divisible by world_size to ensure an - even split among workers. If it is not divisible, logging will give a warning but training will proceed. - In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering - is applied. We currently do not check for this, but your program may hang if the shards are uneven! - - Notice that a few arguments are different from the AudioToBPEDataset; for example, shuffle (bool) has been - replaced by shuffle_n (int). - - Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest - after filtering. An incorrect manifest length may lead to some DataLoader issues down the line. - - Args: - audio_tar_filepaths: Either a list of audio tarball filepaths, or a - string (can be brace-expandable). - manifest_filepath (str): Path to the manifest. - tokenizer (TokenizerSpec): Either a Word Piece Encoding tokenizer (BERT), - or a Sentence Piece Encoding tokenizer (BPE). The CTC blank - symbol is automatically added later for models using ctc. - int_values (bool): If true, load samples as 32-bit integers. Defauts to False. - shuffle_n (int): How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - Defaults to 0. - min_duration (float): Dataset parameter. - All training files which have a duration less than min_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to 0.1. - max_duration (float): Dataset parameter. - All training files which have a duration more than max_duration - are dropped. Note: Duration is read from the manifest JSON. - Defaults to None. - trim (bool): Whether to use trim silence from beginning and end - of audio signal using librosa.effects.trim(). - Defaults to False. - use_start_end_token: Boolean which dictates whether to add [BOS] and [EOS] - tokens to beginning and ending of speech respectively. - pad_id (id): Token used to pad when collating samples in batches. - If this is None, pads using 0s. - Defaults to None. - shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp. - - - `scatter`: The default shard strategy applied by WebDataset, where each node gets - a unique set of shards, which are permanently pre-allocated and never changed at runtime. - - `replicate`: Optional shard strategy, where each node gets all of the set of shards - available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. - The benefit of replication is that it allows each node to sample data points from the entire - dataset independently of other nodes, and reduces dependence on value of `shuffle_n`. - - .. warning:: - - Replicated strategy allows every node to sample the entire set of available tarfiles, - and therefore more than one node may sample the same tarfile, and even sample the same - data points! As such, there is no assured guarantee that all samples in the dataset will be - sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific - occasions (when the number of shards is not divisible with ``world_size``), will not sample - the entire dataset. For these reasons it is not advisable to use tarred datasets as validation - or test datasets. - - global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. - world_size (int): Total number of processes, used for partitioning shards. Defaults to 0. - return_sample_id (bool): whether to return the sample_id as a part of each sample - """ - - def __init__( - self, - audio_tar_filepaths: Union[str, List[str]], - manifest_filepath: str, - tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec', - int_values: bool = False, - shuffle_n: int = 0, - min_duration: Optional[float] = None, - max_duration: Optional[float] = None, - trim: bool = False, - use_start_end_token: bool = True, - shard_strategy: str = "scatter", - global_rank: int = 0, - world_size: int = 0, - return_sample_id: bool = False, - ): - if use_start_end_token and hasattr(tokenizer, "bos_id") and tokenizer.bos_id > 0: - bos_id = tokenizer.bos_id - else: - bos_id = None - - if use_start_end_token and hasattr(tokenizer, "eos_id") and tokenizer.eos_id > 0: - eos_id = tokenizer.eos_id - else: - eos_id = None - - if hasattr(tokenizer, "pad_id") and tokenizer.pad_id > 0: - pad_id = tokenizer.pad_id - else: - pad_id = 0 - - class TokenizerWrapper: - def __init__(self, tokenizer): - if isinstance(tokenizer, tokenizers.aggregate_tokenizer.AggregateTokenizer): - self.is_aggregate = True - else: - self.is_aggregate = False - self._tokenizer = tokenizer - - def __call__(self, *args): - if isinstance(args[0], Iterable) and self.is_aggregate: - t = [] - for span in args[0]: - t.extend(self._tokenizer.text_to_ids(span['str'], span['lang'])) - return t - - t = self._tokenizer.text_to_ids(*args) - return t - - super().__init__( - audio_tar_filepaths=audio_tar_filepaths, - manifest_filepath=manifest_filepath, - parser=TokenizerWrapper(tokenizer), - int_values=int_values, - shuffle_n=shuffle_n, - min_duration=min_duration, - max_duration=max_duration, - trim=trim, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - shard_strategy=shard_strategy, - global_rank=global_rank, - world_size=world_size, - return_sample_id=return_sample_id, - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/data/video_to_text_dataset.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/data/video_to_text_dataset.py deleted file mode 100644 index cf34cc14974e120af5cba98fd5a5bdf014b46a1d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/data/video_to_text_dataset.py +++ /dev/null @@ -1,287 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random -from math import isclose -from typing import Optional - -import torch -from omegaconf import DictConfig -from omegaconf.listconfig import ListConfig -from torch.utils.data import ChainDataset - -from nemo.collections.asr.data.audio_to_text_dataset import convert_to_config_list, get_chain_dataset -from nemo.collections.multimodal.speech_cv.data import video_to_text -from nemo.utils import logging - - -def get_video_to_text_bpe_dataset_from_config( - config, - local_rank: int, - global_rank: int, - world_size: int, - tokenizer, - preprocessor_cfg: Optional[DictConfig] = None, -): - """ - Construct Video-To-Text BPE dataset from a config. - Args: - config: BPE dataset config - local_rank: model local rank - global_rank: model global rand - world_size: world size - tokenizer: BPE tokenizer - preprocessor_cfg: preprocessor config, for DALI BPE dataset - - Returns: - constructed dataset or None if dataset config is invalid or nothing to load - """ - - is_concat = config.get('is_concat', False) - if is_concat: - if 'concat_sampling' in config and config['concat_sampling'] is None: - logging.warning(f"Concat dataset requires `concat_sampling` but it was not provided. Config: {config}") - return None - - if not 'concat_probabilities' in config: - logging.warning( - f"Concat dataset requires `concat_probabilities` list but it was not provided. Config: {config}" - ) - return None - else: - if not isclose(sum(config['concat_probabilities']), 1, abs_tol=1e-6): - logging.warning(f"`concat_probabilities` need to sum to 1. Config: {config}") - return None - - shuffle = config['shuffle'] - - # Instantiate tarred dataset loader or normal dataset loader - if config.get('is_tarred', False): - if ('tarred_audio_filepaths' in config and config['tarred_audio_filepaths'] is None) or ( - 'manifest_filepath' in config and config['manifest_filepath'] is None - ): - logging.warning( - "Could not load dataset as `manifest_filepath` was None or " - f"`tarred_audio_filepaths` is None. Provided config : {config}" - ) - return None - - shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0 - if is_concat: - raise NotImplementedError("get_concat_tarred_dataset method not implemented") - else: - dataset = get_tarred_dataset( - config=config, tokenizer=tokenizer, shuffle_n=shuffle_n, global_rank=global_rank, world_size=world_size - ) - else: - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}") - return None - if is_concat: - raise NotImplementedError("get_concat_bpe_dataset method not implemented") - else: - dataset = get_bpe_dataset(config=config, tokenizer=tokenizer) - return dataset - - -def get_video_to_text_char_dataset_from_config( - config, local_rank: int, global_rank: int, world_size: int, preprocessor_cfg: Optional[DictConfig] = None -): - """ - Construct Video-To-Text Char dataset from a config. - Args: - config: dataset config - local_rank: model local rank - global_rank: model global rand - world_size: world size - preprocessor_cfg: preprocessor config, for DALI dataset - - Returns: - constructed dataset or None if dataset config is invalid or nothing to load - """ - - is_concat = config.get('is_concat', False) - if is_concat: - if 'concat_sampling' in config and config['concat_sampling'] is None: - logging.warning(f"Concat dataset requires `concat_sampling` but it was not provided. Config: {config}") - return None - - if not 'concat_probabilities' in config: - logging.warning( - f"Concat dataset requires `concat_probabilities` list but it was not provided. Config: {config}" - ) - return None - else: - if not isclose(sum(config['concat_probabilities']), 1, abs_tol=1e-6): - logging.warning(f"`concat_probabilities` need to sum to 1. Config: {config}") - return None - - shuffle = config['shuffle'] - - # Instantiate tarred dataset loader or normal dataset loader - if config.get('is_tarred', False): - if ('tarred_audio_filepaths' in config and config['tarred_audio_filepaths'] is None) or ( - 'manifest_filepath' in config and config['manifest_filepath'] is None - ): - logging.warning( - "Could not load dataset as `manifest_filepath` was None or " - f"`tarred_audio_filepaths` is None. Provided config : {config}" - ) - return None - - shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0 - if is_concat: - raise Exception("get_concat_tarred_dataset method not implemented") - else: - dataset = get_tarred_dataset( - config=config, shuffle_n=shuffle_n, global_rank=global_rank, world_size=world_size, - ) - else: - if 'manifest_filepath' in config and config['manifest_filepath'] is None: - logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}") - return None - if is_concat: - raise Exception("get_concat_char_dataset method not implemented") - else: - dataset = get_char_dataset(config=config) - return dataset - - -def get_bpe_dataset(config: dict, tokenizer: 'TokenizerSpec') -> video_to_text.VideoToBPEDataset: - """ - Instantiates a Byte Pair Encoding / Word Piece Encoding based VideoToBPEDataset. - - Args: - config: Config of the VideoToBPEDataset. - tokenizer: An instance of a TokenizerSpec object. - - Returns: - An instance of VideoToBPEDataset. - """ - dataset = video_to_text.VideoToBPEDataset( - manifest_filepath=config['manifest_filepath'], - tokenizer=tokenizer, - int_values=config.get('int_values', False), - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - max_utts=config.get('max_utts', 0), - trim=config.get('trim_silence', False), - use_start_end_token=config.get('use_start_end_token', True), - return_sample_id=config.get('return_sample_id', False), - channel_selector=config.get('channel_selector', None), - ) - return dataset - - -def get_char_dataset(config: dict) -> video_to_text.VideoToCharDataset: - """ - Instantiates a Character Encoding based VideoToCharDataset. - - Args: - config: Config of the VideoToCharDataset. - - Returns: - An instance of VideoToCharDataset. - """ - if 'labels' not in config: - logging.warning(f"dataset does not have explicitly defined labels") - - dataset = video_to_text.VideoToCharDataset( - manifest_filepath=config['manifest_filepath'], - labels=config.get('labels', None), - int_values=config.get('int_values', False), - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - max_utts=config.get('max_utts', 0), - blank_index=config.get('blank_index', -1), - unk_index=config.get('unk_index', -1), - normalize=config.get('normalize_transcripts', False), - trim=config.get('trim_silence', False), - parser=config.get('parser', 'en'), - return_sample_id=config.get('return_sample_id', False), - channel_selector=config.get('channel_selector', None), - ) - return dataset - - -def get_tarred_dataset( - config: dict, shuffle_n: int, global_rank: int, world_size: int, tokenizer: Optional['TokenizerSpec'] = None, -) -> video_to_text.TarredVideoToBPEDataset: - """ - Instantiates a Word Piece/BPE Encoding based TarredVideoToBPEDataset or a char based TarredVideoToCharDataset. - - Args: - config: Config of the TarredVideoToBPEDataset or TarredVideoToCharDataset. - shuffle_n: How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - tokenizer: An instance of a TokenizerSpec object if BPE dataset is needed. - global_rank: Global rank of this device. - world_size: Global world size in the training method. - Passsing None would return a char-based dataset. - - Returns: - An instance of TarredVideoToBPEDataset or TarredVideoToCharDataset. - """ - tarred_audio_filepaths = config['tarred_audio_filepaths'] - manifest_filepaths = config['manifest_filepath'] - datasets = [] - tarred_audio_filepaths = convert_to_config_list(tarred_audio_filepaths) - manifest_filepaths = convert_to_config_list(manifest_filepaths) - - bucketing_weights = config.get('bucketing_weights', None) # For upsampling buckets - if bucketing_weights: - for idx, weight in enumerate(bucketing_weights): - if not isinstance(weight, int) or weight <= 0: - raise ValueError(f"bucket weights must be positive integers") - - if len(manifest_filepaths) != len(tarred_audio_filepaths): - raise ValueError( - f"manifest_filepaths (length={len(manifest_filepaths)}) and tarred_audio_filepaths (length={len(tarred_audio_filepaths)}) need to have the same number of buckets." - ) - - if 'labels' not in config: - logging.warning(f"dataset does not have explicitly defined labels") - - if 'max_utts' in config: - raise ValueError('"max_utts" parameter is not supported for tarred datasets') - - for dataset_idx, (tarred_audio_filepath, manifest_filepath) in enumerate( - zip(tarred_audio_filepaths, manifest_filepaths) - ): - if len(tarred_audio_filepath) == 1: - tarred_audio_filepath = tarred_audio_filepath[0] - if tokenizer is None: - raise Exception("video_to_text.TarredVideoToCharDataset class not Implemented") - else: - dataset = video_to_text.TarredVideoToBPEDataset( - audio_tar_filepaths=tarred_audio_filepath, - manifest_filepath=manifest_filepath, - tokenizer=tokenizer, - int_values=config.get('int_values', False), - shuffle_n=shuffle_n, - max_duration=config.get('max_duration', None), - min_duration=config.get('min_duration', None), - trim=config.get('trim_silence', False), - use_start_end_token=config.get('use_start_end_token', True), - shard_strategy=config.get('tarred_shard_strategy', 'scatter'), - global_rank=global_rank, - world_size=world_size, - return_sample_id=config.get('return_sample_id', False), - ) - if bucketing_weights: - [datasets.append(dataset) for _ in range(bucketing_weights[dataset_idx])] - else: - datasets.append(dataset) - - return get_chain_dataset(datasets=datasets, ds_config=config) diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/__init__.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/__init__.py deleted file mode 100644 index c34b4c174ac18965115f466e18c0e91321d8c17f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# CTC -from nemo.collections.multimodal.speech_cv.models.visual_ctc_bpe_models import VisualEncDecCTCModelBPE -from nemo.collections.multimodal.speech_cv.models.visual_ctc_models import VisualEncDecCTCModel -from nemo.collections.multimodal.speech_cv.models.visual_hybrid_rnnt_ctc_bpe_models import ( - VisualEncDecHybridRNNTCTCBPEModel, -) - -# Hybrid CTC/RNN-T -from nemo.collections.multimodal.speech_cv.models.visual_hybrid_rnnt_ctc_models import VisualEncDecHybridRNNTCTCModel -from nemo.collections.multimodal.speech_cv.models.visual_rnnt_bpe_models import VisualEncDecRNNTBPEModel - -# RNN-T -from nemo.collections.multimodal.speech_cv.models.visual_rnnt_models import VisualEncDecRNNTModel diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_ctc_bpe_models.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_ctc_bpe_models.py deleted file mode 100644 index 529acb4cc86f0081ac851c5aaedc131c0b34dabd..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_ctc_bpe_models.py +++ /dev/null @@ -1,314 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os -from typing import Dict, List, Optional, Union - -import torch -from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict - -from nemo.collections.asr.losses.ctc import CTCLoss -from nemo.collections.asr.metrics.wer_bpe import WERBPE, CTCBPEDecoding, CTCBPEDecodingConfig -from nemo.collections.asr.parts.mixins import ASRBPEMixin -from nemo.collections.multimodal.speech_cv.data import video_to_text_dataset -from nemo.collections.multimodal.speech_cv.models.visual_ctc_models import VisualEncDecCTCModel -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging, model_utils - -__all__ = ['VisualEncDecCTCModelBPE'] - - -class VisualEncDecCTCModelBPE(VisualEncDecCTCModel, ASRBPEMixin): - """Encoder decoder CTC-based models with Byte Pair Encoding.""" - - def __init__(self, cfg: DictConfig, trainer=None): - # Convert to Hydra 1.0 compatible DictConfig - cfg = model_utils.convert_model_config_to_dict_config(cfg) - cfg = model_utils.maybe_update_config_version(cfg) - - if 'tokenizer' not in cfg: - raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !") - - # Setup the tokenizer - self._setup_tokenizer(cfg.tokenizer) - - # Initialize a dummy vocabulary - vocabulary = self.tokenizer.tokenizer.get_vocab() - - # Set the new vocabulary - with open_dict(cfg): - # sidestepping the potential overlapping tokens issue in aggregate tokenizers - if self.tokenizer_type == "agg": - cfg.decoder.vocabulary = ListConfig(vocabulary) - else: - cfg.decoder.vocabulary = ListConfig(list(vocabulary.keys())) - - # Override number of classes if placeholder provided - num_classes = cfg.decoder["num_classes"] - - if num_classes < 1: - logging.info( - "\nReplacing placeholder number of classes ({}) with actual number of classes - {}".format( - num_classes, len(vocabulary) - ) - ) - cfg.decoder["num_classes"] = len(vocabulary) - - super().__init__(cfg=cfg, trainer=trainer) - - # Setup decoding objects - decoding_cfg = self.cfg.get('decoding', None) - - # In case decoding config not found, use default config - if decoding_cfg is None: - decoding_cfg = OmegaConf.structured(CTCBPEDecodingConfig) - with open_dict(self.cfg): - self.cfg.decoding = decoding_cfg - - self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer) - - # Setup metric with decoding strategy - self._wer = WERBPE( - decoding=self.decoding, - use_cer=self._cfg.get('use_cer', False), - dist_sync_on_step=True, - log_prediction=self._cfg.get("log_prediction", False), - ) - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - dataset = video_to_text_dataset.get_video_to_text_bpe_dataset_from_config( - config=config, - local_rank=self.local_rank, - global_rank=self.global_rank, - world_size=self.world_size, - tokenizer=self.tokenizer, - preprocessor_cfg=self.cfg.get("preprocessor", None), - ) - - if dataset is None: - return None - - shuffle = config['shuffle'] - if config.get('is_tarred', False): - shuffle = False - - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - else: - collate_fn = dataset.datasets[0].collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - prefetch_factor=config.get('prefetch_factor', 2), - ) - - def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': - """ - Setup function for a temporary data loader which wraps the provided video file. - - Args: - config: A python dictionary which contains the following keys: - paths2video_files: (a list) of paths to video files. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - temp_dir: (str) A temporary directory where the video manifest is temporarily - stored. - num_workers: (int) number of workers. Depends of the batch_size and machine. \ - 0 - only the main process will load batches, 1 - one worker (not main process) - - Returns: - A pytorch DataLoader for the given video file(s). - """ - - if 'manifest_filepath' in config: - manifest_filepath = config['manifest_filepath'] - batch_size = config['batch_size'] - else: - manifest_filepath = os.path.join(config['temp_dir'], 'manifest.json') - batch_size = min(config['batch_size'], len(config['paths2video_files'])) - - dl_config = { - 'manifest_filepath': manifest_filepath, - 'batch_size': batch_size, - 'shuffle': False, - 'num_workers': config.get('num_workers', min(batch_size, os.cpu_count() - 1)), - 'pin_memory': True, - 'channel_selector': config.get('channel_selector', None), - 'use_start_end_token': self.cfg.validation_ds.get('use_start_end_token', False), - } - - if config.get("augmentor"): - dl_config['augmentor'] = config.get("augmentor") - - temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) - return temporary_datalayer - - def change_vocabulary( - self, - new_tokenizer_dir: Union[str, DictConfig], - new_tokenizer_type: str, - decoding_cfg: Optional[DictConfig] = None, - ): - """ - Changes vocabulary of the tokenizer used during CTC decoding process. - Use this method when fine-tuning on from pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - Args: - new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer (if the tokenizer type is `agg`) - new_tokenizer_type: Either `agg`, `bpe` or `wpe`. `bpe` is used for SentencePiece tokenizers, - whereas `wpe` is used for `BertTokenizer`. - new_tokenizer_cfg: A config for the new tokenizer. if provided, pre-empts the dir and type - - Returns: None - - """ - if isinstance(new_tokenizer_dir, DictConfig): - if new_tokenizer_type == 'agg': - new_tokenizer_cfg = new_tokenizer_dir - else: - raise ValueError( - f'New tokenizer dir should be a string unless the tokenizer is `agg`, but this tokenizer type is: {new_tokenizer_type}' - ) - else: - new_tokenizer_cfg = None - - if new_tokenizer_cfg is not None: - tokenizer_cfg = new_tokenizer_cfg - else: - if not os.path.isdir(new_tokenizer_dir): - raise NotADirectoryError( - f'New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}' - f"New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}" - ) - - if new_tokenizer_type.lower() not in ('bpe', 'wpe'): - raise ValueError(f'New tokenizer type must be either `bpe` or `wpe`') - - tokenizer_cfg = OmegaConf.create({'dir': new_tokenizer_dir, 'type': new_tokenizer_type}) - - # Setup the tokenizer - self._setup_tokenizer(tokenizer_cfg) - - # Initialize a dummy vocabulary - vocabulary = self.tokenizer.tokenizer.get_vocab() - - # Set the new vocabulary - decoder_config = copy.deepcopy(self.decoder.to_config_dict()) - # sidestepping the potential overlapping tokens issue in aggregate tokenizers - if self.tokenizer_type == "agg": - decoder_config.vocabulary = ListConfig(vocabulary) - else: - decoder_config.vocabulary = ListConfig(list(vocabulary.keys())) - - decoder_num_classes = decoder_config['num_classes'] - - # Override number of classes if placeholder provided - logging.info( - "\nReplacing old number of classes ({}) with new number of classes - {}".format( - decoder_num_classes, len(vocabulary) - ) - ) - - decoder_config['num_classes'] = len(vocabulary) - - del self.decoder - self.decoder = VisualEncDecCTCModelBPE.from_config_dict(decoder_config) - del self.loss - self.loss = CTCLoss( - num_classes=self.decoder.num_classes_with_blank - 1, - zero_infinity=True, - reduction=self._cfg.get("ctc_reduction", "mean_batch"), - ) - - if decoding_cfg is None: - # Assume same decoding config as before - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(CTCBPEDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = CTCBPEDecoding(decoding_cfg=decoding_cfg, tokenizer=self.tokenizer) - - self._wer = WERBPE( - decoding=self.decoding, - use_cer=self._cfg.get('use_cer', False), - log_prediction=self._cfg.get("log_prediction", False), - dist_sync_on_step=True, - ) - - # Update config - with open_dict(self.cfg.decoder): - self._cfg.decoder = decoder_config - - with open_dict(self.cfg.decoding): - self._cfg.decoding = decoding_cfg - - logging.info(f"Changed tokenizer to {self.decoder.vocabulary} vocabulary.") - - def change_decoding_strategy(self, decoding_cfg: DictConfig): - """ - Changes decoding strategy used during CTC decoding process. - - Args: - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - """ - if decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config") - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(CTCBPEDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = CTCBPEDecoding(decoding_cfg=decoding_cfg, tokenizer=self.tokenizer,) - - self._wer = WERBPE( - decoding=self.decoding, - use_cer=self._wer.use_cer, - log_prediction=self._wer.log_prediction, - dist_sync_on_step=True, - ) - - # Update config - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - logging.info(f"Changed decoding strategy to \n{OmegaConf.to_yaml(self.cfg.decoding)}") - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - - return results diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_ctc_models.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_ctc_models.py deleted file mode 100644 index a2eeba03ee8fd60dadb243313a53f493676c5383..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_ctc_models.py +++ /dev/null @@ -1,692 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import json -import os -import tempfile -from math import ceil -from typing import Dict, List, Optional, Union - -import torch -from omegaconf import DictConfig, OmegaConf, open_dict -from pytorch_lightning import Trainer -from tqdm.auto import tqdm - -from nemo.collections.asr.data import audio_to_text_dataset -from nemo.collections.asr.losses.ctc import CTCLoss -from nemo.collections.asr.metrics.wer import WER, CTCDecoding, CTCDecodingConfig -from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel -from nemo.collections.asr.parts.mixins import ASRModuleMixin, InterCTCMixin -from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType -from nemo.collections.multimodal.speech_cv.data import video_to_text_dataset -from nemo.core.classes.common import PretrainedModelInfo, typecheck -from nemo.core.classes.mixins import AccessMixin -from nemo.core.neural_types import LabelsType, LengthsType, LogprobsType, NeuralType, VideoSignal -from nemo.utils import logging - -__all__ = ['VisualEncDecCTCModel'] - - -class VisualEncDecCTCModel(ASRModel, ExportableEncDecModel, ASRModuleMixin, InterCTCMixin): - """Base class for encoder decoder CTC-based models.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - - # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable - # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 - self.world_size = 1 - if trainer is not None: - self.world_size = trainer.world_size - - # Init - super().__init__(cfg=cfg, trainer=trainer) - - # Preprocessor, video transforms - self.video_preprocessor = VisualEncDecCTCModel.from_config_dict(self._cfg.video_preprocessor) - - # Augmentation, video augmentations - self.video_augmentation = VisualEncDecCTCModel.from_config_dict(self._cfg.video_augment) - - # Front-end Network, learned module that transform videos to temporal sequence - self.video_front_end = VisualEncDecCTCModel.from_config_dict(self._cfg.video_front_end) - - # Encoder Network - self.encoder = VisualEncDecCTCModel.from_config_dict(self._cfg.encoder) - - with open_dict(self._cfg): - if "feat_in" not in self._cfg.decoder or ( - not self._cfg.decoder.feat_in and hasattr(self.encoder, '_feat_out') - ): - self._cfg.decoder.feat_in = self.encoder._feat_out - if "feat_in" not in self._cfg.decoder or not self._cfg.decoder.feat_in: - raise ValueError("param feat_in of the decoder's config is not set!") - - if self.cfg.decoder.num_classes < 1 and self.cfg.decoder.vocabulary is not None: - logging.info( - "\nReplacing placeholder number of classes ({}) with actual number of classes - {}".format( - self.cfg.decoder.num_classes, len(self.cfg.decoder.vocabulary) - ) - ) - cfg.decoder["num_classes"] = len(self.cfg.decoder.vocabulary) - - # Decoder - self.decoder = VisualEncDecCTCModel.from_config_dict(self._cfg.decoder) - - # CTC Loss - self.loss = CTCLoss( - num_classes=self.decoder.num_classes_with_blank - 1, - zero_infinity=True, - reduction=self._cfg.get("ctc_reduction", "mean_batch"), - ) - - # Setup decoding objects - decoding_cfg = self.cfg.get('decoding', None) - - # In case decoding config not found, use default config - if decoding_cfg is None: - decoding_cfg = OmegaConf.structured(CTCDecodingConfig) - with open_dict(self.cfg): - self.cfg.decoding = decoding_cfg - - # Decoding - self.decoding = CTCDecoding(self.cfg.decoding, vocabulary=self.decoder.vocabulary) - - # Setup metric with decoding strategy - self._wer = WER( - decoding=self.decoding, - use_cer=self._cfg.get('use_cer', False), - dist_sync_on_step=True, - log_prediction=self._cfg.get("log_prediction", False), - ) - - # Setup optional Optimization flags - self.setup_optimization_flags() - - # setting up interCTC loss (from InterCTCMixin) - self.setup_interctc(decoder_name='decoder', loss_name='loss', wer_name='_wer') - - # Adapter modules setup (from ASRAdapterModelMixin) - self.setup_adapters() - - @torch.no_grad() - def transcribe( - self, - paths2video_files: List[str], - batch_size: int = 4, - logprobs: bool = False, - return_hypotheses: bool = False, - num_workers: int = 0, - channel_selector: Optional[ChannelSelectorType] = None, - augmentor: DictConfig = None, - ) -> List[str]: - """ - If modify this function, please remember update transcribe_partial_audio() in - nemo/collections/asr/parts/utils/trancribe_utils.py - - Uses greedy decoding to transcribe video files. Use this method for debugging and prototyping. - - Args: - paths2video_files: (a list) of paths to video files. - batch_size: (int) batch size to use during inference. - Bigger will result in better throughput performance but would use more memory. - logprobs: (bool) pass True to get log probabilities instead of transcripts. - return_hypotheses: (bool) Either return hypotheses or text - With hypotheses can do some postprocessing like getting timestamp or rescoring - num_workers: (int) number of workers for DataLoader - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. - augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied. - Returns: - A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2video_files - """ - if paths2video_files is None or len(paths2video_files) == 0: - return {} - - if return_hypotheses and logprobs: - raise ValueError( - "Either `return_hypotheses` or `logprobs` can be True at any given time." - "Returned hypotheses will contain the logprobs." - ) - - if num_workers is None: - num_workers = min(batch_size, os.cpu_count() - 1) - - # We will store transcriptions here - hypotheses = [] - all_hypotheses = [] - - # Model's mode and device - mode = self.training - device = next(self.parameters()).device - - try: - # Switch model to evaluation mode - self.eval() - # Freeze the visual front-end, encoder and decoder modules - self.video_front_end.freeze() - self.encoder.freeze() - self.decoder.freeze() - logging_level = logging.get_verbosity() - logging.set_verbosity(logging.WARNING) - # Work in tmp directory - will store manifest file there - with tempfile.TemporaryDirectory() as tmpdir: - with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp: - for video_file in paths2video_files: - entry = {'video_filepath': video_file, 'duration': 100000, 'text': ''} - fp.write(json.dumps(entry) + '\n') - - config = { - 'paths2video_files': paths2video_files, - 'batch_size': batch_size, - 'temp_dir': tmpdir, - 'num_workers': num_workers, - 'channel_selector': channel_selector, - } - - if augmentor: - config['augmentor'] = augmentor - - temporary_datalayer = self._setup_transcribe_dataloader(config) - for test_batch in tqdm(temporary_datalayer, desc="Transcribing"): - logits, logits_len, greedy_predictions = self.forward( - input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) - ) - if logprobs: - # dump log probs per file - for idx in range(logits.shape[0]): - lg = logits[idx][: logits_len[idx]] - hypotheses.append(lg.cpu().numpy()) - else: - current_hypotheses, all_hyp = self.decoding.ctc_decoder_predictions_tensor( - logits, decoder_lengths=logits_len, return_hypotheses=return_hypotheses, - ) - - if return_hypotheses: - # dump log probs per file - for idx in range(logits.shape[0]): - current_hypotheses[idx].y_sequence = logits[idx][: logits_len[idx]] - if current_hypotheses[idx].alignments is None: - current_hypotheses[idx].alignments = current_hypotheses[idx].y_sequence - - if all_hyp is None: - hypotheses += current_hypotheses - else: - hypotheses += all_hyp - - del greedy_predictions - del logits - del test_batch - finally: - # set mode back to its original value - self.train(mode=mode) - if mode is True: - self.video_front_end.unfreeze() - self.encoder.unfreeze() - self.decoder.unfreeze() - logging.set_verbosity(logging_level) - - return hypotheses - - def change_vocabulary(self, new_vocabulary: List[str], decoding_cfg: Optional[DictConfig] = None): - """ - Changes vocabulary used during CTC decoding process. Use this method when fine-tuning on from pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - If new_vocabulary == self.decoder.vocabulary then nothing will be changed. - - Args: - - new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically, \ - this is target alphabet. - - Returns: None - - """ - if self.decoder.vocabulary == new_vocabulary: - logging.warning(f"Old {self.decoder.vocabulary} and new {new_vocabulary} match. Not changing anything.") - else: - if new_vocabulary is None or len(new_vocabulary) == 0: - raise ValueError(f'New vocabulary must be non-empty list of chars. But I got: {new_vocabulary}') - decoder_config = self.decoder.to_config_dict() - new_decoder_config = copy.deepcopy(decoder_config) - new_decoder_config['vocabulary'] = new_vocabulary - new_decoder_config['num_classes'] = len(new_vocabulary) - - del self.decoder - self.decoder = VisualEncDecCTCModel.from_config_dict(new_decoder_config) - del self.loss - self.loss = CTCLoss( - num_classes=self.decoder.num_classes_with_blank - 1, - zero_infinity=True, - reduction=self._cfg.get("ctc_reduction", "mean_batch"), - ) - - if decoding_cfg is None: - # Assume same decoding config as before - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(CTCDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = CTCDecoding(decoding_cfg=decoding_cfg, vocabulary=self.decoder.vocabulary) - - self._wer = WER( - decoding=self.decoding, - use_cer=self._cfg.get('use_cer', False), - dist_sync_on_step=True, - log_prediction=self._cfg.get("log_prediction", False), - ) - - # Update config - with open_dict(self.cfg.decoder): - self._cfg.decoder = new_decoder_config - - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - ds_keys = ['train_ds', 'validation_ds', 'test_ds'] - for key in ds_keys: - if key in self.cfg: - with open_dict(self.cfg[key]): - self.cfg[key]['labels'] = OmegaConf.create(new_vocabulary) - - logging.info(f"Changed decoder to output to {self.decoder.vocabulary} vocabulary.") - - def change_decoding_strategy(self, decoding_cfg: DictConfig): - """ - Changes decoding strategy used during CTC decoding process. - - Args: - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - """ - if decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config") - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(CTCDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = CTCDecoding(decoding_cfg=decoding_cfg, vocabulary=self.decoder.vocabulary) - - self._wer = WER( - decoding=self.decoding, - use_cer=self._wer.use_cer, - log_prediction=self._wer.log_prediction, - dist_sync_on_step=True, - ) - - # Update config - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - logging.info(f"Changed decoding strategy to \n{OmegaConf.to_yaml(self.cfg.decoding)}") - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - # Automatically inject args from model config to dataloader config - audio_to_text_dataset.inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate') - audio_to_text_dataset.inject_dataloader_value_from_model_config(self.cfg, config, key='labels') - dataset = video_to_text_dataset.get_video_to_text_char_dataset_from_config( - config=config, - local_rank=self.local_rank, - global_rank=self.global_rank, - world_size=self.world_size, - preprocessor_cfg=self._cfg.get("preprocessor", None), - ) - - if dataset is None: - return None - - shuffle = config['shuffle'] - if config.get('is_tarred', False): - shuffle = False - - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - else: - collate_fn = dataset.datasets[0].collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the training data loader via a Dict-like object. - - Args: - train_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.multimodal.speech_cv.data.video_to_text.VideoToCharDataset` - - :class:`~nemo.collections.asr.data.video_to_text.VideoToBPEDataset` - - :class:`~nemo.collections.asr.data.video_to_text.TarredVideoToBPEDataset` - """ - if 'shuffle' not in train_data_config: - train_data_config['shuffle'] = True - - # preserve config - self._update_dataset_config(dataset_name='train', config=train_data_config) - - self._train_dl = self._setup_dataloader_from_config(config=train_data_config) - - # Need to set this because if using an IterableDataset, the length of the dataloader is the total number - # of samples rather than the number of batches, and this messes up the tqdm progress bar. - # So we set the number of steps manually (to the correct number) to fix this. - if 'is_tarred' in train_data_config and train_data_config['is_tarred']: - # We also need to check if limit_train_batches is already set. - # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, - # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). - if self._trainer is not None and isinstance(self._trainer.limit_train_batches, float): - self._trainer.limit_train_batches = int( - self._trainer.limit_train_batches - * ceil((len(self._train_dl.dataset) / self.world_size) / train_data_config['batch_size']) - ) - elif self._trainer is None: - logging.warning( - "Model Trainer was not set before constructing the dataset, incorrect number of " - "training batches will be used. Please set the trainer and rebuild the dataset." - ) - - def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the validation data loader via a Dict-like object. - - Args: - val_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.multimodal.speech_cv.data.video_to_text.VideoToCharDataset` - - :class:`~nemo.collections.asr.data.video_to_text.VideoToBPEDataset` - - :class:`~nemo.collections.asr.data.video_to_text.TarredVideoToBPEDataset` - """ - if 'shuffle' not in val_data_config: - val_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='validation', config=val_data_config) - - self._validation_dl = self._setup_dataloader_from_config(config=val_data_config) - - def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the test data loader via a Dict-like object. - - Args: - test_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.multimodal.speech_cv.data.video_to_text.VideoToCharDataset` - - :class:`~nemo.collections.asr.data.video_to_text.VideoToBPEDataset` - - :class:`~nemo.collections.asr.data.video_to_text.TarredVideoToBPEDataset` - """ - if 'shuffle' not in test_data_config: - test_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='test', config=test_data_config) - - self._test_dl = self._setup_dataloader_from_config(config=test_data_config) - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - return { - "input_video_signal": NeuralType(('B', 'C', 'T', 'H', 'W'), VideoSignal(), optional=True), - "input_video_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), - "sample_id": NeuralType(tuple('B'), LengthsType(), optional=True), - } - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return { - "outputs": NeuralType(('B', 'T', 'D'), LogprobsType()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - "greedy_predictions": NeuralType(('B', 'T'), LabelsType()), - } - - @typecheck() - def forward(self, input_video_signal=None, input_video_signal_length=None): - """ - Forward pass of the model. - - Args: - input_video_signal: Tensor that represents a batch of video signals, - of shape [B, T, H, W, C]. T here represents timesteps, H height, W width and C channels - input_video_signal_length: Vector of length B, that contains the individual lengths of the video - sequences. - - Returns: - A tuple of 3 elements - - 1) The log probabilities tensor of shape [B, T, D]. - 2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B]. - 3) The greedy token predictions of the model of shape [B, T] (via argmax) - """ - - # Preprocessing - processed_video_signal, processed_video_signal_length = self.video_preprocessor( - input_signal=input_video_signal, length=input_video_signal_length - ) - - # Augmentation - processed_video_signal = self.video_augmentation( - input_signal=processed_video_signal, length=processed_video_signal_length - ) - - # Front-end Networks - processed_video_signal, processed_video_signal_length = self.video_front_end( - input_signal=processed_video_signal, length=processed_video_signal_length - ) - - # Back-end Networks - encoded, encoded_len = self.encoder(audio_signal=processed_video_signal, length=processed_video_signal_length) - - log_probs = self.decoder(encoder_output=encoded) - greedy_predictions = log_probs.argmax(dim=-1, keepdim=False) - - return ( - log_probs, - encoded_len, - greedy_predictions, - ) - - # PTL-specific methods - def training_step(self, batch, batch_nb): - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - if self.is_interctc_enabled(): - AccessMixin.set_access_enabled(access_enabled=True) - - video_signal, video_signal_len, transcript, transcript_len = batch - log_probs, encoded_len, predictions = self.forward( - input_video_signal=video_signal, input_video_signal_length=video_signal_len - ) - - if hasattr(self, '_trainer') and self._trainer is not None: - log_every_n_steps = self._trainer.log_every_n_steps - else: - log_every_n_steps = 1 - - loss_value = self.loss( - log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len - ) - - # Add auxiliary losses, if registered - loss_value = self.add_auxiliary_losses(loss_value) - # only computing WER when requested in the logs (same as done for final-layer WER below) - loss_value, tensorboard_logs = self.add_interctc_losses( - loss_value, transcript, transcript_len, compute_wer=((batch_nb + 1) % log_every_n_steps == 0) - ) - - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - tensorboard_logs.update( - { - 'train_loss': loss_value, - 'learning_rate': self._optimizer.param_groups[0]['lr'], - 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), - } - ) - - if (batch_nb + 1) % log_every_n_steps == 0: - self._wer.update( - predictions=log_probs, - targets=transcript, - target_lengths=transcript_len, - predictions_lengths=encoded_len, - ) - wer, _, _ = self._wer.compute() - self._wer.reset() - tensorboard_logs.update({'training_batch_wer': wer}) - - return {'loss': loss_value, 'log': tensorboard_logs} - - def predict_step(self, batch, batch_idx, dataloader_idx=0): - video_signal, video_signal_len, transcript, transcript_len, sample_id = batch - log_probs, encoded_len, predictions = self.forward( - input_video_signal=video_signal, input_video_signal_length=video_signal_len - ) - - transcribed_texts, _ = self._wer.decoding.ctc_decoder_predictions_tensor( - decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False, - ) - - sample_id = sample_id.cpu().detach().numpy() - return list(zip(sample_id, transcribed_texts)) - - def validation_step(self, batch, batch_idx, dataloader_idx=0): - if self.is_interctc_enabled(): - AccessMixin.set_access_enabled(access_enabled=True) - - video_signal, video_signal_len, transcript, transcript_len = batch - log_probs, encoded_len, predictions = self.forward( - input_video_signal=video_signal, input_video_signal_length=video_signal_len - ) - - loss_value = self.loss( - log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len - ) - loss_value, metrics = self.add_interctc_losses( - loss_value, transcript, transcript_len, compute_wer=True, log_wer_num_denom=True, log_prefix="val_", - ) - - self._wer.update( - predictions=log_probs, targets=transcript, target_lengths=transcript_len, predictions_lengths=encoded_len - ) - wer, wer_num, wer_denom = self._wer.compute() - self._wer.reset() - metrics.update({'val_loss': loss_value, 'val_wer_num': wer_num, 'val_wer_denom': wer_denom, 'val_wer': wer}) - - self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) - - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - return metrics - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): - metrics = super().multi_validation_epoch_end(outputs, dataloader_idx) - self.finalize_interctc_metrics(metrics, outputs, prefix="val_") - return metrics - - def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): - metrics = super().multi_test_epoch_end(outputs, dataloader_idx) - self.finalize_interctc_metrics(metrics, outputs, prefix="test_") - return metrics - - def test_step(self, batch, batch_idx, dataloader_idx=0): - logs = self.validation_step(batch, batch_idx, dataloader_idx=dataloader_idx) - test_logs = {name.replace("val_", "test_"): value for name, value in logs.items()} - if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: - self.test_step_outputs[dataloader_idx].append(test_logs) - else: - self.test_step_outputs.append(test_logs) - return test_logs - - def test_dataloader(self): - if self._test_dl is not None: - return self._test_dl - - def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': - """ - Setup function for a temporary data loader which wraps the provided video file. - - Args: - config: A python dictionary which contains the following keys: - paths2video_files: (a list) of paths to video files. The files should be relatively short fragments. \ - Recommended length per file is between 5 and 25 seconds. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - temp_dir: (str) A temporary directory where the video manifest is temporarily - stored. - num_workers: (int) number of workers. Depends of the batch_size and machine. \ - 0 - only the main process will load batches, 1 - one worker (not main process) - - Returns: - A pytorch DataLoader for the given video file(s). - """ - if 'manifest_filepath' in config: - manifest_filepath = config['manifest_filepath'] - batch_size = config['batch_size'] - else: - manifest_filepath = os.path.join(config['temp_dir'], 'manifest.json') - batch_size = min(config['batch_size'], len(config['paths2video_files'])) - - dl_config = { - 'manifest_filepath': manifest_filepath, - 'labels': self.decoder.vocabulary, - 'batch_size': batch_size, - 'trim_silence': False, - 'shuffle': False, - 'num_workers': config.get('num_workers', min(batch_size, os.cpu_count() - 1)), - 'pin_memory': True, - 'channel_selector': config.get('channel_selector', None), - } - if config.get("augmentor"): - dl_config['augmentor'] = config.get("augmentor") - - temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) - return temporary_datalayer - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - - return results diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_bpe_models.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_bpe_models.py deleted file mode 100644 index 882d1570059352c8d02f8c58b912a386b9d7f655..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_bpe_models.py +++ /dev/null @@ -1,455 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os -from typing import Dict, Optional, Union - -import torch -from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict -from pytorch_lightning import Trainer - -from nemo.collections.asr.losses.ctc import CTCLoss -from nemo.collections.asr.losses.rnnt import RNNTLoss -from nemo.collections.asr.metrics.rnnt_wer_bpe import RNNTBPEWER, RNNTBPEDecoding, RNNTBPEDecodingConfig -from nemo.collections.asr.metrics.wer_bpe import WERBPE, CTCBPEDecoding, CTCBPEDecodingConfig -from nemo.collections.asr.parts.mixins import ASRBPEMixin -from nemo.collections.multimodal.speech_cv.data import video_to_text_dataset -from nemo.collections.multimodal.speech_cv.models.visual_hybrid_rnnt_ctc_models import VisualEncDecHybridRNNTCTCModel -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging, model_utils - - -class VisualEncDecHybridRNNTCTCBPEModel(VisualEncDecHybridRNNTCTCModel, ASRBPEMixin): - """Base class for encoder decoder RNNT-based models with auxiliary CTC decoder/loss and subword tokenization.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - # Convert to Hydra 1.0 compatible DictConfig - cfg = model_utils.convert_model_config_to_dict_config(cfg) - cfg = model_utils.maybe_update_config_version(cfg) - - # Tokenizer is necessary for this model - if 'tokenizer' not in cfg: - raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !") - - if not isinstance(cfg, DictConfig): - cfg = OmegaConf.create(cfg) - - # Setup the tokenizer - self._setup_tokenizer(cfg.tokenizer) - - # Initialize a dummy vocabulary - vocabulary = self.tokenizer.tokenizer.get_vocab() - - # Set the new vocabulary - with open_dict(cfg): - cfg.labels = ListConfig(list(vocabulary)) - - with open_dict(cfg.decoder): - cfg.decoder.vocab_size = len(vocabulary) - - with open_dict(cfg.joint): - cfg.joint.num_classes = len(vocabulary) - cfg.joint.vocabulary = ListConfig(list(vocabulary)) - cfg.joint.jointnet.encoder_hidden = cfg.model_defaults.enc_hidden - cfg.joint.jointnet.pred_hidden = cfg.model_defaults.pred_hidden - - # setup auxiliary CTC decoder - if 'aux_ctc' not in cfg: - raise ValueError( - "The config need to have a section for the CTC decoder named as aux_ctc for Hybrid models." - ) - - with open_dict(cfg): - if self.tokenizer_type == "agg": - cfg.aux_ctc.decoder.vocabulary = ListConfig(vocabulary) - else: - cfg.aux_ctc.decoder.vocabulary = ListConfig(list(vocabulary.keys())) - - if cfg.aux_ctc.decoder["num_classes"] < 1: - logging.info( - "\nReplacing placholder number of classes ({}) with actual number of classes - {}".format( - cfg.aux_ctc.decoder["num_classes"], len(vocabulary) - ) - ) - cfg.aux_ctc.decoder["num_classes"] = len(vocabulary) - - super().__init__(cfg=cfg, trainer=trainer) - - # Setup decoding object - self.decoding = RNNTBPEDecoding( - decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) - - # Setup wer object - self.wer = RNNTBPEWER( - decoding=self.decoding, - batch_dim_index=0, - use_cer=self.cfg.get('use_cer', False), - log_prediction=self.cfg.get('log_prediction', True), - dist_sync_on_step=True, - ) - - # Setup fused Joint step if flag is set - if self.joint.fuse_loss_wer: - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - # Setup CTC decoding - ctc_decoding_cfg = self.cfg.aux_ctc.get('decoding', None) - if ctc_decoding_cfg is None: - ctc_decoding_cfg = OmegaConf.structured(CTCBPEDecodingConfig) - with open_dict(self.cfg.aux_ctc): - self.cfg.aux_ctc.decoding = ctc_decoding_cfg - self.ctc_decoding = CTCBPEDecoding(self.cfg.aux_ctc.decoding, tokenizer=self.tokenizer) - - # Setup CTC WER - self.ctc_wer = WERBPE( - decoding=self.ctc_decoding, - use_cer=self.cfg.aux_ctc.get('use_cer', False), - dist_sync_on_step=True, - log_prediction=self.cfg.get("log_prediction", False), - ) - - # setting the RNNT decoder as the default one - self.use_rnnt_decoder = True - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - dataset = video_to_text_dataset.get_video_to_text_bpe_dataset_from_config( - config=config, - local_rank=self.local_rank, - global_rank=self.global_rank, - world_size=self.world_size, - tokenizer=self.tokenizer, - preprocessor_cfg=self.cfg.get("preprocessor", None), - ) - - if dataset is None: - return None - - shuffle = config['shuffle'] - if config.get('is_tarred', False): - shuffle = False - - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - else: - collate_fn = dataset.datasets[0].collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': - """ - Setup function for a temporary data loader which wraps the provided video file. - - Args: - config: A python dictionary which contains the following keys: - paths2video_files: (a list) of paths to video files. The files should be relatively short fragments. \ - Recommended length per file is between 5 and 25 seconds. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - temp_dir: (str) A temporary directory where the video manifest is temporarily - stored. - num_workers: (int) number of workers. Depends of the batch_size and machine. \ - 0 - only the main process will load batches, 1 - one worker (not main process) - - Returns: - A pytorch DataLoader for the given video file(s). - """ - - if 'manifest_filepath' in config: - manifest_filepath = config['manifest_filepath'] - batch_size = config['batch_size'] - else: - manifest_filepath = os.path.join(config['temp_dir'], 'manifest.json') - batch_size = min(config['batch_size'], len(config['paths2video_files'])) - - dl_config = { - 'manifest_filepath': manifest_filepath, - 'batch_size': batch_size, - 'shuffle': False, - 'num_workers': config.get('num_workers', min(batch_size, os.cpu_count() - 1)), - 'pin_memory': True, - 'channel_selector': config.get('channel_selector', None), - 'use_start_end_token': self.cfg.validation_ds.get('use_start_end_token', False), - } - - if config.get("augmentor"): - dl_config['augmentor'] = config.get("augmentor") - - temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) - return temporary_datalayer - - def change_vocabulary( - self, - new_tokenizer_dir: Union[str, DictConfig], - new_tokenizer_type: str, - decoding_cfg: Optional[DictConfig] = None, - ctc_decoding_cfg: Optional[DictConfig] = None, - ): - """ - Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning on from pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - Args: - new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer (if the tokenizer type is `agg`) - new_tokenizer_type: Type of tokenizer. Can be either `agg`, `bpe` or `wpe`. - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - ctc_decoding_cfg: A config for auxiliary CTC decoding, which is optional and can be used to change the decoding type. - - Returns: None - - """ - if isinstance(new_tokenizer_dir, DictConfig): - if new_tokenizer_type == 'agg': - new_tokenizer_cfg = new_tokenizer_dir - else: - raise ValueError( - f'New tokenizer dir should be a string unless the tokenizer is `agg`, but this tokenizer type is: {new_tokenizer_type}' - ) - else: - new_tokenizer_cfg = None - - if new_tokenizer_cfg is not None: - tokenizer_cfg = new_tokenizer_cfg - else: - if not os.path.isdir(new_tokenizer_dir): - raise NotADirectoryError( - f'New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}' - ) - - if new_tokenizer_type.lower() not in ('bpe', 'wpe'): - raise ValueError(f'New tokenizer type must be either `bpe` or `wpe`') - - tokenizer_cfg = OmegaConf.create({'dir': new_tokenizer_dir, 'type': new_tokenizer_type}) - - # Setup the tokenizer - self._setup_tokenizer(tokenizer_cfg) - - # Initialize a dummy vocabulary - vocabulary = self.tokenizer.tokenizer.get_vocab() - - joint_config = self.joint.to_config_dict() - new_joint_config = copy.deepcopy(joint_config) - if self.tokenizer_type == "agg": - new_joint_config["vocabulary"] = ListConfig(vocabulary) - else: - new_joint_config["vocabulary"] = ListConfig(list(vocabulary.keys())) - - new_joint_config['num_classes'] = len(vocabulary) - del self.joint - self.joint = VisualEncDecHybridRNNTCTCBPEModel.from_config_dict(new_joint_config) - - decoder_config = self.decoder.to_config_dict() - new_decoder_config = copy.deepcopy(decoder_config) - new_decoder_config.vocab_size = len(vocabulary) - del self.decoder - self.decoder = VisualEncDecHybridRNNTCTCBPEModel.from_config_dict(new_decoder_config) - - del self.loss - self.loss = RNNTLoss(num_classes=self.joint.num_classes_with_blank - 1) - - if decoding_cfg is None: - # Assume same decoding config as before - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(RNNTBPEDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = RNNTBPEDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) - - self.wer = RNNTBPEWER( - decoding=self.decoding, - batch_dim_index=self.wer.batch_dim_index, - use_cer=self.wer.use_cer, - log_prediction=self.wer.log_prediction, - dist_sync_on_step=True, - ) - - # Setup fused Joint step - if self.joint.fuse_loss_wer or ( - self.decoding.joint_fused_batch_size is not None and self.decoding.joint_fused_batch_size > 0 - ): - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - # Update config - with open_dict(self.cfg.joint): - self.cfg.joint = new_joint_config - - with open_dict(self.cfg.decoder): - self.cfg.decoder = new_decoder_config - - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - logging.info(f"Changed tokenizer of the RNNT decoder to {self.joint.vocabulary} vocabulary.") - - # set up the new tokenizer for the CTC decoder - if hasattr(self, 'ctc_decoder'): - ctc_decoder_config = copy.deepcopy(self.ctc_decoder.to_config_dict()) - # sidestepping the potential overlapping tokens issue in aggregate tokenizers - if self.tokenizer_type == "agg": - ctc_decoder_config.vocabulary = ListConfig(vocabulary) - else: - ctc_decoder_config.vocabulary = ListConfig(list(vocabulary.keys())) - - decoder_num_classes = ctc_decoder_config['num_classes'] - # Override number of classes if placeholder provided - logging.info( - "\nReplacing old number of classes ({}) with new number of classes - {}".format( - decoder_num_classes, len(vocabulary) - ) - ) - ctc_decoder_config['num_classes'] = len(vocabulary) - - del self.ctc_decoder - self.ctc_decoder = VisualEncDecHybridRNNTCTCBPEModel.from_config_dict(ctc_decoder_config) - del self.ctc_loss - self.ctc_loss = CTCLoss( - num_classes=self.ctc_decoder.num_classes_with_blank - 1, - zero_infinity=True, - reduction=self.cfg.aux_ctc.get("ctc_reduction", "mean_batch"), - ) - - if ctc_decoding_cfg is None: - # Assume same decoding config as before - ctc_decoding_cfg = self.cfg.aux_ctc.decoding - - # Assert the decoding config with all hyper parameters - ctc_decoding_cls = OmegaConf.structured(CTCBPEDecodingConfig) - ctc_decoding_cls = OmegaConf.create(OmegaConf.to_container(ctc_decoding_cls)) - ctc_decoding_cfg = OmegaConf.merge(ctc_decoding_cls, ctc_decoding_cfg) - - self.ctc_decoding = CTCBPEDecoding(decoding_cfg=ctc_decoding_cfg, tokenizer=self.tokenizer) - - self.ctc_wer = WERBPE( - decoding=self.ctc_decoding, - use_cer=self.cfg.aux_ctc.get('use_cer', False), - log_prediction=self.cfg.get("log_prediction", False), - dist_sync_on_step=True, - ) - - # Update config - with open_dict(self.cfg.aux_ctc): - self.cfg.aux_ctc.decoder = ctc_decoder_config - - with open_dict(self.cfg.aux_ctc): - self.cfg.aux_ctc.decoding = ctc_decoding_cfg - - logging.info(f"Changed tokenizer of the CTC decoder to {self.ctc_decoder.vocabulary} vocabulary.") - - def change_decoding_strategy(self, decoding_cfg: DictConfig, decoder_type: str = None): - """ - Changes decoding strategy used during RNNT decoding process. - Args: - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - decoder_type: (str) Can be set to 'rnnt' or 'ctc' to switch between appropriate decoder in a - model having both RNN-T and CTC decoders. Defaults to None, in which case RNN-T decoder is - used. If set to 'ctc', it raises error if 'ctc_decoder' is not an attribute of the model. - """ - if decoder_type is None or decoder_type == 'rnnt': - if decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config") - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(RNNTBPEDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = RNNTBPEDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) - - self.wer = RNNTBPEWER( - decoding=self.decoding, - batch_dim_index=self.wer.batch_dim_index, - use_cer=self.wer.use_cer, - log_prediction=self.wer.log_prediction, - dist_sync_on_step=True, - ) - - # Setup fused Joint step - if self.joint.fuse_loss_wer or ( - self.decoding.joint_fused_batch_size is not None and self.decoding.joint_fused_batch_size > 0 - ): - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - # Update config - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - logging.info(f"Changed decoding strategy of the RNNT decoder to \n{OmegaConf.to_yaml(self.cfg.decoding)}") - elif decoder_type == 'ctc': - if not hasattr(self, 'ctc_decoding'): - raise ValueError("The model does not have the ctc_decoding module and does not support ctc decoding.") - if decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config") - decoding_cfg = self.cfg.aux_ctc.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(CTCBPEDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.ctc_decoding = CTCBPEDecoding(decoding_cfg=decoding_cfg, tokenizer=self.tokenizer) - - self.ctc_wer = WERBPE( - decoding=self.ctc_decoding, - use_cer=self.ctc_wer.use_cer, - log_prediction=self.ctc_wer.log_prediction, - dist_sync_on_step=True, - ) - - # Update config - with open_dict(self.cfg.aux_ctc.decoding): - self.cfg.aux_ctc.decoding = decoding_cfg - - self.use_rnnt_decoder = False - logging.info( - f"Changed decoding strategy of the CTC decoder to \n{OmegaConf.to_yaml(self.cfg.aux_ctc.decoding)}" - ) - else: - raise ValueError(f"decoder_type={decoder_type} is not supported. Supported values: [ctc,rnnt]") - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - return results diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_models.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_models.py deleted file mode 100644 index 7c658f2a18c62adaadb4a596dd523078cffc1e98..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_hybrid_rnnt_ctc_models.py +++ /dev/null @@ -1,644 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import json -import os -import tempfile -from typing import List, Optional - -import torch -from omegaconf import DictConfig, OmegaConf, open_dict -from pytorch_lightning import Trainer -from tqdm.auto import tqdm - -from nemo.collections.asr.losses.ctc import CTCLoss -from nemo.collections.asr.metrics.wer import WER, CTCDecoding, CTCDecodingConfig -from nemo.collections.asr.parts.mixins import ASRBPEMixin, InterCTCMixin -from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType -from nemo.collections.multimodal.speech_cv.models.visual_rnnt_models import VisualEncDecRNNTModel -from nemo.core.classes.common import PretrainedModelInfo -from nemo.core.classes.mixins import AccessMixin -from nemo.utils import logging, model_utils - - -class VisualEncDecHybridRNNTCTCModel(VisualEncDecRNNTModel, ASRBPEMixin, InterCTCMixin): - """Base class for hybrid RNNT/CTC models.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - cfg = model_utils.convert_model_config_to_dict_config(cfg) - cfg = model_utils.maybe_update_config_version(cfg) - super().__init__(cfg=cfg, trainer=trainer) - - if 'aux_ctc' not in self.cfg: - raise ValueError( - "The config need to have a section for the CTC decoder named as aux_ctc for Hybrid models." - ) - with open_dict(self.cfg.aux_ctc): - if "feat_in" not in self.cfg.aux_ctc.decoder or ( - not self.cfg.aux_ctc.decoder.feat_in and hasattr(self.encoder, '_feat_out') - ): - self.cfg.aux_ctc.decoder.feat_in = self.encoder._feat_out - if "feat_in" not in self.cfg.aux_ctc.decoder or not self.cfg.aux_ctc.decoder.feat_in: - raise ValueError("param feat_in of the decoder's config is not set!") - - if self.cfg.aux_ctc.decoder.num_classes < 1 and self.cfg.aux_ctc.decoder.vocabulary is not None: - logging.info( - "\nReplacing placeholder number of classes ({}) with actual number of classes - {}".format( - self.cfg.aux_ctc.decoder.num_classes, len(self.cfg.aux_ctc.decoder.vocabulary) - ) - ) - self.cfg.aux_ctc.decoder["num_classes"] = len(self.cfg.aux_ctc.decoder.vocabulary) - - self.ctc_decoder = VisualEncDecHybridRNNTCTCModel.from_config_dict(self.cfg.aux_ctc.decoder) - self.ctc_loss_weight = self.cfg.aux_ctc.get("ctc_loss_weight", 0.5) - - self.ctc_loss = CTCLoss( - num_classes=self.ctc_decoder.num_classes_with_blank - 1, - zero_infinity=True, - reduction=self.cfg.aux_ctc.get("ctc_reduction", "mean_batch"), - ) - - ctc_decoding_cfg = self.cfg.aux_ctc.get('decoding', None) - if ctc_decoding_cfg is None: - ctc_decoding_cfg = OmegaConf.structured(CTCDecodingConfig) - with open_dict(self.cfg.aux_ctc): - self.cfg.aux_ctc.decoding = ctc_decoding_cfg - - self.ctc_decoding = CTCDecoding(self.cfg.aux_ctc.decoding, vocabulary=self.ctc_decoder.vocabulary) - self.ctc_wer = WER( - decoding=self.ctc_decoding, - use_cer=self.cfg.aux_ctc.get('use_cer', False), - dist_sync_on_step=True, - log_prediction=self.cfg.get("log_prediction", False), - ) - - # setting the RNNT decoder as the default one - self.use_rnnt_decoder = True - - # setting up interCTC loss (from InterCTCMixin) - self.setup_interctc(decoder_name='decoder', loss_name='loss', wer_name='_wer') - - @torch.no_grad() - def transcribe( - self, - paths2video_files: List[str], - batch_size: int = 4, - return_hypotheses: bool = False, - partial_hypothesis: Optional[List['Hypothesis']] = None, - num_workers: int = 0, - channel_selector: Optional[ChannelSelectorType] = None, - ) -> (List[str], Optional[List['Hypothesis']]): - """ - Uses greedy decoding to transcribe video files. Use this method for debugging and prototyping. - - Args: - - paths2video_files: (a list) of paths to video files. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - return_hypotheses: (bool) Either return hypotheses or text - With hypotheses can do some postprocessing like getting timestamp or rescoring - num_workers: (int) number of workers for DataLoader - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. - - Returns: - Returns a tuple of 2 items - - * A list of greedy transcript texts / Hypothesis - * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis. - """ - if self.use_rnnt_decoder: - return super().transcribe( - paths2video_files=paths2video_files, - batch_size=batch_size, - return_hypotheses=return_hypotheses, - partial_hypothesis=partial_hypothesis, - num_workers=num_workers, - channel_selector=channel_selector, - ) - - if paths2video_files is None or len(paths2video_files) == 0: - return {} - # We will store transcriptions here - hypotheses = [] - all_hypotheses = [] - # Model's mode and device - mode = self.training - device = next(self.parameters()).device - - if num_workers is None: - num_workers = min(batch_size, os.cpu_count() - 1) - - try: - - # Switch model to evaluation mode - self.eval() - # Freeze the visual front-end, encoder and decoder modules - self.video_front_end.freeze() - self.encoder.freeze() - self.decoder.freeze() - self.joint.freeze() - if hasattr(self, 'ctc_decoder'): - self.ctc_decoder.freeze() - - logging_level = logging.get_verbosity() - logging.set_verbosity(logging.WARNING) - # Work in tmp directory - will store manifest file there - with tempfile.TemporaryDirectory() as tmpdir: - with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp: - for video_file in paths2video_files: - entry = {'video_filepath': video_file, 'duration': 100000, 'text': ''} - fp.write(json.dumps(entry) + '\n') - - config = { - 'paths2video_files': paths2video_files, - 'batch_size': batch_size, - 'temp_dir': tmpdir, - 'num_workers': num_workers, - 'channel_selector': channel_selector, - } - - temporary_datalayer = self._setup_transcribe_dataloader(config) - for test_batch in tqdm(temporary_datalayer, desc="Transcribing"): - encoded, encoded_len = self.forward( - input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) - ) - - logits = self.ctc_decoder(encoder_output=encoded) - best_hyp, all_hyp = self.ctc_decoding.ctc_decoder_predictions_tensor( - logits, encoded_len, return_hypotheses=return_hypotheses, - ) - if return_hypotheses: - # dump log probs per file - for idx in range(logits.shape[0]): - best_hyp[idx].y_sequence = logits[idx][: encoded_len[idx]] - if best_hyp[idx].alignments is None: - best_hyp[idx].alignments = best_hyp[idx].y_sequence - del logits - - hypotheses += best_hyp - if all_hyp is not None: - all_hypotheses += all_hyp - else: - all_hypotheses += best_hyp - - del encoded - del test_batch - finally: - # set mode back to its original value - self.train(mode=mode) - - logging.set_verbosity(logging_level) - if mode is True: - self.video_front_end.unfreeze() - self.encoder.unfreeze() - self.decoder.unfreeze() - self.joint.unfreeze() - if hasattr(self, 'ctc_decoder'): - self.ctc_decoder.unfreeze() - return hypotheses, all_hypotheses - - def change_vocabulary( - self, - new_vocabulary: List[str], - decoding_cfg: Optional[DictConfig] = None, - ctc_decoding_cfg: Optional[DictConfig] = None, - ): - """ - Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning a pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - Args: - new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically, \ - this is target alphabet. - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - ctc_decoding_cfg: A config for CTC decoding, which is optional and can be used to change decoding type. - - Returns: None - - """ - super().change_vocabulary(new_vocabulary=new_vocabulary, decoding_cfg=decoding_cfg) - - # set up the new tokenizer for the CTC decoder - if hasattr(self, 'ctc_decoder'): - if self.ctc_decoder.vocabulary == new_vocabulary: - logging.warning( - f"Old {self.ctc_decoder.vocabulary} and new {new_vocabulary} match. Not changing anything." - ) - else: - if new_vocabulary is None or len(new_vocabulary) == 0: - raise ValueError(f'New vocabulary must be non-empty list of chars. But I got: {new_vocabulary}') - decoder_config = self.ctc_decoder.to_config_dict() - new_decoder_config = copy.deepcopy(decoder_config) - new_decoder_config['vocabulary'] = new_vocabulary - new_decoder_config['num_classes'] = len(new_vocabulary) - - del self.ctc_decoder - self.ctc_decoder = VisualEncDecHybridRNNTCTCModel.from_config_dict(new_decoder_config) - del self.ctc_loss - self.ctc_loss = CTCLoss( - num_classes=self.ctc_decoder.num_classes_with_blank - 1, - zero_infinity=True, - reduction=self.cfg.aux_ctc.get("ctc_reduction", "mean_batch"), - ) - - if ctc_decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `ctc_decoding_cfg` passed when changing decoding strategy, using internal config") - ctc_decoding_cfg = self.cfg.aux_ctc.decoding - - # Assert the decoding config with all hyper parameters - ctc_decoding_cls = OmegaConf.structured(CTCDecodingConfig) - ctc_decoding_cls = OmegaConf.create(OmegaConf.to_container(ctc_decoding_cls)) - ctc_decoding_cfg = OmegaConf.merge(ctc_decoding_cls, ctc_decoding_cfg) - - self.ctc_decoding = CTCDecoding(decoding_cfg=ctc_decoding_cfg, vocabulary=self.ctc_decoder.vocabulary) - - self.ctc_wer = WER( - decoding=self.ctc_decoding, - use_cer=self.ctc_wer.use_cer, - log_prediction=self.ctc_wer.log_prediction, - dist_sync_on_step=True, - ) - - # Update config - with open_dict(self.cfg.aux_ctc): - self.cfg.aux_ctc.decoding = ctc_decoding_cfg - - with open_dict(self.cfg.aux_ctc): - self.cfg.aux_ctc.decoder = new_decoder_config - - ds_keys = ['train_ds', 'validation_ds', 'test_ds'] - for key in ds_keys: - if key in self.cfg: - with open_dict(self.cfg[key]): - self.cfg[key]['labels'] = OmegaConf.create(new_vocabulary) - - logging.info(f"Changed the tokenizer of the CTC decoder to {self.ctc_decoder.vocabulary} vocabulary.") - - def change_decoding_strategy(self, decoding_cfg: DictConfig, decoder_type: str = None): - """ - Changes decoding strategy used during RNNT decoding process. - - Args: - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - decoder_type: (str) Can be set to 'rnnt' or 'ctc' to switch between appropriate decoder in a - model having RNN-T and CTC decoders. Defaults to None, in which case RNN-T decoder is - used. If set to 'ctc', it raises error if 'ctc_decoder' is not an attribute of the model. - """ - if decoder_type is None or decoder_type == 'rnnt': - self.use_rnnt_decoder = True - return super().change_decoding_strategy(decoding_cfg=decoding_cfg) - - assert decoder_type == 'ctc' and hasattr(self, 'ctc_decoder') - if decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config") - decoding_cfg = self.cfg.aux_ctc.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(CTCDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.ctc_decoding = CTCDecoding(decoding_cfg=decoding_cfg, vocabulary=self.ctc_decoder.vocabulary) - - self.ctc_wer = WER( - decoding=self.ctc_decoding, - use_cer=self.ctc_wer.use_cer, - log_prediction=self.ctc_wer.log_prediction, - dist_sync_on_step=True, - ) - - # Update config - with open_dict(self.cfg.aux_ctc): - self.cfg.aux_ctc.decoding = decoding_cfg - - self.use_rnnt_decoder = False - logging.info(f"Changed decoding strategy to \n{OmegaConf.to_yaml(self.cfg.aux_ctc.decoding)}") - - # PTL-specific methods - def training_step(self, batch, batch_nb): - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - if self.is_interctc_enabled(): - AccessMixin.set_access_enabled(access_enabled=True) - - signal, signal_len, transcript, transcript_len = batch - - # forward() only performs encoder forward - encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) - del signal - - # During training, loss must be computed, so decoder forward is necessary - decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) - - if hasattr(self, '_trainer') and self._trainer is not None: - log_every_n_steps = self._trainer.log_every_n_steps - sample_id = self._trainer.global_step - else: - log_every_n_steps = 1 - sample_id = batch_nb - - # If fused Joint-Loss-WER is not used - if not self.joint.fuse_loss_wer: - # Compute full joint and loss - joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) - loss_value = self.loss( - log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length - ) - - # Add auxiliary losses, if registered - loss_value = self.add_auxiliary_losses(loss_value) - - # Reset access registry - # if AccessMixin.is_access_enabled(): - # AccessMixin.reset_registry(self) - - tensorboard_logs = { - 'train_loss': loss_value, - 'learning_rate': self._optimizer.param_groups[0]['lr'], - 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), - } - - if (sample_id + 1) % log_every_n_steps == 0: - self.wer.update(encoded, encoded_len, transcript, transcript_len) - _, scores, words = self.wer.compute() - self.wer.reset() - tensorboard_logs.update({'training_batch_wer': scores.float() / words}) - - else: - # If fused Joint-Loss-WER is used - if (sample_id + 1) % log_every_n_steps == 0: - compute_wer = True - else: - compute_wer = False - - # Fused joint step - loss_value, wer, _, _ = self.joint( - encoder_outputs=encoded, - decoder_outputs=decoder, - encoder_lengths=encoded_len, - transcripts=transcript, - transcript_lengths=transcript_len, - compute_wer=compute_wer, - ) - - # Add auxiliary losses, if registered - loss_value = self.add_auxiliary_losses(loss_value) - - # Reset access registry - # if AccessMixin.is_access_enabled(): - # AccessMixin.reset_registry(self) - - tensorboard_logs = { - 'train_loss': loss_value, - 'learning_rate': self._optimizer.param_groups[0]['lr'], - 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), - } - - if compute_wer: - tensorboard_logs.update({'training_batch_wer': wer}) - - if self.ctc_loss_weight > 0: - log_probs = self.ctc_decoder(encoder_output=encoded) - ctc_loss = self.ctc_loss( - log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len - ) - - # Add Interctc Losses - ctc_loss, interctc_tensorboard_logs = self.add_interctc_losses( - ctc_loss, transcript, transcript_len, compute_wer=((batch_nb + 1) % log_every_n_steps == 0) - ) - tensorboard_logs.update(interctc_tensorboard_logs) - - tensorboard_logs['train_rnnt_loss'] = loss_value - tensorboard_logs['train_ctc_loss'] = ctc_loss - loss_value = (1 - self.ctc_loss_weight) * loss_value + self.ctc_loss_weight * ctc_loss - tensorboard_logs['train_loss'] = loss_value - if (sample_id + 1) % log_every_n_steps == 0: - self.ctc_wer.update( - predictions=log_probs, - targets=transcript, - target_lengths=transcript_len, - predictions_lengths=encoded_len, - ) - ctc_wer, _, _ = self.ctc_wer.compute() - self.ctc_wer.reset() - tensorboard_logs.update({'training_batch_wer_ctc': ctc_wer}) - - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - # Log items - self.log_dict(tensorboard_logs) - - # Preserve batch acoustic model T and language model U parameters if normalizing - if self._optim_normalize_joint_txu: - self._optim_normalize_txu = [encoded_len.max(), transcript_len.max()] - - return {'loss': loss_value} - - def predict_step(self, batch, batch_idx, dataloader_idx=0): - # TODO: add support for CTC decoding - signal, signal_len, transcript, transcript_len, sample_id = batch - - # forward() only performs encoder forward - encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) - del signal - - best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( - encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False - ) - - sample_id = sample_id.cpu().detach().numpy() - return list(zip(sample_id, best_hyp_text)) - - def validation_step(self, batch, batch_idx, dataloader_idx=0): - if self.is_interctc_enabled(): - AccessMixin.set_access_enabled(access_enabled=True) - - signal, signal_len, transcript, transcript_len = batch - - # forward() only performs encoder forward - encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) - del signal - - tensorboard_logs = {} - - # If experimental fused Joint-Loss-WER is not used - if not self.joint.fuse_loss_wer: - if self.compute_eval_loss: - decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) - joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) - - loss_value = self.loss( - log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length - ) - - tensorboard_logs['val_loss'] = loss_value - - self.wer.update(encoded, encoded_len, transcript, transcript_len) - wer, wer_num, wer_denom = self.wer.compute() - self.wer.reset() - - tensorboard_logs['val_wer_num'] = wer_num - tensorboard_logs['val_wer_denom'] = wer_denom - tensorboard_logs['val_wer'] = wer - - else: - # If experimental fused Joint-Loss-WER is used - compute_wer = True - - if self.compute_eval_loss: - decoded, target_len, states = self.decoder(targets=transcript, target_length=transcript_len) - else: - decoded = None - target_len = transcript_len - - # Fused joint step - loss_value, wer, wer_num, wer_denom = self.joint( - encoder_outputs=encoded, - decoder_outputs=decoded, - encoder_lengths=encoded_len, - transcripts=transcript, - transcript_lengths=target_len, - compute_wer=compute_wer, - ) - - if loss_value is not None: - tensorboard_logs['val_loss'] = loss_value - - tensorboard_logs['val_wer_num'] = wer_num - tensorboard_logs['val_wer_denom'] = wer_denom - tensorboard_logs['val_wer'] = wer - - log_probs = self.ctc_decoder(encoder_output=encoded) - if self.compute_eval_loss: - ctc_loss = self.ctc_loss( - log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len - ) - - # Add interCTC losses - ctc_loss, interctc_tensorboard_logs = self.add_interctc_losses( - ctc_loss, transcript, transcript_len, compute_wer=True, log_wer_num_denom=True, log_prefix="val_", - ) - tensorboard_logs.update(interctc_tensorboard_logs) - - tensorboard_logs['val_ctc_loss'] = ctc_loss - tensorboard_logs['val_rnnt_loss'] = loss_value - loss_value = (1 - self.ctc_loss_weight) * loss_value + self.ctc_loss_weight * ctc_loss - tensorboard_logs['val_loss'] = loss_value - self.ctc_wer.update( - predictions=log_probs, targets=transcript, target_lengths=transcript_len, predictions_lengths=encoded_len, - ) - ctc_wer, ctc_wer_num, ctc_wer_denom = self.ctc_wer.compute() - self.ctc_wer.reset() - tensorboard_logs['val_wer_num_ctc'] = ctc_wer_num - tensorboard_logs['val_wer_denom_ctc'] = ctc_wer_denom - tensorboard_logs['val_wer_ctc'] = ctc_wer - - self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) - - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - return tensorboard_logs - - def test_step(self, batch, batch_idx, dataloader_idx=0): - logs = self.validation_step(batch, batch_idx, dataloader_idx=dataloader_idx) - test_logs = {name.replace("val_", "test_"): value for name, value in logs.items()} - if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: - self.test_step_outputs[dataloader_idx].append(test_logs) - else: - self.test_step_outputs.append(test_logs) - return test_logs - - """ - def test_step(self, batch, batch_idx, dataloader_idx=0): - logs = self.validation_step(batch, batch_idx, dataloader_idx=dataloader_idx) - test_logs = { - 'test_wer_num': logs['val_wer_num'], - 'test_wer_denom': logs['val_wer_denom'], - # 'test_wer': logs['val_wer'], - } - if 'val_loss' in logs: - test_logs['test_loss'] = logs['val_loss'] - - if self.ctc_loss_weight > 0: - test_logs['test_wer_num_ctc'] = logs['val_wer_num_ctc'] - test_logs['test_wer_denom_ctc'] = logs['val_wer_denom_ctc'] - if 'val_ctc_loss' in logs: - test_logs['test_ctc_loss'] = logs['val_ctc_loss'] - if 'val_rnnt_loss' in logs: - test_logs['test_rnnt_loss'] = logs['val_rnnt_loss'] - - return test_logs - """ - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): - if self.compute_eval_loss: - val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean() - val_loss_log = {'val_loss': val_loss_mean} - else: - val_loss_log = {} - wer_num = torch.stack([x['val_wer_num'] for x in outputs]).sum() - wer_denom = torch.stack([x['val_wer_denom'] for x in outputs]).sum() - tensorboard_logs = {**val_loss_log, 'val_wer': wer_num.float() / wer_denom} - if self.ctc_loss_weight > 0: - ctc_wer_num = torch.stack([x['val_wer_num_ctc'] for x in outputs]).sum() - ctc_wer_denom = torch.stack([x['val_wer_denom_ctc'] for x in outputs]).sum() - tensorboard_logs['val_wer_ctc'] = ctc_wer_num.float() / ctc_wer_denom - - metrics = {**val_loss_log, 'log': tensorboard_logs} - self.finalize_interctc_metrics(metrics, outputs, prefix="val_") - return metrics - - def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): - if self.compute_eval_loss: - test_loss_mean = torch.stack([x['test_loss'] for x in outputs]).mean() - test_loss_log = {'test_loss': test_loss_mean} - else: - test_loss_log = {} - wer_num = torch.stack([x['test_wer_num'] for x in outputs]).sum() - wer_denom = torch.stack([x['test_wer_denom'] for x in outputs]).sum() - tensorboard_logs = {**test_loss_log, 'test_wer': wer_num.float() / wer_denom} - - if self.ctc_loss_weight > 0: - ctc_wer_num = torch.stack([x['test_wer_num_ctc'] for x in outputs]).sum() - ctc_wer_denom = torch.stack([x['test_wer_denom_ctc'] for x in outputs]).sum() - tensorboard_logs['test_wer_ctc'] = ctc_wer_num.float() / ctc_wer_denom - - metrics = {**test_loss_log, 'log': tensorboard_logs} - self.finalize_interctc_metrics(metrics, outputs, prefix="test_") - return metrics - - @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - return results diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_rnnt_bpe_models.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_rnnt_bpe_models.py deleted file mode 100644 index 5c5263b1ce76b4b7dc61c4a5b2ab0df499121f8f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_rnnt_bpe_models.py +++ /dev/null @@ -1,321 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os -from typing import Dict, List, Optional, Union - -import torch -from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict -from pytorch_lightning import Trainer - -from nemo.collections.asr.losses.rnnt import RNNTLoss -from nemo.collections.asr.metrics.rnnt_wer_bpe import RNNTBPEWER, RNNTBPEDecoding, RNNTBPEDecodingConfig -from nemo.collections.asr.parts.mixins import ASRBPEMixin -from nemo.collections.multimodal.speech_cv.data import video_to_text_dataset -from nemo.collections.multimodal.speech_cv.models.visual_rnnt_models import VisualEncDecRNNTModel -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging, model_utils - - -class VisualEncDecRNNTBPEModel(VisualEncDecRNNTModel, ASRBPEMixin): - """Base class for encoder decoder RNNT-based models with subword tokenization.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - # Convert to Hydra 1.0 compatible DictConfig - cfg = model_utils.convert_model_config_to_dict_config(cfg) - cfg = model_utils.maybe_update_config_version(cfg) - - # Tokenizer is necessary for this model - if 'tokenizer' not in cfg: - raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !") - - if not isinstance(cfg, DictConfig): - cfg = OmegaConf.create(cfg) - - # Setup the tokenizer - self._setup_tokenizer(cfg.tokenizer) - - # Initialize a dummy vocabulary - vocabulary = self.tokenizer.tokenizer.get_vocab() - - # Set the new vocabulary - with open_dict(cfg): - cfg.labels = ListConfig(list(vocabulary)) - - with open_dict(cfg.decoder): - cfg.decoder.vocab_size = len(vocabulary) - - with open_dict(cfg.joint): - cfg.joint.num_classes = len(vocabulary) - cfg.joint.vocabulary = ListConfig(list(vocabulary)) - cfg.joint.jointnet.encoder_hidden = cfg.model_defaults.enc_hidden - cfg.joint.jointnet.pred_hidden = cfg.model_defaults.pred_hidden - - super().__init__(cfg=cfg, trainer=trainer) - - # Setup decoding object - self.decoding = RNNTBPEDecoding( - decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) - - # Setup wer object - self.wer = RNNTBPEWER( - decoding=self.decoding, - batch_dim_index=0, - use_cer=self._cfg.get('use_cer', False), - log_prediction=self._cfg.get('log_prediction', True), - dist_sync_on_step=True, - ) - - # Setup fused Joint step if flag is set - if self.joint.fuse_loss_wer: - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - def change_vocabulary( - self, - new_tokenizer_dir: Union[str, DictConfig], - new_tokenizer_type: str, - decoding_cfg: Optional[DictConfig] = None, - ): - """ - Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning on from pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - Args: - new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer (if the tokenizer type is `agg`) - new_tokenizer_type: Type of tokenizer. Can be either `agg`, `bpe` or `wpe`. - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - - Returns: None - - """ - if isinstance(new_tokenizer_dir, DictConfig): - if new_tokenizer_type == 'agg': - new_tokenizer_cfg = new_tokenizer_dir - else: - raise ValueError( - f'New tokenizer dir should be a string unless the tokenizer is `agg`, but this tokenizer type is: {new_tokenizer_type}' - ) - else: - new_tokenizer_cfg = None - - if new_tokenizer_cfg is not None: - tokenizer_cfg = new_tokenizer_cfg - else: - if not os.path.isdir(new_tokenizer_dir): - raise NotADirectoryError( - f'New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}' - ) - - if new_tokenizer_type.lower() not in ('bpe', 'wpe'): - raise ValueError(f'New tokenizer type must be either `bpe` or `wpe`') - - tokenizer_cfg = OmegaConf.create({'dir': new_tokenizer_dir, 'type': new_tokenizer_type}) - - # Setup the tokenizer - self._setup_tokenizer(tokenizer_cfg) - - # Initialize a dummy vocabulary - vocabulary = self.tokenizer.tokenizer.get_vocab() - - joint_config = self.joint.to_config_dict() - new_joint_config = copy.deepcopy(joint_config) - if self.tokenizer_type == "agg": - new_joint_config["vocabulary"] = ListConfig(vocabulary) - else: - new_joint_config["vocabulary"] = ListConfig(list(vocabulary.keys())) - - new_joint_config['num_classes'] = len(vocabulary) - del self.joint - self.joint = VisualEncDecRNNTBPEModel.from_config_dict(new_joint_config) - - decoder_config = self.decoder.to_config_dict() - new_decoder_config = copy.deepcopy(decoder_config) - new_decoder_config.vocab_size = len(vocabulary) - del self.decoder - self.decoder = VisualEncDecRNNTBPEModel.from_config_dict(new_decoder_config) - - del self.loss - self.loss = RNNTLoss(num_classes=self.joint.num_classes_with_blank - 1) - - if decoding_cfg is None: - # Assume same decoding config as before - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(RNNTBPEDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = RNNTBPEDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) - - self.wer = RNNTBPEWER( - decoding=self.decoding, - batch_dim_index=self.wer.batch_dim_index, - use_cer=self.wer.use_cer, - log_prediction=self.wer.log_prediction, - dist_sync_on_step=True, - ) - - # Setup fused Joint step - if self.joint.fuse_loss_wer or ( - self.decoding.joint_fused_batch_size is not None and self.decoding.joint_fused_batch_size > 0 - ): - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - # Update config - with open_dict(self.cfg.joint): - self.cfg.joint = new_joint_config - - with open_dict(self.cfg.decoder): - self.cfg.decoder = new_decoder_config - - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - logging.info(f"Changed decoder to output to {self.joint.vocabulary} vocabulary.") - - def change_decoding_strategy(self, decoding_cfg: DictConfig): - """ - Changes decoding strategy used during RNNT decoding process. - - Args: - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - """ - if decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config") - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(RNNTBPEDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = RNNTBPEDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, tokenizer=self.tokenizer, - ) - - self.wer = RNNTBPEWER( - decoding=self.decoding, - batch_dim_index=self.wer.batch_dim_index, - use_cer=self.wer.use_cer, - log_prediction=self.wer.log_prediction, - dist_sync_on_step=True, - ) - - # Setup fused Joint step - if self.joint.fuse_loss_wer or ( - self.decoding.joint_fused_batch_size is not None and self.decoding.joint_fused_batch_size > 0 - ): - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - # Update config - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - logging.info(f"Changed decoding strategy to \n{OmegaConf.to_yaml(self.cfg.decoding)}") - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - dataset = video_to_text_dataset.get_video_to_text_bpe_dataset_from_config( - config=config, - local_rank=self.local_rank, - global_rank=self.global_rank, - world_size=self.world_size, - tokenizer=self.tokenizer, - preprocessor_cfg=self.cfg.get("preprocessor", None), - ) - - if dataset is None: - return None - - shuffle = config['shuffle'] - if config.get('is_tarred', False): - shuffle = False - - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - else: - collate_fn = dataset.datasets[0].collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': - """ - Setup function for a temporary data loader which wraps the provided video file. - - Args: - config: A python dictionary which contains the following keys: - paths2video_files: (a list) of paths to video files. The files should be relatively short fragments. \ - Recommended length per file is between 5 and 25 seconds. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - temp_dir: (str) A temporary directory where the video manifest is temporarily - stored. - - Returns: - A pytorch DataLoader for the given video file(s). - """ - if 'manifest_filepath' in config: - manifest_filepath = config['manifest_filepath'] - batch_size = config['batch_size'] - else: - manifest_filepath = os.path.join(config['temp_dir'], 'manifest.json') - batch_size = min(config['batch_size'], len(config['paths2video_files'])) - - dl_config = { - 'manifest_filepath': manifest_filepath, - 'batch_size': batch_size, - 'shuffle': False, - 'num_workers': config.get('num_workers', min(batch_size, os.cpu_count() - 1)), - 'pin_memory': True, - 'channel_selector': config.get('channel_selector', None), - 'use_start_end_token': self.cfg.validation_ds.get('use_start_end_token', False), - } - - if config.get("augmentor"): - dl_config['augmentor'] = config.get("augmentor") - - temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) - return temporary_datalayer - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - - return results diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_rnnt_models.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_rnnt_models.py deleted file mode 100644 index 0bd3e2fd1563bae2e1f66fd79a3f0a4d2347f33c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/models/visual_rnnt_models.py +++ /dev/null @@ -1,920 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import json -import os -import tempfile -from math import ceil, isclose -from typing import Dict, List, Optional, Tuple, Union - -import torch -from omegaconf import DictConfig, OmegaConf, open_dict -from pytorch_lightning import Trainer -from tqdm.auto import tqdm - -from nemo.collections.asr.data import audio_to_text_dataset -from nemo.collections.asr.losses.rnnt import RNNTLoss, resolve_rnnt_default_loss_name -from nemo.collections.asr.metrics.rnnt_wer import RNNTWER, RNNTDecoding, RNNTDecodingConfig -from nemo.collections.asr.models.asr_model import ASRModel -from nemo.collections.asr.modules.rnnt import RNNTDecoderJoint -from nemo.collections.asr.parts.mixins import ASRModuleMixin -from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType -from nemo.collections.multimodal.speech_cv.data import video_to_text_dataset -from nemo.core.classes import Exportable -from nemo.core.classes.common import PretrainedModelInfo, typecheck -from nemo.core.classes.mixins import AccessMixin -from nemo.core.neural_types import AcousticEncodedRepresentation, LengthsType, NeuralType, VideoSignal -from nemo.utils import logging - - -class VisualEncDecRNNTModel(ASRModel, ASRModuleMixin, Exportable): - """Base class for encoder decoder RNNT-based models.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable - # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0 - self.world_size = 1 - if trainer is not None: - self.world_size = trainer.world_size - - super().__init__(cfg=cfg, trainer=trainer) - - # Preprocessors - self.video_preprocessor = VisualEncDecRNNTModel.from_config_dict(self._cfg.video_preprocessor) - - # Augmentations - self.video_augmentation = VisualEncDecRNNTModel.from_config_dict(self._cfg.video_augment) - - # Front-end Networks - self.video_front_end = VisualEncDecRNNTModel.from_config_dict(self._cfg.video_front_end) - - # Back-end Networks - self.encoder = VisualEncDecRNNTModel.from_config_dict(self._cfg.encoder) - - # Update config values required by components dynamically - with open_dict(self.cfg.decoder): - self.cfg.decoder.vocab_size = len(self.cfg.labels) - - with open_dict(self.cfg.joint): - self.cfg.joint.num_classes = len(self.cfg.labels) - self.cfg.joint.vocabulary = self.cfg.labels - self.cfg.joint.jointnet.encoder_hidden = self.cfg.model_defaults.enc_hidden - self.cfg.joint.jointnet.pred_hidden = self.cfg.model_defaults.pred_hidden - - self.decoder = VisualEncDecRNNTModel.from_config_dict(self.cfg.decoder) - self.joint = VisualEncDecRNNTModel.from_config_dict(self.cfg.joint) - - # Setup RNNT Loss - loss_name, loss_kwargs = self.extract_rnnt_loss_cfg(self.cfg.get("loss", None)) - - self.loss = RNNTLoss( - num_classes=self.joint.num_classes_with_blank - 1, - loss_name=loss_name, - loss_kwargs=loss_kwargs, - reduction=self.cfg.get("rnnt_reduction", "mean_batch"), - ) - - # Setup decoding objects - self.decoding = RNNTDecoding( - decoding_cfg=self.cfg.decoding, decoder=self.decoder, joint=self.joint, vocabulary=self.joint.vocabulary, - ) - # Setup WER calculation - self.wer = RNNTWER( - decoding=self.decoding, - batch_dim_index=0, - use_cer=self._cfg.get('use_cer', False), - log_prediction=self._cfg.get('log_prediction', True), - dist_sync_on_step=True, - ) - - # Whether to compute loss during evaluation - if 'compute_eval_loss' in self.cfg: - self.compute_eval_loss = self.cfg.compute_eval_loss - else: - self.compute_eval_loss = True - - # Setup fused Joint step if flag is set - if self.joint.fuse_loss_wer or ( - self.decoding.joint_fused_batch_size is not None and self.decoding.joint_fused_batch_size > 0 - ): - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - # Setup optimization normalization (if provided in config) - self.setup_optim_normalization() - - # Setup optional Optimization flags - self.setup_optimization_flags() - - # Setup encoder adapters (from ASRAdapterModelMixin) - self.setup_adapters() - - def setup_optim_normalization(self): - """ - Helper method to setup normalization of certain parts of the model prior to the optimization step. - - Supported pre-optimization normalizations are as follows: - - .. code-block:: yaml - - # Variation Noise injection - model: - variational_noise: - std: 0.0 - start_step: 0 - - # Joint - Length normalization - model: - normalize_joint_txu: false - - # Encoder Network - gradient normalization - model: - normalize_encoder_norm: false - - # Decoder / Prediction Network - gradient normalization - model: - normalize_decoder_norm: false - - # Joint - gradient normalization - model: - normalize_joint_norm: false - """ - # setting up the variational noise for the decoder - if hasattr(self.cfg, 'variational_noise'): - self._optim_variational_noise_std = self.cfg['variational_noise'].get('std', 0) - self._optim_variational_noise_start = self.cfg['variational_noise'].get('start_step', 0) - else: - self._optim_variational_noise_std = 0 - self._optim_variational_noise_start = 0 - - # Setup normalized gradients for model joint by T x U scaling factor (joint length normalization) - self._optim_normalize_joint_txu = self.cfg.get('normalize_joint_txu', False) - self._optim_normalize_txu = None - - # Setup normalized encoder norm for model - self._optim_normalize_encoder_norm = self.cfg.get('normalize_encoder_norm', False) - - # Setup normalized decoder norm for model - self._optim_normalize_decoder_norm = self.cfg.get('normalize_decoder_norm', False) - - # Setup normalized joint norm for model - self._optim_normalize_joint_norm = self.cfg.get('normalize_joint_norm', False) - - def extract_rnnt_loss_cfg(self, cfg: Optional[DictConfig]): - """ - Helper method to extract the rnnt loss name, and potentially its kwargs - to be passed. - - Args: - cfg: Should contain `loss_name` as a string which is resolved to a RNNT loss name. - If the default should be used, then `default` can be used. - Optionally, one can pass additional kwargs to the loss function. The subdict - should have a keyname as follows : `{loss_name}_kwargs`. - - Note that whichever loss_name is selected, that corresponding kwargs will be - selected. For the "default" case, the "{resolved_default}_kwargs" will be used. - - Examples: - .. code-block:: yaml - - loss_name: "default" - warprnnt_numba_kwargs: - kwargs2: some_other_val - - Returns: - A tuple, the resolved loss name as well as its kwargs (if found). - """ - if cfg is None: - cfg = DictConfig({}) - - loss_name = cfg.get("loss_name", "default") - - if loss_name == "default": - loss_name = resolve_rnnt_default_loss_name() - - loss_kwargs = cfg.get(f"{loss_name}_kwargs", None) - - logging.info(f"Using RNNT Loss : {loss_name}\n" f"Loss {loss_name}_kwargs: {loss_kwargs}") - - return loss_name, loss_kwargs - - @torch.no_grad() - def transcribe( - self, - paths2video_files: List[str], - batch_size: int = 4, - return_hypotheses: bool = False, - partial_hypothesis: Optional[List['Hypothesis']] = None, - num_workers: int = 0, - channel_selector: Optional[ChannelSelectorType] = None, - augmentor: DictConfig = None, - ) -> Tuple[List[str], Optional[List['Hypothesis']]]: - """ - Uses greedy decoding to transcribe video files. Use this method for debugging and prototyping. - - Args: - - paths2video_files: (a list) of paths to video files. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - return_hypotheses: (bool) Either return hypotheses or text - With hypotheses can do some postprocessing like getting timestamp or rescoring - num_workers: (int) number of workers for DataLoader - channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. - augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied. - Returns: - Returns a tuple of 2 items - - * A list of greedy transcript texts / Hypothesis - * An optional list of beam search transcript texts / Hypothesis / NBestHypothesis. - """ - if paths2video_files is None or len(paths2video_files) == 0: - return {} - # We will store transcriptions here - hypotheses = [] - all_hypotheses = [] - # Model's mode and device - mode = self.training - device = next(self.parameters()).device - - if num_workers is None: - num_workers = min(batch_size, os.cpu_count() - 1) - - try: - - # Switch model to evaluation mode - self.eval() - # Freeze the visual front-end, encoder and decoder modules - self.video_front_end.freeze() - self.encoder.freeze() - self.decoder.freeze() - self.joint.freeze() - logging_level = logging.get_verbosity() - logging.set_verbosity(logging.WARNING) - # Work in tmp directory - will store manifest file there - with tempfile.TemporaryDirectory() as tmpdir: - with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp: - for video_file in paths2video_files: - entry = {'video_filepath': video_file, 'duration': 100000, 'text': ''} - fp.write(json.dumps(entry) + '\n') - - config = { - 'paths2video_files': paths2video_files, - 'batch_size': batch_size, - 'temp_dir': tmpdir, - 'num_workers': num_workers, - 'channel_selector': channel_selector, - } - - if augmentor: - config['augmentor'] = augmentor - - temporary_datalayer = self._setup_transcribe_dataloader(config) - for test_batch in tqdm(temporary_datalayer, desc="Transcribing"): - encoded, encoded_len = self.forward( - input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) - ) - best_hyp, all_hyp = self.decoding.rnnt_decoder_predictions_tensor( - encoded, - encoded_len, - return_hypotheses=return_hypotheses, - partial_hypotheses=partial_hypothesis, - ) - - hypotheses += best_hyp - if all_hyp is not None: - all_hypotheses += all_hyp - else: - all_hypotheses += best_hyp - - del encoded - del test_batch - finally: - # set mode back to its original value - self.train(mode=mode) - - logging.set_verbosity(logging_level) - if mode is True: - self.video_front_end.unfreeze() - self.encoder.unfreeze() - self.decoder.unfreeze() - self.joint.unfreeze() - return hypotheses, all_hypotheses - - def change_vocabulary(self, new_vocabulary: List[str], decoding_cfg: Optional[DictConfig] = None): - """ - Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning a pre-trained model. - This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would - use it if you want to use pretrained encoder when fine-tuning on data in another language, or when you'd need - model to learn capitalization, punctuation and/or special characters. - - Args: - new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically, \ - this is target alphabet. - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - - Returns: None - - """ - if self.joint.vocabulary == new_vocabulary: - logging.warning(f"Old {self.joint.vocabulary} and new {new_vocabulary} match. Not changing anything.") - else: - if new_vocabulary is None or len(new_vocabulary) == 0: - raise ValueError(f'New vocabulary must be non-empty list of chars. But I got: {new_vocabulary}') - - joint_config = self.joint.to_config_dict() - new_joint_config = copy.deepcopy(joint_config) - new_joint_config['vocabulary'] = new_vocabulary - new_joint_config['num_classes'] = len(new_vocabulary) - del self.joint - self.joint = VisualEncDecRNNTModel.from_config_dict(new_joint_config) - - decoder_config = self.decoder.to_config_dict() - new_decoder_config = copy.deepcopy(decoder_config) - new_decoder_config.vocab_size = len(new_vocabulary) - del self.decoder - self.decoder = VisualEncDecRNNTModel.from_config_dict(new_decoder_config) - - del self.loss - loss_name, loss_kwargs = self.extract_rnnt_loss_cfg(self.cfg.get('loss', None)) - self.loss = RNNTLoss( - num_classes=self.joint.num_classes_with_blank - 1, loss_name=loss_name, loss_kwargs=loss_kwargs - ) - - if decoding_cfg is None: - # Assume same decoding config as before - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(RNNTDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = RNNTDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, vocabulary=self.joint.vocabulary, - ) - - self.wer = RNNTWER( - decoding=self.decoding, - batch_dim_index=self.wer.batch_dim_index, - use_cer=self.wer.use_cer, - log_prediction=self.wer.log_prediction, - dist_sync_on_step=True, - ) - - # Setup fused Joint step - if self.joint.fuse_loss_wer or ( - self.decoding.joint_fused_batch_size is not None and self.decoding.joint_fused_batch_size > 0 - ): - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - # Update config - with open_dict(self.cfg.joint): - self.cfg.joint = new_joint_config - - with open_dict(self.cfg.decoder): - self.cfg.decoder = new_decoder_config - - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - ds_keys = ['train_ds', 'validation_ds', 'test_ds'] - for key in ds_keys: - if key in self.cfg: - with open_dict(self.cfg[key]): - self.cfg[key]['labels'] = OmegaConf.create(new_vocabulary) - - logging.info(f"Changed decoder to output to {self.joint.vocabulary} vocabulary.") - - def change_decoding_strategy(self, decoding_cfg: DictConfig): - """ - Changes decoding strategy used during RNNT decoding process. - - Args: - decoding_cfg: A config for the decoder, which is optional. If the decoding type - needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here. - """ - if decoding_cfg is None: - # Assume same decoding config as before - logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config") - decoding_cfg = self.cfg.decoding - - # Assert the decoding config with all hyper parameters - decoding_cls = OmegaConf.structured(RNNTDecodingConfig) - decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls)) - decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg) - - self.decoding = RNNTDecoding( - decoding_cfg=decoding_cfg, decoder=self.decoder, joint=self.joint, vocabulary=self.joint.vocabulary, - ) - - self.wer = RNNTWER( - decoding=self.decoding, - batch_dim_index=self.wer.batch_dim_index, - use_cer=self.wer.use_cer, - log_prediction=self.wer.log_prediction, - dist_sync_on_step=True, - ) - - # Setup fused Joint step - if self.joint.fuse_loss_wer or ( - self.decoding.joint_fused_batch_size is not None and self.decoding.joint_fused_batch_size > 0 - ): - self.joint.set_loss(self.loss) - self.joint.set_wer(self.wer) - - # Update config - with open_dict(self.cfg.decoding): - self.cfg.decoding = decoding_cfg - - logging.info(f"Changed decoding strategy to \n{OmegaConf.to_yaml(self.cfg.decoding)}") - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - # Automatically inject args from model config to dataloader config - audio_to_text_dataset.inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate') - audio_to_text_dataset.inject_dataloader_value_from_model_config(self.cfg, config, key='labels') - dataset = video_to_text_dataset.get_video_to_text_bpe_dataset_from_config( - config=config, - local_rank=self.local_rank, - global_rank=self.global_rank, - world_size=self.world_size, - preprocessor_cfg=self._cfg.get("preprocessor", None), - ) - - if dataset is None: - return None - - shuffle = config['shuffle'] - if config.get('is_tarred', False): - shuffle = False - - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - else: - collate_fn = dataset.datasets[0].collate_fn - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the training data loader via a Dict-like object. - - Args: - train_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.multimodal.speech_cv.data.video_to_text.VideoToCharDataset` - - :class:`~nemo.collections.asr.data.video_to_text.VideoToBPEDataset` - - :class:`~nemo.collections.asr.data.video_to_text.TarredVideoToBPEDataset` - """ - if 'shuffle' not in train_data_config: - train_data_config['shuffle'] = True - - # preserve config - self._update_dataset_config(dataset_name='train', config=train_data_config) - - self._train_dl = self._setup_dataloader_from_config(config=train_data_config) - - # Need to set this because if using an IterableDataset, the length of the dataloader is the total number - # of samples rather than the number of batches, and this messes up the tqdm progress bar. - # So we set the number of steps manually (to the correct number) to fix this. - if 'is_tarred' in train_data_config and train_data_config['is_tarred']: - # We also need to check if limit_train_batches is already set. - # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, - # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0). - if self._trainer is not None and isinstance(self._trainer.limit_train_batches, float): - self._trainer.limit_train_batches = int( - self._trainer.limit_train_batches - * ceil((len(self._train_dl.dataset) / self.world_size) / train_data_config['batch_size']) - ) - elif self._trainer is None: - logging.warning( - "Model Trainer was not set before constructing the dataset, incorrect number of " - "training batches will be used. Please set the trainer and rebuild the dataset." - ) - - def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the validation data loader via a Dict-like object. - - Args: - val_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.multimodal.speech_cv.data.video_to_text.VideoToCharDataset` - - :class:`~nemo.collections.asr.data.video_to_text.VideoToBPEDataset` - - :class:`~nemo.collections.asr.data.video_to_text.TarredVideoToBPEDataset` - """ - if 'shuffle' not in val_data_config: - val_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='validation', config=val_data_config) - - self._validation_dl = self._setup_dataloader_from_config(config=val_data_config) - - def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]): - """ - Sets up the test data loader via a Dict-like object. - - Args: - test_data_config: A config that contains the information regarding construction - of an ASR Training dataset. - - Supported Datasets: - - :class:`~nemo.collections.multimodal.speech_cv.data.video_to_text.VideoToCharDataset` - - :class:`~nemo.collections.asr.data.video_to_text.VideoToBPEDataset` - - :class:`~nemo.collections.asr.data.video_to_text.TarredVideoToBPEDataset` - """ - if 'shuffle' not in test_data_config: - test_data_config['shuffle'] = False - - # preserve config - self._update_dataset_config(dataset_name='test', config=test_data_config) - - self._test_dl = self._setup_dataloader_from_config(config=test_data_config) - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - - return { - "input_signal": NeuralType(('B', 'C', 'T', 'H', 'W'), VideoSignal(), optional=True), - "input_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), - } - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return { - "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), - "encoded_lengths": NeuralType(tuple('B'), LengthsType()), - } - - @typecheck() - def forward(self, input_signal=None, input_signal_length=None): - """ - Forward pass of the model. Note that for RNNT Models, the forward pass of the model is a 3 step process, - and this method only performs the first step - forward of the acoustic/visual model. - - Please refer to the `training_step` in order to see the full `forward` step for training - which - performs the forward of the acoustic model, the prediction network and then the joint network. - Finally, it computes the loss and possibly compute the detokenized text via the `decoding` step. - - Please refer to the `validation_step` in order to see the full `forward` step for inference - which - performs the forward of the acoustic model, the prediction network and then the joint network. - Finally, it computes the decoded tokens via the `decoding` step and possibly compute the batch metrics. - - Args: - input_signal: Tensor that represents a batch of video signals, - of shape [B, T, H, W, C]. T here represents timesteps, H height, W width and C channels - input_signal_length: Vector of length B, that contains the individual lengths of the video - sequences. - - Returns: - A tuple of 2 elements - - 1) The log probabilities tensor of shape [B, T, D]. - 2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B]. - """ - - # Preprocessing - processed_video_signal, processed_video_signal_length = self.video_preprocessor( - input_signal=input_signal, length=input_signal_length - ) - - # Augmentation - processed_video_signal = self.video_augmentation( - input_signal=processed_video_signal, length=processed_video_signal_length - ) - - # Front-end Networks - processed_video_signal, processed_video_signal_length = self.video_front_end( - input_signal=processed_video_signal, length=processed_video_signal_length - ) - - # Back-end Networks - encoded, encoded_len = self.encoder(audio_signal=processed_video_signal, length=processed_video_signal_length) - - return encoded, encoded_len - - # PTL-specific methods - def training_step(self, batch, batch_nb): - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - signal, signal_len, transcript, transcript_len = batch - - # forward() only performs encoder forward - encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) - del signal - - # During training, loss must be computed, so decoder forward is necessary - decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) - - if hasattr(self, '_trainer') and self._trainer is not None: - log_every_n_steps = self._trainer.log_every_n_steps - sample_id = self._trainer.global_step - else: - log_every_n_steps = 1 - sample_id = batch_nb - - # If experimental fused Joint-Loss-WER is not used - if not self.joint.fuse_loss_wer: - # Compute full joint and loss - joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) - loss_value = self.loss( - log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length - ) - - # Add auxiliary losses, if registered - loss_value = self.add_auxiliary_losses(loss_value) - - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - tensorboard_logs = { - 'train_loss': loss_value, - 'learning_rate': self._optimizer.param_groups[0]['lr'], - 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), - } - - if (sample_id + 1) % log_every_n_steps == 0: - self.wer.update(encoded, encoded_len, transcript, transcript_len) - _, scores, words = self.wer.compute() - self.wer.reset() - tensorboard_logs.update({'training_batch_wer': scores.float() / words}) - - else: - # If experimental fused Joint-Loss-WER is used - if (sample_id + 1) % log_every_n_steps == 0: - compute_wer = True - else: - compute_wer = False - - # Fused joint step - loss_value, wer, _, _ = self.joint( - encoder_outputs=encoded, - decoder_outputs=decoder, - encoder_lengths=encoded_len, - transcripts=transcript, - transcript_lengths=transcript_len, - compute_wer=compute_wer, - ) - - # Add auxiliary losses, if registered - loss_value = self.add_auxiliary_losses(loss_value) - - # Reset access registry - if AccessMixin.is_access_enabled(): - AccessMixin.reset_registry(self) - - tensorboard_logs = { - 'train_loss': loss_value, - 'learning_rate': self._optimizer.param_groups[0]['lr'], - 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), - } - - if compute_wer: - tensorboard_logs.update({'training_batch_wer': wer}) - - # Log items - self.log_dict(tensorboard_logs) - - # Preserve batch acoustic model T and language model U parameters if normalizing - if self._optim_normalize_joint_txu: - self._optim_normalize_txu = [encoded_len.max(), transcript_len.max()] - - return {'loss': loss_value} - - def predict_step(self, batch, batch_idx, dataloader_idx=0): - signal, signal_len, transcript, transcript_len, sample_id = batch - - # forward() only performs encoder forward - encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) - del signal - - best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( - encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False - ) - - sample_id = sample_id.cpu().detach().numpy() - return list(zip(sample_id, best_hyp_text)) - - def validation_step(self, batch, batch_idx, dataloader_idx=0): - signal, signal_len, transcript, transcript_len = batch - - # forward() only performs encoder forward - encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) - del signal - - tensorboard_logs = {} - - # If experimental fused Joint-Loss-WER is not used - if not self.joint.fuse_loss_wer: - if self.compute_eval_loss: - decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) - joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) - - loss_value = self.loss( - log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length - ) - - tensorboard_logs['val_loss'] = loss_value - - self.wer.update(encoded, encoded_len, transcript, transcript_len) - wer, wer_num, wer_denom = self.wer.compute() - self.wer.reset() - - tensorboard_logs['val_wer_num'] = wer_num - tensorboard_logs['val_wer_denom'] = wer_denom - tensorboard_logs['val_wer'] = wer - - else: - # If experimental fused Joint-Loss-WER is used - compute_wer = True - - if self.compute_eval_loss: - decoded, target_len, states = self.decoder(targets=transcript, target_length=transcript_len) - else: - decoded = None - target_len = transcript_len - - # Fused joint step - loss_value, wer, wer_num, wer_denom = self.joint( - encoder_outputs=encoded, - decoder_outputs=decoded, - encoder_lengths=encoded_len, - transcripts=transcript, - transcript_lengths=target_len, - compute_wer=compute_wer, - ) - - if loss_value is not None: - tensorboard_logs['val_loss'] = loss_value - - tensorboard_logs['val_wer_num'] = wer_num - tensorboard_logs['val_wer_denom'] = wer_denom - tensorboard_logs['val_wer'] = wer - - self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) - - return tensorboard_logs - - def test_step(self, batch, batch_idx, dataloader_idx=0): - logs = self.validation_step(batch, batch_idx, dataloader_idx=dataloader_idx) - test_logs = { - 'test_wer_num': logs['val_wer_num'], - 'test_wer_denom': logs['val_wer_denom'], - # 'test_wer': logs['val_wer'], - } - if 'val_loss' in logs: - test_logs['test_loss'] = logs['val_loss'] - return test_logs - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): - if self.compute_eval_loss: - val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean() - val_loss_log = {'val_loss': val_loss_mean} - else: - val_loss_log = {} - wer_num = torch.stack([x['val_wer_num'] for x in outputs]).sum() - wer_denom = torch.stack([x['val_wer_denom'] for x in outputs]).sum() - tensorboard_logs = {**val_loss_log, 'val_wer': wer_num.float() / wer_denom} - return {**val_loss_log, 'log': tensorboard_logs} - - def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): - if self.compute_eval_loss: - test_loss_mean = torch.stack([x['test_loss'] for x in outputs]).mean() - test_loss_log = {'test_loss': test_loss_mean} - else: - test_loss_log = {} - wer_num = torch.stack([x['test_wer_num'] for x in outputs]).sum() - wer_denom = torch.stack([x['test_wer_denom'] for x in outputs]).sum() - tensorboard_logs = {**test_loss_log, 'test_wer': wer_num.float() / wer_denom} - return {**test_loss_log, 'log': tensorboard_logs} - - def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': - """ - Setup function for a temporary data loader which wraps the provided video file. - - Args: - config: A python dictionary which contains the following keys: - paths2video_files: (a list) of paths to video files. The files should be relatively short fragments. \ - Recommended length per file is between 5 and 25 seconds. - batch_size: (int) batch size to use during inference. \ - Bigger will result in better throughput performance but would use more memory. - temp_dir: (str) A temporary directory where the video manifest is temporarily - stored. - - Returns: - A pytorch DataLoader for the given video file(s). - """ - if 'manifest_filepath' in config: - manifest_filepath = config['manifest_filepath'] - batch_size = config['batch_size'] - else: - manifest_filepath = os.path.join(config['temp_dir'], 'manifest.json') - batch_size = min(config['batch_size'], len(config['paths2video_files'])) - - dl_config = { - 'manifest_filepath': manifest_filepath, - 'labels': self.joint.vocabulary, - 'batch_size': batch_size, - 'trim_silence': False, - 'shuffle': False, - 'num_workers': config.get('num_workers', min(batch_size, os.cpu_count() - 1)), - 'pin_memory': True, - } - - if config.get("augmentor"): - dl_config['augmentor'] = config.get("augmentor") - - temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) - return temporary_datalayer - - def on_after_backward(self): - super().on_after_backward() - if self._optim_variational_noise_std > 0 and self.global_step >= self._optim_variational_noise_start: - for param_name, param in self.decoder.named_parameters(): - if param.grad is not None: - noise = torch.normal( - mean=0.0, - std=self._optim_variational_noise_std, - size=param.size(), - device=param.device, - dtype=param.dtype, - ) - param.grad.data.add_(noise) - - if self._optim_normalize_joint_txu: - T, U = self._optim_normalize_txu - if T is not None and U is not None: - for param_name, param in self.encoder.named_parameters(): - if param.grad is not None: - param.grad.data.div_(U) - - for param_name, param in self.decoder.named_parameters(): - if param.grad is not None: - param.grad.data.div_(T) - - if self._optim_normalize_encoder_norm: - for param_name, param in self.encoder.named_parameters(): - if param.grad is not None: - norm = param.grad.norm() - param.grad.data.div_(norm) - - if self._optim_normalize_decoder_norm: - for param_name, param in self.decoder.named_parameters(): - if param.grad is not None: - norm = param.grad.norm() - param.grad.data.div_(norm) - - if self._optim_normalize_joint_norm: - for param_name, param in self.joint.named_parameters(): - if param.grad is not None: - norm = param.grad.norm() - param.grad.data.div_(norm) - - # EncDecRNNTModel is exported in 2 parts - def list_export_subnets(self): - return ['encoder', 'decoder_joint'] - - # for export - @property - def decoder_joint(self): - return RNNTDecoderJoint(self.decoder, self.joint) - - @classmethod - def list_available_models(cls) -> List[PretrainedModelInfo]: - """ - This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. - - Returns: - List of available pre-trained models. - """ - results = [] - - return results diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/modules/__init__.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/modules/__init__.py deleted file mode 100644 index bfc09080b274beac4fd8bb604daa0619f47692db..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/modules/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.multimodal.speech_cv.modules.linear_projection_video_front_end import ( - LinearProjectionVideoFrontEnd, -) -from nemo.collections.multimodal.speech_cv.modules.resnet_video_front_end import ResNetVideoFrontEnd -from nemo.collections.multimodal.speech_cv.modules.video_augment import VideoAugmentation -from nemo.collections.multimodal.speech_cv.modules.video_preprocessing import VideoPreprocessor diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/modules/linear_projection_video_front_end.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/modules/linear_projection_video_front_end.py deleted file mode 100644 index 45e797171f2e7dc4bf0638088cc5eec396467bd3..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/modules/linear_projection_video_front_end.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections import OrderedDict - -import torch -from torch import nn - -from nemo.core.classes.module import NeuralModule -from nemo.core.neural_types import LengthsType, NeuralType, VideoSignal - - -class LinearProjectionVideoFrontEnd(NeuralModule): - - """ - Linear Projection Video Front-End for Lip Reading - - The spatial dimension is flattened and projected to dim_output using a Linear layer. - This is equivalent to having a convolution layer with a kernel size of the size of the image. - Circle crop can be used as pre-processing to crop the image as a circle around lips and ignore corner pixels - - Args: - in_channels: number of inputs video channels, 1 for grayscale and 3 for RGB - in_height: image height - in_width: image width - dim_output: output feature dimension for linear projection - out_channels_first: Whether outputs should have channels_first format (Batch, Dout, Time) or channels_last (Batch, Time, Dout) - circle_crop: crop the image as a circle before the Linear layer, default to False - circle_radius: the circle radius, default to 1 for full circle - - """ - - def __init__( - self, - in_channels, - in_height, - in_width, - dim_output, - out_channels_first=True, - circle_crop=False, - circle_radius=1.0, - ): - super(LinearProjectionVideoFrontEnd, self).__init__() - - self.out_channels_first = out_channels_first - self.in_height = in_height - self.in_width = in_width - self.dim_output = dim_output - self.in_channels = in_channels - self.circle_crop = circle_crop - self.circle_radius = circle_radius - self.circle_indices = self.get_circle_indices() - - if self.dim_output is not None: - if self.circle_crop: - self.linear_proj = nn.Linear(in_channels * len(self.circle_indices), dim_output) - else: - self.linear_proj = nn.Linear(in_channels * in_height * in_width, dim_output) - else: - self.linear_proj = nn.Identity() - - @property - def input_types(self): - """Returns definitions of module input ports.""" - return OrderedDict( - { - "audio_signal": NeuralType(('B', 'D', 'T', 'H', 'W'), VideoSignal()), - "length": NeuralType(tuple('B'), LengthsType()), - } - ) - - @property - def input_types_for_export(self): - """Returns definitions of module input ports.""" - return OrderedDict( - { - "output_signal": NeuralType(('B', 'D', 'T'), NeuralType()), - "length": NeuralType(tuple('B'), LengthsType()), - } - ) - - def get_circle_indices(self): - - """ return image indices inside circle of radius circle_radius """ - - # Create linspace - linspace_height = (torch.linspace(0, 2, steps=self.in_height) - 1).abs() - linspace_width = (torch.linspace(0, 2, steps=self.in_width) - 1).abs() - - # Repeat linspace along height/width - linspace_height = linspace_height.unsqueeze(dim=-1).repeat(1, self.in_width).flatten() - linspace_width = linspace_width.repeat(self.in_height) - - # Compute norm - dist = torch.sqrt(linspace_height.square() + linspace_width.square()) - - # Get circle indices - circle_indices = torch.nonzero(dist <= self.circle_radius).squeeze(dim=-1) - - return circle_indices - - def forward(self, input_signal, length): - - # Permute (B, C, T, H, W) -> (B, T, H, W, C) - input_signal = input_signal.permute(0, 2, 3, 4, 1) - - # Circle Crop - if self.circle_crop: - - # Flatten height, width (B, T, H, W, C) -> (B, T, H*W, C) - input_signal = input_signal.flatten(start_dim=2, end_dim=-2) - - # (B, T, H*W, C) -> (B, T, N circle, C) - input_signal = input_signal[:, :, self.circle_indices] - - # Flatten circle and channels (B, T, N circle, C) -> (B, T, N) - input_signal = input_signal.flatten(start_dim=2, end_dim=-1) - - # Flatten height, width and channels (B, T, H, W, C) -> (B, T, N) - else: - input_signal = input_signal.flatten(start_dim=2, end_dim=-1) - - # Project (B, T, N) -> (B, T, Dout) - input_signal = self.linear_proj(input_signal) - - # Transpose to channels_last format (Batch, Dout, Time) -> (Batch, Time, Dout) - if self.out_channels_first: - output_signal = input_signal.transpose(1, 2) - else: - output_signal = input_signal - - return output_signal, length diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/modules/resnet_video_front_end.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/modules/resnet_video_front_end.py deleted file mode 100644 index 202c629e2b02ddacc249c2373992a5645b3bbbc2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/modules/resnet_video_front_end.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections import OrderedDict - -import torch -from torch import nn - -from nemo.collections.multimodal.speech_cv.parts.submodules.resnet import ResNet -from nemo.core.classes.module import NeuralModule -from nemo.core.neural_types import LengthsType, NeuralType, VideoSignal - - -class ResNetVideoFrontEnd(NeuralModule): - """ - Lip Reading / Visual Speech Recognition (VSR) ResNet Front-End Network - - Paper: - 'Audio-Visual Efficient Conformer for Robust Speech Recognition' by Burchi and Timofte - https://arxiv.org/abs/2301.01456 - - Args: - in_channels: number of inputs video channels, 1 for grayscale and 3 for RGB - model: model size in ["ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"] - dim_output: output feature dimension for linear projection after spacial average pooling - out_channels_first: Whether outputs should have channels_first format (Batch, Dout, Time) or channels_last (Batch, Time, Dout) - """ - - def __init__(self, in_channels=1, model="ResNet18", dim_output=256, out_channels_first=True): - super(ResNetVideoFrontEnd, self).__init__() - - self.front_end = nn.Sequential( - nn.Conv3d( - in_channels=in_channels, out_channels=64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3) - ), - nn.BatchNorm3d(num_features=64), - nn.ReLU(), - nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)), - ResNet(include_stem=False, dim_output=dim_output, model=model), - ) - - self.out_channels_first = out_channels_first - - @property - def input_types(self): - """Returns definitions of module input ports.""" - return OrderedDict( - { - "audio_signal": NeuralType(('B', 'D', 'T', 'H', 'W'), VideoSignal()), - "length": NeuralType(tuple('B'), LengthsType()), - } - ) - - @property - def input_types_for_export(self): - """Returns definitions of module input ports.""" - return OrderedDict( - { - "output_signal": NeuralType(('B', 'D', 'T'), NeuralType()), - "length": NeuralType(tuple('B'), LengthsType()), - } - ) - - def forward(self, input_signal, length): - - # Front-End Network (Batch, Din, Time, Height, Width) -> (Batch, Dout, Time) - input_signal = self.front_end(input_signal) - - # Transpose to channels_last format (Batch, Dout, Time) -> (Batch, Time, Dout) - if not self.out_channels_first: - input_signal = input_signal.transpose(1, 2) - - return input_signal, length diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/modules/video_augment.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/modules/video_augment.py deleted file mode 100644 index e191cc89d1a0621a985298570e5e6916faec3fcf..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/modules/video_augment.py +++ /dev/null @@ -1,225 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random -from collections import OrderedDict - -import torch -from torch import nn - -from nemo.core.classes.module import NeuralModule -from nemo.core.neural_types import NeuralType, VideoSignal - - -try: - import torchvision - - TORCHVISION_AVAILABLE = True -except (ImportError, ModuleNotFoundError): - TORCHVISION_AVAILABLE = False - - -class VideoAugmentation(NeuralModule): - - """ Video Augmentation for batched video input: input_signal shape (B, C, T, H, W) """ - - def __init__( - self, - random_crop, - crop_size, - horizontal_flip, - time_masking, - num_mask_second=1.0, - spatial_masking=False, - mean_frame=True, - ): - super().__init__() - - # Params - self.random_crop = random_crop - self.crop_size = crop_size - self.horizontal_flip = horizontal_flip - self.time_masking = time_masking - self.spatial_masking = spatial_masking - - self.training_augments = nn.ModuleList() - self.inference_augments = nn.ModuleList() - - # Random Crop - if self.random_crop: - if TORCHVISION_AVAILABLE: - self.training_augments.append(torchvision.transforms.RandomCrop(self.crop_size)) - self.inference_augments.append(torchvision.transforms.CenterCrop(self.crop_size)) - else: - raise Exception("RandomCrop transform requires torchvision") - - # Horizontal Flip - if self.horizontal_flip: - if TORCHVISION_AVAILABLE: - self.training_augments.append(torchvision.transforms.RandomHorizontalFlip()) - else: - raise Exception("RandomHorizontalFlip transform requires torchvision") - - # Time Masking - if self.time_masking: - self.training_augments.append(VideoFrameMasking(num_mask_second=num_mask_second, mean_frame=mean_frame)) - - # Spatial Masking - if self.spatial_masking: - self.training_augments.append(SpatialVideoMasking(mean_frame=mean_frame)) - - @property - def input_types(self): - """Returns definitions of module input ports.""" - return OrderedDict({"input_signal": NeuralType(('B', 'D', 'T', 'H', 'W'), VideoSignal()),}) - - @property - def input_types_for_export(self): - """Returns definitions of module input ports.""" - return OrderedDict({"output_signal": NeuralType(('B', 'D', 'T', 'H', 'W'), VideoSignal()),}) - - @torch.no_grad() - def forward(self, input_signal, length): - - if self.training: - augments = self.training_augments - else: - augments = self.inference_augments - - output_signal = input_signal - - for augment in augments: - if isinstance(augment, VideoFrameMasking) or isinstance(augment, SpatialVideoMasking): - output_signal = augment(output_signal, length) - else: - output_signal = augment(output_signal) - - return output_signal - - -class SpatialVideoMasking(NeuralModule): - - """ Spatial Video Mask - - Will mask videos frames in the spatial dimensions using horizontal and vertical masks - - params: - num_horizontal_masks: number of horizontal masks - num_vertical_masks: number of vertical masks - max_h: maximum width of horizontal mask - max_v: maximum width of vertical mask - mean_frame: mask using video mean instead of zeros - - """ - - def __init__(self, num_horizontal_masks=1, num_vertical_masks=1, max_h=30, max_v=30, mean_frame=True): - super().__init__() - - self.num_horizontal_masks = num_horizontal_masks - self.num_vertical_masks = num_vertical_masks - self.max_h = max_h - self.max_v = max_v - self.mean_frame = mean_frame - self.random = random.Random() - - def forward(self, input_signal, length): - - # (B, C, T, H, W) - shape = input_signal.shape - - # Batch loop - for b in range(shape[0]): - - # Mask Value - mask_value = input_signal[b, :, : length[b]].mean() if self.mean_frame else 0.0 - - # Horizontal Mask loop - for i in range(self.num_horizontal_masks): - - # Start index - x = self.random.randint(0, shape[3] - self.max_h) - - # Mask width - w = self.random.randint(0, self.max_h) - - # Apply mask - input_signal[b, :, :, x : x + w] = mask_value - - # Vertical Mask loop - for i in range(self.num_vertical_masks): - - # Start index - x = self.random.randint(0, shape[4] - self.max_v) - - # Mask width - w = self.random.randint(0, self.max_v) - - # Apply mask - input_signal[b, :, :, :, x : x + w] = mask_value - - return input_signal - - -class VideoFrameMasking(NeuralModule): - - """ Video Frame Mask: - - As explained in: - "Visual Speech Recognition for Multiple Languages in the Wild" - https://arxiv.org/abs/2202.13084 - - S6 Time Masking - We mask n consecutive frames with the mean frame of the video. - The duration tn is chosen from 0 to an upper bound nmax using a uniform distribution. - Since there is a large variance in the video lengths of the LRS2 and LRS3 datasets, we set the number of masks proportional to the sequence length. - Specifically, we use one mask per second, and for each mask, the maximum duration nmax is set to 0.4 seconds. - - """ - - def __init__(self, T_second=0.4, num_mask_second=1.0, fps=25.0, mean_frame=True): - super().__init__() - - self.T = int(T_second * fps) - self.num_mask_second = num_mask_second - self.mean_frame = mean_frame - self.fps = fps - self.random = random.Random() - - def forward(self, input_signal, length): - - # (B, C, T, H, W) - shape = input_signal.shape - - # Batch loop - for b in range(shape[0]): - - # Mask per second - mT = int(length[b] / self.fps * self.num_mask_second) - - # Mask Value - mask_value = input_signal[b, :, : length[b]].mean() if self.mean_frame else 0.0 - - # Mask loop - for i in range(mT): - - # Start left Frame - x_left = self.random.randint(0, length[b] - self.T) - - # Mask width - w = self.random.randint(0, self.T) - - # Apply mask - input_signal[b, :, x_left : x_left + w] = mask_value - - return input_signal diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/modules/video_preprocessing.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/modules/video_preprocessing.py deleted file mode 100644 index 30accea097dbd95b6ae00ff125e21c97ad1087f8..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/modules/video_preprocessing.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from torch import nn - -from nemo.collections.multimodal.speech_cv.parts.submodules.permute import Permute -from nemo.core.classes import NeuralModule, typecheck - -try: - import torchvision - - TORCHVISION_AVAILABLE = True -except (ImportError, ModuleNotFoundError): - TORCHVISION_AVAILABLE = False - - -class VideoPreprocessor(NeuralModule): - - """ Video Pre-processing - - args: - grayscale: convert images to grayscale - normalize: normalize videos - resize: resize videos - resize_size: output image size for resize - norm_mean: normalize mean - norm_std: normalize std - - """ - - def __init__(self, grayscale, normalize, resize, resize_size, norm_mean, norm_std): - super().__init__() - - # Params - self.grayscale = grayscale - self.normalize = normalize - self.resize = resize - self.resize_size = resize_size - self.norm_mean = norm_mean - self.norm_std = norm_std - - self.transforms = nn.ModuleList() - - # Convert float32 [0:255] -> [0:1] - if TORCHVISION_AVAILABLE: - self.transforms.append(torchvision.transforms.ConvertImageDtype(dtype=torch.float32)) - else: - raise Exception("ConvertImageDtype transform requires torchvision") - - # Convert Channels First - self.transforms.append(Permute(dims=(0, 4, 1, 2, 3))) # (B, T, H, W, C) -> (B, C, T, H, W) - - # Resize - if self.resize: - self.transforms.append(ResizeVideo(self.resize_size)) # (B, C, T, H, W) -> (B, C, T, H', W') - - # Grayscale - if self.grayscale: - if TORCHVISION_AVAILABLE: - self.transforms.append( - nn.Sequential( - Permute(dims=(0, 2, 1, 3, 4)), # (B, C, T, H, W) -> (B, T, C, H, W) - torchvision.transforms.Grayscale(), - Permute(dims=(0, 2, 1, 3, 4)), # (B, T, C, H, W) -> (B, C, T, H, W) - ) - ) - else: - raise Exception("Grayscale transform requires torchvision") - - # Normalize - if self.normalize: - self.transforms.append(NormalizeVideo(mean=norm_mean, std=norm_std)) - - @typecheck() - @torch.no_grad() - def forward(self, input_signal, length): - - for transform in self.transforms: - input_signal = transform(input_signal) - - return input_signal, length - - -class NormalizeVideo(NeuralModule): - def __init__(self, mean, std): - super().__init__() - - self.register_buffer( - "mean", torch.tensor(mean, dtype=torch.float32).reshape(len(mean), 1, 1, 1), persistent=False - ) - self.register_buffer( - "std", torch.tensor(std, dtype=torch.float32).reshape(len(std), 1, 1, 1), persistent=False - ) - - def forward(self, x): - - x = (x - self.mean) / self.std - - return x - - -class ResizeVideo(NeuralModule): - def __init__(self, size): - super().__init__() - - self.size = size - if TORCHVISION_AVAILABLE: - self.resize = torchvision.transforms.Resize(size=self.size) - else: - raise Exception("Resize transform requires torchvision") - - def forward(self, x): - - # (B, C, T, H, W) - if x.dim() == 5: - - B, C = x.shape[:2] - x = x.flatten(start_dim=0, end_dim=1) - x = self.resize(x) - x = x.reshape((B, C) + x.shape[1:]) - - # (C, T, H, W) - elif x.dim() == 4: - x = self.resize(x) - - return x diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/__init__.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/__init__.py deleted file mode 100644 index 4fc50543f1d247a021674eaea55ba5f4c224c56e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/preprocessing/features.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/preprocessing/features.py deleted file mode 100644 index 29b5268d6adf7e7e02d548b857eb8ad55a7da1ab..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/preprocessing/features.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import tempfile - -try: - import torchvision - - TORCHVISION_AVAILABLE = True -except (ImportError, ModuleNotFoundError): - TORCHVISION_AVAILABLE = False - - -class VideoFeaturizer(object): - def __init__(self): - pass - - def process(self, video_file, offset, duration): - - # Load Video - video = self.from_file(video_file, offset=offset, duration=duration) - - return video - - def from_file(self, video_file, offset, duration): - - if not TORCHVISION_AVAILABLE: - raise Exception("Reading Video requires torchvision") - - # Load from filename - if isinstance(video_file, str): - video, audio, infos = torchvision.io.read_video( - video_file, start_pts=offset, end_pts=offset + duration, pts_unit="sec" - ) - - # Load from bytes - elif isinstance(video_file, bytes): - - # webdataset.torch_video - with tempfile.TemporaryDirectory() as dirname: - fname = os.path.join(dirname, f"file.mp4") - with open(fname, "wb") as stream: - stream.write(video_file) - video, audio, infos = torchvision.io.read_video( - fname, start_pts=offset, end_pts=offset + duration, pts_unit="sec" - ) - else: - raise Exception("Unknown video data format") - - return video diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/__init__.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/__init__.py deleted file mode 100644 index 4fc50543f1d247a021674eaea55ba5f4c224c56e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/conv2d.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/conv2d.py deleted file mode 100644 index 25f6e5451b66c43c3c4fc991d816498eb4504fa3..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/conv2d.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Union - -from torch import nn -from torch.nn import init -from torch.nn.common_types import _size_2_t - - -class Conv2d(nn.Conv2d): - - """ - Conv2d layer with ResNet initialization: - - Reference: "Deep Residual Learning for Image Recognition" by He et al. - https://arxiv.org/abs/1512.03385 - - """ - - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: _size_2_t, - stride: _size_2_t = 1, - padding: Union[str, _size_2_t] = 0, - dilation: _size_2_t = 1, - groups: int = 1, - bias: bool = True, - padding_mode: str = 'zeros', - device=None, - dtype=None, - weight_init: str = "default", - bias_init: str = "default", - ): - - super(Conv2d, self).__init__( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - dilation=dilation, - groups=groups, - bias=bias, - padding_mode=padding_mode, - device=device, - dtype=dtype, - ) - - # Weight Init - assert weight_init in ["default", "he_normal"] - if weight_init == "he_normal": - init.kaiming_normal_(self.weight) - - # Bias Init - assert bias_init in ["default", "zeros"] - if self.bias is not None: - if bias_init == "zeros": - init.zeros_(self.bias) diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/global_avg_pool2d.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/global_avg_pool2d.py deleted file mode 100644 index 6c248d86564e1d16d3d40be7b89fa36028513b94..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/global_avg_pool2d.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from torch import nn - - -class GlobalAvgPool2d(nn.Module): - def __init__(self, dim=(2, 3), keepdim=False): - super(GlobalAvgPool2d, self).__init__() - self.dim = dim - self.keepdim = keepdim - - def forward(self, x): - - assert x.dim() == 4, "input signal should have 4 dims, has {}".format(x.dim()) - - return x.mean(dim=self.dim, keepdim=self.keepdim) diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/permute.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/permute.py deleted file mode 100644 index abd4ce34f4a80a7938af9883c79b2977920e26d9..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/permute.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from torch import nn - - -class Permute(nn.Module): - def __init__(self, dims, make_contiguous=False): - super(Permute, self).__init__() - self.dims = dims - self.make_contiguous = make_contiguous - - def forward(self, x): - x = x.permute(self.dims) - if self.make_contiguous: - x = x.contiguous() - return x diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/resnet.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/resnet.py deleted file mode 100644 index c911db6f3abe6fe28e848707d807778ce19830de..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/resnet.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from torch import nn - -from nemo.collections.multimodal.speech_cv.parts.submodules.conv2d import Conv2d -from nemo.collections.multimodal.speech_cv.parts.submodules.global_avg_pool2d import GlobalAvgPool2d -from nemo.collections.multimodal.speech_cv.parts.submodules.resnet_block import ResNetBlock -from nemo.collections.multimodal.speech_cv.parts.submodules.resnet_bottleneck_block import ResNetBottleneckBlock - - -class ResNet(nn.Module): - - """ ResNet (ResNet18, ResNet34, ResNet50, ResNet101, ResNet152) - Models: 224 x 224 - ResNet18: 11,689,512 Params - ResNet34: 21,797,672 Params - ResNet50: 25,557,032 Params - ResNet101: 44,549,160 Params - Resnet152: 60,192,808 Params - Reference: "Deep Residual Learning for Image Recognition" by He et al. - https://arxiv.org/abs/1512.03385 - """ - - def __init__(self, dim_input=3, dim_output=1000, model="ResNet50", include_stem=True, include_head=True): - super(ResNet, self).__init__() - - assert model in ["ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"] - - if model == "ResNet18": - dim_stem = 64 - dim_blocks = [64, 128, 256, 512] - num_blocks = [2, 2, 2, 2] - bottleneck = False - elif model == "ResNet34": - dim_stem = 64 - dim_blocks = [64, 128, 256, 512] - num_blocks = [3, 4, 6, 3] - bottleneck = False - elif model == "ResNet50": - dim_stem = 64 - dim_blocks = [256, 512, 1024, 2048] - num_blocks = [3, 4, 6, 3] - bottleneck = True - elif model == "ResNet101": - dim_stem = 64 - dim_blocks = [256, 512, 1024, 2048] - num_blocks = [3, 4, 23, 3] - bottleneck = True - elif model == "ResNet152": - dim_stem = 64 - dim_blocks = [256, 512, 1024, 2048] - num_blocks = [3, 8, 36, 3] - bottleneck = True - - self.stem = ( - nn.Sequential( - Conv2d( - in_channels=dim_input, - out_channels=dim_stem, - kernel_size=(7, 7), - stride=(2, 2), - weight_init="he_normal", - bias=False, - ), - nn.BatchNorm2d(num_features=dim_stem), - nn.ReLU(), - nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), - ) - if include_stem - else nn.Identity() - ) - - # Blocks - self.blocks = nn.ModuleList() - for stage_id in range(4): - - for block_id in range(num_blocks[stage_id]): - - # Projection Block - if block_id == 0: - if stage_id == 0: - stride = (1, 1) - bottleneck_ratio = 1 - in_features = dim_stem - else: - stride = (2, 2) - bottleneck_ratio = 2 - in_features = dim_blocks[stage_id - 1] - # Default Block - else: - stride = (1, 1) - in_features = dim_blocks[stage_id] - bottleneck_ratio = 4 - - if bottleneck: - self.blocks.append( - ResNetBottleneckBlock( - in_features=in_features, - out_features=dim_blocks[stage_id], - bottleneck_ratio=bottleneck_ratio, - kernel_size=(3, 3), - stride=stride, - ) - ) - else: - self.blocks.append( - ResNetBlock( - in_features=in_features, - out_features=dim_blocks[stage_id], - kernel_size=(3, 3), - stride=stride, - ) - ) - - # Head - self.head = ( - nn.Sequential( - GlobalAvgPool2d(), - nn.Linear(in_features=dim_blocks[-1], out_features=dim_output) - if dim_output is not None - else nn.Identity(), - ) - if include_head - else nn.Identity() - ) - - def forward(self, x): - - # Is Video - if x.dim() == 5: - - is_video = True - batch_size = x.shape[0] - video_frames = x.shape[2] - - # (B, Din, T, H, W) -> (B * T, Din, H, W) - x = x.transpose(1, 2).flatten(start_dim=0, end_dim=1) - - else: - is_video = False - - # (B, Din, H, W) -> (B, D0, H//4, W//4) - x = self.stem(x) - - # (B, D0, H//4, W//4) -> (B, D4, H//32, W//32) - for block in self.blocks: - x = block(x) - - # (B, D4, H//32, W//32) -> (B, Dout) - x = self.head(x) - - # Is Video - if is_video: - - # (B * T, Dout) -> (B, Dout, T) - if x.dim() == 2: - x = x.reshape(batch_size, video_frames, -1).transpose(1, 2) - - # (B * T, D4, H//32, W//32) -> (B, D4, T, H//32, W//32) - else: - x = x.reshape(batch_size, video_frames, x.shape[1], x.shape[2], x.shape[3]).transpose(1, 2) - - return x diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/resnet_block.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/resnet_block.py deleted file mode 100644 index 19436311ccd7bdaaaf0c6add04691298db238c9e..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/resnet_block.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from torch import nn -from torch.nn.modules.utils import _pair - -from nemo.collections.multimodal.speech_cv.parts.submodules.conv2d import Conv2d - - -class ResNetBlock(nn.Module): - - """ ResNet Residual Block used by ResNet18 and ResNet34 networks. - References: "Deep Residual Learning for Image Recognition", He et al. - https://arxiv.org/abs/1512.03385 - """ - - def __init__(self, in_features, out_features, kernel_size, stride, weight_init="he_normal", bias_init="zeros"): - super(ResNetBlock, self).__init__() - - # Convert to pair - kernel_size = _pair(kernel_size) - - # layers - self.layers = nn.Sequential( - Conv2d( - in_channels=in_features, - out_channels=out_features, - kernel_size=kernel_size, - stride=stride, - bias=False, - weight_init=weight_init, - bias_init=bias_init, - padding=((kernel_size[0] - 1) // 2, kernel_size[1] // 2), - ), - nn.BatchNorm2d(out_features), - nn.ReLU(), - Conv2d( - in_channels=out_features, - out_channels=out_features, - kernel_size=kernel_size, - bias=False, - weight_init=weight_init, - bias_init=bias_init, - padding=((kernel_size[0] - 1) // 2, kernel_size[1] // 2), - ), - nn.BatchNorm2d(out_features), - ) - - # Residual Block - if torch.prod(torch.tensor(stride)) > 1 or in_features != out_features: - self.residual = nn.Sequential( - Conv2d( - in_channels=in_features, - out_channels=out_features, - kernel_size=1, - stride=stride, - bias=False, - weight_init=weight_init, - bias_init=bias_init, - ), - nn.BatchNorm2d(out_features), - ) - else: - self.residual = nn.Identity() - - # Joined Post Act - self.joined_post_act = nn.ReLU() - - def forward(self, x): - - # Forward Layers - x = self.joined_post_act(self.layers(x) + self.residual(x)) - - return x diff --git a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/resnet_bottleneck_block.py b/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/resnet_bottleneck_block.py deleted file mode 100644 index 50cafa53343ca94f5ba3bead029360cccdc3a8d4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/multimodal/speech_cv/parts/submodules/resnet_bottleneck_block.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from torch import nn -from torch.nn.modules.utils import _pair - -from nemo.collections.multimodal.speech_cv.parts.submodules.conv2d import Conv2d - - -class ResNetBottleneckBlock(nn.Module): - - """ ResNet Bottleneck Residual Block used by ResNet50, ResNet101 and ResNet152 networks. - References: "Deep Residual Learning for Image Recognition", He et al. - https://arxiv.org/abs/1512.03385 - """ - - def __init__( - self, - in_features, - out_features, - bottleneck_ratio, - kernel_size, - stride, - weight_init="he_normal", - bias_init="zeros", - ): - super(ResNetBottleneckBlock, self).__init__() - - # Assert - assert in_features % bottleneck_ratio == 0 - - # Convert to pair - kernel_size = _pair(kernel_size) - - # layers - self.layers = nn.Sequential( - Conv2d( - in_channels=in_features, - out_channels=in_features // bottleneck_ratio, - kernel_size=1, - bias=False, - weight_init=weight_init, - bias_init=bias_init, - ), - nn.BatchNorm2d(in_features // bottleneck_ratio), - nn.ReLU(), - Conv2d( - in_channels=in_features // bottleneck_ratio, - out_channels=in_features // bottleneck_ratio, - kernel_size=kernel_size, - stride=stride, - bias=False, - weight_init=weight_init, - bias_init=bias_init, - padding=((kernel_size[0] - 1) // 2, kernel_size[1] // 2), - ), - nn.BatchNorm2d(in_features // bottleneck_ratio), - nn.ReLU(), - Conv2d( - in_channels=in_features // bottleneck_ratio, - out_channels=out_features, - kernel_size=1, - bias=False, - weight_init=weight_init, - bias_init=bias_init, - ), - nn.BatchNorm2d(out_features), - ) - - # Joined Post Act - self.joined_post_act = nn.ReLU() - - # Residual Block - if torch.prod(torch.tensor(stride)) > 1 or in_features != out_features: - self.residual = nn.Sequential( - Conv2d( - in_channels=in_features, - out_channels=out_features, - kernel_size=1, - stride=stride, - bias=False, - weight_init=weight_init, - bias_init=bias_init, - ), - nn.BatchNorm2d(out_features), - ) - else: - self.residual = nn.Identity() - - def forward(self, x): - - # Forward Layers - x = self.joined_post_act(self.layers(x) + self.residual(x)) - - return x diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/__init__.py b/SoundScribe/SpeakerID/nemo/collections/nlp/__init__.py deleted file mode 100644 index f6e986d0e818c707a94b38d808ba1d6014afb918..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp import data, losses, models, modules -from nemo.package_info import __version__ - -# Set collection version equal to NeMo version. -__version = __version__ - -# Authorship. -__author__ = "NVIDIA Corporation" - -# Set collection name. -__description__ = "Natural Language Processing collection" diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/__init__.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/__init__.py deleted file mode 100644 index 78ed9ee2a2439618131a2567d9ebeb21aa7a85de..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/__init__.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.data_utils import * -from nemo.collections.nlp.data.entity_linking.entity_linking_dataset import EntityLinkingDataset -from nemo.collections.nlp.data.information_retrieval.information_retrieval_dataset import ( - BertInformationRetrievalDataset, -) -from nemo.collections.nlp.data.language_modeling.l2r_lm_dataset import ( - L2RLanguageModelingDataset, - TarredL2RLanguageModelingDataset, -) -from nemo.collections.nlp.data.language_modeling.lm_bert_dataset import ( - BertPretrainingDataset, - BertPretrainingPreprocessedDataloader, -) -from nemo.collections.nlp.data.language_modeling.sentence_dataset import SentenceDataset, TarredSentenceDataset -from nemo.collections.nlp.data.machine_translation.machine_translation_dataset import ( - TarredTranslationDataset, - TranslationDataset, -) -from nemo.collections.nlp.data.question_answering_squad.qa_dataset import SquadDataset -from nemo.collections.nlp.data.text2sparql.text2sparql_dataset import Text2SparqlDataset -from nemo.collections.nlp.data.text_normalization.decoder_dataset import TextNormalizationDecoderDataset -from nemo.collections.nlp.data.text_normalization.tagger_dataset import TextNormalizationTaggerDataset -from nemo.collections.nlp.data.text_normalization.test_dataset import TextNormalizationTestDataset -from nemo.collections.nlp.data.token_classification.token_classification_dataset import ( - BertTokenClassificationDataset, - BertTokenClassificationInferDataset, -) -from nemo.collections.nlp.data.zero_shot_intent_recognition.zero_shot_intent_dataset import ( - ZeroShotIntentDataset, - ZeroShotIntentInferenceDataset, -) diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/common/__init__.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/common/__init__.py deleted file mode 100644 index 4e2ba234e6f5bbf517f836a3155f90cbc8d1a655..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/common/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.common.sequence_to_sequence_dataset import SequenceToSequenceDataset diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/common/sequence_to_sequence_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/common/sequence_to_sequence_dataset.py deleted file mode 100644 index d4ca85289a6f6b253f61484450060fc729c662e6..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/common/sequence_to_sequence_dataset.py +++ /dev/null @@ -1,398 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import numpy as np -import torch - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import ( - get_indexed_dataset_, - get_samples_mapping, -) -from nemo.collections.nlp.data.language_modeling.text_memmap_dataset import TextMemMapDataset -from nemo.core.classes import Dataset -from nemo.utils import logging - -__all__ = ['SequenceToSequenceDataset', 'TextMemmapSequenceToSequenceDataset'] - - -class SequenceToSequenceDataset(Dataset): - """Sequence to Sequence Dataset in memory.""" - - def __init__( - self, - src_file_name: str, - tgt_file_name: str, - src_tokenizer: TokenizerSpec, - tgt_tokenizer: TokenizerSpec, - max_src_seq_length: int, - max_tgt_seq_length: int, - add_bos_to_input: bool = True, - add_eos_to_input: bool = True, - replace_bos_with_pad: bool = False, - ): - super().__init__() - self.src_file_name = src_file_name - self.tgt_file_name = tgt_file_name - self.src_tokenizer = src_tokenizer - self.tgt_tokenizer = tgt_tokenizer - self.max_src_seq_length = max_src_seq_length - self.max_tgt_seq_length = max_tgt_seq_length - self.add_bos_to_input = add_bos_to_input - self.add_eos_to_input = add_eos_to_input - self.replace_bos_with_pad = replace_bos_with_pad - assert self.max_src_seq_length > 0 - assert self.max_tgt_seq_length > 0 - self._check_files_exist() - self._get_examples() - - def _check_files_exist(self): - if not os.path.exists(self.src_file_name): - raise FileNotFoundError(f"Source file {self.src_file_name} not found") - if not os.path.exists(self.tgt_file_name): - raise FileNotFoundError(f"Source file {self.src_file_name} not found") - - def __len__(self): - return len(self.examples) - - def __getitem__(self, idx): - example = self.examples[idx] - text_enc = example['src'] - text_dec = example['tgt'][:-1] - labels = example['tgt'][1:] - return {'text_enc': text_enc, 'text_dec': text_dec, 'labels': labels} - - def _get_examples(self): - self.examples = [] - with open(self.src_file_name, encoding='utf8') as f_src, open(self.tgt_file_name, encoding='utf8') as f_tgt: - for i, (src, tgt) in enumerate(zip(f_src, f_tgt)): - if i % 10000 == 0 and i != 0: - logging.info(f"Read {i} lines from {self.src_file_name} & {self.tgt_file_name}") - src = self.src_tokenizer.text_to_ids(src.strip()) - if self.add_bos_to_input: - src = [self.src_tokenizer.pad_id if self.replace_bos_with_pad else self.src_tokenizer.bos_id] + src - if self.add_eos_to_input: - src = src + [self.src_tokenizer.eos_id] - - tgt = ( - [self.tgt_tokenizer.pad_id if self.replace_bos_with_pad else self.tgt_tokenizer.bos_id] - + self.tgt_tokenizer.text_to_ids(tgt.strip()) - + [self.tgt_tokenizer.eos_id] - ) - # Truncate to max sequence length. - if len(src) > self.max_src_seq_length: - src = src[-self.max_src_seq_length + 1 :] - if len(tgt) > self.max_tgt_seq_length: - tgt = tgt[-self.max_tgt_seq_length + 1 :] - self.examples.append({'src': src, 'tgt': tgt}) - - logging.info(f'Dataset Length : {len(self.examples)}') - - def collate_fn(self, batch): - text_enc = [item['text_enc'] for item in batch] - text_dec = [item['text_dec'] for item in batch] - labels = [item['labels'] for item in batch] - - if isinstance(text_enc[0], np.ndarray): - text_enc = [x.tolist() for x in text_enc] - - if isinstance(text_dec[0], np.ndarray): - text_dec = [x.tolist() for x in text_dec] - - if isinstance(labels[0], np.ndarray): - labels = [x.tolist() for x in labels] - - max_dec_input_length = max([len(item) for item in text_dec]) if text_dec else 0 - max_enc_input_length = max([len(item) for item in text_enc]) if text_enc else 0 - max_label_length = max([len(item) for item in labels]) if labels else 0 - - loss_mask = [([1] * (len(item))) + ([0] * (max_label_length - len(item))) for item in labels] - text_enc = [item + [self.src_tokenizer.pad_id] * (max_enc_input_length - len(item)) for item in text_enc] - text_dec = [item + [self.tgt_tokenizer.pad_id] * (max_dec_input_length - len(item)) for item in text_dec] - labels = [item + [self.tgt_tokenizer.pad_id] * (max_label_length - len(item)) for item in labels] - - text_enc = torch.LongTensor(text_enc) - text_dec = torch.LongTensor(text_dec) - labels = torch.LongTensor(labels) - loss_mask = torch.LongTensor(loss_mask) - - enc_mask = (text_enc != self.src_tokenizer.pad_id).long() - dec_mask = (text_dec != self.tgt_tokenizer.pad_id).long() - - return { - 'text_enc': text_enc, - 'text_dec': text_dec, - 'labels': labels, - 'loss_mask': loss_mask, - 'enc_mask': enc_mask, - 'dec_mask': dec_mask, - } - - -class IndexedSequenceToSequenceDataset(SequenceToSequenceDataset): - """Abstract class for TextMemmapSequenceToSequenceDataset and BinarizedMemmapSequenceToSequenceDataset. - This class is not meant to be used standalone and just as an abstract class for the two subclasses. - """ - - def __init__( - self, - src_file_name: str, - tgt_file_name: str, - src_tokenizer: TokenizerSpec, - tgt_tokenizer: TokenizerSpec, - max_src_seq_length: int, - max_tgt_seq_length: int, - seed: int = 1234, - add_bos_to_enc: bool = True, - add_eos_to_enc: bool = True, - max_num_samples: int = None, - prepend_id: int = None, - ): - """ - src_file_name: Path to a single source file on disk. This is either the path to a raw text file or the prefix to the processed src_file_name.bin/idx files. - src_file_name: Path to a single target file on disk. This is either the path to a raw text file or the prefix to the processed tgt_file_name.bin/idx files. - src_tokenizer: Tokenizer for the source dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece). - tgt_tokenizer: Tokenizer for the target dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece). - max_src_seq_length: Maximum length of the source sequences. Lines above this length will be truncated. - max_tgt_seq_length: Maximum length of the target sequences. Lines above this length will be truncated. - seed: Random seed for data shuffling. - max_num_samples: Maximum number of samples to load. This can be > dataset length if you want to oversample data. If None, all samples will be loaded. - prepend_id: If not None, prepend this id to the encoder input. - """ - super().__init__( - src_file_name=src_file_name, - tgt_file_name=tgt_file_name, - src_tokenizer=src_tokenizer, - tgt_tokenizer=tgt_tokenizer, - max_src_seq_length=max_src_seq_length, - max_tgt_seq_length=max_tgt_seq_length, - ) - self.seed = seed - self.max_num_samples = max_num_samples - self.add_bos_to_enc = add_bos_to_enc - self.add_eos_to_enc = add_eos_to_enc - self.prepend_id = prepend_id - - logging.info(f'Desired number of samples : {self.max_num_samples}') - logging.info(f'Source Dataset Length : {len(self.src_indexed_dataset)}') - logging.info(f'Target Dataset Length : {len(self.tgt_indexed_dataset)}') - - def __len__(self): - if self.max_num_samples is None: - return len(self.src_indexed_dataset) - else: - return self.max_num_samples - - def _get_sample(self, idx): - if isinstance(idx, np.int64): - idx = idx.item() - - if self.samples_mapping is not None: - assert idx < len(self.samples_mapping) - idx, _, _ = self.samples_mapping[idx] - if isinstance(idx, np.uint32): - idx = idx.item() - - assert idx < len(self.src_indexed_dataset) - src = self.src_indexed_dataset[idx] - tgt = self.tgt_indexed_dataset[idx] - - return src, tgt - - def __getitem__(self, idx): - src, tgt = self._get_sample(idx) - offset = 0 - if self.add_bos_to_enc: - offset += 1 - if self.add_eos_to_enc: - offset += 1 - if self.prepend_id is not None: - offset += 1 - - if len(src) > self.max_src_seq_length - offset: - src = src[: self.max_src_seq_length - offset] - - if self.add_bos_to_enc: - src = np.concatenate([[self.src_tokenizer.bos_id], src]) - - if self.prepend_id is not None: - src = np.concatenate([[self.prepend_id], src]) - - if self.add_eos_to_enc: - src = np.concatenate([src, [self.src_tokenizer.eos_id]]) - - if len(tgt) > self.max_tgt_seq_length - 2: - tgt = tgt[: self.max_tgt_seq_length - 2] - - text_dec = np.concatenate([[self.tgt_tokenizer.bos_id], tgt]) - labels = np.concatenate([tgt, [self.tgt_tokenizer.eos_id]]) - - return {'text_enc': src, 'text_dec': text_dec, 'labels': labels} - - def _build_samples_mapping(self): - if self.max_num_samples is not None: - # This means max src and max tgt sequence length need to be the same - if self.max_src_seq_length != self.max_tgt_seq_length: - raise ValueError( - f"max_src_seq_length ({self.max_src_seq_length}) != max_tgt_seq_length ({self.max_tgt_seq_length}). This is needed for max_samples based training for now." - ) - - self.samples_mapping = get_samples_mapping( - indexed_dataset=self.src_indexed_dataset, - data_prefix=self.src_file_name, - num_epochs=None, - max_num_samples=self.max_num_samples, - max_seq_length=self.max_src_seq_length - 2, - short_seq_prob=0, - seed=self.seed, - name=self.src_file_name.split('/')[-1], - binary_head=False, - ) - else: - self.samples_mapping = None - - -class TextMemmapSequenceToSequenceDataset(IndexedSequenceToSequenceDataset): - """Memory-mapped text sequence to sequence dataset. Operates on raw text files and tokenizes the text on-the-fly.""" - - def __init__( - self, - src_file_name: str, - tgt_file_name: str, - src_tokenizer: TokenizerSpec, - tgt_tokenizer: TokenizerSpec, - max_src_seq_length: int, - max_tgt_seq_length: int, - seed: int = 1234, - max_num_samples: int = None, - add_bos_to_enc: bool = True, - add_eos_to_enc: bool = True, - prepend_id: int = None, - ): - """ - src_file_name: Path to a single source file on disk. The file should contain one sentence per line and be raw text. - tgt_file_name: Path to a single target file on disk. The file should contain one sentence per line aligned with src_file_name and be raw text. - src_tokenizer: Tokenizer for the source dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece). - tgt_tokenizer: Tokenizer for the target dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece). - max_src_seq_length: Maximum length of the source sequences. Lines above this length will be truncated. - max_tgt_seq_length: Maximum length of the target sequences. Lines above this length will be truncated. - seed: Random seed for data shuffling. - max_num_samples: Maximum number of samples to load. This can be > dataset length if you want to oversample data. If None, all samples will be loaded. - add_bos_to_enc: Add BOS token to the encoder input. - add_eos_to_enc: Add EOS token to the encoder input. - prepend_id: If not None, prepend this id to the encoder input. - """ - self.seed = seed - self.max_num_samples = max_num_samples - super().__init__( - src_file_name=src_file_name, - tgt_file_name=tgt_file_name, - src_tokenizer=src_tokenizer, - tgt_tokenizer=tgt_tokenizer, - max_src_seq_length=max_src_seq_length, - max_tgt_seq_length=max_tgt_seq_length, - seed=seed, - max_num_samples=max_num_samples, - add_bos_to_enc=add_bos_to_enc, - add_eos_to_enc=add_eos_to_enc, - prepend_id=prepend_id, - ) - - def _get_examples(self): - self.src_indexed_dataset = TextMemMapDataset( - dataset_paths=[self.src_file_name], tokenizer=self.src_tokenizer, header_lines=0 - ) - self.tgt_indexed_dataset = TextMemMapDataset( - dataset_paths=[self.tgt_file_name], tokenizer=self.tgt_tokenizer, header_lines=0 - ) - - assert len(self.src_indexed_dataset) == len( - self.tgt_indexed_dataset - ), "src and tgt has different number of lines" - self._build_samples_mapping() - - -class BinarizedMemmapSequenceToSequenceDataset(IndexedSequenceToSequenceDataset): - """Memory-mapped text sequence to sequence dataset. Operates pre-tokenized binarized data files.""" - - def __init__( - self, - src_dataset_prefix: str, - tgt_dataset_prefix: str, - src_tokenizer: TokenizerSpec, - tgt_tokenizer: TokenizerSpec, - max_src_seq_length: int, - max_tgt_seq_length: int, - seed: int = 1234, - max_num_samples: int = None, - add_bos_to_enc: bool = True, - add_eos_to_enc: bool = True, - prepend_id: int = None, - ): - """ - src_dataset_prefix: Path to the *prefix* of a single source bin/idx file on disk. This necessitates the existance src_file_prefix.bin and src_file_prefix.idx. - tgt_dataset_prefix: Path to the *prefix* of a single target aligned with source bin/idx file on disk. This necessitates the existance tgt_file_prefix.bin and tgt_file_prefix.idx. - src_tokenizer: Tokenizer for the source dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece). - tgt_tokenizer: Tokenizer for the target dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece). - max_src_seq_length: Maximum length of the source sequences. Lines above this length will be truncated. - max_tgt_seq_length: Maximum length of the target sequences. Lines above this length will be truncated. - seed: Random seed for data shuffling. - max_num_samples: Maximum number of samples to load. This can be > dataset length if you want to oversample data. If None, all samples will be loaded. - add_bos_to_enc: Add BOS token to the encoder input. - add_eos_to_enc: Add EOS token to the encoder input. - prepend_id: If not None, prepend this id to the encoder input. - """ - self.src_dataset_prefix = src_dataset_prefix - self.tgt_dataset_prefix = tgt_dataset_prefix - self.seed = seed - self.max_num_samples = max_num_samples - super().__init__( - src_file_name=src_dataset_prefix, - tgt_file_name=tgt_dataset_prefix, - src_tokenizer=src_tokenizer, - tgt_tokenizer=tgt_tokenizer, - max_src_seq_length=max_src_seq_length, - max_tgt_seq_length=max_tgt_seq_length, - seed=seed, - max_num_samples=max_num_samples, - add_bos_to_enc=add_bos_to_enc, - add_eos_to_enc=add_eos_to_enc, - prepend_id=prepend_id, - ) - - def _check_files_exist(self): - if not os.path.exists(self.src_dataset_prefix + ".bin") or not os.path.exists( - self.src_dataset_prefix + ".idx" - ): - raise FileNotFoundError(f"{self.src_dataset_prefix}.bin or {self.src_dataset_prefix}.idx not found") - if not os.path.exists(self.tgt_dataset_prefix + ".bin") or not os.path.exists( - self.tgt_dataset_prefix + ".idx" - ): - raise FileNotFoundError(f"{self.tgt_dataset_prefix}.bin or {self.tgt_dataset_prefix}.idx not found") - - def _get_examples(self): - self.src_indexed_dataset = self._get_indexed_dataset( - self.src_dataset_prefix, data_impl='mmap', skip_warmup=True - ) - self.tgt_indexed_dataset = self._get_indexed_dataset( - self.tgt_dataset_prefix, data_impl='mmap', skip_warmup=True - ) - assert len(self.src_indexed_dataset) == len(self.tgt_indexed_dataset) - self._build_samples_mapping() - - def _get_indexed_dataset(self, data_prefix, data_impl, skip_warmup): - indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup) - return indexed_dataset diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/data_utils/__init__.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/data_utils/__init__.py deleted file mode 100644 index f57d67bdeaef1095aac15b0ec3586f8687bb1b50..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/data_utils/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.data_utils.data_preprocessing import * diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/data_utils/data_preprocessing.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/data_utils/data_preprocessing.py deleted file mode 100644 index 25884a8290536f7f76327aa487281be75a8825ff..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/data_utils/data_preprocessing.py +++ /dev/null @@ -1,623 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import csv -import json -import os -import pickle -import random -import re -import string -from collections import Counter - -import numpy as np -import torch -from tqdm.auto import tqdm - -from nemo.utils import logging -from nemo.utils.env_var_parsing import get_envint - -__all__ = [ - "DataProcessor", - "get_label_stats", - "get_multi_label_stats", - "partition_data", - "write_files", - "write_data", - "create_dataset", - "read_csv", - "get_dataset", - "partition", - "map_entities", - "get_entities", - "get_data", - "reverse_dict", - "get_intent_labels", - "get_stats", - "DATABASE_EXISTS_TMP", - "MODE_EXISTS_TMP", - "is_whitespace", - "write_vocab", - "if_exist", - "remove_punctuation_from_sentence", - "dataset_to_ids", - "get_freq_weights", - "get_freq_weights_bce_with_logits_loss", - "fill_class_weights", - "normalize_answer", - "get_labels_to_labels_id_mapping", - "get_vocab", - "find_newlines", - "load_data_indices", - "chinese_punctuation", - "check_chinese_char", - "normalize_chinese_answer", -] - -DATABASE_EXISTS_TMP = "{} dataset has already been processed and stored at {}" -MODE_EXISTS_TMP = "{} mode of {} dataset has already been processed and stored at {}" - - -class DataProcessor(object): - """Base class for data converters for sequence classification data sets.""" - - def get_train_examples(self, data_dir): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() - - def get_dev_examples(self, data_dir): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() - - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() - - @classmethod - def _read_tsv(cls, input_file, quotechar=None): - """Reads a tab separated value file.""" - with open(input_file, "r", encoding="utf-8-sig") as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - # if sys.version_info[0] == 2: - # line = list(unicode(cell, 'utf-8') for cell in line) - lines.append(line) - return lines - - -chinese_punctuation = { - "——", - "‘", - "’", - "“", - "”", - "…", - "、", - "。", - "〈", - "〉", - "《", - "》", - "「", - "」", - "『", - "』", - "【", - "】", - "〔", - "〕", - "!", - "(", - ")", - ",", - ".", - ":", - ";", - "?", -} - - -def check_chinese_char(ch): - """Check if a character is in Chinese.""" - if "\u4e00" <= ch <= "\u9fff" or ch in chinese_punctuation: - return True - else: - return False - - -def normalize_chinese_answer(text): - """Remove the Chinese punctuation and separate Chinese answers to char-level""" - - def remove_punc(text): - exclude = chinese_punctuation - return "".join(ch for ch in text if ch not in exclude) - - def separate_char(text): - ch_list = [] - for ch in text: - ch_list.append(ch) - return ch_list - - return separate_char(remove_punc(text)) - - -def normalize_answer(s): - """Lower text and remove punctuation, articles and extra whitespace.""" - - def remove_articles(text): - return re.sub(r"\b(a|an|the)\b", " ", text) - - def white_space_fix(text): - return " ".join(text.split()) - - def remove_punc(text): - exclude = set(string.punctuation) - return "".join(ch for ch in text if ch not in exclude) - - def lower(text): - return text.lower() - - return white_space_fix(remove_articles(remove_punc(lower(s)))) - - -def get_label_stats(labels, outfile="stats.tsv", verbose=True): - """ - Args: - labels: list of all labels - outfile: path to the file where to save label stats - Returns: - total (int): total number of labels - label_frequencies (list of tuples): each tuple represent (label, label frequency) - max id of the labels - """ - labels = Counter(labels) - total = sum(labels.values()) - out = open(outfile, "w") - i = 0 - freq_dict = {} - label_frequencies = labels.most_common() - for k, v in label_frequencies: - out.write(f"{k}\t\t{round(v/total,5)}\t\t{v}\n") - if verbose and i < 3: - logging.info(f"label: {k}, {v} out of {total} ({(v / total)*100.0:.2f}%).") - i += 1 - freq_dict[k] = v - - return total, freq_dict, max(labels.keys()) - - -def get_multi_label_stats(labels, outfile="stats.tsv", verbose=True): - """ - Args: - labels: list of tuples containing labels for each utterance - Example: If there are 5 intents in total, then (0,1,1,1,0) represents the labels - for an individual utterance. (0,1,1,1,0) indicates that the utterance has labels - at index/line 1,2, and 3 in dict.intents. The list of tuples contain labels for - all utterances. - - outfile: path to the file where to save label stats - - Returns: - total (int): total number of labels - freq_dict (list of tuples): each tuple represents class counts in the form of (negative, positive) - """ - total = len(labels) - positive_class_total = 0 - class_count_dict = {} - - # Get the count of each label in the label dictionary, both the positive and negative classes - for label in labels: - for label_index, val in enumerate(label): - if label_index not in class_count_dict: - class_count_dict[label_index] = [0, 0] - - if val == 1: - positive_class_total += 1 - class_count_dict[label_index][1] += 1 - else: - class_count_dict[label_index][0] += 1 - - if verbose: - three_most_frequent_classes = sorted(class_count_dict, key=lambda idx: class_count_dict[idx][1], reverse=True) - - for cnt, idx in enumerate(three_most_frequent_classes): - if cnt > 2: - break - - positives = class_count_dict[idx][1] - logging.info( - f"label: {idx}, {positives} out of {positive_class_total} ({(positives / positive_class_total)*100.0:.2f}%)." - ) - - return total, class_count_dict, len(labels[0]) - 1 - - -def partition_data(intent_queries, slot_tags, split=0.1): - n = len(intent_queries) - n_dev = int(n * split) - dev_idx = set(random.sample(range(n), n_dev)) - dev_intents, dev_slots, train_intents, train_slots = [], [], [], [] - - dev_intents.append("sentence\tlabel\n") - train_intents.append("sentence\tlabel\n") - - for i, item in enumerate(intent_queries): - if i in dev_idx: - dev_intents.append(item) - dev_slots.append(slot_tags[i]) - else: - train_intents.append(item) - train_slots.append(slot_tags[i]) - return train_intents, train_slots, dev_intents, dev_slots - - -def write_files(data, outfile): - with open(outfile, "w") as f: - for item in data: - item = f"{item.strip()}\n" - f.write(item) - - -def write_data(data, slot_dict, intent_dict, outfold, mode, uncased): - intent_file = open(f"{outfold}/{mode}.tsv", "w") - intent_file.write("sentence\tlabel\n") - slot_file = open(f"{outfold}/{mode}_slots.tsv", "w") - for tokens, slots, intent in data: - text = " ".join(tokens) - if uncased: - text = text.lower() - intent_file.write(f"{text}\t{intent_dict[intent]}\n") - slots = [str(slot_dict[slot]) for slot in slots] - slot_file.write(" ".join(slots) + "\n") - intent_file.close() - slot_file.close() - - -def create_dataset(train, dev, slots, intents, uncased, outfold): - os.makedirs(outfold, exist_ok=True) - if "O" in slots: - slots.remove("O") - slots = sorted(list(slots)) + ["O"] - intents = sorted(list(intents)) - slots = write_vocab(slots, f"{outfold}/dict.slots.csv") - intents = write_vocab(intents, f"{outfold}/dict.intents.csv") - write_data(train, slots, intents, outfold, "train", uncased) - write_data(dev, slots, intents, outfold, "test", uncased) - - -def read_csv(file_path): - rows = [] - with open(file_path, "r") as csvfile: - read_csv = csv.reader(csvfile, delimiter=",") - for row in read_csv: - rows.append(row) - return rows - - -def get_dataset(files, dev_split=0.1): - # entity2value, value2entity = get_entities(files) - data, slots, intents = get_data(files) - if len(data) == 1: - train, dev = partition(data[0], split=dev_split) - else: - train, dev = data[0], data[1] - return train, dev, slots, intents - - -def partition(data, split=0.1): - n = len(data) - n_dev = int(n * split) - dev_idx = set(random.sample(range(n), n_dev)) - dev, train = [], [] - - for i, item in enumerate(data): - if i in dev_idx: - dev.append(item) - else: - train.append(item) - return train, dev - - -def map_entities(entity2value, entities): - for key in entities: - if "data" in entities[key]: - if key not in entity2value: - entity2value[key] = set([]) - - values = [] - for value in entities[key]["data"]: - values.append(value["value"]) - values.extend(value["synonyms"]) - entity2value[key] = entity2value[key] | set(values) - - return entity2value - - -def get_entities(files): - entity2value = {} - for file in files: - with open(file, "r") as json_file: - data = json.load(json_file) - entity2value = map_entities(entity2value, data["entities"]) - - value2entity = reverse_dict(entity2value) - return entity2value, value2entity - - -def get_data(files): - all_data, all_slots, all_intents = [], set(["O"]), set() - for file in files: - file_data = [] - with open(file, "r") as json_file: - data = json.load(json_file) - for intent in data["intents"]: - all_intents.add(intent) - utterances = data["intents"][intent]["utterances"] - for utterance in utterances: - tokens, slots = [], [] - for frag in utterance["data"]: - frag_tokens = frag["text"].strip().split() - tokens.extend(frag_tokens) - if "slot_name" not in frag: - slot = "O" - else: - slot = frag["slot_name"] - all_slots.add(slot) - slots.extend([slot] * len(frag_tokens)) - file_data.append((tokens, slots, intent)) - all_data.append(file_data) - return all_data, all_slots, all_intents - - -def reverse_dict(entity2value): - value2entity = {} - for entity in entity2value: - for value in entity2value[entity]: - value2entity[value] = entity - return value2entity - - -def get_intent_labels(intent_file): - labels = {} - label = 0 - with open(intent_file, "r") as f: - for line in f: - intent = line.strip() - labels[intent] = label - label += 1 - return labels - - -def get_stats(lengths): - logging.info("Some stats of the lengths of the sequences:") - lengths = np.asarray(lengths) - logging.info( - f"Min: {np.min(lengths)} | \ - Max: {np.max(lengths)} | \ - Mean: {np.mean(lengths)} | \ - Median: {np.median(lengths)}" - ) - logging.info(f"75 percentile: {np.percentile(lengths, 75):.2f}") - logging.info(f"99 percentile: {np.percentile(lengths, 99):.2f}") - - -def is_whitespace(c): - if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: - return True - return False - - -def write_vocab(items, outfile): - vocab = {} - idx = 0 - with open(outfile, "w") as f: - for item in items: - f.write(item + "\n") - vocab[item] = idx - idx += 1 - return vocab - - -def get_labels_to_labels_id_mapping(file): - """ - Reads labels from the file and returns labels to id mapping dictionary - Args: - file: path to file - Returns: - labels to id mapping dictionary - """ - lines = open(file, "r").readlines() - lines = [line.strip() for line in lines if line.strip()] - label_ids = {lines[i]: i for i in range(len(lines))} - return label_ids - - -def if_exist(outfold, files): - if not os.path.exists(outfold): - return False - for file in files: - if not os.path.exists(f"{outfold}/{file}"): - return False - return True - - -def remove_punctuation_from_sentence(sentence): - sentence = re.sub("[" + string.punctuation + "]", "", sentence) - sentence = sentence.lower() - return sentence - - -def dataset_to_ids( - dataset, - tokenizer, - cache_ids=False, - add_bos_eos=True, - cache_data_per_node=False, - use_cache=False, - remove_trailing_newline=False, -): - """ - Reads dataset from file line by line, tokenizes each line with tokenizer, - and returns list of lists which corresponds to ids of tokenized strings. - - Args: - dataset (str): path to dataset - tokenizer: tokenizer to convert text into ids - cache_ids (bool): if True, ids are saved to disk as pickle file - with similar name (e.g., data.txt --> data.txt.pkl) - add_bos_eos (bool): whether to add and symbols (e.g., for NMT) - cache_data_per_node (bool): Cache data on local_rank 0. Use when there is not a shared-filesystem. - use_cache (bool): Use cached ids if they exist. - remove_trailing_newline (bool): Remove trailing newline character. - Returns: - ids: list of ids which correspond to tokenized strings of the dataset - """ - - cached_ids_dataset = dataset + str(".pkl") - if use_cache and os.path.isfile(cached_ids_dataset): - logging.info("Loading cached tokenized dataset ...") - ids = pickle.load(open(cached_ids_dataset, "rb")) - else: - logging.info(f"Tokenizing dataset {dataset}...") - data = open(dataset, "rb").readlines() - ids = [] - for sentence in tqdm(data, desc="Tokenizing sentence"): - text = sentence.decode("utf-8") - if remove_trailing_newline: - text = text.rstrip("\n") - sent_ids = tokenizer.text_to_ids(text) - if add_bos_eos: - sent_ids = [tokenizer.bos_id] + sent_ids + [tokenizer.eos_id] - ids.append(sent_ids) - if cache_ids and ( - not torch.distributed.is_initialized() or (cache_data_per_node and get_envint("LOCAL_RANK", 0) == 0) - ): - logging.info("Caching tokenized dataset ...") - pickle.dump(ids, open(cached_ids_dataset, "wb")) - return ids - - -def get_freq_weights(label_freq): - """ - Goal is to give more weight to the classes with less samples - so as to match the ones with the higher frequencies. We achieve this by - dividing the total frequency by the freq of each label to calculate its weight. - """ - total_size = 0 - for lf in label_freq.values(): - total_size += lf - weighted_slots = {label: (total_size / (len(label_freq) * freq)) for label, freq in label_freq.items()} - - return weighted_slots - - -def get_freq_weights_bce_with_logits_loss(label_freq): - """ - Calculate positive class weights to be passed to BCEWithLogitsLoss - https://pytorch.org/docs/1.9.1/generated/torch.nn.BCEWithLogitsLoss.html - - Args: - label_freq: dictionary of tuples where keys represents class id, and tuple represents counts of positive and negative classes, - positive classes are at index 1 and negative at index 0 - Returns: - weights: dictionary of labels with their weights - """ - weights = {} - - for label_id, class_values in label_freq.items(): - positive_class = class_values[1] - negative_class = class_values[0] - - if positive_class == 0: - weights[label_id] = 0 - - else: - weights[label_id] = float(negative_class) / float(positive_class) - - return weights - - -def fill_class_weights(weights, max_id=-1): - """ - Gets a dictionary of labels with their weights and creates a list with size of the labels filled with those weights. - Missing labels in the dictionary would get value 1. - - Args: - weights: dictionary of weights for labels, labels as keys and weights are their values - max_id: the largest label id in the dataset, default=-1 would consider the largest label in the weights dictionary as max_id - Returns: - weights_list: list of weights for labels - """ - if max_id < 0: - max_id = 0 - for l in weights.keys(): - max_id = max(max_id, l) - - all_weights = [1.0] * (max_id + 1) - for i in range(len(all_weights)): - if i in weights: - all_weights[i] = weights[i] - return all_weights - - -def get_vocab(file): - lines = open(file, "r").readlines() - lines = [line.strip() for line in lines if line.strip()] - labels = {i: lines[i] for i in range(len(lines))} - return labels - - -def find_newlines(contents): - """ - Finds all of the newline positions in a text file. - """ - start = 0 - - while True: - try: - # index and split are much faster than Python for loops - new_start = contents.index(b"\n", start) - line = ( - contents[start:new_start] - .replace(b"\xc2\x99", b" ") - .replace(b"\xc2\xa0", b" ") - .decode("utf-8", errors="ignore") - ) - - if len(line.split()) > 0: - yield start - - start = new_start + 1 - - except ValueError: - break - - -def load_data_indices(idx_file: str, data_file: str, savename: str): - """ - Loads dataset index file if it exsits - """ - data_dir = data_file[: data_file.rfind("/")] - mode = data_file[data_file.rfind("/") + 1 : data_file.rfind(".")] - idx_file = f"{data_dir}/{mode}_{savename}.pkl" - - if os.path.isfile(idx_file): - # If the sentence indices file already exists, load from it - with open(idx_file, "rb") as f: - indices = pickle.load(f) - - return indices, idx_file, data_dir - - return None, idx_file, data_dir diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/__init__.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/__init__.py deleted file mode 100644 index a3992ef599715143b374736347cfca2d581591f2..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.data_processor.sgd_data_processor import DialogueSGDDataProcessor -from nemo.collections.nlp.data.dialogue.dataset import ( - DialogueBERTDataset, - DialogueGPTClassificationDataset, - DialogueSGDBERTDataset, - DialogueZeroShotIntentDataset, -) -from nemo.collections.nlp.data.dialogue.sgd.schema import Schema diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/__init__.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/__init__.py deleted file mode 100644 index 2db92b2574167fb3436c079fd7941450ae0c316b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py deleted file mode 100644 index 98d24802189e6eaea829eae25adcefcdb1af7fc7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample - -__all__ = ['DialogueAssistantDataProcessor'] - - -class DialogueAssistantDataProcessor(DialogueDataProcessor): - """Data Processor for Assistant dialogues.""" - - def __init__(self, data_dir: str, tokenizer: object, cfg): - """ - Constructs DialogueAssistantDataProcessor - Args: - data_dir: path to data directory - tokenizer: tokenizer object - """ - self.data_dir = data_dir - self._tokenizer = tokenizer - self.cfg = cfg - self.intents = self.open_file("dict.intents.csv") - if self.cfg.preprocess_intent_function == 'remove_domain': - self.intents = [ - DialogueAssistantDataProcessor.normalize_zero_shot_intent(intent) for intent in self.intents - ] - self.slots = self.open_file("dict.slots.csv") - ( - bio_slot_ids_to_unified_slot_ids, - unified_slots, - ) = DialogueAssistantDataProcessor.map_bio_format_slots_to_unified_slots(self.slots) - self.slots = unified_slots - - self.bio_slot_ids_to_unified_slot_ids = bio_slot_ids_to_unified_slot_ids - self.services = sorted(list(set([intent.split('_')[0] for intent in self.intents]))) - self.empty_slot_id = [str(idx) for idx, slot_name in enumerate(self.slots) if slot_name == "O"][0] - - @staticmethod - def normalize_zero_shot_intent(label): - label = label.split('.')[1] - if label == 'nomatch': - return 'no match' - else: - return label.replace('_', ' ') - - def open_file(self, filename): - """ - Reads file into a list - """ - filename = os.path.join(self.data_dir, filename) - with open(filename, "r", encoding="UTF-8") as f: - lines = [i.strip() for i in f.readlines()] - return lines - - @staticmethod - def get_continuous_slots(slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids): - - """ - Extract continuous spans of slot_ids - - To accomodate slots with distinct labels for B-label1 and I-label1, - slot_id = self.bio_slot_ids_to_unified_slot_ids[slot_id] is called to map them both to label1 - - Args: - Slot: list of int representing slot of each word token - For instance, 54 54 54 54 54 54 54 54 18 54 44 44 54 46 46 54 12 - Corresponds to "please set an alarm clock for my next meeting with the team at three pm next friday" - Except for the empty_slot_id (54 in this case), we hope to extract the continuous spans of tokens, - each containing a start position and an exclusive end position - E.g {18: [9, 10], 44: [11, 13], 46: [14, 16], 12: [17, 18]} - """ - slot_id_stack = [] - position_stack = [] - for i in range(len(slot_ids)): - slot_id = slot_ids[i] - - slot_id = bio_slot_ids_to_unified_slot_ids[slot_id] - - if not slot_id_stack or slot_id != slot_id_stack[-1]: - slot_id_stack.append(slot_id) - position_stack.append([]) - position_stack[-1].append(i) - - slot_id_to_start_and_exclusive_end = { - slot_id_stack[i]: [position_stack[i][0], position_stack[i][-1] + 1] - for i in range(len(position_stack)) - if slot_id_stack[i] != empty_slot_id - } - - return slot_id_to_start_and_exclusive_end - - @staticmethod - def map_bio_format_slots_to_unified_slots(slots): - """ - maps BIO format slots to unified slots (meaning that B-alarm_time and I-alarm_time both map to alarm_time) - called even slots does not contain BIO, for unified interface - in that case slots == unified_slots and bio_slot_ids_to_unified_slot_ids is an identity mapping i.e. {"0": "0", "1": "1"} - """ - bio_slot_ids_to_unified_slot_ids = {} - unified_slots = [] - unified_idx = -1 - for idx, slot in enumerate(slots): - if slot.replace('I-', '').replace('B-', '') not in unified_slots: - unified_idx += 1 - unified_slots.append(slot.replace('I-', '').replace('B-', '')) - bio_slot_ids_to_unified_slot_ids[str(idx)] = str(unified_idx) - return bio_slot_ids_to_unified_slot_ids, unified_slots - - def get_dialog_examples(self, dataset_split: str): - """ - Process raw files into DialogueInputExample - Args: - dataset_split: {train, dev, test} - For the assistant dataset, there is no explicit dev set (instead uses the test set as the dev set) - Therefore, this function creates a dev set and a new train set from the train set. - This is done by taking every 10th example and putting it into the dev set, - with all other examples going into the new train set. - """ - examples = [] - - dataset_split_print = {"train": "train", "dev": "train", "test": "test"} - - raw_examples_intent = self.open_file("{}.tsv".format(dataset_split_print[dataset_split])) - # removes header of tsv file - raw_examples_intent = raw_examples_intent[1:] - raw_examples_slots = self.open_file("{}_slots.tsv".format(dataset_split_print[dataset_split])) - - if dataset_split in ["train", "dev"]: - train_idx = [] - dev_idx = [] - for idx in range(len(raw_examples_intent)): - if idx % 10 == 0: - dev_idx.append(idx) - else: - train_idx.append(idx) - - if dataset_split == "train": - raw_examples_intent = [raw_examples_intent[idx] for idx in train_idx] - raw_examples_slots = [raw_examples_slots[idx] for idx in train_idx] - elif dataset_split == "dev": - raw_examples_intent = [raw_examples_intent[idx] for idx in dev_idx] - raw_examples_slots = [raw_examples_slots[idx] for idx in dev_idx] - - for i in range(len(raw_examples_intent)): - utterance, intent_id = raw_examples_intent[i].split('\t') - slot_ids = raw_examples_slots[i].split() - utterance_tokens = utterance.split() - intent = self.intents[int(intent_id)] - slot_id_to_start_and_exclusive_end = DialogueAssistantDataProcessor.get_continuous_slots( - slot_ids, self.empty_slot_id, self.bio_slot_ids_to_unified_slot_ids - ) - - slot_to_start_and_exclusive_end = { - self.slots[int(slot_id)]: position for slot_id, position in slot_id_to_start_and_exclusive_end.items() - } - slot_to_words = { - slot: ' '.join(utterance_tokens[position[0] : position[1]]) - for slot, position in slot_to_start_and_exclusive_end.items() - } - input_example = { - "utterance": utterance, - "labels": {"service": intent.split('_')[0], "intent": intent, "slots": slot_to_words}, - "label_positions": { - "slots": { - slot: {"start": position[0], "exclusive_end": position[1], "slot": slot,} - for slot, position in slot_to_start_and_exclusive_end.items() - } - }, - "possible_labels": { - "service": self.services, - "intent": self.intents, - "slots": { - # this dataset does not support categorical slots (i.e. only extractive slots) - # therefore use empty list for all values - slot: [] - for slot in self.slots - }, - }, - } - example = DialogueInputExample(input_example) - examples.append(example) - return examples - - def get_train_examples(self): - """Gets a collection of `InputExample`s for the train set.""" - return self.get_dialog_examples("train") - - def get_dev_examples(self): - """Gets a collection of `InputExample`s for the dev set.""" - return self.get_dialog_examples("dev") - - def get_test_examples(self): - """Gets a collection of `InputExample`s for the test set.""" - return self.get_dialog_examples("test") diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py deleted file mode 100644 index 2a4b21c7053537133e177e985a2a798ff5ef95dc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import random - -from nemo.collections.nlp.data.data_utils.data_preprocessing import DataProcessor - -__all__ = ['DialogueDataProcessor'] - - -class DialogueDataProcessor(DataProcessor): - """ - Base class for Data Processing for all data sources - - Data Processor is designed to be Model-independent (but Data-dependent) so that - - Encourages experimentation with a variety of models \ - (BERT-style; GPT-style; T5-style), \ - which have different tokenization/preprocessing requirements - - Facilitates experiments with a variety of data sources, - as data is processed into a common format - - Roles - 1. Processes raw files into Dialogue Input Examples. - 2. Keeps all possibly relevant information from the raw files, which - the Dataset class can then determine which labels to use - - """ - - def __init__(self): - raise NotImplementedError() - - def get_train_examples(self): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() - - def get_dev_examples(self): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() - - def get_test_examples(self): - """Gets a collection of `InputExample`s for the test set.""" - raise NotImplementedError() - - @staticmethod - def get_relevant_idxs(dataset_split, n_samples, dev_proportion): - """ - Obtain indexes for each dataset_split, when train and dev sets are not in separate files - - Args: - dataset_split: train, dev or test - n_samples: total number of samples - dev_proportion: value from 1 to 99 that represent proportion of data in dev set - Returns: - idxs: indices for relevant samples - """ - - if dataset_split in ["train", "dev"]: - n_dev = int(n_samples * (dev_proportion / 100)) - dev_idxs = random.sample(list(range(n_samples)), n_dev) - if dataset_split == "dev": - idxs = dev_idxs - else: - dev_idxs_set = set(dev_idxs) - train_idxs = [idx for idx in list(range(n_samples)) if idx not in dev_idxs_set] - idxs = train_idxs - - elif dataset_split == "test": - idxs = list(range(n_samples)) - - else: - raise ValueError("please select dataset split from train, dev and test") - - return idxs diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py deleted file mode 100644 index 5e58919b7652293614bdc10009aad7e3d26dc50d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pandas as pd - -from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample - -__all__ = ['DialogueDesignDataProcessor'] - - -class DialogueDesignDataProcessor(DialogueDataProcessor): - """Data Processor for Design Dataset""" - - def __init__(self, data_dir: str, tokenizer: object, cfg=None): - """ - Constructs DialogueDesignDataProcessor - Args: - data_dir: path to data directory - tokenizer: tokenizer object - cfg: cfg container for dataset - """ - self.data_dir = data_dir - self._tokenizer = tokenizer - self.cfg = cfg - - def open_csv(self, filename): - """ - Reads file into a list - """ - filename = os.path.join(self.data_dir, filename) - with open(filename, "r", encoding="UTF-8") as f: - df = pd.read_csv(filename) - return df.to_dict(orient='index') - - def get_dialog_examples(self, dataset_split: str): - """ - Process raw files into DialogueInputExample - Args: - dataset_split: {train, dev, test} - Dev set contains self.cfg.dev_proportion % of samples with the rest going into the train set - Test set contains the whole dataset (Dev + Train) as this dataset is small (~100) and primarily used in a zero shot setting - """ - - examples = [] - - raw_examples = self.open_csv('mellon_design_OV.csv') - # remove disabled examples - raw_examples = [raw_examples[i] for i in range(len(raw_examples)) if raw_examples[i]['disabled'] != 'yes'] - - n_samples = len(raw_examples) - - idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, self.cfg.dev_proportion) - - all_intents = sorted(list(set(raw_examples[i]['intent labels'] for i in range(len(raw_examples))))) - all_services = sorted(list(set(raw_examples[i]['domain'] for i in range(len(raw_examples))))) - for i in idxs: - raw_example = raw_examples[i] - utterances = [raw_example['example_{}'.format(i)] for i in range(1, 4)] - service = raw_example['domain'] - intent = raw_example['intent'] - intent_description = raw_example['intent labels'] - system_utterance = raw_example['response'] - - slot_names = [raw_example['slot{}'.format(i)] for i in range(1, 3)] - # these are possible slot values not ground truth slot values - slot_values = [raw_example['slot{}_values'.format(i)] for i in range(1, 3)] - slot_questions = [raw_example['slot{}_values'.format(i)] for i in range(1, 3)] - - for j in range(1, 3): - value = raw_example['slot{}'.format(j)] - if isinstance(value, str): - system_utterance = system_utterance.replace('slot{}'.format(j), value) - - valid_slots_ids = [i for i, slot in enumerate(slot_names) if isinstance(slot, str)] - slot_names = [slot_names[i] for i in valid_slots_ids] - slot_values = [slot_values[i] if isinstance(slot_values[i], str) else '' for i in valid_slots_ids] - slot_questions = [slot_questions[i] if isinstance(slot_questions[i], str) else '' for i in valid_slots_ids] - - for utterance in utterances: - if not isinstance(utterance, str): - continue - input_example = { - "utterance": utterance, - "system_utterance": system_utterance, - "labels": { - "service": service, - "intent": intent_description, - "slots": { - slot: '' for slot in slot_names - }, # dataset does not contain ground truth slot values - }, - "possible_labels": { - 'intent': all_intents, - "service": all_services, - "slots": {slot: slot_values[i] for i, slot in enumerate(slot_names)}, - }, - "description": { - "service": service, - "intent": intent_description, - "slots": {slot: slot_questions[i] for i, slot in enumerate(slot_names)}, - }, - } - - example = DialogueInputExample(input_example) - examples.append(example) - return examples - - def get_train_examples(self): - """Gets a collection of `InputExample`s for the train set.""" - return self.get_dialog_examples("train") - - def get_dev_examples(self): - """Gets a collection of `InputExample`s for the dev set.""" - return self.get_dialog_examples("dev") - - def get_test_examples(self): - """Gets a collection of `InputExample`s for the test set.""" - return self.get_dialog_examples("test") diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py deleted file mode 100644 index 58814a8eee90d79a7c9acdf85110e3df164281d5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pandas as pd - -from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample - -__all__ = ['DialogueMellonQADataProcessor'] - - -class DialogueMellonQADataProcessor(DialogueDataProcessor): - """Data Processor for Mellon QA dialogues. - """ - - def __init__(self, data_dir: str, tokenizer: object, cfg=None): - """ - Constructs DialogueMSMarcoDataProcessor - Args: - data_dir: path to data directory - tokenizer: tokenizer object - cfg: cfg container for dataset - """ - self.data_dir = data_dir - self._tokenizer = tokenizer - self.cfg = cfg - - def open_csv(self, filename): - """ - Reads file into a list - """ - filename = os.path.join(self.data_dir, filename) - with open(filename, "r", encoding="UTF-8") as f: - df = pd.read_csv(filename) - return df.to_dict(orient='index') - - def get_dialog_examples(self, dataset_split: str): - """ - Process raw files into DialogueInputExample - Args: - dataset_split: {train, dev, test} - For the Mellon QA dataset, there is no explicit dev set (instead uses the test set as the dev set) - Therefore, this function creates a dev set and a new train set from the train set. - Dev set contains self.cfg.dev_proportion % of samples with the rest going into the train set - Test set contains the whole dataset (Dev + Train) as this dataset is small (~100) and primarily used in a zero shot setting - """ - - examples = [] - - raw_examples = self.open_csv('mellon_qa_data.csv') - raw_examples = list(raw_examples.values()) - # filter out answers with no answer - raw_examples = [ - example - for example in raw_examples - if isinstance(example['Non Generative Question Answering '], str) - and isinstance(example['Generative Question Answering '], str) - ] - - n_samples = len(raw_examples) - idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, self.cfg.dev_proportion) - - for i in idxs: - utterance = str(raw_examples[i]['Question']) - answer = str(raw_examples[i]['Non Generative Question Answering ']) - well_formed_answer = str(raw_examples[i]['Generative Question Answering ']) - passage = raw_examples[i]['Passage'] - input_example = { - "utterance": utterance, - "example_id": i, - "labels": {"response": answer, "fluent_response": well_formed_answer, "passage": passage,}, - } - example = DialogueInputExample(input_example) - examples.append(example) - return examples - - def get_train_examples(self): - """Gets a collection of `InputExample`s for the train set.""" - return self.get_dialog_examples("train") - - def get_dev_examples(self): - """Gets a collection of `InputExample`s for the dev set.""" - return self.get_dialog_examples("dev") - - def get_test_examples(self): - """Gets a collection of `InputExample`s for the test set.""" - return self.get_dialog_examples("test") diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py deleted file mode 100644 index 78f434c1d5dda143314d72f4dc729fba7fc2315b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -from ast import literal_eval - -from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample - -__all__ = ['DialogueMSMarcoDataProcessor'] - - -class DialogueMSMarcoDataProcessor(DialogueDataProcessor): - """Data Processor for MS Marco dialogues. (https://github.com/microsoft/MSMARCO-Question-Answering) - Please agree to the Terms of Use before downloading data at - https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz - https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz - """ - - def __init__(self, data_dir: str, tokenizer: object, cfg=None): - """ - Constructs DialogueMSMarcoDataProcessor - Args: - data_dir: path to data directory - tokenizer: tokenizer object - debug_mode: reduce number of samples to load in order to increase speed of processing - cfg: cfg container for dataset - """ - self.data_dir = data_dir - self._tokenizer = tokenizer - self.cfg = cfg - - def open_json(self, filename): - """ - Reads file into a list - """ - filename = os.path.join(self.data_dir, filename) - with open(filename, "r", encoding="UTF-8") as f: - data = json.load(f) - return data - - def get_dialog_examples(self, dataset_split: str): - """ - Process raw files into DialogueInputExample - Args: - dataset_split: {train, dev, test} - For the MS Marco dataset, there is no explicit dev set (instead uses the test set as the dev set) - Therefore, this function creates a dev set and a new train set from the train set. - Dev set contains self.cfg.dev_proportion % of samples with the rest going into the train set - """ - - examples = [] - - dataset_split_print = {"train": "train", "dev": "train", "test": "dev"} - - raw_examples = self.open_json("{}_v2.1.json".format(dataset_split_print[dataset_split])) - - n_samples = len(raw_examples['answers']) - - idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, self.cfg.dev_proportion) - - if self.cfg.debug_mode: - idxs = idxs[:100] - - for i in idxs: - utterance = raw_examples['query'][str(i)] - # answer need not be extracted from passage - # taking the first answer as the ground truth correct answer as only <1% has multiple answers - answer = raw_examples['answers'][str(i)] - answer = answer[0] if isinstance(answer, list) else answer - - well_formed_answer = raw_examples['wellFormedAnswers'][str(i)] - well_formed_answer = ( - well_formed_answer if isinstance(well_formed_answer, list) else literal_eval(well_formed_answer) - ) - well_formed_answer = well_formed_answer[0] if well_formed_answer else None - query_type = raw_examples['query_type'][str(i)] - candidate_passages = raw_examples['passages'][str(i)] - passage = [ - candidate_passage["passage_text"] - for candidate_passage in candidate_passages - if int(candidate_passage["is_selected"]) - ] - passage = passage[0] if passage else None - - possible_passages = [candidate_passage["passage_text"] for candidate_passage in candidate_passages] - - input_example = { - "utterance": utterance, - "example_id": i, - "labels": { - "service": query_type, - "response": answer, - "fluent_response": well_formed_answer, - "passage": passage, - }, - "possible_labels": { - "service": "LOCATION,NUMERIC,PERSON,DESCRIPTION,ENTITY".split(','), - "passage": possible_passages, - }, - } - example = DialogueInputExample(input_example) - examples.append(example) - return examples - - def get_train_examples(self): - """Gets a collection of `InputExample`s for the train set.""" - return self.get_dialog_examples("train") - - def get_dev_examples(self): - """Gets a collection of `InputExample`s for the dev set.""" - return self.get_dialog_examples("dev") - - def get_test_examples(self): - """Gets a collection of `InputExample`s for the test set.""" - return self.get_dialog_examples("test") diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py deleted file mode 100644 index a78e1973e55fa85cba4543e10669d33a3c56e578..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py +++ /dev/null @@ -1,568 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This file contains code artifacts adapted from the original implementation: -https://github.com/google-research/google-research/blob/master/schema_guided_dst/baseline/data_utils.py -""" -import collections -import json -import os -import pickle -import re -from typing import List - -from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample -from nemo.collections.nlp.data.dialogue.sgd.schema import Schema -from nemo.utils import logging -from nemo.utils.get_rank import is_global_rank_zero - -__all__ = ['DialogueSGDDataProcessor'] - -FILE_RANGES = { - "sgd_single_domain": {"train": range(1, 44), "dev": range(1, 8), "test": range(1, 12)}, - "sgd_multi_domain": {"train": range(44, 128), "dev": range(8, 21), "test": range(12, 35)}, - "sgd_all": {"train": range(1, 128), "dev": range(1, 21), "test": range(1, 35)}, - "sgd_all_single": {"train": range(1, 128), "dev": range(1, 8), "test": range(1, 12)}, - "multiwoz": {"train": range(1, 18), "dev": range(1, 3), "test": range(1, 3)}, - "debug_sample": {"train": range(1, 2), "dev": range(1, 2), "test": range(1, 2)}, -} - - -class DialogueSGDDataProcessor(DialogueDataProcessor): - """Data Processor for SGD dialogues. - - More information at https://arxiv.org/abs/1909.05855 - - ***Downloading the dataset*** - # git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue.git - - ***Data format*** - SGD data comes with a JSON schema file and dialogue files for each dataset split. - - In the following we will show an example for a service entry in the schema file. - * service_name - * description - * slots - * name - * description - * is_categorical - * possible values - * intents - * name - * description - * required_slots (not used) - * is_transactional (not used) - * optional_slots (not used) - * result_slots (not used) - - - In the following we will show an example for a dialogue. - * dialogue_id - * services - * turns - * frames - * actions - * act - * slot - * values - * service - * slots - * exclusive_end - * slot - * start - * state - * active_intent - * requeste_slots - * slot_values - * speaker - [USER, SYSTEM] - * utterance - - """ - - def __init__( - self, data_dir: str, dialogues_example_dir: str, tokenizer: object, cfg=None, - ): - """ - Constructs DialogueSGDDataProcessor - Args: - data_dir: path to data directory - dialogues_example_dir: path to store processed dialogue examples - tokenizer: tokenizer object - cfg: cfg container for dataset - """ - self.data_dir = data_dir - self.cfg = cfg - - self._task_name = self.cfg.task_name # e.g. "sgd_single_domain" - self._subsample = self.cfg.subsample - - all_schema_json_paths = [] - for dataset_split in ['train', 'test', 'dev']: - all_schema_json_paths.append(os.path.join(self.cfg.data_dir, dataset_split, "schema.json")) - self.schemas = Schema(all_schema_json_paths) - - self.schema_config = { - "MAX_NUM_CAT_SLOT": self.cfg.max_num_cat_slot, - "MAX_NUM_NONCAT_SLOT": self.cfg.max_num_noncat_slot, - "MAX_NUM_VALUE_PER_CAT_SLOT": self.cfg.max_value_per_cat_slot, - "MAX_NUM_INTENT": self.cfg.max_num_intent, - "NUM_TASKS": self.cfg.num_tasks, - "MAX_SEQ_LENGTH": self.cfg.max_seq_length, - } - - train_file_range = FILE_RANGES[self._task_name]["train"] - dev_file_range = FILE_RANGES[self._task_name]["dev"] - test_file_range = FILE_RANGES[self._task_name]["test"] - - self._file_ranges = { - "train": train_file_range, - "dev": dev_file_range, - "test": test_file_range, - } - - self._seen_services = { - "train": set(), - "dev": set(), - "test": set(), - } - - self._tokenizer = tokenizer - - self._dialogues_example_dir = dialogues_example_dir - - self.dial_files = {} - - # slots_relation_list.np would contain the candidate list of slots for each (service, slot) which would be - # looked into when a switch between two services happens in the dialogue and we can not find any value for a slot in the current user utterance. - # This file would get generated from the dialogues in the training set. - self.slots_relation_file = os.path.join( - dialogues_example_dir, f"{self._task_name}_train_slots_relation_list.np" - ) - for dataset in ["train", "dev", "test"]: - # Process dialogue files - dial_file = f"{self._task_name}_{dataset}_examples.json" - dial_file = os.path.join(dialogues_example_dir, dial_file) - self.dial_files[(self._task_name, dataset)] = dial_file - - dialog_paths = DialogueSGDDataProcessor.get_dialogue_files(data_dir, dataset, self._task_name) - dialogs = DialogueSGDDataProcessor.load_dialogues(dialog_paths) - for dialog in dialogs: - self._seen_services[dataset].update(set(dialog['services'])) - - if is_global_rank_zero(): - overwrite_dial_files = not self.cfg.use_cache - self.save_dialog_examples(overwrite_dial_files=overwrite_dial_files) - - def save_dialog_examples(self, overwrite_dial_files: bool): - """ - Preprocesses dialogues and saves to disk. - Args: - overwrite_dial_files: whether or not to overwrite saved file if already exists - """ - for dataset in ["train", "dev", "test"]: - dial_file = self.dial_files[(self._task_name, dataset)] - if not os.path.exists(dial_file) or overwrite_dial_files: - logging.info(f"Start generating the dialogue examples for {dataset} dataset.") - if not os.path.exists(self._dialogues_example_dir): - os.makedirs(self._dialogues_example_dir) - dial_examples, slots_relation_list = self._generate_dialog_examples( - dataset, self.schemas, self._subsample - ) - - with open(dial_file, "w", encoding="UTF-8") as f: - json.dump([i.data for i in dial_examples], f) - - if dataset == "train": - with open(self.slots_relation_file, "wb") as f: - pickle.dump(slots_relation_list, f) - logging.info(f"The slot carry-over list for train set is stored at {self.slots_relation_file}") - - logging.info(f"The dialogue examples for {dataset} dataset saved at {dial_file}") - logging.info(f"Finish generating the dialogue examples for {dataset} dataset.") - - # common interface for Data Processor - def get_train_examples(self): - """Gets a collection of `InputExample`s for the train set.""" - return self.get_dialog_examples("train") - - def get_dev_examples(self): - """Gets a collection of `InputExample`s for the dev set.""" - return self.get_dialog_examples("dev") - - def get_test_examples(self): - """Gets a collection of `InputExample`s for the test set.""" - return self.get_dialog_examples("test") - - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() - - def get_dialog_examples(self, dataset_split: str) -> List[object]: - """ - Loads preprocessed dialogue examples from disk. - Args: - dataset_split: dataset split - Returns: - dial_examples: list of InputExample's. - """ - if (self._task_name, dataset_split) not in self.dial_files or not os.path.exists( - self.dial_files[(self._task_name, dataset_split)] - ): - raise ValueError( - f"{dataset_split} dialogue examples were not processed for {self._task_name} task. Re-initialize SGDDataProcessor and add {dataset_split} dataset split to datasets arg." - ) - dial_file = self.dial_files[(self._task_name, dataset_split)] - logging.info(f"Loading dialogue examples from {dial_file}.") - - with open(dial_file, "rb") as f: - dial_examples = json.load(f) - dial_examples = [DialogueInputExample(i) for i in dial_examples] - if not os.path.exists(self.slots_relation_file): - raise ValueError( - f"Slots relation file {self.slots_relation_file} does not exist. It is needed for the carry-over mechanism of state tracker for switches between services." - ) - if os.path.getsize(self.slots_relation_file) > 0: - with open(self.slots_relation_file, "rb") as f: - self.schemas._slots_relation_list = pickle.load(f) - logging.info( - f"Loaded the slot relation list for value carry-over between services from {self.slots_relation_file}." - ) - - return dial_examples - - def get_seen_services(self, dataset_split: str): - """ - Returns list of seen services, i.e. both in given and training split - Args: - dataset_split: data split - Returns: - seen_services: list of seen services - """ - seen_services = self._seen_services[dataset_split] - return seen_services - - def _generate_dialog_examples(self, dataset_split: str, schemas: object, subsample: bool): - """ - Returns a list of `InputExample`s of the data splits' dialogues. - Args: - dataset_split: data split, can be "train", "dev", or "test". - schemas: schema for all services of all datasets - subsample: whether to balance postive and negative samples in the dataset - Returns: - examples: a list of `InputExample`s. - """ - logging.info(f'Creating examples and slot relation list from the dialogues started...') - dialog_paths = [ - os.path.join(self.data_dir, dataset_split, "dialogues_{:03d}.json".format(i)) - for i in self._file_ranges[dataset_split] - ] - dialogs = DialogueSGDDataProcessor.load_dialogues(dialog_paths) - - examples = [] - slot_carryover_candlist = collections.defaultdict(int) - for dialog_idx, dialog in enumerate(dialogs): - if dialog_idx % 1000 == 0: - logging.info(f'Processed {dialog_idx} dialogues.') - examples.extend( - self._create_examples_from_dialog(dialog, schemas, dataset_split, slot_carryover_candlist, subsample) - ) - - slots_relation_list = collections.defaultdict(list) - for slots_relation, relation_size in slot_carryover_candlist.items(): - if relation_size > 0: - slots_relation_list[(slots_relation[0], slots_relation[1])].append( - (slots_relation[2], slots_relation[3], relation_size) - ) - slots_relation_list[(slots_relation[2], slots_relation[3])].append( - (slots_relation[0], slots_relation[1], relation_size) - ) - - return examples, slots_relation_list - - def _create_examples_from_dialog( - self, dialog: dict, schemas: object, dataset_split: str, slot_carryover_candlist: dict, subsample: bool - ): - """ - Create examples for every turn in the dialogue. - Args: - dialog: dialogue example - schemas: schema for all services of all datasets - dataset_split: data split - slot_carryover_candlist: a dictionary to keep and count the number of carry-over cases between two slots from two different services - subsample: whether to balance postive and negative samples in the dataset - Returns: - examples: a list of `InputExample`s. - """ - dialog_id = dialog["dialogue_id"] - prev_states = {} - examples = [] - for turn_idx, turn in enumerate(dialog["turns"]): - # Generate an example for every frame in every user turn. - if turn["speaker"] == "USER": - user_utterance = turn["utterance"] - user_frames = {f["service"]: f for f in turn["frames"]} - if self.cfg.system_utterance == 'prev_turn': - if turn_idx > 0: - system_turn = dialog["turns"][turn_idx - 1] - system_utterance = system_turn["utterance"] - system_frames = {f["service"]: f for f in system_turn["frames"]} - else: - system_utterance = "" - system_frames = {} - else: # takes the system utterance of the next turn - system_turn = dialog["turns"][turn_idx + 1] - system_utterance = system_turn["utterance"] - system_frames = {f["service"]: f for f in system_turn["frames"]} - - turn_id = "{}-{}-{:02d}".format(dataset_split, dialog_id, turn_idx) - turn_examples, prev_states, slot_carryover_values = self._create_examples_from_turn( - turn_id, - system_utterance, - user_utterance, - system_frames, - user_frames, - prev_states, - schemas, - subsample, - ) - examples.extend(turn_examples) - - for value, slots_list in slot_carryover_values.items(): - if value in ["True", "False"]: - continue - if len(slots_list) > 1: - for service1, slot1 in slots_list: - for service2, slot2 in slots_list: - if service1 == service2: - continue - if service1 > service2: - service1, service2 = service2, service1 - slot1, slot2 = slot2, slot1 - slot_carryover_candlist[(service1, slot1, service2, slot2)] += 1 - return examples - - def _get_state_update(self, current_state: dict, prev_state: dict) -> dict: - """ - Updates dialogue state - Args: - current_state: slot values pairs for the current dialogue turn - prev_state: slot values pairs for the previous dialogue turns - Returns: - state_update: slot values pairs that are added/updated during the current dialogue turn - """ - state_update = dict(current_state) - for slot, values in current_state.items(): - if slot in prev_state and prev_state[slot][0] in values: - # Remove the slot from state if its value didn't change. - state_update.pop(slot) - return state_update - - @staticmethod - def convert_camelcase_to_lower(label): - """Converts camelcase to lowercase with spaces e.g. 'HelloWorld' --> 'hello world'""" - if label.lower() == "none": - return "none" - label = label.split("_")[0] - tokens = re.findall('[A-Z][^A-Z]*', label) - return ' '.join([token.lower() for token in tokens]) - - def preprocess_intent(self, intent, schemas, service): - if self.cfg.preprocess_intent_function == 'default': - return intent - elif self.cfg.preprocess_intent_function == 'lowercase': - return DialogueSGDDataProcessor.convert_camelcase_to_lower(intent) - elif self.cfg.preprocess_intent_function == 'description': - return schemas.get_service_schema(service).intent_descriptions[intent] - else: - raise ValueError( - 'Only default, lowercase and description are allowed for model.dataset.preprocess_intent_function for SGD task' - ) - - def _create_examples_from_turn( - self, - turn_id: int, - system_utterance: str, - user_utterance: str, - system_frames: dict, - user_frames: dict, - prev_states: dict, - schemas: object, - subsample: bool, - ): - """ - Creates an example for each frame in the user turn. - Args: - turn_id: turn number - system_utterance: last system utterance - user_utterance: lst user utterance - system_frames: all system utterances and slot - slot value pairs - user_frames: all user utterances and slot - slot value pairs - prev_states: slot - slot value pairs from the previous turns - schemas: schema for all services of all datasets - subsample: whether to balance postive and negative samples in the dataset - Returns: - examples: a list of `InputExample`s. - prev_states: updated dialogue state e.g. {'Restaurants_1': {'city': ['San Jose'], 'cuisine': ['American']}} - """ - system_user_utterance = system_utterance + ' ' + user_utterance - states = {} - - examples = [] - slot_carryover_values = collections.defaultdict(list) - - for service, user_frame in user_frames.items(): - - state = user_frame["state"]["slot_values"] - state_update = self._get_state_update(state, prev_states.get(service, {})) - states[service] = state - system_frame = system_frames.get(service, None) - dataset_split, dialog_id, turn_id_ = turn_id.split('-') - dialog_id_1, dialog_id_2 = dialog_id.split('_') - example_id = f"{turn_id}-{service}" - example_id_num = [ - int(dialog_id_1), - int(dialog_id_2), - int(turn_id_), - schemas.get_service_id(service), - ] - intent = user_frames[service]["state"]['active_intent'] - all_possible_slots = schemas.get_service_schema(service).slots - categorical_slots = schemas.get_service_schema(service).categorical_slots - one_example = { - "example_id": example_id, - "example_id_num": example_id_num, - "utterance": user_utterance, - "system_utterance": system_utterance, - "system_slots": {slot["slot"]: slot for slot in system_frame["slots"]} - if system_frame is not None - else None, - "system_actions": system_frame["actions"] if system_frame is not None else None, - "labels": { - "service": service, - "intent": self.preprocess_intent(intent, schemas, service), - "slots": {slot: state[slot] for slot in state_update}, - }, - "label_positions": {"slots": {slot["slot"]: slot for slot in user_frames[service]["slots"]}}, - "possible_labels": { - "service": schemas.services, - "intent": [ - self.preprocess_intent(intent, schemas, service) - for intent in schemas.get_service_schema(service).intents - ], - "slots": { - slot: schemas.get_service_schema(service).get_categorical_slot_values(slot) - if slot in categorical_slots - else [] - for slot in all_possible_slots - }, - }, - "description": { - "service": schemas.get_service_schema(service).description, - "intent": schemas.get_service_schema(service).intent_descriptions[intent], - "slots": { - slot: schemas.get_service_schema(service).slot_descriptions[slot] for slot in state_update - }, - }, - } - - examples.append(DialogueInputExample(one_example)) - - if service not in prev_states and int(turn_id_) > 0: - for slot_name, values in state_update.items(): - for value in values: - slot_carryover_values[value].append((service, slot_name)) - for prev_service, prev_slot_value_list in prev_states.items(): - if prev_service == service: - continue - if prev_service in state: - prev_slot_value_list = state[prev_service] - for prev_slot_name, prev_values in prev_slot_value_list.items(): - for prev_value in prev_values: - slot_carryover_values[prev_value].append((prev_service, prev_slot_name)) - - return examples, states, slot_carryover_values - - def _find_subword_indices( - self, - slot_values: dict, - utterance: str, - char_slot_spans: dict, - alignments: List[int], - subwords: List[str], - bias: int, - ) -> dict: - """ - Find indices for subwords corresponding to slot values. - Args: - slot_values: slot - slot value pairs - utterance: utterance - char_slot_spans: char - slot spans - alignments: alignments - subwords: subtokens mapping - bias: offset - Returns: - span_boundaries: span boundaries - """ - span_boundaries = {} - for slot, values in slot_values.items(): - # Get all values present in the utterance for the specified slot. - value_char_spans = {} - for slot_span in char_slot_spans: - if slot_span["slot"] == slot: - value = utterance[slot_span["start"] : slot_span["exclusive_end"]] - start_tok_idx = alignments[slot_span["start"]] - end_tok_idx = alignments[slot_span["exclusive_end"] - 1] - if 0 <= start_tok_idx < len(subwords): - end_tok_idx = min(end_tok_idx, len(subwords) - 1) - value_char_spans[value] = (start_tok_idx + bias, end_tok_idx + bias) - for v in values: - if v in value_char_spans: - span_boundaries[slot] = value_char_spans[v] - break - return span_boundaries - - @classmethod - def load_dialogues(cls, dialog_json_filepaths: List[str]) -> List[dict]: - """ - Obtain the list of all dialogues from specified json files. - Args: - dialog_json_filepaths: list of json files - Returns: - dialogs: the list of all dialogues - """ - dialogs = [] - for dialog_json_filepath in sorted(dialog_json_filepaths): - with open(dialog_json_filepath, 'r', encoding="UTF-8") as f: - dialogs.extend(json.load(f)) - f.close() - return dialogs - - @classmethod - def get_dialogue_files(cls, data_dir: str, dataset_split: str, task_name: str): - """ - Obtain the list of all dialogue json files - Args: - data_dir: path to the data folder - dataset_split: data split - task_name: SGD task name, see keys of the FILE_RANGES - Returns: - dialog: the list of all dialogue json files paths - """ - return [ - os.path.join(data_dir, dataset_split, 'dialogues_{:03d}.json'.format(fid)) - for fid in FILE_RANGES[task_name][dataset_split] - ] diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/__init__.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/__init__.py deleted file mode 100644 index 3352c7be2d9bb6ac6192aa6dfb6470749995ef53..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.dataset.dialogue_bert_dataset import DialogueBERTDataset -from nemo.collections.nlp.data.dialogue.dataset.dialogue_gpt_classification_dataset import ( - DialogueGPTClassificationDataset, -) -from nemo.collections.nlp.data.dialogue.dataset.dialogue_sgd_bert_dataset import DialogueSGDBERTDataset -from nemo.collections.nlp.data.dialogue.dataset.dialogue_zero_shot_intent_dataset import DialogueZeroShotIntentDataset diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py deleted file mode 100644 index edab77a056e13cf840e94567592f6d84bd85dcfe..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py +++ /dev/null @@ -1,332 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, Optional - -import numpy as np - -from nemo.collections.nlp.data.data_utils import get_stats -from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset -from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType -from nemo.utils import logging - -__all__ = ['DialogueBERTDataset', 'DialogueIntentSlotInferenceDataset'] - - -class DialogueBERTDataset(DialogueDataset): - - """ - Creates a dataset to use for the task of joint intent - and slot classification with pretrained model. - - For a dataset to use during inference without labels, see - IntentSlotDataset. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - 'intent_labels': NeuralType(('B'), LabelsType()), - 'slot_labels': NeuralType(('B', 'T'), LabelsType()), - } - - def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ - Args: - dataset_split: dataset split - dialogues_processor: Data generator for dialogues - tokenizer: tokenizer - cfg: config container for dataset - """ - self.cfg = cfg - self.all_possible_labels = dialogues_processor.intents - self.label_to_label_id = {self.all_possible_labels[i]: i for i in range(len(self.all_possible_labels))} - self.all_possible_slots = dialogues_processor.slots - self.slot_name_to_slot_id = {self.all_possible_slots[i]: i for i in range(len(self.all_possible_slots))} - self.empty_slot_name = 'O' - - self.features = dialogues_processor.get_dialog_examples(dataset_split) - self.features = self.features if self.cfg.num_samples == -1 else self.features[: self.cfg.num_samples] - - queries = [feature.data["utterance"] for feature in self.features] - if self.cfg.do_lowercase: - queries = [query.lower() for query in queries] - intents = [self.label_to_label_id[feature.data["labels"]["intent"]] for feature in self.features] - word_level_slots = [self.convert_slot_position_to_slot_ids(feature.data) for feature in self.features] - - features = DialogueBERTDataset.get_features( - queries, - self.cfg.max_seq_length, - tokenizer, - pad_label=self.cfg.pad_label, - word_level_slots=word_level_slots, - ignore_extra_tokens=self.cfg.ignore_extra_tokens, - ignore_start_end=self.cfg.ignore_start_end, - ) - - self.all_input_ids = features[0] - self.all_segment_ids = features[1] - self.all_input_mask = features[2] - self.all_loss_mask = features[3] - self.all_subtokens_mask = features[4] - self.all_slots = features[5] - self.all_intents = intents - - def convert_slot_position_to_slot_ids(self, feature): - slot_ids = [self.slot_name_to_slot_id[self.empty_slot_name] for i in range(len(feature["utterance"].split()))] - slot_name_to_positions = feature["label_positions"]["slots"] - - for slot_name in slot_name_to_positions: - slot_id = self.slot_name_to_slot_id[slot_name] - start = slot_name_to_positions[slot_name]["start"] - exclusive_end = slot_name_to_positions[slot_name]["exclusive_end"] - for to_replace_position in range(start, min(exclusive_end, len(slot_ids))): - slot_ids[to_replace_position] = slot_id - - return slot_ids - - def __len__(self): - return len(self.all_input_ids) - - def __getitem__(self, idx): - return ( - np.array(self.all_input_ids[idx]), - np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.long), - np.array(self.all_loss_mask[idx]), - np.array(self.all_subtokens_mask[idx]), - self.all_intents[idx], - np.array(self.all_slots[idx]), - ) - - @staticmethod - def truncate_and_pad( - max_seq_length, - ignore_start_end, - with_label, - pad_label, - tokenizer, - all_slots, - all_subtokens, - all_input_mask, - all_loss_mask, - all_subtokens_mask, - all_input_ids, - all_segment_ids, - ): - - too_long_count = 0 - - for i, subtokens in enumerate(all_subtokens): - if len(subtokens) > max_seq_length: - subtokens = [tokenizer.cls_token] + subtokens[-max_seq_length + 1 :] - all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :] - all_loss_mask[i] = [1 - ignore_start_end] + all_loss_mask[i][-max_seq_length + 1 :] - all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :] - - if with_label: - all_slots[i] = [pad_label] + all_slots[i][-max_seq_length + 1 :] - too_long_count += 1 - - all_input_ids.append([tokenizer.tokens_to_ids(t) for t in subtokens]) - - if len(subtokens) < max_seq_length: - extra = max_seq_length - len(subtokens) - all_input_ids[i] = all_input_ids[i] + [0] * extra - all_loss_mask[i] = all_loss_mask[i] + [0] * extra - all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra - all_input_mask[i] = all_input_mask[i] + [0] * extra - - if with_label: - all_slots[i] = all_slots[i] + [pad_label] * extra - - all_segment_ids.append([0] * max_seq_length) - - logging.info(f'{too_long_count} are longer than {max_seq_length}') - return ( - all_slots, - all_subtokens, - all_input_mask, - all_loss_mask, - all_subtokens_mask, - all_input_ids, - all_segment_ids, - ) - - @staticmethod - def get_features( - queries, - max_seq_length, - tokenizer, - pad_label=128, - word_level_slots=None, - ignore_extra_tokens=False, - ignore_start_end=False, - ): - """ - Convert queries (utterance, intent label and slot labels) to BERT input format - """ - - all_subtokens = [] - all_loss_mask = [] - all_subtokens_mask = [] - all_segment_ids = [] - all_input_ids = [] - all_input_mask = [] - sent_lengths = [] - all_slots = [] - - with_label = word_level_slots is not None - - for i, query in enumerate(queries): - words = query.strip().split() - subtokens = [tokenizer.cls_token] - loss_mask = [1 - ignore_start_end] - subtokens_mask = [0] - if with_label: - slots = [pad_label] - - for j, word in enumerate(words): - word_tokens = tokenizer.text_to_tokens(word) - - # to handle emojis that could be neglected during tokenization - if len(word.strip()) > 0 and len(word_tokens) == 0: - word_tokens = [tokenizer.ids_to_tokens(tokenizer.unk_id)] - - subtokens.extend(word_tokens) - # mask all sub-word tokens except the first token in a word - # use the label for the first sub-word token as the label for the entire word to eliminate need for disambiguation - loss_mask.append(1) - loss_mask.extend([int(not ignore_extra_tokens)] * (len(word_tokens) - 1)) - - subtokens_mask.append(1) - subtokens_mask.extend([0] * (len(word_tokens) - 1)) - - if with_label: - slots.extend([word_level_slots[i][j]] * len(word_tokens)) - - subtokens.append(tokenizer.sep_token) - loss_mask.append(1 - ignore_start_end) - subtokens_mask.append(0) - sent_lengths.append(len(subtokens)) - all_subtokens.append(subtokens) - all_loss_mask.append(loss_mask) - all_subtokens_mask.append(subtokens_mask) - all_input_mask.append([1] * len(subtokens)) - if with_label: - slots.append(pad_label) - all_slots.append(slots) - max_seq_length_data = max(sent_lengths) - max_seq_length = min(max_seq_length, max_seq_length_data) if max_seq_length > 0 else max_seq_length_data - logging.info(f'Setting max length to: {max_seq_length}') - get_stats(sent_lengths) - - # truncate and pad samples - ( - all_slots, - all_subtokens, - all_input_mask, - all_loss_mask, - all_subtokens_mask, - all_input_ids, - all_segment_ids, - ) = DialogueBERTDataset.truncate_and_pad( - max_seq_length, - ignore_start_end, - with_label, - pad_label, - tokenizer, - all_slots, - all_subtokens, - all_input_mask, - all_loss_mask, - all_subtokens_mask, - all_input_ids, - all_segment_ids, - ) - - # log examples for debugging - logging.debug("*** Some Examples of Processed Data ***") - for i in range(min(len(all_input_ids), 5)): - logging.debug("i: %s" % (i)) - logging.debug("subtokens: %s" % " ".join(list(map(str, all_subtokens[i])))) - logging.debug("loss_mask: %s" % " ".join(list(map(str, all_loss_mask[i])))) - logging.debug("input_mask: %s" % " ".join(list(map(str, all_input_mask[i])))) - logging.debug("subtokens_mask: %s" % " ".join(list(map(str, all_subtokens_mask[i])))) - if with_label: - logging.debug("slots_label: %s" % " ".join(list(map(str, all_slots[i])))) - - return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_slots) - - -class DialogueIntentSlotInferenceDataset(DialogueBERTDataset): - """ - Creates dataset to use for the task of joint intent - and slot classification with pretrained model. - This is to be used during inference only. - It uses list of queries as the input. - - Args: - queries (list): list of queries to run inference on - max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP] - tokenizer (Tokenizer): such as NemoBertTokenizer - pad_label (int): pad value use for slot labels. - by default, it's the neutral label. - - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """ - Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - } - - def __init__(self, queries, max_seq_length, tokenizer, do_lower_case): - if do_lower_case: - queries = [query.lower() for query in queries] - - features = DialogueBERTDataset.get_features(queries, max_seq_length, tokenizer) - - self.all_input_ids = features[0] - self.all_segment_ids = features[1] - self.all_input_mask = features[2] - self.all_loss_mask = features[3] - self.all_subtokens_mask = features[4] - - def __len__(self): - return len(self.all_input_ids) - - def __getitem__(self, idx): - return ( - np.array(self.all_input_ids[idx]), - np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.long), - np.array(self.all_loss_mask[idx]), - np.array(self.all_subtokens_mask[idx]), - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_dataset.py deleted file mode 100644 index 5540dd3b19f75fb64099acec36babc498b3ab782..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_dataset.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.core.classes import Dataset - -__all__ = ['DialogueDataset'] - - -class DialogueDataset(Dataset): - ''' - Base class for Dialogue Datasets - 1. Performs Model-dependent (but Data-independent) operations (tokenization etc) - 2. This can allow the same model preprocessing for multiple datasources - 3. Users can configurate which labels to use for modelling - (e.g. intent classification, slot filling or sequence generation etc) - ''' - - def __init__(self, dataset_split: str, dialogues_processor: object, **kwargs): - raise NotImplementedError - - def __len__(self): - raise NotImplementedError - - def __getitem__(self, idx: int): - raise NotImplementedError diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py deleted file mode 100644 index 1ac04a856a89ce737e6763403d4bff148a94a9cc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py +++ /dev/null @@ -1,311 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import random -from collections import defaultdict - -import torch - -from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset -from nemo.utils import logging - - -class DialogueGPTClassificationDataset(DialogueDataset): - ''' - Designed for classification tasks such as intent/domain classification as well as slot tagging - - Dataset Class - 1. Performs Model-dependent (but Data-independent) operations (tokenization etc) - 2. This can allow the same model preprocessing for multiple datasources - 3. Users can configurate which labels to use for modelling - (e.g. intent classification, slot filling or both together etc) - ''' - - def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ Constructor - Args: - dataset_split: dataset split - dialogues_processor: Data generator for SGD dialogues - tokenizer: tokenizer - cfg: cfg container for dataset - """ - self.cfg = cfg - - if self.cfg.target_template == "with_slots" and self.cfg.eval_mode != "generation": - raise ValueError( - "slot-filling is not supported by eval_mode {}, please set model.dataset.eval_mode=generation instead".format( - self.cfg.eval_mode - ) - ) - if self.cfg.target_template != "with_slots" and self.cfg.field == "slots": - raise ValueError("please set model.dataset.target_template='with_slots' if model.dataset.field='slots'") - self.label_type = self.cfg.field - if self.cfg.target_template == "with_description": - self.label_to_description = defaultdict(str) - self.all_possible_labels = set() - self.tokenizer = tokenizer - self.tokenizer.tokenizer.padding_side = "right" - self.max_candidates = 2 - if not isinstance(dataset_split, str): - dataset_split = dataset_split[0] - self.features = dialogues_processor.get_dialog_examples(dataset_split) - for idx in range(len(self.features)): - self.preprocess_feature(idx) - if self.cfg.debug_mode: - self.features = self.features[:16] - # for few shot learning to append in the prompt - self.lm_features = self.get_lm_samples() - - def transform(self, label): - """ - Normalize labels by replacing underscore with space - - Args: - label: str - Returns: - normalized_label: str - """ - if self.cfg.task == "assistant" and self.cfg.prompt_template != "prompt_tuning": - label = label.replace('_', ' ') - return label - - def __len__(self): - return len(self.features) - - def get_n_tokens_in_sentence(self, sentence): - encodings_dict = self.tokenizer.tokenizer( - sentence, truncation=True, max_length=self.cfg.max_seq_length, padding=False, return_tensors="pt" - ) - output = torch.squeeze(encodings_dict['input_ids']) - return len(output) if len(output.size()) > 0 else 0 - - def preprocess_feature(self, idx): - ex = self.features[idx].data - label = ex["labels"][self.label_type] - candidates = ex["possible_labels"][self.label_type] - - if self.label_type in ["service", "intent"]: - label = self.transform(label) - candidates = [self.transform(candidate) for candidate in candidates] - - self.features[idx].data["labels"][self.label_type] = label - self.features[idx].data["possible_labels"][self.label_type] = candidates - if self.cfg.target_template == "with_description": - description = ex["description"][self.label_type] - self.label_to_description[label] = description - for candidate in candidates: - self.all_possible_labels.add(candidate) - self.max_candidates = max(self.max_candidates, len(candidates)) - - def default_encode(self, sentence): - encodings_dict = self.tokenizer.tokenizer( - sentence, truncation=True, max_length=self.cfg.max_seq_length, padding="max_length", return_tensors="pt" - ) - input_ids = torch.squeeze(encodings_dict['input_ids']) - attn_masks = torch.squeeze(encodings_dict['attention_mask']) - return encodings_dict, input_ids, attn_masks - - @staticmethod - def linearize_slots(slots): - """ - Serialize slots into a linear text - - Args: - slots: dict with each slot_name as key and possible slot values as value - Returns: - linear_slots: text based representation of slot names and values - """ - if not slots: - return "None" - return ", ".join( - ["{}({})".format(slot, value if isinstance(value, str) else value[0]) for slot, value in slots.items()] - ) - - def format_target(self, target, slots=None): - """ - Formats the back part of the training example, after the base_template - for instance, "restaurant" in " service: restaurant" - or "set alarm\nslots: (), ()" in \ - "\nintent: set alarm\nslots: (), ()" - """ - if self.cfg.target_template == "with_description": - return target + ' (' + self.label_to_description[target] + ')' - elif self.cfg.target_template == "default": - return target - elif self.cfg.target_template == "with_slots" and slots is not None and self.cfg.field == "intent": - return target + '\nslots: ' + DialogueGPTClassificationDataset.linearize_slots(slots) - elif self.cfg.target_template == "with_slots" and slots is not None and self.cfg.field == "slots": - return DialogueGPTClassificationDataset.linearize_slots(slots) - else: - raise ValueError("Please choose a target format from {default, with_description, with_slots}") - - def get_lm_samples(self): - max_sample_length = 0 - lm_features = [] - for idx in range(len(self.features)): - ex = self.features[idx].data - utterance = ex["utterance"] - label = ex["labels"][self.label_type] - slots = ex["labels"]["slots"] if self.cfg.target_template == "with_slots" else None - lm_feature = self.format_prompt(utterance) + ' ' + self.format_target(label, slots=slots) - feature_len = self.get_n_tokens_in_sentence(lm_feature) - max_sample_length = max(max_sample_length, feature_len) - lm_features.append(lm_feature) - logging.info("max feature length per sample with label: ".format(max_sample_length)) - logging.info( - "please adjust max seq len to at least {} * ({} + 1) = {} but not too much more for efficiency".format( - max_sample_length, self.cfg.few_shot, max_sample_length * (1 + self.cfg.few_shot) - ) - ) - return lm_features - - def format_prompt(self, utterance, few_shot=0, idx=None): - if self.cfg.prompt_template == "default": - base_template = utterance + ' ' + self.label_type + ':' - elif self.cfg.prompt_template == "i_want_to": - base_template = utterance + ' ' + 'I want to' - elif self.cfg.prompt_template == "prompt_tuning": - base_template = utterance + '\n' + self.label_type + ':' - elif self.cfg.prompt_template == "prompt_tuning_with_options": - base_template = ( - 'possible intents: ' - + ', '.join(sorted(list(self.all_possible_labels))) - + '\n\n' - + utterance - + '\n' - + self.label_type - + ':' - ) - - if few_shot > 0: - few_shot_indices = random.sample(range(len(self.features)), few_shot + 1) - few_shot_indices = [i for i in few_shot_indices if i != idx][:few_shot] - few_shot_samples = [self.lm_features[i] for i in few_shot_indices] - base_template = ( - self.tokenizer.tokenizer.pad_token.join(few_shot_samples) - + self.tokenizer.tokenizer.pad_token - + base_template - ) - return base_template - - def collate_fn(self, batch): - """ - Truncates elements to max length in batch - """ - _, _, _, _, candidate_attn_masks, _, _, _ = zip(*batch) - # determine max length in batch - batch_max_length = 0 - for candidate_attn_mask in candidate_attn_masks: - for one_attn_mask in candidate_attn_mask: - batch_max_length = max(batch_max_length, torch.sum(one_attn_mask).item()) - # padding for tp=2 situation - if batch_max_length % 2: - batch_max_length += 1 - - all_items = [] - for item in zip(*batch): - if isinstance(item[0], int): - item = [torch.tensor(i) for i in item] - item_stack = torch.stack(item) - # if item_stack is 1d, elements refers to indexes and there is no need to truncate - if len(item_stack.size()) == 1: - all_items.append(item_stack) - # otherwise, truncate last dimension to max length in batch - else: - all_items.append(item_stack[..., :batch_max_length]) - return all_items - - def __getitem__(self, idx: int): - - ''' - State how the input and output samples look like - - This template can be changed - - Training example: - e.g. service: restaurant - e.g. service: restaurant - e.g. \nintent: set alarm\nslots: (), () - - Generation example: - e.g. service: - - ''' - ex = self.features[idx].data - - utterance = ex["utterance"] - utterance_length = self.get_n_tokens_in_sentence(utterance) - - label = ex["labels"][self.label_type] - candidates = ex["possible_labels"][self.label_type] - - slots = ex["labels"]["slots"] if self.cfg.target_template == "with_slots" else None - - base_template = self.format_prompt(utterance, few_shot=self.cfg.few_shot, idx=idx) - - sentence_without_answer = base_template - - sentence = base_template + ' ' + self.format_target(label, slots=slots) - - if self.cfg.eval_mode == "binary_score": - candidate_sentences = [] - for candidate in candidates: - positive_answer = base_template + ' ' + candidate + ' Answer: ' + 'yes' - negative_answer = base_template + ' ' + candidate + ' Answer: ' + 'no' - if candidate == label: - correct_candidate = len(candidate_sentences) // 2 - candidate_sentences.append(positive_answer) - candidate_sentences.append(negative_answer) - else: - candidate_sentences.append(negative_answer) - candidate_sentences.append(positive_answer) - else: - correct_candidate = 0 - candidate_sentences = [ - base_template + ' ' + self.format_target(candidate, slots=slots) for candidate in candidates - ] - - encodings_dict, input_ids, attn_masks = self.default_encode(sentence) - - candidate_tokenized_sentences = [ - self.default_encode(candidate_sentence) for candidate_sentence in candidate_sentences - ] - - # ensure all samples have the same number of candidates for collating into tensor - while len(candidate_tokenized_sentences) < self.max_candidates: - candidate_tokenized_sentences.append(candidate_tokenized_sentences[0]) - - candidate_input_ids = torch.stack([i[1] for i in candidate_tokenized_sentences]) - candidate_attn_masks = torch.stack([i[2] for i in candidate_tokenized_sentences]) - - labels = copy.copy(torch.squeeze(encodings_dict['input_ids'])) - - training_mask_end = self.get_n_tokens_in_sentence(sentence_without_answer) - - labels.data = torch.tensor( - [-100 if i < training_mask_end else labels.data[i] for i in range(len(labels.data))] - ) - - return ( - input_ids, - attn_masks, - labels, - candidate_input_ids, - candidate_attn_masks, - training_mask_end, - utterance_length, - correct_candidate, - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py deleted file mode 100644 index 7de02d75c574446fdb5932962af5c33ba45507b4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py +++ /dev/null @@ -1,130 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy - -import torch - -from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset - - -class DialogueGPTGenerationDataset(DialogueDataset): - def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ Constructor - Designed for free form generation tasks such as Dialogue Response Generation - - Args: - dataset_split: dataset split - dialogues_processor: dialogues processor - tokenizer: tokenizer - cfg: cfg container for dataset - """ - self.cfg = cfg - self.input_label_type = self.cfg.input_field - self.output_label_type = self.cfg.output_field - self.tokenizer = tokenizer - self.tokenizer.tokenizer.padding_side = "right" - if not isinstance(dataset_split, str): - dataset_split = dataset_split[0] - - self.features = dialogues_processor.get_dialog_examples(dataset_split) - self.features = self.remove_invalid_samples(self.features) - - if self.cfg.debug_mode: - self.features = self.features[:16] - - def remove_invalid_samples(self, features): - valid_idxs = [] - all_fields = self.input_label_type.split('+') + self.output_label_type.split('+') - for i in range(len(features)): - features[i].data["labels"]["utterance"] = features[i].data["utterance"] - all_fields_non_empty = True - for field in all_fields: - if not features[i].data["labels"][field] or not features[i].data["labels"][field].strip(): - all_fields_non_empty = False - if all_fields_non_empty: - valid_idxs.append(i) - return [features[i] for i in valid_idxs] - - def __len__(self): - return len(self.features) - - def get_n_tokens_in_sentence(self, sentence): - encodings_dict = self.tokenizer.tokenizer( - sentence, truncation=True, max_length=self.cfg.max_seq_length, padding=False, return_tensors="pt" - ) - output = torch.squeeze(encodings_dict['input_ids']) - return len(output) if len(output.size()) > 0 else 0 - - def default_encode(self, sentence): - encodings_dict = self.tokenizer.tokenizer( - sentence, truncation=True, max_length=self.cfg.max_seq_length, padding="max_length", return_tensors="pt" - ) - input_ids = torch.squeeze(encodings_dict['input_ids']) - attn_masks = torch.squeeze(encodings_dict['attention_mask']) - return encodings_dict, input_ids, attn_masks - - def format_prompt(self, ex): - ''' - Formats training prompt based on self.input_field_type - - Training example: - e.g. response: # input_label_type = response - e.g. utterance: # input_label_type = utterance - e.g. passage: utterance: # input_label_type = passage+utterance - ''' - ex["labels"]["utterance"] = ex["utterance"] - parts = self.input_label_type.split('+') - input_sentence = ' '.join([part + ': ' + ex["labels"][part] for part in parts]) - return input_sentence - - def __getitem__(self, idx: int): - - ''' - For each example, this function determines the format of input and output sequences based on user-specified conguration. - This is controlled by model.dataset.input_field and model.dataset.output_field - For instance: - If model.dataset.input_field == response and model.dataset.output_field == fluent_response: - Input = "response: " and output = "response: fluent_response: " (with loss calculated from only) - If model.dataset.input_field == utterance and model.dataset.output_field == response: - Input = "utterance: " and output = "utterance: response: " (with loss calculated from only) - If model.dataset.input_field == passage+utterance and model.dataset.output_field == response: - Input = "passage: utterance: " and output="passage: utterance: response: " (with loss calculated from only) - ''' - ex = self.features[idx].data - - input_sentence = self.format_prompt(ex) - - utterance_length = self.get_n_tokens_in_sentence(input_sentence) - - output_sentence = ex["labels"][self.output_label_type] - - base_template = input_sentence - - sentence_without_answer = base_template + ' ' + self.output_label_type + ':' - - sentence = sentence_without_answer + ' ' + output_sentence - - encodings_dict, input_ids, attn_masks = self.default_encode(sentence) - - labels = copy.copy(torch.squeeze(encodings_dict['input_ids'])) - - training_mask_end = self.get_n_tokens_in_sentence(sentence_without_answer) - - labels.data = torch.tensor( - [-100 if i < training_mask_end else labels.data[i] for i in range(len(labels.data))] - ) - - return (input_ids, attn_masks, labels, training_mask_end, utterance_length) diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py deleted file mode 100644 index 8618f2f8c7b4b6fa0376dcc2aa1b9ad0649f56d1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset - -__all__ = ['DialogueNearestNeighbourDataset'] - - -class DialogueNearestNeighbourDataset(DialogueDataset): - """ - Dataset for training a Nearest Neighbour model for zero shot intent recognition. - """ - - def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ - Args: - dataset_split: dataset split - dialogues_processor: Data generator for dialogues - tokenizer: tokenizer to split text into sub-word tokens - """ - self.cfg = cfg - self.tokenizer = tokenizer - self.raw_features = dialogues_processor.get_dialog_examples(dataset_split) - self.max_n = self.find_max_n_candidates() - self.examples = self._create_examples(self.raw_features) - - def find_max_n_candidates(self): - max_n = 0 - for idx in range(len(self.raw_features)): - ex = self.raw_features[idx].data - n = len(ex["possible_labels"]["intent"]) - max_n = max(max_n, n) - return max_n - - def _create_examples(self, raw_features): - """Creates examples for the training and dev sets.""" - examples = [] - seen_utterances = set() - for idx in range(len(raw_features)): - ex = self.raw_features[idx].data - user_utterance = ex["utterance"] - if user_utterance in seen_utterances: - continue - seen_utterances.add(user_utterance) - intent = ex["labels"]["intent"] - sentences = [user_utterance] - labels = [-1] - for candidate_intent in ex["possible_labels"]["intent"]: - text_b = "{} {}".format(self.cfg.prompt_template, candidate_intent) - label = 1 if candidate_intent == intent else 0 - labels.append(label) - sentences.append(text_b) - - while self.max_n > len(labels) - 1: - labels.append(label) - sentences.append(text_b) - - encoded_input = self.tokenizer.tokenizer( - sentences, - padding='max_length', - truncation=True, - return_tensors='pt', - max_length=self.cfg.max_seq_length, - ) - examples.append((encoded_input['input_ids'], encoded_input['attention_mask'], torch.tensor(labels))) - return examples - - def __len__(self): - return len(self.examples) - - def __getitem__(self, idx: int): - return self.examples[idx] diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py deleted file mode 100644 index 78fda55edd2eab41dc1405372dd5e95349a8845b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - -from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset - - -class DialogueS2SGenerationDataset(DialogueDataset): - def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ Constructor - Designed for free form generation tasks such as Dialogue Response Generation - - Args: - dataset_split: dataset split - dialogues_processor: dialogues processor - tokenizer: tokenizer - cfg: cfg container for dataset - """ - self.cfg = cfg - self.input_label_type = self.cfg.input_field - self.output_label_type = self.cfg.output_field - self.tokenizer = tokenizer - if not isinstance(dataset_split, str): - dataset_split = dataset_split[0] - - self.features = dialogues_processor.get_dialog_examples(dataset_split) - self.features = self.remove_invalid_samples(self.features) - - if self.cfg.debug_mode: - self.features = self.features[:16] - - @staticmethod - def format_actions(prompt_template, actions): - """ - Formats actions based on prompt_template - - Args: - prompt_template: determines whether acts, slot-names, slot-values are necessary in formatted actions - actions: list of actions, each a dict containing keys 'act', 'slot' and 'values' with their corresponding values as their attribute-values - - Returns: - formatted_actions: string representations of actions, formatted based on the fields needed. - """ - actions_str = [] - for action in actions: - act = action['act'].lower() - slot = action['slot'] - value = action['values'][0] if action['values'] else '' - - if prompt_template == 'values': - action_str = value - elif prompt_template == 'slots_values': - if value: - action_str = '{} ({})'.format(slot, value) - else: - action_str = slot - elif prompt_template == 'acts_slots_values': - if value: - action_str = '{} {} ({})'.format(act, slot, value) - elif slot: - action_str = '{} {}'.format(act, slot) - else: - action_str = act - else: - raise ValueError( - "Please set model.dataset.prompt_template to acts_slots_values, slots_values or values" - ) - actions_str.append(action_str) - return ' '.join(actions_str) - - def remove_invalid_samples(self, features): - valid_idxs = [] - for i in range(len(features)): - for field in ['utterance', 'system_utterance', 'system_actions']: - if field in features[i].data: - features[i].data["labels"][field] = features[i].data[field] - all_fields = self.input_label_type.split('+') + self.output_label_type.split('+') - all_fields_non_empty = True - for field in all_fields: - if not features[i].data["labels"][field]: - all_fields_non_empty = False - if all_fields_non_empty: - valid_idxs.append(i) - return [features[i] for i in valid_idxs] - - def __len__(self): - return len(self.features) - - def get_n_tokens_in_sentence(self, sentence): - encodings_dict = self.tokenizer.tokenizer( - sentence, truncation=True, max_length=self.cfg.max_seq_length, padding=False, return_tensors="pt" - ) - output = torch.squeeze(encodings_dict['input_ids']) - return len(output) if len(output.size()) > 0 else 0 - - def default_encode(self, sentence): - encodings_dict = self.tokenizer.tokenizer( - sentence, truncation=True, max_length=self.cfg.max_seq_length, padding="max_length", return_tensors="pt" - ) - input_ids = torch.squeeze(encodings_dict['input_ids']) - attn_masks = torch.squeeze(encodings_dict['attention_mask']) - return encodings_dict, input_ids, attn_masks - - def format_prompt(self, ex): - ''' - Formats training prompt based on self.input_field_type - - Training example: - e.g. response: # input_label_type = response - e.g. utterance: # input_label_type = utterance - e.g. passage: utterance: # input_label_type = passage+utterance - ''' - parts = self.input_label_type.split('+') - input_sentence = ' '.join([part + ': ' + ex["labels"][part] for part in parts]) - return input_sentence - - def __getitem__(self, idx: int): - - ''' - State how the input and output samples look like - - This template can be changed - - Training example: - e.g. INPUT - "response: " OUTPUT - "" # input_label_type = response, output_label_type = fluent_response - e.g. INPUT - "utterance: " OUTPUT - "" # input_label_type = utterance, output_label_type = response - e.g. INPUT - "passage: utterance: " OUTPUT - "" # input_label_type = passage+utterance, output_label_type = response - ''' - ex = self.features[idx].data - for field in ['utterance', 'system_utterance']: - if field in ex: - ex["labels"][field] = ex[field] - - if 'system_actions' in ex: - ex["labels"]['system_actions'] = DialogueS2SGenerationDataset.format_actions( - self.cfg.prompt_template, ex['system_actions'] - ) - - input_sentence = self.format_prompt(ex) - output_sentence = ex["labels"][self.output_label_type] - - _, input_ids, attn_masks = self.default_encode(input_sentence) - - _, labels, _ = self.default_encode(output_sentence) - - labels[labels == self.tokenizer.tokenizer.pad_token_id] = -100 - - return input_ids, attn_masks, labels diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_sgd_bert_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_sgd_bert_dataset.py deleted file mode 100644 index 364998e8e58af3379e53190defb7cd2144e53310..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_sgd_bert_dataset.py +++ /dev/null @@ -1,425 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This file contains code artifacts adapted from the original implementation: -https://github.com/google-research/google-research/blob/master/schema_guided_dst -""" - -import os -import re -from typing import List - -import numpy as np - -from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset -from nemo.collections.nlp.data.dialogue.input_example.sgd_input_example import SGDInputExample - -__all__ = ['DialogueSGDBERTDataset'] - - -class DialogueSGDBERTDataset(DialogueDataset): - ''' - Dataset Class - 1. Performs Model-dependent (but Data-independent) operations (tokenization etc) - 2. This can allow the same model preprocessing for multiple datasources - 3. Users can configurate which labels to use for modelling - (e.g. intent classification, slot filling or both together etc) - ''' - - def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, schemas, schema_config, cfg): - """ Constructor - Args: - dataset_split: dataset split - dialogues_processor: Data generator for SGD dialogues - tokenizer: tokenizer - schemas: SGD schema for domain, intent and slots - schema_config: config dict for schemas - cfg: cfg container for dataset - """ - self.dataset_split = dataset_split - self.tokenizer = tokenizer - self.schemas = schemas - self.schema_config = schema_config - self.dialogues_processor = dialogues_processor - self.cfg = cfg - self.subsample = self.dialogues_processor._subsample - - dial_file = f"{dialogues_processor._task_name}_{dataset_split}_examples_bert.processed" - self.dial_file = os.path.join(self.cfg.data_dir, dial_file) - if self.cfg.use_cache and os.path.exists(self.dial_file): - self.load_features() - else: - self.process_features() - self.save_features() - - def load_features(self): - with open(self.dial_file, "rb") as f: - self.features = np.load(f, allow_pickle=True) - - def process_features(self): - self.features = [] - self.raw_features = self.dialogues_processor.get_dialog_examples(self.dataset_split) - for idx in range(len(self.raw_features)): - self.bert_process_one_sample(idx) - - def save_features(self): - with open(self.dial_file, "wb") as f: - np.save(f, self.features) - - def _tokenize(self, utterance: str): - """ - Tokenize the utterance - - Args: - utterance: A string containing the utterance to be tokenized. - - Returns: - bert_tokens: A list of tokens obtained by word-piece tokenization of the - utterance. - alignments: A dict mapping indices of characters corresponding to start - and end positions of words (not subwords) to corresponding indices in - bert_tokens list. - inverse_alignments: A list of size equal to bert_tokens. Each element is a - tuple containing the index of the starting and inclusive ending - character of the word corresponding to the subword. This list is used - during inference to map word-piece indices to spans in the original - utterance. - """ - # utterance = tokenization.convert_to_unicode(utterance) - - # After _naive_tokenize, spaces and punctuation marks are all retained, i.e. - # direct concatenation of all the tokens in the sequence will be the - # original string. - tokens = DialogueSGDBERTDataset._naive_tokenize(utterance) - # ['I', ' ', 'am', ' ', 'feeling', ' ', 'hungry', ' ', 'so', ' ', 'I', ' ', 'would', ' ', 'like', ' ', 'to', ' ', 'find', ' ', 'a', ' ', 'place', ' ', 'to', ' ', 'eat', '.'] - # Filter out empty tokens and obtain aligned character index for each token. - alignments = {} - char_index = 0 - bert_tokens = ( - [] - ) # ['I', 'am', 'feeling', 'hungry', 'so', 'I', 'would', 'like', 'to', 'find', 'a', 'place', 'to', 'eat', '.'] - # These lists store inverse alignments to be used during inference. - bert_tokens_start_chars = [] - bert_tokens_end_chars = [] - for token in tokens: - if token.strip(): - subwords = self.tokenizer.text_to_tokens(token) - # Store the alignment for the index of starting character and the - # inclusive ending character of the token. - alignments[char_index] = len(bert_tokens) - bert_tokens_start_chars.extend([char_index] * len(subwords)) - bert_tokens.extend(subwords) - # The inclusive ending character index corresponding to the word. - inclusive_char_end = char_index + len(token) - 1 - alignments[inclusive_char_end] = len(bert_tokens) - 1 - bert_tokens_end_chars.extend([inclusive_char_end] * len(subwords)) - char_index += len(token) - inverse_alignments = list(zip(bert_tokens_start_chars, bert_tokens_end_chars)) - return bert_tokens, alignments, inverse_alignments - - @classmethod - def _naive_tokenize(cls, s: str): - """ - Tokenizes a string, separating words, spaces and punctuations. - Args: - s: a string - Returns: - seq_tok: list of words, spaces and punctuations from the string - """ - # Spaces and punctuation marks are all retained, i.e. direct concatenation - # of all the tokens in the sequence will be the original string. - seq_tok = [tok for tok in re.split(r"([^a-zA-Z0-9])", s) if tok] - return seq_tok - - def __len__(self): - return len(self.features) - - def __getitem__(self, idx: int): - ex = self.features[idx] - - return ( - np.array(ex.example_id_num), - np.array(ex.example_id_num[-1]), # service_id - np.array(ex.utterance_ids), - np.array(ex.utterance_segment), - np.array(ex.utterance_mask, dtype=np.long), - np.array(ex.intent_status, dtype=np.float32), - np.array(ex.requested_slot_status, dtype=np.float32), - np.array(ex.categorical_slot_status), - np.array(ex.categorical_slot_value_status, dtype=np.float32), - np.array(ex.noncategorical_slot_status), - np.array(ex.noncategorical_slot_value_start), - np.array(ex.noncategorical_slot_value_end), - np.array(ex.start_char_idx), # noncat_alignment_start - np.array(ex.end_char_idx), # noncat_alignment_end - np.array(ex.task_mask), # noncat_alignment_end - ) - - def bert_process_one_sample(self, idx): - """ - Creates an example for each frame in the user turn. - Args: - turn_id: turn number - system_utterance: last system utterance - user_utterance: lst user utterance - system_frames: all system utterances and slot - slot value pairs - user_frames: all user utterances and slot - slot value pairs - prev_states: slot - slot value pairs from the previous turns - schemas: schema for all services of all datasets - subsample: whether to balance postive and negative samples in the dataset - Returns: - examples: a list of `InputExample`s. - prev_states: updated dialogue state e.g. {'Restaurants_1': {'city': ['San Jose'], 'cuisine': ['American']}} - """ - - ex = self.raw_features[idx].data - example_id_num = ex["example_id_num"] - example_id = ex["example_id"] - user_utterance = ex["utterance"] - system_utterance = ex["system_utterance"] - service = ex["labels"]["service"] - schemas = self.schemas - state_update = ex["labels"]["slots"] - system_slots = ex["system_slots"] - - user_tokens, user_alignments, user_inv_alignments = self._tokenize(user_utterance) - system_tokens, system_alignments, system_inv_alignments = self._tokenize(system_utterance) - system_user_utterance = system_utterance + ' ' + user_utterance - system_user_tokens, system_user_alignments, system_user_inv_alignments = self._tokenize(system_user_utterance) - examples = [] - - base_example = SGDInputExample(schema_config=self.schema_config, tokenizer=self.tokenizer) - base_example.service_schema = self.schemas.get_service_schema(service) - base_example.service_id = example_id_num[-1] - - base_example.example_id = example_id - base_example.example_id_num = example_id_num - - for model_task in range(self.schema_config["NUM_TASKS"]): - if model_task == 0: - for intent_id, intent in enumerate(schemas.get_service_schema(service).intents): - task_example = base_example.make_copy() - task_example.task_mask[model_task] = 1 - task_example.intent_id = intent_id - task_example.example_id += f"-{model_task}-{intent_id}-0" - task_example.example_id_num.extend([model_task, intent_id, 0]) - intent_description = ( - intent + " " + self.schemas.get_service_schema(service).intent_descriptions[intent] - ) - intent_tokens, intent_alignments, intent_inv_alignments = self._tokenize(intent_description) - task_example.add_utterance_features( - intent_tokens, - intent_inv_alignments, - system_user_tokens, - system_user_inv_alignments, - intent_description, - system_user_utterance, - ) - - task_example.add_intents(ex) - examples.append(task_example) - - if model_task == 1: - for slot_id, slot in enumerate(schemas.get_service_schema(service).slots): - task_example = base_example.make_copy() - task_example.task_mask[model_task] = 1 - task_example.requested_slot_id = slot_id - task_example.example_id += f"-{model_task}-{slot_id}-0" - task_example.example_id_num.extend([model_task, slot_id, 0]) - slot_description = slot + " " + self.schemas.get_service_schema(service).slot_descriptions[slot] - slot_tokens, slot_alignments, slot_inv_alignments = self._tokenize(slot_description) - task_example.add_utterance_features( - slot_tokens, - slot_inv_alignments, - user_tokens, - user_inv_alignments, - slot_description, - user_utterance, - ) - - task_example.add_requested_slots(ex) - examples.append(task_example) - - if model_task == 2: - off_slots = [] - on_slots = [] - for slot_id, slot in enumerate(schemas.get_service_schema(service).categorical_slots): - task_example = base_example.make_copy() - task_example.task_mask[model_task] = 1 - - # assert task_example.task_mask == [0, 0, 1, 0, 0, 0] - task_example.categorical_slot_id = slot_id - task_example.example_id += f"-{model_task}-{slot_id}-0" - task_example.example_id_num.extend([model_task, slot_id, 0]) - slot_description = slot + " " + schemas.get_service_schema(service).slot_descriptions[slot] - slot_tokens, slot_alignments, slot_inv_alignments = self._tokenize(slot_description) - task_example.add_utterance_features( - slot_tokens, - slot_inv_alignments, - system_user_tokens, - system_user_inv_alignments, - slot_description, - system_user_utterance, - ) - task_example.add_categorical_slots(state_update) - - if task_example.categorical_slot_status == 0: - off_slots.append(task_example) - else: - on_slots.append(task_example) - examples.append(task_example) - old_example = task_example - - for value_id, value in enumerate( - schemas.get_service_schema(service).get_categorical_slot_values(slot) - ): - if self.dataset_split != 'train' or task_example.categorical_slot_status == 1: - task_example = old_example.make_copy_of_categorical_features() - task_example.task_mask[3] = 1 - # assert task_example.task_mask == [0, 0, 0, 1, 0, 0] - task_example.categorical_slot_id = slot_id - task_example.categorical_slot_value_id = value_id - task_example.example_id = base_example.example_id + f"-3-{slot_id}-{value_id}" - task_example.example_id_num = base_example.example_id_num + [3, slot_id, value_id] - slot_description = slot + " " + value # add slot description - slot_tokens, slot_alignments, slot_inv_alignments = self._tokenize(slot_description) - task_example.add_utterance_features( - slot_tokens, - slot_inv_alignments, - system_user_tokens, - system_user_inv_alignments, - slot_description, - system_user_utterance, - ) - task_example.add_categorical_slots(state_update) - assert task_example.categorical_slot_status == old_example.categorical_slot_status - examples.append(task_example) - - if self.dataset_split == 'train' and self.subsample: - num_on_slots = len(on_slots) - examples.extend( - np.random.choice(off_slots, replace=False, size=min(max(num_on_slots, 1), len(off_slots))) - ) - else: - examples.extend(off_slots) - - if model_task == 4: # noncat slot status - off_slots = [] - on_slots = [] - for slot_id, slot in enumerate(schemas.get_service_schema(service).non_categorical_slots): - task_example = base_example.make_copy() - task_example.task_mask[model_task] = 1 - # assert task_example.task_mask == [0, 0, 0, 0, 1, 0] - task_example.noncategorical_slot_id = slot_id - task_example.example_id += f"-{model_task}-{slot_id}-0" - task_example.example_id_num.extend([model_task, slot_id, 0]) - slot_description = slot + " " + schemas.get_service_schema(service).slot_descriptions[slot] - slot_tokens, slot_alignments, slot_inv_alignments = self._tokenize(slot_description) - task_example.add_utterance_features( - slot_tokens, - slot_inv_alignments, - system_user_tokens, - system_user_inv_alignments, - slot_description, - system_user_utterance, - ) - - user_span_boundaries = self._find_subword_indices( - state_update, - user_utterance, - ex["label_positions"]["slots"], - user_alignments, - user_tokens, - 2 + len(slot_tokens) + len(system_tokens), - ) - - if system_slots is not None: - system_span_boundaries = self._find_subword_indices( - state_update, - system_utterance, - system_slots, - system_alignments, - system_tokens, - 2 + len(slot_tokens), - ) - else: - system_span_boundaries = {} - - task_example.add_noncategorical_slots(state_update, user_span_boundaries, system_span_boundaries) - if task_example.noncategorical_slot_status == 0: - off_slots.append(task_example) - else: - on_slots.append(task_example) - examples.append(task_example) - - if self.dataset_split != 'train' or task_example.noncategorical_slot_status == 1: - task_example = task_example.make_copy_of_non_categorical_features() - task_example.task_mask[5] = 1 - # assert task_example.task_mask == [0, 0, 0, 0, 0, 1] - task_example.example_id = base_example.example_id + f"-5-{slot_id}-0" - task_example.example_id_num = base_example.example_id_num + [5, slot_id, 0] - examples.append(task_example) - - if self.dataset_split == 'train' and self.subsample: - num_on_slots = len(on_slots) - examples.extend( - np.random.choice(off_slots, replace=False, size=min(max(num_on_slots, 1), len(off_slots))) - ) - else: - examples.extend(off_slots) - - for example in examples: - self.features.append(example) - - def _find_subword_indices( - self, - slot_values: dict, - utterance: str, - char_slot_spans: dict, - alignments: List[int], - subwords: List[str], - bias: int, - ) -> dict: - """ - Find indices for subwords corresponding to slot values. - Args: - slot_values: slot - slot value pairs - utterance: utterance - char_slot_spans: char - slot spans - alignments: alignments - subwords: subtokens mapping - bias: offset - Returns: - span_boundaries: span boundaries - """ - span_boundaries = {} - for slot, values in slot_values.items(): - # Get all values present in the utterance for the specified slot. - value_char_spans = {} - for key, slot_span in char_slot_spans.items(): - # print(key, slot, slot_span, char_slot_spans) - if slot_span["slot"] == slot: - value = utterance[slot_span["start"] : slot_span["exclusive_end"]] - start_tok_idx = alignments[slot_span["start"]] - end_tok_idx = alignments[slot_span["exclusive_end"] - 1] - if 0 <= start_tok_idx < len(subwords): - end_tok_idx = min(end_tok_idx, len(subwords) - 1) - value_char_spans[value] = (start_tok_idx + bias, end_tok_idx + bias) - for v in values: - if v in value_char_spans: - span_boundaries[slot] = value_char_spans[v] - break - return span_boundaries diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py deleted file mode 100644 index f2a0f58bcfac2c32658f8c1298b54c1f87c91852..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py +++ /dev/null @@ -1,297 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Dict, List, Optional, Union - -import numpy as np - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.glue_benchmark.data_processors import InputExample -from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUEDataset -from nemo.core.neural_types import CategoricalValuesType, ChannelType, MaskType, NeuralType -from nemo.utils import logging - -__all__ = ['DialogueZeroShotIntentDataset'] - - -class DialogueZeroShotIntentDataset(GLUEDataset): - """ - Dataset for training a NLI model for zero shot intent recognition. Similar to GLUE/MNLI - dataset, but allows the user to specify which columns in the data files contain the - premise, hypothesis, and gold label. - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'labels': NeuralType(tuple('B'), CategoricalValuesType()), - } - - def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ - Args: - dataset_split: dataset split - dialogues_processor: Data generator for dialogues - tokenizer: tokenizer to split text into sub-word tokens - cfg: config dict for dataset - num_classes: number of classes in the data (should be either 2 or 3, corresponding to - labels ['entailment', 'not_entailment'] or ["contradiction", "entailment", "neutral"]) - """ - self.cfg = cfg - self.tokenizer = tokenizer - if self.cfg.num_classes not in [2, 3]: - raise ValueError("num_classes must be either 2 or 3!") - self.label_list = ( - ["contradiction", "entailment", "neutral"] - if self.cfg.num_classes == 3 - else ['not_entailment', 'entailment'] - ) - token_params = { - 'bos_token': None, - 'eos_token': tokenizer.eos_token, - 'pad_token': tokenizer.pad_token, - 'cls_token': tokenizer.cls_token, - 'sep_token_extra': tokenizer.eos_token - if hasattr(tokenizer, 'name') and 'roberta' in tokenizer.name.lower() - else None, - } - - self.raw_features = dialogues_processor.get_dialog_examples(dataset_split) - self.examples = self._create_examples(self.raw_features, dataset_split) - self.features = self.convert_examples_to_features( - self.examples, - [0, 1, 2, 3], - self.cfg.max_seq_length, - tokenizer, - output_mode="classification", - **token_params, - ) - - def _create_examples(self, raw_features, dataset_split: str): - """Creates examples for the training and dev sets.""" - examples = [] - for idx in range(len(raw_features)): - ex = self.raw_features[idx].data - user_utterance = ex["utterance"] - intent = ex["labels"]["intent"] - for candidate_idx, candidate_intent in enumerate(ex["possible_labels"]["intent"]): - guid = "{}-{}-{}".format(dataset_split, idx, candidate_idx) - text_a = user_utterance - text_b = "{} {}".format(self.cfg.prompt_template, candidate_intent) - label = 1 if candidate_intent == intent else 0 - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def convert_examples_to_features( - self, - examples: List[str], - label_list: List[int], - max_seq_length: int, - tokenizer: TokenizerSpec, - output_mode: str, - bos_token: str = None, - eos_token: str = '[SEP]', - pad_token: str = '[PAD]', - cls_token: str = '[CLS]', - sep_token_extra: str = None, - cls_token_at_end: bool = False, - cls_token_segment_id: int = 0, - pad_token_segment_id: int = 0, - pad_on_left: bool = False, - mask_padding_with_zero: bool = True, - sequence_a_segment_id: int = 0, - sequence_b_segment_id: int = 1, - ): - """ - Loads a data file into a list of `InputBatch`s. - The `cls_token_at_end` defines the location of the CLS token: - - * False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - * True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] - - The `cls_token_segment_id` defines the segment id associated to the CLS token (0 for BERT, 2 for XLNet) - - The convention in BERT is: - - a. For sequence pairs: - * tokens: [CLS] is this jack ##ville ? [SEP] no it is not . [SEP] - * type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 - b. For single sequences: - * tokens: [CLS] the dog is hairy . [SEP] - * type_ids: 0 0 0 0 0 0 0 - - Where "type_ids" are used to indicate whether this is the first - sequence or the second sequence. The embedding vectors for `type=0` - and `type=1` were learned during pre-training and are added to the - wordpiece embedding vector (and position vector). This is - not *strictly* necessarysince the [SEP] token unambiguously separates - the sequences, but it makes it easier for the model to learn - the concept of sequences. - For classification tasks, the first vector (corresponding to [CLS]) - is used as as the "sentence vector". Note that this only makes sense - because the entire model is fine-tuned. - - The convention for NMT is: - - a. For sequence pairs: - * tokens: is this jack ##ville ? no it is not . - * type_ids:0 0 0 0 0 0 0 1 1 1 1 1 1 1 - b. For single sequences: - * tokens: the dog is hairy . - * type_ids: 0 0 0 0 0 0 0 - - """ - label_map = {label: i for i, label in enumerate(label_list)} - - features = [] - for ex_index, example in enumerate(examples): - if example.label == "-": # skip examples without a consensus label (e.g. in SNLI data set) - continue - if ex_index % 10000 == 0: - logging.info("Writing example %d of %d" % (ex_index, len(examples))) - - if hasattr(tokenizer, 'text_to_tokens'): - tokens_a = tokenizer.text_to_tokens(example.text_a) - else: - tokens_a = tokenizer.tokenize(example.text_a) - - tokens_b = None - if example.text_b: - if hasattr(tokenizer, 'text_to_tokens'): - tokens_b = tokenizer.text_to_tokens(example.text_b) - else: - tokens_b = tokenizer.tokenize(example.text_b) - - special_tokens_count = 2 if eos_token else 0 - special_tokens_count += 1 if sep_token_extra else 0 - special_tokens_count += 2 if bos_token else 0 - special_tokens_count += 1 if cls_token else 0 - self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) - else: - special_tokens_count = 1 if eos_token else 0 - special_tokens_count += 1 if sep_token_extra else 0 - special_tokens_count += 1 if bos_token else 0 - if len(tokens_a) > max_seq_length - special_tokens_count: - tokens_a = tokens_a[: max_seq_length - special_tokens_count] - # Add special tokens to sequence_a - tokens = tokens_a - if bos_token: - tokens = [bos_token] + tokens - if eos_token: - tokens += [eos_token] - segment_ids = [sequence_a_segment_id] * len(tokens) - - # Add sequence separator between sequences - if tokens_b and sep_token_extra: - tokens += [sep_token_extra] - segment_ids += [sequence_a_segment_id] - - # Add special tokens to sequence_b - if tokens_b: - if bos_token: - tokens += [bos_token] - segment_ids += [sequence_b_segment_id] - tokens += tokens_b - segment_ids += [sequence_b_segment_id] * (len(tokens_b)) - if eos_token: - tokens += [eos_token] - segment_ids += [sequence_b_segment_id] - - # Add classification token - for BERT models - if cls_token: - if cls_token_at_end: - tokens += [cls_token] - segment_ids += [cls_token_segment_id] - else: - tokens = [cls_token] + tokens - segment_ids = [cls_token_segment_id] + segment_ids - if hasattr(tokenizer, 'tokens_to_ids'): - input_ids = tokenizer.tokens_to_ids(tokens) - else: - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - # Zero-pad up to the sequence length. - padding_length = max_seq_length - len(input_ids) - - if hasattr(tokenizer, 'tokens_to_ids'): - pad_token_id = tokenizer.tokens_to_ids([pad_token])[0] - else: - pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0] - - if pad_on_left: - input_ids = ([pad_token_id] * padding_length) + input_ids - input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask - segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids - else: - input_ids = input_ids + ([pad_token_id] * padding_length) - input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) - segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) - if len(input_ids) != max_seq_length: - raise ValueError("input_ids must be of length max_seq_length") - if len(input_mask) != max_seq_length: - raise ValueError("input_mask must be of length max_seq_length") - if len(segment_ids) != max_seq_length: - raise ValueError("segment_ids must be of length max_seq_length") - if output_mode == "classification": - label_id = label_map[example.label] - elif output_mode == "regression": - label_id = np.float32(example.label) - else: - raise KeyError(output_mode) - - if ex_index < 5: - logging.info("*** Example ***") - logging.info("guid: %s" % (example.guid)) - logging.info("tokens: %s" % " ".join(list(map(str, tokens)))) - logging.info("input_ids: %s" % " ".join(list(map(str, input_ids)))) - logging.info("input_mask: %s" % " ".join(list(map(str, input_mask)))) - logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids)))) - logging.info("label: %s (id = %d)" % (example.label, label_id)) - - features.append( - InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id) - ) - - return features - - -class InputFeatures(object): - """A single set of features of data. - - Args: - input_ids: input/token ids - input_mask: masks out subword tokens - segment_ids: distinguish one sentence from the other one (if present) - label_ids: label for the current example - """ - - def __init__( - self, input_ids: List[int], input_mask: List[int], segment_ids: List[int], label_id: Union[float, int] - ): - """Initialized InputFeatures.""" - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.label_id = label_id diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/__init__.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/__init__.py deleted file mode 100644 index de4cf417e58c91fecf4834926fde710b95363766..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.input_example.assistant_input_example import DialogueAssistantInputExample -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample -from nemo.collections.nlp.data.dialogue.input_example.sgd_input_example import DialogueSGDInputExample, SGDInputExample diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/assistant_input_example.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/assistant_input_example.py deleted file mode 100644 index c5574e8fa1030d3d5dc6894ae180c56bc099294f..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/assistant_input_example.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample - - -class DialogueAssistantInputExample(DialogueInputExample): - """ - Template for DialogueAssistantInputExample - - Meant as a descriptor rather than to be instantiated - - Please instantiate using the base class 'DialogueInputExample' - - { - - "utterance": , - "labels": { - "service": , - "intent": , - "slots": { - "": [, ], - "": [], - } - }, - "label_positions":{ - "slots": { - "": { - # note for the Assistant dataset, start and end are word positions rather than char position - # these are whitespace-delimited word positions rather than tokenization-specific sub-word tokens. - "exclusive_end": 3, - "slot": "restaurant_name", - "start": 1 - }, - } - }, - "possible_labels": { - "service": [, , ...], - "intent": [, , ...], - "slots": { - # all slots for categorical variables - # empty list for extractive slots - # Assistant only support extractive slots - "": [], - "": [], - } - } - } - """ diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/design_input_example.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/design_input_example.py deleted file mode 100644 index 80f3152cd82e158c9e9451f2406dadcfbd475d90..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/design_input_example.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample - - -class DialogueDesignInputExample(DialogueInputExample): - """ - Template for DialogueDesignInputExample - - Meant as a descriptor rather than to be instantiated - - Please instantiate using the base class 'DialogueInputExample' - - { - "utterance": , - "system_utterance": , - "labels": { - "service": , - "intent": , - "slots": { - : '', - : '', - }, # dataset does not contain ground truth slot values - }, - "possible_labels": { - 'intent': [, , ...], - "service": [, , ...], - "slots": { - "": [, , ...], - "": [, , ...], - } - }, - "description": { - "service": , - "intent": , - "slots": { - "": "", - "": "", - } - }, - } - """ diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/input_example.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/input_example.py deleted file mode 100644 index 4920c2927f4611ff0338b6196e265d671d048ef4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/input_example.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -__all__ = ['DialogueInputExample'] - - -class DialogueInputExample(object): - """ - Generic Dialogue Input Example - Uses data: dict as a flexible interface to support various input types. - This ranges from classification labels, to complex nested labels such as those in SGD - - { - "utterance": , - "labels": { - "intent": , - "slots": { ... }, - } - } - """ - - def __init__(self, data: dict): - self.data = data - - def __repr__(self): - return self.data - - def __str__(self): - return self.data diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/mellon_qa_input_example.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/mellon_qa_input_example.py deleted file mode 100644 index e6576d40460bdc698bd7fbe8eede2c0a743fe6ed..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/mellon_qa_input_example.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample - - -class MellonQAInputExample(DialogueInputExample): - """ - Template for MellonQAInputExample - - Meant as a descriptor rather than to be instantiated - - Please instantiate using the base class 'DialogueInputExample' - - { - "utterance": , - "labels": { - "example_id": , - "response": , - "fluent_response": , # written version of the response that is more fluent - "passage": , # passage which supports generating the response (answer) to the utterance (question) - } - } - """ diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/ms_marco_input_example.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/ms_marco_input_example.py deleted file mode 100644 index ded84d3ece67c2467135a9f8c1809b55fd3f40de..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/ms_marco_input_example.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample - - -class DialogueMSMarcoInputExample(DialogueInputExample): - """ - Template for DialogueMSMarcoInputExample - - Meant as a descriptor rather than to be instantiated - - Please instantiate using the base class 'DialogueInputExample' - - { - - "utterance": , - "labels": { - "service": , # this is the domain - "example_id": , - "response": , - "fluent_response": , # written version of the response that is more fluent - "passage": , # passage which supports generating the response (answer) to the utterance (question) - }, - "possible_labels": { - "service": [, , ...], - "passage": [, , ...], - } - } - """ diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/sgd_input_example.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/sgd_input_example.py deleted file mode 100644 index 9862a07baccdd473c8da3d83babc0189a7b3e315..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/input_example/sgd_input_example.py +++ /dev/null @@ -1,481 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This file contains code artifacts adapted from the original implementation: -https://github.com/google-research/google-research/blob/master/schema_guided_dst/baseline/data_utils.py -""" - -from typing import List - -from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample -from nemo.utils import logging - -__all__ = [ - 'SGDInputExample', - 'STR_DONTCARE', - 'STATUS_OFF', - 'STATUS_ACTIVE', - 'STATUS_DONTCARE', -] - - -class DialogueSGDInputExample(DialogueInputExample): - - """ - Template for DialogueSGDInputExample - - Meant as a descriptor rather than to be instantiated - - Please instantiate using the base class 'DialogueInputExample' - - { - "example_id": , - "example_id_num": , - "utterance": , - "system_utterance": , - "system_slots": None or { - "": { - "exclusive_end": 46, - "slot": "restaurant_name", - "start": 34 - }, - "system_actions": None or [{ - "act": "INFORM", - "canonical_values": [ - "2019-03-02" - ], - "slot": "date", - "values": [ - "March 2nd" - ] - }, ...] - "labels": { - "service": , - "intent": , - "slots": { - #only non-empty slots - #most slot values are list of length 1 - #but there are some of length 2 as both are accepted - #e.g. 1930 and 7:30 pm - "": [, ], - "": [], - } - }, - "label_positions":{ - "slots": { - "": { - "exclusive_end": 46, - "slot": "restaurant_name", - "start": 34 - }, - } - }, - "possible_labels": { - "service": [, , ...], - "intent": [, , ...], - "slots": { - #all slots including empty - "": [, , ...], - "": [, , ...], - } - }, - "description": { - "service": , - "intent": , - "slots": { - #only non-empty slots - "": , - "": , - } - } - } - - """ - - -STR_DONTCARE = "dontcare" - -# These are used to represent the status of slots (off, active, dontcare) and -# intents (off, active) in dialogue state tracking. -STATUS_OFF = 0 -STATUS_ACTIVE = 1 -STATUS_DONTCARE = 2 - - -class SGDInputExample(object): - """An example for training/inference.""" - - def __init__( - self, - schema_config: dict, - tokenizer: object, - service_schema: object = None, - example_id: str = "NONE", - example_id_num: List[int] = [], - ): - """ - Constructs an InputExample. - Args: - schema_config: configuration - tokenizer: tokenizer object - service_schema: A ServiceSchema object wrapping the schema for the service - corresponding to this example. - example_id: Unique identifier for the example, like: 'train-1_00000-00-Restaurants_1' - example_id_num: dialogue_id and turn_id combined and service id combined into a list of ints, - like: [1, 0, 0, 18] - """ - self.schema_config = schema_config - self.service_schema = service_schema - self.service_id = None - if service_schema: - self.service_id = service_schema.service_id - self.example_id = example_id - self.example_id_num = example_id_num - self._max_seq_length = schema_config["MAX_SEQ_LENGTH"] - self._tokenizer = tokenizer - if self._tokenizer is None: - raise ValueError("Must specify tokenizer") - - self.user_utterance = '' - self.system_utterance = '' - # The id of each subword in the vocabulary for BERT. - self.utterance_ids = [0] * self._max_seq_length - # Denotes the identity of the sequence. Takes values 0 (schema description) and 1 (system and user utterance). - self.utterance_segment = [0] * self._max_seq_length - # Mask which takes the value 0 for padded tokens and 1 otherwise. - self.utterance_mask = [0] * self._max_seq_length - # Start and inclusive end character indices in the original utterance - # corresponding to the tokens. This is used to obtain the character indices - # from the predicted subword indices during inference. - # NOTE: A positive value indicates the character indices in the schema description - # whereas a negative value indicates the character indices in the - # utterance. The indices are offset by 1 to prevent ambiguity in the - # 0 index, which could be in either the schema description or utterance by the - # above convention. Now the 0 index corresponds to padded tokens. - self.start_char_idx = [0] * self._max_seq_length - self.end_char_idx = [0] * self._max_seq_length - - # Id of categorical slot present in the example or 0 if not present. - self.categorical_slot_id = 0 - # Id of non categorical slot present in the example or 0 if not present. - self.noncategorical_slot_id = 0 - # The status of categorical slot in the example. - self.categorical_slot_status = STATUS_OFF - # The status of non categorical slot in the example. - self.noncategorical_slot_status = STATUS_OFF - # Masks out tasks not represented by example - self.task_mask = [0] * schema_config["NUM_TASKS"] - - # The index of the starting subword corresponding to the slot span - # for a non-categorical slot value. - self.noncategorical_slot_value_start = 0 - # The index of the ending (inclusive) subword corresponding to the slot span - # for a non-categorical slot value. - self.noncategorical_slot_value_end = 0 - - # Id of categorical slot value present in the example or 0 if not present. - self.categorical_slot_value_id = 0 - # The status of categorical slot value in the example. - self.categorical_slot_value_status = STATUS_OFF - # Id of requested slot present in the example or 0 if not present. - self.requested_slot_id = 0 - # Takes value 1 if the corresponding slot is requested, 0 otherwise. - self.requested_slot_status = STATUS_OFF - - # ID of intent present in the example. - self.intent_id = 0 - # Takes value 1 if the intent is active, 0 otherwise. - self.intent_status = STATUS_OFF - - @property - def readable_summary(self): - """Get a readable dict that summarizes the attributes of an InputExample.""" - seq_length = sum(self.utterance_mask) - utt_toks = self._tokenizer.ids_to_tokens(self.utterance_ids[:seq_length]) - utt_tok_mask_pairs = list(zip(utt_toks, self.utterance_segment[:seq_length])) - active_intent = ( - self.service_schema.get_intent_from_id(self.intent_id) if self.intent_status == STATUS_ACTIVE else "" - ) - slot_values_in_state = {} - if self.categorical_slot_status == STATUS_ACTIVE: - slot_values_in_state[ - self.service_schema.get_categorical_slot_from_id(self.categorical_slot_id) - ] = self.service_schema.get_categorical_slot_value_from_id( - self.categorical_slot_id, self.categorical_slot_value_id - ) - elif self.categorical_slot_status == STATUS_DONTCARE: - slot_values_in_state[ - self.service_schema.get_categorical_slot_from_id(self.categorical_slot_id) - ] = STR_DONTCARE - if self.noncategorical_slot_status == STATUS_ACTIVE: - slot = self.service_schema.get_non_categorical_slot_from_id(self.noncategorical_slot_id) - start_id = self.noncategorical_slot_value_start[slot] - end_id = self.noncategorical_slot_value_end[slot] - # Token list is consisted of the subwords that may start with "##". We - # remove "##" to reconstruct the original value. Note that it's not a - # strict restoration of the original string. It's primarily used for - # debugging. - # ex. ["san", "j", "##ose"] --> "san jose" - readable_value = " ".join(utt_toks[start_id : end_id + 1]).replace(" ##", "") - slot_values_in_state[slot] = readable_value - elif self.noncategorical_slot_status == STATUS_DONTCARE: - slot = self.service_schema.get_non_categorical_slot_from_id(self.noncategorical_slot_id) - slot_values_in_state[slot] = STR_DONTCARE - - summary_dict = { - "utt_tok_mask_pairs": utt_tok_mask_pairs, - "utt_len": seq_length, - "categorical_slot_id": self.categorical_slot_id, - "noncategorical_slot_id": self.noncategorical_slot_id, - "intent_id": self.intent_id, - "service_name": self.service_schema.service_name, - "active_intent": active_intent, - "slot_values_in_state": slot_values_in_state, - } - return summary_dict - - def add_utterance_features( - self, system_tokens, system_inv_alignments, user_tokens, user_inv_alignments, system_utterance, user_utterance - ): - """Add utterance related features input to InputExample. - - Note: this method modifies the system tokens and user_tokens in place to - make their total length <= the maximum input length for BERT model. - - Args: - system_tokens: a list of strings which represents schema description. - system_inv_alignments: a list of tuples which denotes the start and end - charater of the tpken that a bert token originates from in the original - schema description. - user_tokens: a list of strings which represents utterance. - user_inv_alignments: a list of tuples which denotes the start and end - charater of the token that a bert token originates from in the original - system and user utterance. - """ - # Input sequence length for utterance BERT encoder - max_utt_len = self._max_seq_length - - # Modify lengths of schema description & utterance so that length of total utt - # (including cls_token, setp_token, sep_token) is no more than max_utt_len - is_too_long = truncate_seq_pair(system_tokens, user_tokens, max_utt_len - 3) - if is_too_long: - logging.debug( - f'Utterance sequence truncated in example id - {self.example_id} from {len(system_tokens) + len(user_tokens)}.' - ) - - # Construct the tokens, segment mask and valid token mask which will be - # input to BERT, using the tokens for schema description (sequence A) and - # system and user utterance (sequence B). - utt_subword = [] - utt_seg = [] - utt_mask = [] - start_char_idx = [] - end_char_idx = [] - - utt_subword.append(self._tokenizer.cls_token) - utt_seg.append(0) - utt_mask.append(1) - start_char_idx.append(0) - end_char_idx.append(0) - - for subword_idx, subword in enumerate(system_tokens): - utt_subword.append(subword) - utt_seg.append(0) - utt_mask.append(1) - st, en = system_inv_alignments[subword_idx] - start_char_idx.append(-(st + 1)) - end_char_idx.append(-(en + 1)) - - utt_subword.append(self._tokenizer.sep_token) - utt_seg.append(0) - utt_mask.append(1) - start_char_idx.append(0) - end_char_idx.append(0) - - for subword_idx, subword in enumerate(user_tokens): - utt_subword.append(subword) - utt_seg.append(1) - utt_mask.append(1) - st, en = user_inv_alignments[subword_idx] - start_char_idx.append(st + 1) - end_char_idx.append(en + 1) - - utt_subword.append(self._tokenizer.sep_token) - utt_seg.append(1) - utt_mask.append(1) - start_char_idx.append(0) - end_char_idx.append(0) - - utterance_ids = self._tokenizer.tokens_to_ids(utt_subword) - - # Zero-pad up to the BERT input sequence length. - while len(utterance_ids) < max_utt_len: - utterance_ids.append(0) - utt_seg.append(0) - utt_mask.append(0) - start_char_idx.append(0) - end_char_idx.append(0) - self.utterance_ids = utterance_ids - self.utterance_segment = utt_seg - self.utterance_mask = utt_mask - self.start_char_idx = start_char_idx - self.end_char_idx = end_char_idx - - self.user_utterance = user_utterance - self.system_utterance = system_utterance - - def make_copy(self): - """Make a copy of the current example with utterance features.""" - new_example = SGDInputExample( - schema_config=self.schema_config, - service_schema=self.service_schema, - example_id=self.example_id, - example_id_num=self.example_id_num.copy(), - tokenizer=self._tokenizer, - ) - return new_example - - def make_copy_of_categorical_features(self): - """Make a copy of the current example with utterance and categorical features.""" - new_example = self.make_copy() - - new_example.categorical_slot_status = self.categorical_slot_status - return new_example - - def make_copy_of_non_categorical_features(self): - """Make a copy of the current example with utterance features and non categorical features.""" - new_example = self.make_copy() - new_example.noncategorical_slot_id = self.noncategorical_slot_id - new_example.noncategorical_slot_status = self.noncategorical_slot_status - new_example.utterance_ids = list(self.utterance_ids) - new_example.utterance_segment = list(self.utterance_segment) - new_example.utterance_mask = list(self.utterance_mask) - new_example.start_char_idx = list(self.start_char_idx) - new_example.end_char_idx = list(self.end_char_idx) - new_example.user_utterance = self.user_utterance - new_example.system_utterance = self.system_utterance - new_example.noncategorical_slot_status = self.noncategorical_slot_status - new_example.noncategorical_slot_value_start = self.noncategorical_slot_value_start - new_example.noncategorical_slot_value_end = self.noncategorical_slot_value_end - return new_example - - def add_categorical_slots(self, state_update: dict): - """Add features for categorical slots. - Args: - state_update: slot value pairs of the state update - """ - - categorical_slots = self.service_schema.categorical_slots - if not categorical_slots: - return - slot = categorical_slots[self.categorical_slot_id] - values = state_update.get(slot, []) - - if not values: - self.categorical_slot_status = STATUS_OFF - elif values[0] == STR_DONTCARE: - self.categorical_slot_status = STATUS_DONTCARE - else: - self.categorical_slot_status = STATUS_ACTIVE - self.categorical_slot_value_status = ( - self.categorical_slot_value_id == self.service_schema.get_categorical_slot_value_id(slot, values[0]) - ) - - def add_noncategorical_slots(self, state_update: dict, system_span_boundaries: dict, user_span_boundaries: dict): - """Add features for non-categorical slots. - Args: - state_update: slot value pairs of state update - system_span_boundaries: span boundaries of schema description - user_span_boundaries: span boundaries of utterance - """ - - noncategorical_slots = self.service_schema.non_categorical_slots - slot = noncategorical_slots[self.noncategorical_slot_id] - - values = state_update.get(slot, []) - if not values: - self.noncategorical_slot_status = STATUS_OFF - elif values[0] == STR_DONTCARE: - self.noncategorical_slot_status = STATUS_DONTCARE - else: - self.noncategorical_slot_status = STATUS_ACTIVE - # Add indices of the start and end tokens for the first encountered - # value. Spans in user utterance are prioritized over the system - # utterance. If a span is not found, the slot value is ignored. - if slot in user_span_boundaries: - start, end = user_span_boundaries[slot] - elif slot in system_span_boundaries: - start, end = system_span_boundaries[slot] - else: - # A span may not be found because the value was cropped out or because - # the value was mentioned earlier in the dialogue. Since this model - # only makes use of the last two utterances to predict state updates, - # it will fail in such cases. - logging.debug( - f'"Slot values {str(values)} not found in user or system utterance in example with id - {self.example_id}.' - ) - start = 0 - end = 0 - self.noncategorical_slot_value_start = start - self.noncategorical_slot_value_end = end - - def add_requested_slots(self, frame: dict): - """Add requested slots to InputExample - Args: - frame: frame object from which requested slots are extracted - """ - all_slots = self.service_schema.slots - slot = all_slots[self.requested_slot_id] - if slot in frame["labels"]["slots"]: - self.requested_slot_status = STATUS_ACTIVE - - def add_intents(self, frame): - """Add intents to InputExample - Args: - frame: frame object from which intents are extracted - """ - all_intents = self.service_schema.intents - intent = all_intents[self.intent_id] - if intent == frame["labels"]["intent"]: - self.intent_status = STATUS_ACTIVE - - -# Modified from run_classifier._truncate_seq_pair in the public bert model repo. -# https://github.com/google-research/bert/blob/master/run_classifier.py. -def truncate_seq_pair(tokens_a: List[int], tokens_b: List[int], max_length: int) -> bool: - """Truncate a seq pair in place so that their total length <= max_length. - Args: - tokens_a: first token sequence - tokens_b: second token sequence - max_length: truncated sequence length - Returns: - is_too_long: whether combined sequences exceed maximum sequence length - """ - is_too_long = False - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - is_too_long = True - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - return is_too_long diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/sgd/__init__.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/sgd/__init__.py deleted file mode 100644 index 9bc88d075659fee14d052b090e1b7fd0564b3c66..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/sgd/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.dialogue.sgd.evaluate import evaluate, get_in_domain_services -from nemo.collections.nlp.data.dialogue.sgd.schema import Schema diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/sgd/evaluate.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/sgd/evaluate.py deleted file mode 100644 index 0829543dcc516cfbfd40df7be5262fd6c9e94bd7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/sgd/evaluate.py +++ /dev/null @@ -1,294 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Evaluate predictions JSON file, w.r.t. ground truth file. -This file contains code artifacts adapted from the original implementation: -https://github.com/google-research/google-research/blob/master/schema_guided_dst/evaluate.py -""" - -import collections -import glob -import json -import os - -import numpy as np - -from nemo.collections.nlp.metrics.sgd_metrics import ( - ACTIVE_INTENT_ACCURACY, - JOINT_CAT_ACCURACY, - JOINT_GOAL_ACCURACY, - JOINT_NONCAT_ACCURACY, - NAN_VAL, - REQUESTED_SLOTS_F1, - REQUESTED_SLOTS_PRECISION, - REQUESTED_SLOTS_RECALL, - SLOT_TAGGING_F1, - SLOT_TAGGING_PRECISION, - SLOT_TAGGING_RECALL, - get_active_intent_accuracy, - get_average_and_joint_goal_accuracy, - get_requested_slots_f1, - get_slot_tagging_f1, -) -from nemo.utils import logging - -__all__ = ['get_in_domain_services'] - -ALL_SERVICES = "#ALL_SERVICES" -SEEN_SERVICES = "#SEEN_SERVICES" -UNSEEN_SERVICES = "#UNSEEN_SERVICES" - -# Name of the file containing all predictions and their corresponding frame metrics. -PER_FRAME_OUTPUT_FILENAME = "dialogues_and_metrics.json" - - -def get_service_set(schema_path: str) -> set: - """ - Get the set of all services present in a schema. - Args: - schema_path: schema file path - Returns: - service_set: set of services in file - """ - service_set = set() - with open(schema_path, encoding="UTF-8") as f: - schema = json.load(f) - for service in schema: - service_set.add(service["service_name"]) - f.close() - return service_set - - -def get_in_domain_services(schema_path: str, service_set: set) -> set: - """Get the set of common services between a schema and set of services. - Args: - schema_path: path to schema file - service_set: set of services - Returns: - joint_services: joint services between schema path file and service set - """ - joint_services = get_service_set(schema_path) & service_set - return joint_services - - -def get_dataset_as_dict(file_path_patterns) -> dict: - """Read the DSTC8/SGD json dialogue data as dictionary with dialog ID as keys. - Args: - file_path_patterns: list or directory of files - Returns: - dataset_dict: dataset dictionary with dialog ID as keys - """ - dataset_dict = {} - if isinstance(file_path_patterns, list): - list_fp = file_path_patterns - else: - list_fp = sorted(glob.glob(file_path_patterns)) - for fp in list_fp: - if PER_FRAME_OUTPUT_FILENAME in fp: - continue - logging.debug("Loading file: %s", fp) - with open(fp, encoding="UTF-8") as f: - data = json.load(f) - if isinstance(data, list): - for dial in data: - dataset_dict[dial["dialogue_id"]] = dial - elif isinstance(data, dict): - dataset_dict.update(data) - f.close() - return dataset_dict - - -def get_metrics( - dataset_ref: dict, - dataset_hyp: dict, - service_schemas: dict, - in_domain_services: set, - joint_acc_across_turn: bool, - use_fuzzy_match: bool, -): - """Calculate the DSTC8/SGD metrics. - Args: - dataset_ref: The ground truth dataset represented as a dict mapping dialogue id to the corresponding dialogue. - dataset_hyp: The predictions in the same format as `dataset_ref`. - service_schemas: A dict mapping service name to the schema for the service. - in_domain_services: The set of services which are present in the training set. - joint_acc_across_turn: Whether to compute joint accuracy across turn instead of across service. Should be set to True when conducting multiwoz style evaluation. - use_fuzzy_match: Whether to use fuzzy string matching when comparing non-categorical slot values. Should be set to False when conducting multiwoz style evaluation. - - Returns: - all_metric_aggregate: A dict mapping a metric collection name to a dict containing the values - for various metrics. Each metric collection aggregates the metrics across a specific set of frames in the dialogues. - per_frame_metric: metrics aggregated for each frame - """ - # Metrics can be aggregated in various ways, eg over all dialogues, only for - # dialogues containing unseen services or for dialogues corresponding to a - # single service. This aggregation is done through metric_collections, which - # is a dict mapping a collection name to a dict, which maps a metric to a list - # of values for that metric. Each value in this list is the value taken by - # the metric on a frame. - metric_collections = collections.defaultdict(lambda: collections.defaultdict(list)) - - # Ensure the dialogs in dataset_hyp also occur in dataset_ref. - assert set(dataset_hyp.keys()).issubset(set(dataset_ref.keys())) - logging.debug("len(dataset_hyp)=%d, len(dataset_ref)=%d", len(dataset_hyp), len(dataset_ref)) - - # Store metrics for every frame for debugging. - per_frame_metric = {} - - for dial_id, dial_hyp in dataset_hyp.items(): - dial_ref = dataset_ref[dial_id] - - if set(dial_ref["services"]) != set(dial_hyp["services"]): - raise ValueError( - "Set of services present in ground truth and predictions don't match " - "for dialogue with id {}".format(dial_id) - ) - - joint_metrics = [JOINT_GOAL_ACCURACY, JOINT_CAT_ACCURACY, JOINT_NONCAT_ACCURACY] - for turn_id, (turn_ref, turn_hyp) in enumerate(zip(dial_ref["turns"], dial_hyp["turns"])): - metric_collections_per_turn = collections.defaultdict(lambda: collections.defaultdict(lambda: 1.0)) - if turn_ref["speaker"] != turn_hyp["speaker"]: - raise ValueError("Speakers don't match in dialogue with id {}".format(dial_id)) - - # Skip system turns because metrics are only computed for user turns. - if turn_ref["speaker"] != "USER": - continue - - if turn_ref["utterance"] != turn_hyp["utterance"]: - logging.error("Ref utt: %s", turn_ref["utterance"]) - logging.error("Hyp utt: %s", turn_hyp["utterance"]) - raise ValueError("Utterances don't match for dialogue with id {}".format(dial_id)) - - hyp_frames_by_service = {frame["service"]: frame for frame in turn_hyp["frames"]} - - # Calculate metrics for each frame in each user turn. - for frame_ref in turn_ref["frames"]: - service_name = frame_ref["service"] - if service_name not in hyp_frames_by_service: - raise ValueError( - "Frame for service {} not found in dialogue with id {}".format(service_name, dial_id) - ) - service = service_schemas[service_name] - frame_hyp = hyp_frames_by_service[service_name] - - active_intent_acc = get_active_intent_accuracy(frame_ref, frame_hyp) - slot_tagging_f1_scores = get_slot_tagging_f1(frame_ref, frame_hyp, turn_ref["utterance"], service) - requested_slots_f1_scores = get_requested_slots_f1(frame_ref, frame_hyp) - goal_accuracy_dict = get_average_and_joint_goal_accuracy( - frame_ref, frame_hyp, service, use_fuzzy_match - ) - - frame_metric = { - ACTIVE_INTENT_ACCURACY: active_intent_acc, - REQUESTED_SLOTS_F1: requested_slots_f1_scores.f1, - REQUESTED_SLOTS_PRECISION: requested_slots_f1_scores.precision, - REQUESTED_SLOTS_RECALL: requested_slots_f1_scores.recall, - } - if slot_tagging_f1_scores is not None: - frame_metric[SLOT_TAGGING_F1] = slot_tagging_f1_scores.f1 - frame_metric[SLOT_TAGGING_PRECISION] = slot_tagging_f1_scores.precision - frame_metric[SLOT_TAGGING_RECALL] = slot_tagging_f1_scores.recall - frame_metric.update(goal_accuracy_dict) - - frame_id = "{:s}-{:03d}-{:s}".format(dial_id, turn_id, frame_hyp["service"]) - per_frame_metric[frame_id] = frame_metric - # Add the frame-level metric result back to dialogues. - frame_hyp["metrics"] = frame_metric - - # Get the domain name of the service. - domain_name = frame_hyp["service"].split("_")[0] - domain_keys = [ALL_SERVICES, frame_hyp["service"], domain_name] - if frame_hyp["service"] in in_domain_services: - domain_keys.append(SEEN_SERVICES) - - else: - domain_keys.append(UNSEEN_SERVICES) - for domain_key in domain_keys: - for metric_key, metric_value in frame_metric.items(): - if metric_value != NAN_VAL: - if joint_acc_across_turn and metric_key in joint_metrics: - metric_collections_per_turn[domain_key][metric_key] *= metric_value - else: - metric_collections[domain_key][metric_key].append(metric_value) - if joint_acc_across_turn: - # Conduct multiwoz style evaluation that computes joint goal accuracy - # across all the slot values of all the domains for each turn. - for domain_key in metric_collections_per_turn: - for metric_key, metric_value in metric_collections_per_turn[domain_key].items(): - metric_collections[domain_key][metric_key].append(metric_value) - - all_metric_aggregate = {} - for domain_key, domain_metric_vals in metric_collections.items(): - domain_metric_aggregate = {} - for metric_key, value_list in domain_metric_vals.items(): - if value_list: - # Metrics are macro-averaged across all frames. - domain_metric_aggregate[metric_key] = round(float(np.mean(value_list)) * 100.0, 2) - else: - domain_metric_aggregate[metric_key] = NAN_VAL - all_metric_aggregate[domain_key] = domain_metric_aggregate - return all_metric_aggregate, per_frame_metric - - -def evaluate( - prediction_dir: str, - data_dir: str, - eval_dataset: str, - in_domain_services: set, - joint_acc_across_turn: bool, - use_fuzzy_match: bool, -) -> dict: - """Calculate the DSTC8/SGD metrics for given data. - - Args: - prediction_dir: prediction location - data_dir: ground truth data location. - eval_dataset: evaluation data split - in_domain_services: The set of services which are present in the training set. - joint_acc_across_turn: Whether to compute joint goal accuracy across turn instead of across service. Should be set to True when conducting multiwoz style evaluation. - use_fuzzy_match: Whether to use fuzzy string matching when comparing non-categorical slot values. Should be set to False when conducting multiwoz style evaluation. - - Returns: - A dict mapping a metric collection name to a dict containing the values - for various metrics for all dialogues and all services - """ - - with open(os.path.join(data_dir, eval_dataset, "schema.json"), encoding="UTF-8") as f: - eval_services = {} - list_services = json.load(f) - for service in list_services: - eval_services[service["service_name"]] = service - f.close() - - dataset_ref = get_dataset_as_dict(os.path.join(data_dir, eval_dataset, "dialogues_*.json")) - dataset_hyp = get_dataset_as_dict(os.path.join(prediction_dir, "*.json")) - - # has ALLSERVICE, SEEN_SERVICES, UNSEEN_SERVICES, SERVICE, DOMAIN - all_metric_aggregate, _ = get_metrics( - dataset_ref, dataset_hyp, eval_services, in_domain_services, joint_acc_across_turn, use_fuzzy_match - ) - if SEEN_SERVICES in all_metric_aggregate: - logging.info(f'Dialog metrics for {SEEN_SERVICES} : {sorted(all_metric_aggregate[SEEN_SERVICES].items())}') - if UNSEEN_SERVICES in all_metric_aggregate: - logging.info(f'Dialog metrics for {UNSEEN_SERVICES}: {sorted(all_metric_aggregate[UNSEEN_SERVICES].items())}') - if ALL_SERVICES in all_metric_aggregate: - logging.info(f'Dialog metrics for {ALL_SERVICES} : {sorted(all_metric_aggregate[ALL_SERVICES].items())}') - - # Write the per-frame metrics values with the corrresponding dialogue frames. - with open(os.path.join(prediction_dir, PER_FRAME_OUTPUT_FILENAME), "w", encoding="UTF-8") as f: - json.dump(dataset_hyp, f, indent=2, separators=(",", ": ")) - f.close() - return all_metric_aggregate[ALL_SERVICES] diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/sgd/prediction_utils.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/sgd/prediction_utils.py deleted file mode 100644 index c9ddd2fd6f2336bd78e2f230e9887c5dcdf188a3..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/sgd/prediction_utils.py +++ /dev/null @@ -1,251 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Prediction and evaluation-related utility functions. -This file contains code artifacts adapted from the original implementation: -https://github.com/google-research/google-research/blob/master/schema_guided_dst/baseline/pred_utils.py -""" - -import json -import os -from collections import OrderedDict, defaultdict -from typing import Dict, List, Optional - -from nemo.collections.nlp.data.dialogue.input_example.sgd_input_example import ( - STATUS_ACTIVE, - STATUS_DONTCARE, - STR_DONTCARE, -) -from nemo.utils import logging - -REQ_SLOT_THRESHOLD = 0.5 - - -__all__ = ['write_predictions_to_file'] - - -def set_cat_slot(predictions_status: dict, predictions_value: dict, cat_slot_values: Dict[str, List[str]]) -> dict: - """ - Extract predicted categorical slot information - Args: - predictions_status: predicted statuses - predictions_value: predicted slot values - cat_slot_values: possible categorical slots and their potential values for this service - Returns: - out_dict: predicted slot value pairs - """ - out_dict = {} - for slot_idx, slot in enumerate(cat_slot_values): - slot_status = predictions_status[slot_idx][0]["cat_slot_status"] - if slot_status == STATUS_DONTCARE: - out_dict[slot] = STR_DONTCARE - elif slot_status == STATUS_ACTIVE: - tmp = predictions_value[slot_idx] - value_idx = max(tmp, key=lambda k: tmp[k]['cat_slot_value_status'][0].item()) - out_dict[slot] = cat_slot_values[slot][value_idx] - return out_dict - - -def set_noncat_slot( - predictions_status: dict, - predictions_value: dict, - non_cat_slots: List[str], - user_utterance: str, - sys_slots_agg: Optional[dict] = None, -) -> dict: - """ - Extract predicted non categorical slot information - Args: - predictions_status: predicted statuses - predictions_value: predicted slot values - non_cat_slots: list of possible non categorical slots for this service - user_utterance: system and user utterance - sys_slots_agg: system retrieval lookup table. Contains for each slot the most recent value seen in the history - Returns: - out_dict: predicted slot value pairs - """ - out_dict = {} - for slot_idx, slot in enumerate(non_cat_slots): - slot_status = predictions_status[slot_idx][0]["noncat_slot_status"] - if slot_status == STATUS_DONTCARE: - out_dict[slot] = STR_DONTCARE - elif slot_status == STATUS_ACTIVE: - tok_start_idx = predictions_value[slot_idx][0]["noncat_slot_start"] - tok_end_idx = predictions_value[slot_idx][0]["noncat_slot_end"] - ch_start_idx = predictions_value[slot_idx][0]["noncat_alignment_start"][tok_start_idx] - ch_end_idx = predictions_value[slot_idx][0]["noncat_alignment_end"][tok_end_idx] - if ch_start_idx > 0 and ch_end_idx > 0: - # Add span from the utterance. - out_dict[slot] = user_utterance[ch_start_idx - 1 : ch_end_idx] - elif sys_slots_agg and slot in sys_slots_agg: - # system retrieval - out_dict[slot] = sys_slots_agg[slot] - return out_dict - - -def get_predicted_dialog(dialog: dict, all_predictions: dict, schemas: object, state_tracker: str) -> dict: - """Overwrite the labels in the turn with the predictions from the model. For test set, these labels are missing from the data and hence they are added. - Args: - dialog: ground truth dialog - all_predictions: predictions - schemas: schema object of all services of all datasets - state_tracker: state tracker option, e.g. nemotracker - Returns: - dialog: dialog overwritten with prediction information - """ - dialog_id = dialog["dialogue_id"] - if state_tracker == "baseline": - sys_slots_agg = {} - else: - sys_slots_agg = defaultdict(OrderedDict) - all_slot_values = defaultdict(dict) - for turn_idx, turn in enumerate(dialog["turns"]): - if turn["speaker"] == "SYSTEM" and state_tracker == 'nemotracker': - for frame in turn["frames"]: - if frame["service"] not in sys_slots_agg: - sys_slots_agg[frame["service"]] = OrderedDict() - for action in frame["actions"]: - if action["slot"] and len(action["values"]) > 0: - sys_slots_agg[frame["service"]][action["slot"]] = action["values"][0] - if turn["speaker"] == "USER": - user_utterance = turn["utterance"] - system_utterance = dialog["turns"][turn_idx - 1]["utterance"] if turn_idx else "" - system_user_utterance = system_utterance + ' ' + user_utterance - turn_id = "{:02d}".format(turn_idx) - for frame in turn["frames"]: - - predictions = all_predictions[(dialog_id, turn_id, frame["service"])] - slot_values = all_slot_values[frame["service"]] - service_schema = schemas.get_service_schema(frame["service"]) - # Remove the slot spans and state if present. - frame.pop("slots", None) - frame.pop("state", None) - - # The baseline model doesn't predict slot spans. Only state predictions - # are added. - state = {} - - # Add prediction for active intent. No Offset is subtracted since schema has now NONE intent at index 0 - state["active_intent"] = get_predicted_intent( - predictions=predictions[0], intents=service_schema.intents - ) - # Add prediction for requested slots. - state["requested_slots"] = get_requested_slot(predictions=predictions[1], slots=service_schema.slots) - - # Add prediction for user goal (slot values). - # Categorical slots. - cat_out_dict = set_cat_slot( - predictions_status=predictions[2], - predictions_value=predictions[3], - cat_slot_values=service_schema.categorical_slot_values, - ) - for k, v in cat_out_dict.items(): - slot_values[k] = v - - # Non-categorical slots. - noncat_out_dict = set_noncat_slot( - predictions_status=predictions[4], - predictions_value=predictions[5], - non_cat_slots=service_schema.non_categorical_slots, - user_utterance=system_user_utterance, - sys_slots_agg=sys_slots_agg.get(frame["service"], None), - ) - for k, v in noncat_out_dict.items(): - slot_values[k] = v - # Create a new dict to avoid overwriting the state in previous turns - # because of use of same objects. - state["slot_values"] = {s: [v] for s, v in slot_values.items()} - frame["state"] = state - return dialog - - -def get_predicted_intent(predictions: dict, intents: List[str]) -> str: - """ - Returns intent name with maximum score - Args: - predictions: predictions - intents: list of possible intents for this service - Returns: - intent: predicted intent - """ - assert len(predictions) == len(intents) - active_intent_id = max(predictions, key=lambda k: predictions[k][0]['intent_status']) - intent = intents[active_intent_id] - return intent - - -def get_requested_slot(predictions: dict, slots: List[str]) -> List[str]: - """ - Returns list of slots which are predicted to be requested - Args: - predictions: predictions - slots: list of possible slots - Returns: - requested_slots: list of requested slots - """ - active_indices = [k for k in predictions if predictions[k][0]["req_slot_status"] > REQ_SLOT_THRESHOLD] - requested_slots = list(map(lambda k: slots[k], active_indices)) - return requested_slots - - -def write_predictions_to_file( - predictions: List[dict], - input_json_files: List[str], - output_dir: str, - schemas: object, - state_tracker: str, - eval_debug: bool, - in_domain_services: set, -): - """Save predicted dialogues as json files. - - Args: - predictions: An iterator containing model predictions. This is the output of - the predict method in the estimator. - input_json_files: A list of json paths containing the dialogues to run - inference on. - output_dir: The directory where output json files will be created. - schemas: Schemas to all services in the dst dataset - state_tracker: state tracker option - eval_debug: output evaluation debugging information - in_domain_services: in domain services - """ - logging.info(f"Writing predictions to {output_dir} started.") - - # Index all predictions. - all_predictions = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) - for idx, prediction in enumerate(predictions): - eval_dataset, dialog_id, turn_id, service_name, model_task, slot_intent_id, value_id = prediction[ - 'example_id' - ].split('-') - all_predictions[(dialog_id, turn_id, service_name)][int(model_task)][int(slot_intent_id)][ - int(value_id) - ] = prediction - logging.info(f'Predictions for {idx} examples in {eval_dataset} dataset are getting processed.') - - # Read each input file and write its predictions. - for input_file_path in input_json_files: - with open(input_file_path, encoding="UTF-8") as f: - dialogs = json.load(f) - logging.debug(f'{input_file_path} file is loaded') - pred_dialogs = [] - for d in dialogs: - pred_dialog = get_predicted_dialog(d, all_predictions, schemas, state_tracker) - pred_dialogs.append(pred_dialog) - input_file_name = os.path.basename(input_file_path) - output_file_path = os.path.join(output_dir, input_file_name) - with open(output_file_path, "w", encoding="UTF-8") as f: - json.dump(pred_dialogs, f, indent=2, separators=(",", ": "), sort_keys=True) diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/sgd/schema.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/sgd/schema.py deleted file mode 100644 index b12a11fdb63c2f5685504b1b4f29fed3872f1211..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/dialogue/sgd/schema.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Copyright 2019 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Wrappers for schemas of different services. -This file contains code artifacts adapted from the original implementation: -https://github.com/google-research/google-research/blob/master/schema_guided_dst/schema.py -""" - -import json -from typing import List, Optional, Union - -from nemo.utils import logging - -__all__ = ['Schema'] - - -class ServiceSchema(object): - """A wrapper for schema for a service.""" - - def __init__(self, schema_json: dict, service_id: Optional[int] = None): - """ - Constructor for ServiceSchema. - Args: - schema_json: schema json dict - service_id: service ID - """ - self._service_name = schema_json["service_name"] - self._description = schema_json["description"] - self._schema_json = schema_json - self._service_id = service_id - - # Construct the vocabulary for intents, slots, categorical slots, - # non-categorical slots and categorical slot values. - self._intents = ["NONE"] + sorted(i["name"] for i in schema_json["intents"]) - self._intent_descriptions = {i["name"]: i["description"] for i in schema_json["intents"]} - self._intent_descriptions["NONE"] = "none" - self._slots = sorted(s["name"] for s in schema_json["slots"]) - self._slots_descriptions = {s["name"]: s["description"] for s in schema_json["slots"]} - self._categorical_slots = sorted( - s["name"] for s in schema_json["slots"] if s["is_categorical"] and s["name"] in self.state_slots - ) - self._non_categorical_slots = sorted( - s["name"] for s in schema_json["slots"] if not s["is_categorical"] and s["name"] in self.state_slots - ) - slot_schemas = {s["name"]: s for s in schema_json["slots"]} - categorical_slot_values = {} - categorical_slot_value_ids = {} - categorical_slot_ids = {} - non_categorical_slot_ids = {} - for slot_id, slot in enumerate(self._categorical_slots): - slot_schema = slot_schemas[slot] - values = sorted(slot_schema["possible_values"]) - categorical_slot_values[slot] = values - value_ids = {value: idx for idx, value in enumerate(values)} - categorical_slot_value_ids[slot] = value_ids - categorical_slot_ids[slot] = slot_id - - for slot_id, slot in enumerate(self._non_categorical_slots): - non_categorical_slot_ids[slot] = slot_id - - self._categorical_slot_values = categorical_slot_values - self._categorical_slot_value_ids = categorical_slot_value_ids - - self._categorical_slot_ids = categorical_slot_ids - self._non_categorical_slot_ids = non_categorical_slot_ids - - @property - def schema_json(self) -> dict: - """Returns schema json dictionary""" - return self._schema_json - - @property - def state_slots(self) -> set: - """Set of slots which are permitted to be in the dialogue state.""" - state_slots = set() - for intent in self._schema_json["intents"]: - state_slots.update(intent["required_slots"]) - state_slots.update(intent["optional_slots"]) - return state_slots - - @property - def service_name(self): - return self._service_name - - @property - def service_id(self): - return self._service_id - - @property - def description(self): - return self._description - - @property - def slots(self): - return self._slots - - @property - def intents(self): - return self._intents - - @property - def intent_descriptions(self): - return self._intent_descriptions - - @property - def slot_descriptions(self): - return self._slots_descriptions - - @property - def categorical_slots(self): - return self._categorical_slots - - @property - def non_categorical_slots(self): - return self._non_categorical_slots - - @property - def categorical_slot_values(self): - return self._categorical_slot_values - - def get_categorical_slot_values(self, slot): - return self._categorical_slot_values[slot] - - def get_slot_from_id(self, slot_id): - return self._slots[slot_id] - - def get_intent_from_id(self, intent_id): - return self._intents[intent_id] - - def get_categorical_slot_from_id(self, slot_id): - return self._categorical_slots[slot_id] - - def get_non_categorical_slot_from_id(self, slot_id): - return self._non_categorical_slots[slot_id] - - def get_categorical_slot_value_from_id(self, slot_id, value_id): - slot = self._categorical_slots[slot_id] - return self._categorical_slot_values[slot][value_id] - - def get_categorical_slot_value_id(self, slot, value): - return self._categorical_slot_value_ids[slot][value] - - def get_categorical_slot_id(self, slot): - return self._categorical_slot_ids[slot] - - def get_non_categorical_slot_id(self, slot): - return self._non_categorical_slot_ids[slot] - - -class Schema(object): - """Wrapper for schemas for all services in a dataset.""" - - def __init__(self, schema_json_paths: Union[str, List[str]]): - """ - schema_json_paths: list of .json path to schema files of a single str with path to the json file. - """ - # Load the schema from the json file. - if isinstance(schema_json_paths, str): - with open(schema_json_paths, "r") as f: - all_schemas = json.load(f) - f.close() - else: - # load multiple schemas from the list of the json files - all_schemas = [] - completed_services = [] - for schema_json_path in schema_json_paths: - with open(schema_json_path, "r") as f: - schemas = json.load(f) - f.close() - logging.debug("Num of services in %s: %s", schema_json_path, len(schemas)) - - for service in schemas: - if service['service_name'] not in completed_services: - completed_services.append(service['service_name']) - all_schemas.append(service) - - self._services = sorted(schema["service_name"] for schema in all_schemas) - self._services_vocab = {v: k for k, v in enumerate(self._services)} - self._services_id_to_vocab = {v: k for k, v in self._services_vocab.items()} - service_schemas = {} - for schema in all_schemas: - service = schema["service_name"] - service_schemas[service] = ServiceSchema(schema, service_id=self.get_service_id(service)) - - self._service_schemas = service_schemas - self._schemas = all_schemas - self._slots_relation_list = {} - - def get_service_id(self, service: str): - return self._services_vocab[service] - - def get_service_from_id(self, service_id: int): - return self._services[service_id] - - def get_service_schema(self, service: str): - return self._service_schemas[service] - - @property - def services(self): - return self._services - - def save_to_file(self, file_path): - """ - Saves schema object to file - Args: - file_path: path to store schema object at - """ - with open(file_path, "w") as f: - json.dump(self._schemas, f, indent=2) diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/entity_linking/__init__.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/entity_linking/__init__.py deleted file mode 100644 index 659718d71b82c71853c2f515bbcf5eb7bf09230a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/entity_linking/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.entity_linking.entity_linking_dataset import EntityLinkingDataset diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/entity_linking/entity_linking_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/entity_linking/entity_linking_dataset.py deleted file mode 100644 index 3b1d97a354f0aac557921c50eff2712e8c25fba4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/entity_linking/entity_linking_dataset.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import array -import pickle as pkl -from typing import Optional - -import torch - -from nemo.collections.nlp.data.data_utils.data_preprocessing import find_newlines, load_data_indices -from nemo.core.classes import Dataset -from nemo.utils import logging - -__all__ = ['EntityLinkingDataset'] - - -class EntityLinkingDataset(Dataset): - """ - Parent class for entity linking encoder training and index - datasets - - Args: - tokenizer (obj): huggingface tokenizer, - data_file (str): path to tab separated column file where data - pairs apear in the format - concept_ID\tconcept_synonym1\tconcept_synonym2\n - newline_idx_file (str): path to pickle file containing location - of data_file newline characters - max_seq_length (int): maximum length of a concept in tokens - is_index_data (bool): Whether dataset will be used for building - a nearest neighbors index - """ - - def __init__( - self, - tokenizer: object, - data_file: str, - newline_idx_file: Optional[str] = None, - max_seq_length: Optional[int] = 512, - is_index_data: bool = False, - ): - - self.tokenizer = tokenizer - - # Try and load pair indices file if already exists - newline_indices, newline_idx_file, _ = load_data_indices(newline_idx_file, data_file, "newline_indices") - - # If pair indices file doesn't exists, generate and store them - if newline_indices is None: - logging.info("Getting datafile newline indices") - - with open(data_file, "rb") as f: - contents = f.read() - newline_indices = find_newlines(contents) - newline_indices = array.array("I", newline_indices) - - # Store data file indicies to avoid generating them again - with open(newline_idx_file, "wb") as f: - pkl.dump(newline_indices, f) - - self.newline_indices = newline_indices - self.data_file = data_file - self.num_lines = len(newline_indices) - self.max_seq_length = max_seq_length - self.is_index_data = is_index_data - - logging.info(f"Loaded dataset with {self.num_lines} examples") - - def __len__(self): - return self.num_lines - - def __getitem__(self, idx): - - concept_offset = self.newline_indices[idx] - - with open(self.data_file, "r", encoding='utf-8-sig') as f: - # Find data pair within datafile using byte offset - f.seek(concept_offset) - concept = f.readline()[:-1] - concept = concept.strip().split("\t") - - if self.is_index_data: - concept_id, concept = concept - return (int(concept_id), concept) - - else: - concept_id, concept1, concept2 = concept - return (int(concept_id), concept1, concept2) - - def _collate_fn(self, batch): - """collate batch of input_ids, segment_ids, input_mask, and label - - Args: - batch: A list of tuples of format (concept_ID, concept_synonym1, concept_synonym2). - """ - if self.is_index_data: - concept_ids, concepts = zip(*batch) - concept_ids = list(concept_ids) - concepts = list(concepts) - - else: - concept_ids, concepts1, concepts2 = zip(*batch) - concept_ids = list(concept_ids) - concept_ids.extend(concept_ids) # Need to double label list to match each concept - concepts = list(concepts1) - concepts.extend(concepts2) - - batch = self.tokenizer( - concepts, - add_special_tokens=True, - padding=True, - truncation=True, - max_length=self.max_seq_length, - return_token_type_ids=True, - return_attention_mask=True, - return_length=True, - ) - - return ( - torch.LongTensor(batch["input_ids"]), - torch.LongTensor(batch["token_type_ids"]), - torch.LongTensor(batch["attention_mask"]), - torch.LongTensor(concept_ids), - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/glue_benchmark/__init__.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/glue_benchmark/__init__.py deleted file mode 100644 index 753411382bc1e65de5979285129d1a570ef8feeb..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/glue_benchmark/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUEDataset diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/glue_benchmark/data_processors.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/glue_benchmark/data_processors.py deleted file mode 100644 index 3d907f24eff806fa965504b1f6c4f7fc703aa6f5..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/glue_benchmark/data_processors.py +++ /dev/null @@ -1,445 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and -# The HuggingFace Inc. team. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -from nemo.collections.nlp.data.data_utils.data_preprocessing import DataProcessor -from nemo.utils import logging - -__all__ = [ - 'ColaProcessor', - 'MnliProcessor', - 'MnliMismatchedProcessor', - 'MrpcProcessor', - 'Sst2Processor', - 'StsbProcessor', - 'QqpProcessor', - 'QnliProcessor', - 'RteProcessor', - 'WnliProcessor', - 'XNLIProcessor', -] - - -class MrpcProcessor(DataProcessor): - """Processor for the MRPC data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}') - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[3] - text_b = line[4] - label = line[0] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - return f"mrpc sentence1: {text_a} sentence2: {text_b}" - - def label2string(self, label): - return "equivalent" if label == "1" else "not equivalent" - - -class MnliProcessor(DataProcessor): - """Processor for the MultiNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[8] - text_b = line[9] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - return f"mnli hypothesis: {text_a} premise: {text_b}" - - def label2string(self, label): - return label - - -class XNLIProcessor(DataProcessor): - """Processor for the MultiNLI data set (GLUE version).""" - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[6] - text_b = line[7] - label = line[1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - return f"mnli hypothesis: {text_a} premise: {text_b}" - - def label2string(self, label): - return label - - -class MnliMismatchedProcessor(MnliProcessor): - """Processor for the MultiNLI Mismatched data set (GLUE version).""" - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - -class ColaProcessor(DataProcessor): - """Processor for the CoLA data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - guid = "%s-%s" % (set_type, i) - text_a = line[3] - label = line[1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - assert text_b is None - return f"cola sentence: {text_a}" - - def label2string(self, label): - return "acceptable" if label == "1" else "not acceptable" - - -class Sst2Processor(DataProcessor): - """Processor for the SST-2 data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[0] - label = line[1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - assert text_b is None - return f"sst2 sentence: {text_a}" - - def label2string(self, label): - return "positive" if label == "1" else "negative" - - -class StsbProcessor(DataProcessor): - """Processor for the STS-B data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return [None] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[7] - text_b = line[8] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - return f"stsb sentence1: {text_a} sentence2: {text_b}" - - def label2string(self, label): - return '%.1f' % float(label) - - -class QqpProcessor(DataProcessor): - """Processor for the QQP data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - try: - text_a = line[3] - text_b = line[4] - label = line[5] - except IndexError: - continue - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - return f"qqp question1: {text_a} question2: {text_b}" - - def label2string(self, label): - return "duplicate" if label == "1" else "not_duplicate" - - -class QnliProcessor(DataProcessor): - """Processor for the QNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - return f"qnli question: {text_a} sentence: {text_b}" - - def label2string(self, label): - return label - - -class RteProcessor(DataProcessor): - """Processor for the RTE data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - return f"rte sentence1: {text_a} sentence2: {text_b}" - - def label2string(self, label): - return label - - -class WnliProcessor(DataProcessor): - """Processor for the WNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_examples(self, file_path): - return self._create_examples(self._read_tsv(file_path), "example") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_t5_prompted_query(self, text_a, text_b): - raise NotImplementedError("NeMo-Megatron T5 does not support WNLI at the moment.") - - def label2string(self, label): - raise NotImplementedError("NeMo-Megatron T5 does not support WNLI at the moment.") - - -class InputExample(object): - """A single training/test example for simple sequence classification. - - Args: - guid: Unique id for the example. - text_a: The untokenized text of the first sequence. - For single sequence tasks, only this sequence must be specified. - text_b: The untokenized text of the second - sequence. Only must be specified for sequence pair tasks. - label:The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - - def __init__(self, guid: int, text_a: str, text_b: str = None, label: str = None): - """Constructs a InputExample.""" - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.label = label - - def __repr__(self): - return ( - f"InputExample(guid='{self.guid}', text_a='{self.text_a}', text_b='{self.text_b}', label='{self.label}')" - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/glue_benchmark/glue_benchmark_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/glue_benchmark/glue_benchmark_dataset.py deleted file mode 100644 index 2a14aa5afc58e7f95c65ae7eb18d3548c802c86c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/glue_benchmark/glue_benchmark_dataset.py +++ /dev/null @@ -1,561 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and -# The HuggingFace Inc. team. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Some code of this file was adapted from the HuggingFace library available at -# https://github.com/huggingface/transformers - -import os -import pickle -from typing import Dict, List, Optional, Union - -import numpy as np -import torch - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.glue_benchmark.data_processors import ( - ColaProcessor, - MnliMismatchedProcessor, - MnliProcessor, - MrpcProcessor, - QnliProcessor, - QqpProcessor, - RteProcessor, - Sst2Processor, - StsbProcessor, - WnliProcessor, - XNLIProcessor, -) -from nemo.core.classes import Dataset -from nemo.core.neural_types import CategoricalValuesType, ChannelType, MaskType, NeuralType, RegressionValuesType -from nemo.utils import logging - -__all__ = ['GLUEDataset', 'TextToTextGLUEDataset', 'TextToTextXNLIDataset'] - -processors = { - "cola": ColaProcessor, - "mnli": MnliProcessor, - "mnli-mm": MnliMismatchedProcessor, - "mrpc": MrpcProcessor, - "sst-2": Sst2Processor, - "sts-b": StsbProcessor, - "qqp": QqpProcessor, - "qnli": QnliProcessor, - "rte": RteProcessor, - "wnli": WnliProcessor, - "xnli": XNLIProcessor, -} -output_modes = { - "cola": "classification", - "mnli": "classification", - "mnli-mm": "classification", - "mrpc": "classification", - "sst-2": "classification", - "sts-b": "regression", - "qqp": "classification", - "qnli": "classification", - "rte": "classification", - "wnli": "classification", - "xnli": "classification", -} -GLUE_TASKS_NUM_LABELS = { - "cola": 2, - "mnli": 3, - "mrpc": 2, - "sst-2": 2, - "sts-b": 1, - "qqp": 2, - "qnli": 2, - "rte": 2, - "wnli": 2, -} - - -class GLUEDataset(Dataset): - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - "labels": NeuralType( - tuple('B'), RegressionValuesType() if self.task_name == 'sts-b' else CategoricalValuesType() - ), - } - - def __init__( - self, - file_name: str, - task_name: str, - tokenizer: TokenizerSpec, - max_seq_length: str, - use_cache: bool = True, - compute_features: bool = True, - ): - """ - Processes GLUE datasets - Args: - file_name: path to file - task_name: GLUE task name - tokenizer: such as AutoTokenizer - max_seq_length: max sequence length minus 2 for [CLS] and [SEP] - use_cache: whether to use data cache - """ - original_file_name = file_name - logging.info(f'Processing {file_name}') - data_dir, file_name = os.path.split(file_name) - file_name = file_name[:-4] - self.tokenizer = tokenizer - evaluate = False if 'train' in file_name else True - - if task_name not in processors: - raise ValueError(f'{task_name} not supported. Choose from {processors.keys()}') - - if task_name == 'mnli' and 'dev_mismatched' in file_name: - self.task_name = 'mnli-mm' - else: - self.task_name = task_name - - processor = processors[self.task_name]() - output_mode = output_modes[self.task_name] - self.label_list = processor.get_labels() - - # TODO: use a different variable to decide whether to trust the user provided filename. This is a temporary workaround for T5 GLUE and XNLI. - if not compute_features: - if not os.path.exists(original_file_name): - raise ValueError(f"Could not find file : {original_file_name}") - self.examples = processor.get_examples(original_file_name) - else: - self.examples = ( - processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir) - ) - processor_name = type(processor).__name__ - vocab_size = getattr(tokenizer, "vocab_size", 0) - if compute_features: - cached_features_file = os.path.join( - data_dir, - "cached_{}_{}_{}_{}_{}".format( - processor_name, file_name, tokenizer.name, str(max_seq_length), str(vocab_size) - ), - ) - - if use_cache and os.path.exists(cached_features_file): - logging.info(f"loading from {cached_features_file}") - with open(cached_features_file, "rb") as reader: - self.features = pickle.load(reader) - else: - token_params = { - 'bos_token': None, - 'eos_token': tokenizer.eos_token, - 'pad_token': tokenizer.pad_token, - 'cls_token': tokenizer.cls_token, - 'sep_token_extra': tokenizer.eos_token if 'roberta' in tokenizer.name.lower() else None, - } - - self.features = self.convert_examples_to_features( - self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params - ) - master_device = not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0 - if master_device: - logging.info(f'Saving train features into {cached_features_file}') - with open(cached_features_file, "wb") as writer: - pickle.dump(self.features, writer) - - def __len__(self): - return len(self.features) - - def __getitem__(self, idx): - feature = self.features[idx] - return ( - np.array(feature.input_ids), - np.array(feature.segment_ids), - np.array(feature.input_mask, dtype=np.long), - np.array(feature.label_id), - ) - - def convert_examples_to_features( - self, - examples: List[str], - label_list: List[int], - max_seq_length: int, - tokenizer: TokenizerSpec, - output_mode: str, - bos_token: str = None, - eos_token: str = '[SEP]', - pad_token: str = '[PAD]', - cls_token: str = '[CLS]', - sep_token_extra: str = None, - cls_token_at_end: bool = False, - cls_token_segment_id: int = 0, - pad_token_segment_id: int = 0, - pad_on_left: bool = False, - mask_padding_with_zero: bool = True, - sequence_a_segment_id: int = 0, - sequence_b_segment_id: int = 1, - ): - """ - Loads a data file into a list of `InputBatch`s. - The `cls_token_at_end` defines the location of the CLS token: - - * False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - * True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] - - The `cls_token_segment_id` defines the segment id associated to the CLS token (0 for BERT, 2 for XLNet) - - The convention in BERT is: - - a. For sequence pairs: - * tokens: [CLS] is this jack ##ville ? [SEP] no it is not . [SEP] - * type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 - b. For single sequences: - * tokens: [CLS] the dog is hairy . [SEP] - * type_ids: 0 0 0 0 0 0 0 - - Where "type_ids" are used to indicate whether this is the first - sequence or the second sequence. The embedding vectors for `type=0` - and `type=1` were learned during pre-training and are added to the - wordpiece embedding vector (and position vector). This is - not *strictly* necessarysince the [SEP] token unambiguously separates - the sequences, but it makes it easier for the model to learn - the concept of sequences. - For classification tasks, the first vector (corresponding to [CLS]) - is used as as the "sentence vector". Note that this only makes sense - because the entire model is fine-tuned. - - The convention for NMT is: - - a. For sequence pairs: - * tokens: is this jack ##ville ? no it is not . - * type_ids:0 0 0 0 0 0 0 1 1 1 1 1 1 1 - b. For single sequences: - * tokens: the dog is hairy . - * type_ids: 0 0 0 0 0 0 0 - - """ - label_map = {label: i for i, label in enumerate(label_list)} - - features = [] - for ex_index, example in enumerate(examples): - if example.label == "-": # skip examples without a consensus label (e.g. in SNLI data set) - continue - if ex_index % 10000 == 0: - logging.info("Writing example %d of %d" % (ex_index, len(examples))) - - tokens_a = tokenizer.text_to_tokens(example.text_a) - - tokens_b = None - if example.text_b: - tokens_b = tokenizer.text_to_tokens(example.text_b) - - special_tokens_count = 2 if eos_token else 0 - special_tokens_count += 1 if sep_token_extra else 0 - special_tokens_count += 2 if bos_token else 0 - special_tokens_count += 1 if cls_token else 0 - self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) - else: - special_tokens_count = 1 if eos_token else 0 - special_tokens_count += 1 if sep_token_extra else 0 - special_tokens_count += 1 if bos_token else 0 - if len(tokens_a) > max_seq_length - special_tokens_count: - tokens_a = tokens_a[: max_seq_length - special_tokens_count] - # Add special tokens to sequence_a - tokens = tokens_a - if bos_token: - tokens = [bos_token] + tokens - if eos_token: - tokens += [eos_token] - segment_ids = [sequence_a_segment_id] * len(tokens) - - # Add sequence separator between sequences - if tokens_b and sep_token_extra: - tokens += [sep_token_extra] - segment_ids += [sequence_a_segment_id] - - # Add special tokens to sequence_b - if tokens_b: - if bos_token: - tokens += [bos_token] - segment_ids += [sequence_b_segment_id] - tokens += tokens_b - segment_ids += [sequence_b_segment_id] * (len(tokens_b)) - if eos_token: - tokens += [eos_token] - segment_ids += [sequence_b_segment_id] - - # Add classification token - for BERT models - if cls_token: - if cls_token_at_end: - tokens += [cls_token] - segment_ids += [cls_token_segment_id] - else: - tokens = [cls_token] + tokens - segment_ids = [cls_token_segment_id] + segment_ids - input_ids = tokenizer.tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - # Zero-pad up to the sequence length. - padding_length = max_seq_length - len(input_ids) - pad_token_id = tokenizer.tokens_to_ids([pad_token])[0] - if pad_on_left: - input_ids = ([pad_token_id] * padding_length) + input_ids - input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask - segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids - else: - input_ids = input_ids + ([pad_token_id] * padding_length) - input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) - segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) - if len(input_ids) != max_seq_length: - raise ValueError("input_ids must be of length max_seq_length") - if len(input_mask) != max_seq_length: - raise ValueError("input_mask must be of length max_seq_length") - if len(segment_ids) != max_seq_length: - raise ValueError("segment_ids must be of length max_seq_length") - if output_mode == "classification": - label_id = label_map[example.label] - elif output_mode == "regression": - label_id = np.float32(example.label) - else: - raise KeyError(output_mode) - - if ex_index < 5: - logging.info("*** Example ***") - logging.info("guid: %s" % (example.guid)) - logging.info("tokens: %s" % " ".join(list(map(str, tokens)))) - logging.info("input_ids: %s" % " ".join(list(map(str, input_ids)))) - logging.info("input_mask: %s" % " ".join(list(map(str, input_mask)))) - logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids)))) - logging.info("label: %s (id = %d)" % (example.label, label_id)) - - features.append( - InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id) - ) - return features - - def _truncate_seq_pair(self, tokens_a: str, tokens_b: str, max_length: int): - """Truncates a sequence pair in place to the maximum length. - - This will always truncate the longer sequence one token at a time. - This makes more sense than truncating an equal percent - of tokens from each, since if one sequence is very short then each token - that's truncated likely contains more information than a longer sequence. - """ - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -class TextToTextGLUEDataset(GLUEDataset): - """GLUE Dataset in a text-to-text format.""" - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return - - def __init__( - self, - file_name: str, - task_name: str, - tokenizer: TokenizerSpec, - max_seq_length: int, - max_seq_length_decoder: int = 128, - use_cache: bool = True, - prefix_override: str = None, - pad_to_max_length: bool = True, - ): - """ - Processes GLUE datasets - Args: - file_name: path to file - task_name: GLUE task name - tokenizer: such as AutoTokenizer - max_seq_length: max sequence length minus 2 for [CLS] and [SEP] - use_cache: whether to use data cache - prefix_override: if you want to override default prompt for this task specify this via a string. - pad_to_max_length: If true, pad to the maximum length. - """ - super().__init__(file_name, task_name, tokenizer, max_seq_length, use_cache, compute_features=False) - self.max_seq_length = max_seq_length - self.max_seq_length_decoder = max_seq_length_decoder - self.pad_to_max_length = pad_to_max_length - self.processor = processors[self.task_name]() - self.prefix_override = prefix_override - self.features = self.convert_examples_to_features() - - def __len__(self): - return len(self.examples) - - def __getitem__(self, idx): - enc_query, dec_input, labels = self.features[idx] - return {'text_enc': enc_query, 'text_dec': dec_input, 'labels': labels} - - def collate_fn(self, batch): - enc_query = [item['text_enc'] for item in batch] - dec_input = [item['text_dec'] for item in batch] - labels = [item['labels'] for item in batch] - - max_enc_query_length = max([len(item) for item in enc_query]) if enc_query else 0 - max_dec_input_length = max([len(item) for item in dec_input]) if dec_input else 0 - max_label_length = max([len(item) for item in labels]) if labels else 0 - if self.pad_to_max_length: - assert max_enc_query_length <= self.max_seq_length - assert max_dec_input_length <= self.max_seq_length_decoder - assert max_label_length <= self.max_seq_length_decoder - max_enc_query_length = self.max_seq_length - max_dec_input_length = self.max_seq_length_decoder - max_label_length = self.max_seq_length_decoder - - loss_mask = [([1] * (len(item))) + ([0] * (max_label_length - len(item))) for item in labels] - enc_query = [item + [self.tokenizer.pad_id] * (max_enc_query_length - len(item)) for item in enc_query] - dec_input = [item + [self.tokenizer.pad_id] * (max_dec_input_length - len(item)) for item in dec_input] - labels = [item + [self.tokenizer.pad_id] * (max_label_length - len(item)) for item in labels] - - enc_query = torch.LongTensor(enc_query) - dec_input = torch.LongTensor(dec_input) - labels = torch.LongTensor(labels) - loss_mask = torch.LongTensor(loss_mask) - - enc_mask = (enc_query != self.tokenizer.pad_id).long() - dec_mask = (dec_input != self.tokenizer.pad_id).long() - - return { - 'text_enc': enc_query, - 'text_dec': dec_input, - 'labels': labels, - 'loss_mask': loss_mask, - 'enc_mask': enc_mask, - 'dec_mask': dec_mask, - } - - def make_history_mask_3d(self, block): - batch, length = block.shape - arange = np.arange(length) - history_mask = (arange[None,] <= arange[:, None])[ - None, - ] - history_mask = np.repeat(history_mask, batch, 0) - return history_mask - - def convert_examples_to_features(self): - """ - Converts examples into Text-to-Text batches to be used with a model like T5. - Inputs are prefixed with a text prompt that indicates the task to perform. - """ - features = [] - for ex_index, example in enumerate(self.examples): - if ex_index % 10000 == 0: - logging.info(f"Writing example {ex_index} of {len(self.examples)}") - - text_to_text_query = self.processor.get_t5_prompted_query(example.text_a, example.text_b) - enc_query = self.tokenizer.text_to_ids(text_to_text_query) - if len(enc_query) > self.max_seq_length: - enc_query = enc_query[: self.max_seq_length] - dec_query = ( - [self.tokenizer.bos_id] - + self.tokenizer.text_to_ids(self.processor.label2string(example.label)) - + [self.tokenizer.eos_id] - ) - - dec_input = dec_query[:-1] - labels = dec_query[1:] - - features.append([enc_query, dec_input, labels]) - - return features - - -class TextToTextXNLIDataset(TextToTextGLUEDataset): - """XNLI Dataset in a text-to-text format.""" - - def __init__( - self, - file_name: str, - task_name: str, - tokenizer: TokenizerSpec, - max_seq_length: int, - max_seq_length_decoder: int = 128, - use_cache: bool = True, - prefix_override: str = None, - lang_list: List[str] = None, - pad_to_max_length: bool = True, - ): - self.lang_list = set(lang_list) - super().__init__( - file_name, - task_name, - tokenizer, - max_seq_length, - max_seq_length_decoder, - use_cache, - prefix_override, - pad_to_max_length, - ) - if len(lang_list) <= 0 or lang_list is None: - raise ValueError(f"Found an empty or None lang_list for {self.task_name}") - self.features = self.convert_xnli_examples_to_features() - - def __getitem__(self, idx): - enc_query, dec_input, labels, lang = self.features[idx] - return {'text_enc': enc_query, 'text_dec': dec_input, 'labels': labels, 'lang': lang} - - def collate_fn(self, batch): - base_batch = super().collate_fn(batch) - base_batch['lang'] = [item['lang'] for item in batch] - return base_batch - - def convert_xnli_examples_to_features(self): - """ - Converts examples into Text-to-Text batches to be used with a model like T5. - Inputs are prefixed with a text prompt that indicates the task to perform. - """ - features = self.features - lang_filtered_features = [] - for ex_index, example in enumerate(self.examples): - language = example.guid.split('-')[1] - if language in self.lang_list: - lang_filtered_features.append(features[ex_index] + [language]) - return lang_filtered_features - - def __len__(self): - return len(self.features) - - -class InputFeatures(object): - """A single set of features of data. - - Args: - input_ids: input/token ids - input_mask: masks out subword tokens - segment_ids: distinguish one sentence from the other one (if present) - label_ids: label for the current example - """ - - def __init__( - self, input_ids: List[int], input_mask: List[int], segment_ids: List[int], label_id: Union[float, int] - ): - """Initialized InputFeatures.""" - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.label_id = label_id diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/information_retrieval/__init__.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/information_retrieval/__init__.py deleted file mode 100644 index a32196ee7c112a3c6236a487d1f67bbc2951b74c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/information_retrieval/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.information_retrieval.information_retrieval_dataset import ( - BertInformationRetrievalDataset, -) diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/information_retrieval/information_retrieval_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/information_retrieval/information_retrieval_dataset.py deleted file mode 100644 index 61e0a3cffc1f0c7496b4be520f5d728a9824d51d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/information_retrieval/information_retrieval_dataset.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import multiprocessing as mp -import os -import pickle -import random -from typing import Optional - -import numpy as np -from torch.utils.data import Dataset - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec - -__all__ = ["BertInformationRetrievalDataset"] - - -class BaseInformationRetrievalDataset(Dataset): - """ - Base information retrieval dataset on which other datasets are built. - - Args: - tokenizer: tokenizer - max_query_length: maximum length of query in tokens - max_passage_length: maximum length of passage in tokens - """ - - def __init__( - self, tokenizer: TokenizerSpec, max_query_length: Optional[int] = 31, max_passage_length: Optional[int] = 190, - ): - self.tokenizer = tokenizer - self.max_query_length = max_query_length - self.max_passage_length = max_passage_length - - def parse_npz(self, file, max_seq_length): - """ - Function which parses passages (documents) in npz format. - After pre-processing and tokenization, the dataset will be saved - as numpy matrix, i_th entry of which corresponds to i_th passage (document) - and has the following form: - [n, token_1, ..., token_n, 0, ..., 0] - where n is the passage length (in tokens) and 0s correspond to pad tokens. - - Args: - file: str, path to file with passages (documents) - max_seq_length: maximum length of sequence in tokens - """ - cached_collection = file + ".npz" - if os.path.isfile(cached_collection): - dataset_npz = np.load(cached_collection)["data"] - else: - dataset_dict = self.tokenize_dataset(file, max_seq_length) - dataset_npz = np.zeros((len(dataset_dict), max_seq_length + 1)) - for key in dataset_dict: - dataset_npz[key][0] = len(dataset_dict[key]) - dataset_npz[key][1 : len(dataset_dict[key]) + 1] = dataset_dict[key] - np.savez(cached_collection, data=dataset_npz) - return dataset_npz - - def parse_pkl(self, file, max_seq_length): - """ - Function which parses passages (documents, queries) in pkl format. - After pre-processing and tokenization, the dataset will be saved - as pkl dict, i_th entry of which corresponds to i_th passage (document, query) - and has the following form: - {passage_id: [token_1, ..., token_n]} - where n is the passage length (in tokens). - - Args: - file: str, path to file with passages (documents) - max_seq_length: maximum length of sequence in tokens - """ - cached_collection = file + ".pkl" - if os.path.isfile(cached_collection): - dataset_dict = pickle.load(open(cached_collection, "rb")) - else: - dataset_dict = self.tokenize_dataset(file, max_seq_length) - pickle.dump(dataset_dict, open(cached_collection, "wb")) - return dataset_dict - - def tokenize_dataset(self, file, max_seq_length): - """ - Function which pre-tokenizes the dataset. - """ - lines = open(file, "r").readlines() - with mp.Pool() as pool: - dataset_dict = pool.map(self.preprocess_line, lines) - dataset_dict = {id_: tokens[:max_seq_length] for (id_, tokens) in dataset_dict} - return dataset_dict - - def preprocess_line(self, line): - """ - Parse a single entry (line) of tsv file. - """ - if "\t" not in line: - raise ValueError(f"Provided dataset does not have a form of tsv file") - id_, text = line.split("\t") - token_ids = self.tokenizer.text_to_ids(text.strip()) - return int(id_), token_ids - - def construct_input(self, token_ids1, max_seq_length, token_ids2=None): - """ - Function which constructs a valid input to BERT from tokens. - - If only one list of tokens (token_ids1) is passed, the input will be - [CLS] token_ids1 [SEP] - - if two lists of tokens are passed, the input will be - [CLS] token_ids1 [SEP] token_ids2 [SEP] - """ - - input_ids = [self.tokenizer.pad_id] * max_seq_length - bert_input = [self.tokenizer.cls_id] + token_ids1 + [self.tokenizer.sep_id] - sentence1_length = len(bert_input) - if token_ids2 is not None: - bert_input = bert_input + token_ids2 + [self.tokenizer.sep_id] - - bert_input = bert_input[:max_seq_length] - - num_nonpad_tokens = len(bert_input) - - input_ids[:num_nonpad_tokens] = bert_input - input_ids = np.array(input_ids, dtype=np.long) - input_mask = input_ids != self.tokenizer.pad_id - input_type_ids = np.ones_like(input_ids) - input_type_ids[:sentence1_length] = 0 - - return input_ids, input_mask, input_type_ids - - def preprocess_bert(self, query_id, psg_ids): - """ - Transforms query id (Q) and a list of passages ids (P1, ..., Pk) - into a tensor of size [k, max_length] with the following rows: - [CLS] Q_text [SEP] Pi_text [SEP], i = 1, ..., k - """ - - max_seq_length = self.max_query_length + self.max_passage_length + 3 - input_ids, input_mask, input_type_ids = [], [], [] - for psg_id in psg_ids: - inputs = self.construct_input(self.queries[query_id], max_seq_length, self._psgid2tokens(psg_id)) - input_ids.append(inputs[0]) - input_mask.append(inputs[1]) - input_type_ids.append(inputs[2]) - - input_ids = np.stack(input_ids) - input_mask = np.stack(input_mask) - input_type_ids = np.stack(input_type_ids) - - return input_ids, input_mask, input_type_ids - - def preprocess_dpr(self, query_id, psg_ids): - """ - Transforms query id (Q) and a list of passages ids (P1, ..., Pk) - into two tensors of sizes [1, max_q_length] and [k, max_p_length] - with the following rows: - 1) [CLS] Q_text [SEP] - 2) [CLS] Pi_text [SEP], i = 1, ..., k - """ - - q_input_ids, q_input_mask, q_type_ids = self.construct_input(self.queries[query_id], self.max_query_length + 2) - input_ids, input_mask, input_type_ids = [], [], [] - for psg_id in psg_ids: - inputs = self.construct_input(self._psgid2tokens(psg_id), self.max_passage_length + 2) - input_ids.append(inputs[0]) - input_mask.append(inputs[1]) - input_type_ids.append(inputs[2]) - input_ids = np.stack(input_ids) - input_mask = np.stack(input_mask) - input_type_ids = np.stack(input_type_ids) - return ( - q_input_ids[None, ...], - q_input_mask[None, ...], - q_type_ids[None, ...], - input_ids, - input_mask, - input_type_ids, - ) - - def _psgid2tokens(self, psg_id): - """ - Internal function which maps passage id to its tokens. - """ - pass - - def psgid2tokens_npz(self, psg_id): - """ - Mapping from passage id to its tokens in case of npz cache format. - """ - seq_len = self.passages[psg_id][0] - return self.passages[psg_id][1 : seq_len + 1].tolist() - - def psgid2tokens_pkl(self, psg_id): - """ - Mapping from passage id to its tokens in case of pkl cache format. - """ - return self.passages[psg_id] - - -class BertInformationRetrievalDataset(BaseInformationRetrievalDataset): - def __init__( - self, - tokenizer: TokenizerSpec, - passages: str, - queries: str, - query_to_passages: str, - max_query_length: Optional[int] = 31, - max_passage_length: Optional[int] = 190, - num_negatives: Optional[int] = 10, - preprocess_fn: Optional[str] = "preprocess_bert", - psg_cache_format: Optional[str] = "npz", - ): - """ - Dataset for training information retrieval models. - - Args: - tokenizer: tokenizer - passages: path to tsv with [psg_id, psg_text] entries - queries: path to tsv with [query_id, query_text] entries - query_to_passages: path to tsv with - [query_id, pos_psg_id, neg_psg_id_1, ..., neg_psg_id_k] entries - max_query_length: maximum length of query in tokens - max_passage_length: maximum length of passage in tokens - num_negatives: number of negative passages per positive to use for training - preprocess_fn: either preprocess_bert or preprocess_dpr - preprocess_bert: joint input: [CLS] query [SEP] passage [SEP] - preprocess_dpr: separate inputs: [CLS] query [SEP], [CLS] passage [SEP] - psg_cache_format: either pkl or npz - """ - - super().__init__(tokenizer, max_query_length, max_passage_length) - self.num_negatives = num_negatives - - self.passages = getattr(self, f"parse_{psg_cache_format}")(passages, max_passage_length) - self._psgid2tokens = getattr(self, f"psgid2tokens_{psg_cache_format}") - self.queries = self.parse_pkl(queries, max_query_length) - self.idx2psgs = self.parse_query_to_passages(query_to_passages) - self._preprocess_fn = getattr(self, preprocess_fn) - - def __getitem__(self, idx): - query_and_psgs = self.idx2psgs[idx] - query_id, psg_ids = query_and_psgs[0], query_and_psgs[1:] - inputs = self._preprocess_fn(query_id, psg_ids) - return [*inputs, query_id, np.array(psg_ids)] - - def __len__(self): - return len(self.idx2psgs) - - def parse_query_to_passages(self, file): - """ - Function which parses query to passages correspondence file. - """ - idx2psgs = {} - idx = 0 - for line in open(file, "r").readlines(): - if "\t" not in line: - raise ValueError(f"Provided dataset does not have a form of tsv file") - query_and_psgs = line.split("\t") - query_and_psgs_ids = [int(id_) for id_ in query_and_psgs] - query_and_rel_psg_ids, irrel_psgs_ids = query_and_psgs_ids[:2], query_and_psgs_ids[2:] - random.shuffle(irrel_psgs_ids) - num_samples = len(irrel_psgs_ids) // self.num_negatives - for j in range(num_samples): - left = self.num_negatives * j - right = self.num_negatives * (j + 1) - idx2psgs[idx] = query_and_rel_psg_ids + irrel_psgs_ids[left:right] - idx += 1 - return idx2psgs diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/intent_slot_classification/__init__.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/intent_slot_classification/__init__.py deleted file mode 100644 index 3e1782e02e4fb3c15c85333e83503b8d0c4abcd4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/intent_slot_classification/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from nemo.collections.nlp.data.intent_slot_classification.intent_slot_classification_dataset import ( - IntentSlotClassificationDataset, - IntentSlotInferenceDataset, -) -from nemo.collections.nlp.data.intent_slot_classification.intent_slot_classification_descriptor import ( - IntentSlotDataDesc, -) -from nemo.collections.nlp.data.intent_slot_classification.multi_label_intent_slot_classification_dataset import ( - MultiLabelIntentSlotClassificationDataset, -) -from nemo.collections.nlp.data.intent_slot_classification.multi_label_intent_slot_classification_descriptor import ( - MultiLabelIntentSlotDataDesc, -) diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py deleted file mode 100644 index cf0081f7bd834a9db06c0d9a092e35716843ced7..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_dataset.py +++ /dev/null @@ -1,297 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and -# The HuggingFace Inc. team. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, Optional - -import numpy as np - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.data_utils import get_stats -from nemo.core.classes import Dataset -from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType -from nemo.utils import logging - -__all__ = ['IntentSlotClassificationDataset', 'IntentSlotInferenceDataset'] - - -def get_features( - queries, - max_seq_length, - tokenizer, - pad_label=128, - raw_slots=None, - ignore_extra_tokens=False, - ignore_start_end=False, -): - all_subtokens = [] - all_loss_mask = [] - all_subtokens_mask = [] - all_segment_ids = [] - all_input_ids = [] - all_input_mask = [] - sent_lengths = [] - all_slots = [] - - with_label = False - if raw_slots is not None: - with_label = True - - for i, query in enumerate(queries): - words = query.strip().split() - subtokens = [tokenizer.cls_token] - loss_mask = [1 - ignore_start_end] - subtokens_mask = [0] - if with_label: - slots = [pad_label] - - for j, word in enumerate(words): - word_tokens = tokenizer.text_to_tokens(word) - - # to handle emojis that could be neglected during tokenization - if len(word.strip()) > 0 and len(word_tokens) == 0: - word_tokens = [tokenizer.ids_to_tokens(tokenizer.unk_id)] - - subtokens.extend(word_tokens) - - loss_mask.append(1) - loss_mask.extend([int(not ignore_extra_tokens)] * (len(word_tokens) - 1)) - - subtokens_mask.append(1) - subtokens_mask.extend([0] * (len(word_tokens) - 1)) - - if with_label: - slots.extend([raw_slots[i][j]] * len(word_tokens)) - - subtokens.append(tokenizer.sep_token) - loss_mask.append(1 - ignore_start_end) - subtokens_mask.append(0) - sent_lengths.append(len(subtokens)) - all_subtokens.append(subtokens) - all_loss_mask.append(loss_mask) - all_subtokens_mask.append(subtokens_mask) - all_input_mask.append([1] * len(subtokens)) - if with_label: - slots.append(pad_label) - all_slots.append(slots) - - max_seq_length_data = max(sent_lengths) - max_seq_length = min(max_seq_length, max_seq_length_data) if max_seq_length > 0 else max_seq_length_data - logging.info(f'Setting max length to: {max_seq_length}') - get_stats(sent_lengths) - too_long_count = 0 - - for i, subtokens in enumerate(all_subtokens): - if len(subtokens) > max_seq_length: - subtokens = [tokenizer.cls_token] + subtokens[-max_seq_length + 1 :] - all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :] - all_loss_mask[i] = [1 - ignore_start_end] + all_loss_mask[i][-max_seq_length + 1 :] - all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :] - - if with_label: - all_slots[i] = [pad_label] + all_slots[i][-max_seq_length + 1 :] - too_long_count += 1 - - all_input_ids.append([tokenizer.tokens_to_ids(t) for t in subtokens]) - - if len(subtokens) < max_seq_length: - extra = max_seq_length - len(subtokens) - all_input_ids[i] = all_input_ids[i] + [0] * extra - all_loss_mask[i] = all_loss_mask[i] + [0] * extra - all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra - all_input_mask[i] = all_input_mask[i] + [0] * extra - - if with_label: - all_slots[i] = all_slots[i] + [pad_label] * extra - - all_segment_ids.append([0] * max_seq_length) - - logging.info(f'{too_long_count} are longer than {max_seq_length}') - - # May be useful for debugging - logging.debug("*** Some Examples of Processed Data ***") - for i in range(min(len(all_input_ids), 5)): - logging.debug("i: %s" % (i)) - logging.debug("subtokens: %s" % " ".join(list(map(str, all_subtokens[i])))) - logging.debug("loss_mask: %s" % " ".join(list(map(str, all_loss_mask[i])))) - logging.debug("input_mask: %s" % " ".join(list(map(str, all_input_mask[i])))) - logging.debug("subtokens_mask: %s" % " ".join(list(map(str, all_subtokens_mask[i])))) - if with_label: - logging.debug("slots_label: %s" % " ".join(list(map(str, all_slots[i])))) - - return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_slots) - - -class IntentSlotClassificationDataset(Dataset): - """ - Creates dataset to use for the task of joint intent - and slot classification with pretrained model. - - Converts from raw data to an instance that can be used by - NMDataLayer. - - For dataset to use during inference without labels, see - IntentSlotDataset. - - Args: - input_file: file to sequence + label. the first line is header (sentence [tab] label) - each line should be [sentence][tab][label] - slot_file: file to slot labels, each line corresponding to slot labels for a sentence in input_file. No header. - max_seq_length: max sequence length minus 2 for [CLS] and [SEP] - tokenizer: such as NemoBertTokenizer - num_samples: number of samples you want to use for the dataset. If -1, use all dataset. Useful for testing. - pad_label: pad value use for slot labels. by default, it's the neutral label. - ignore_extra_tokens: whether to ignore extra tokens in the loss_mask. - ignore_start_end: whether to ignore bos and eos tokens in the loss_mask. - do_lower_case: convert query to lower case or not - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - 'intent_labels': NeuralType(('B'), LabelsType()), - 'slot_labels': NeuralType(('B', 'T'), LabelsType()), - } - - def __init__( - self, - input_file: str, - slot_file: str, - max_seq_length: int, - tokenizer: TokenizerSpec, - num_samples: int = -1, - pad_label: int = 128, - ignore_extra_tokens: bool = False, - ignore_start_end: bool = False, - do_lower_case: bool = False, - ): - if num_samples == 0: - raise ValueError("num_samples has to be positive", num_samples) - - with open(slot_file, 'r') as f: - slot_lines = f.readlines() - - with open(input_file, 'r') as f: - input_lines = f.readlines()[1:] - - assert len(slot_lines) == len(input_lines) - - dataset = list(zip(slot_lines, input_lines)) - - if num_samples > 0: - dataset = dataset[:num_samples] - - raw_slots, queries, raw_intents = [], [], [] - for slot_line, input_line in dataset: - raw_slots.append([int(slot) for slot in slot_line.strip().split()]) - parts = input_line.strip().split() - raw_intents.append(int(parts[-1])) - query = ' '.join(parts[:-1]) - if do_lower_case: - query = query.lower() - queries.append(query) - - features = get_features( - queries, - max_seq_length, - tokenizer, - pad_label=pad_label, - raw_slots=raw_slots, - ignore_extra_tokens=ignore_extra_tokens, - ignore_start_end=ignore_start_end, - ) - self.all_input_ids = features[0] - self.all_segment_ids = features[1] - self.all_input_mask = features[2] - self.all_loss_mask = features[3] - self.all_subtokens_mask = features[4] - self.all_slots = features[5] - self.all_intents = raw_intents - - def __len__(self): - return len(self.all_input_ids) - - def __getitem__(self, idx): - return ( - np.array(self.all_input_ids[idx]), - np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.long), - np.array(self.all_loss_mask[idx]), - np.array(self.all_subtokens_mask[idx]), - self.all_intents[idx], - np.array(self.all_slots[idx]), - ) - - -class IntentSlotInferenceDataset(Dataset): - """ - Creates dataset to use for the task of joint intent - and slot classification with pretrained model. - This is to be used during inference only. - It uses list of queries as the input. - - Args: - queries (list): list of queries to run inference on - max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP] - tokenizer (Tokenizer): such as NemoBertTokenizer - pad_label (int): pad value use for slot labels. - by default, it's the neutral label. - - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """ - Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - } - - def __init__(self, queries, max_seq_length, tokenizer, do_lower_case): - if do_lower_case: - for idx, query in enumerate(queries): - queries[idx] = queries[idx].lower() - - features = get_features(queries, max_seq_length, tokenizer) - - self.all_input_ids = features[0] - self.all_segment_ids = features[1] - self.all_input_mask = features[2] - self.all_loss_mask = features[3] - self.all_subtokens_mask = features[4] - - def __len__(self): - return len(self.all_input_ids) - - def __getitem__(self, idx): - return ( - np.array(self.all_input_ids[idx]), - np.array(self.all_segment_ids[idx]), - np.array(self.all_input_mask[idx], dtype=np.long), - np.array(self.all_loss_mask[idx]), - np.array(self.all_subtokens_mask[idx]), - ) diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_descriptor.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_descriptor.py deleted file mode 100644 index 544b5e1db8586436710c48b8e51acb14d24fd13d..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/intent_slot_classification/intent_slot_classification_descriptor.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -from typing import List - -from nemo.collections.nlp.data.data_utils.data_preprocessing import ( - fill_class_weights, - get_freq_weights, - get_label_stats, - if_exist, -) -from nemo.utils import logging - - -class IntentSlotDataDesc: - """ Convert the raw data to the standard format supported by - IntentSlotDataDesc. - - By default, the None label for slots is 'O'. - - IntentSlotDataDesc requires two files: - - input_file: file to sequence + label. - the first line is header (sentence [tab] label) - each line should be [sentence][tab][label] - - slot_file: file to slot labels, each line corresponding to - slot labels for a sentence in input_file. No header. - - To keep the mapping from label index to label consistent during - training and inferencing we require the following files: - dicts.intents.csv: each line is an intent. The first line - corresponding to the 0 intent label, the second line - corresponding to the 1 intent label, and so on. - - dicts.slots.csv: each line is a slot. The first line - corresponding to the 0 slot label, the second line - corresponding to the 1 slot label, and so on. - - Args: - data_dir: the directory of the dataset - modes: ['train', 'test', 'dev'], - none_slot_label: the label for slots that aren't identified defaulted to 'O' - pad_label: the int used for padding. If set to -1, it'll be set to the whatever the None label is. - """ - - def __init__( - self, - data_dir: str, - modes: List[str] = ['train', 'test', 'dev'], - none_slot_label: str = 'O', - pad_label: int = -1, - ): - if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']): - raise FileNotFoundError( - "Make sure that your data follows the standard format " - "supported by JointIntentSlotDataset. Your data must " - "contain dict.intents.csv and dict.slots.csv." - ) - - self.data_dir = data_dir - self.intent_dict_file = self.data_dir + '/dict.intents.csv' - self.slot_dict_file = self.data_dir + '/dict.slots.csv' - - self.intents_label_ids = IntentSlotDataDesc.label2idx(self.intent_dict_file) - self.num_intents = len(self.intents_label_ids) - self.slots_label_ids = IntentSlotDataDesc.label2idx(self.slot_dict_file) - self.num_slots = len(self.slots_label_ids) - - infold = self.data_dir - for mode in modes: - if not if_exist(self.data_dir, [f'{mode}.tsv']): - logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') - continue - logging.info(f' Stats calculating for {mode} mode...') - slot_file = f'{self.data_dir}/{mode}_slots.tsv' - with open(slot_file, 'r') as f: - slot_lines = f.readlines() - - input_file = f'{self.data_dir}/{mode}.tsv' - with open(input_file, 'r') as f: - input_lines = f.readlines()[1:] # Skipping headers at index 0 - - if len(slot_lines) != len(input_lines): - raise ValueError( - "Make sure that the number of slot lines match the " - "number of intent lines. There should be a 1-1 " - "correspondence between every slot and intent lines." - ) - - dataset = list(zip(slot_lines, input_lines)) - - raw_slots, raw_intents = [], [] - for slot_line, input_line in dataset: - slot_list = [int(slot) for slot in slot_line.strip().split()] - raw_slots.append(slot_list) - parts = input_line.strip().split() - raw_intents.append(int(parts[-1])) - - logging.info(f'Three most popular intents in {mode} mode:') - total_intents, intent_label_freq, max_id = get_label_stats( - raw_intents, infold + f'/{mode}_intent_stats.tsv' - ) - - merged_slots = itertools.chain.from_iterable(raw_slots) - logging.info(f'Three most popular slots in {mode} mode:') - slots_total, slots_label_freq, max_id = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv') - - logging.info(f'Total Number of Intents: {total_intents}') - logging.info(f'Intent Label Frequencies: {intent_label_freq}') - logging.info(f'Total Number of Slots: {slots_total}') - logging.info(f'Slots Label Frequencies: {slots_label_freq}') - - if mode == 'train': - intent_weights_dict = get_freq_weights(intent_label_freq) - logging.info(f'Intent Weights: {intent_weights_dict}') - slot_weights_dict = get_freq_weights(slots_label_freq) - logging.info(f'Slot Weights: {slot_weights_dict}') - - self.intent_weights = fill_class_weights(intent_weights_dict, self.num_intents - 1) - self.slot_weights = fill_class_weights(slot_weights_dict, self.num_slots - 1) - - if pad_label != -1: - self.pad_label = pad_label - else: - if none_slot_label not in self.slots_label_ids: - raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.') - self.pad_label = self.slots_label_ids[none_slot_label] - - @staticmethod - def label2idx(file): - lines = open(file, 'r').readlines() - lines = [line.strip() for line in lines if line.strip()] - labels = {lines[i]: i for i in range(len(lines))} - return labels - - @staticmethod - def intent_slot_dicts(data_dir): - ''' - Return Intent and slot dictionaries - ''' - intent_dict_file = data_dir + '/dict.intents.csv' - slot_dict_file = data_dir + '/dict.slots.csv' - - intents_labels = open(intent_dict_file, 'r').readlines() - intents_labels = [line.strip() for line in intents_labels if line.strip()] - - slots_labels = open(slot_dict_file, 'r').readlines() - slots_labels = [line.strip() for line in slots_labels if line.strip()] - - return intents_labels, slots_labels diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_dataset.py deleted file mode 100644 index 32a72d10719326583cac683604006c280d4f97b1..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_dataset.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and -# The HuggingFace Inc. team. -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, Optional - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.intent_slot_classification import IntentSlotClassificationDataset -from nemo.collections.nlp.data.intent_slot_classification.intent_slot_classification_dataset import get_features -from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType - -__all__ = ['MultiLabelIntentSlotClassificationDataset'] - - -class MultiLabelIntentSlotClassificationDataset(IntentSlotClassificationDataset): - """ - Creates dataset to use for the task of multi-label joint intent - and slot classification with pretrained model. - - Converts from raw data to an instance that can be used by - NMDataLayer. - - Args: - input_file: file containing sentences + labels. The first line is header (sentence [tab] label) - each line should be [sentence][tab][label] where label can be multiple labels separated by a comma - slot_file: file containing slot labels, each line corresponding to slot labels for a sentence in input_file. No header. - num_intents: total number of intents in dict.intents file - max_seq_length: max sequence length minus 2 for [CLS] and [SEP] - tokenizer: such as NemoBertTokenizer - num_samples: number of samples you want to use for the dataset. If -1, use all dataset. Useful for testing. - pad_label: pad value use for slot labels. by default, it's the neutral label. - ignore_extra_tokens: whether to ignore extra tokens in the loss_mask. - ignore_start_end: whether to ignore bos and eos tokens in the loss_mask. - do_lower_case: convert query to lower case or not - """ - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ - return { - 'input_ids': NeuralType(('B', 'T'), ChannelType()), - 'segment_ids': NeuralType(('B', 'T'), ChannelType()), - 'input_mask': NeuralType(('B', 'T'), MaskType()), - 'loss_mask': NeuralType(('B', 'T'), MaskType()), - 'subtokens_mask': NeuralType(('B', 'T'), MaskType()), - 'intent_labels': [NeuralType(('B'), LabelsType())], - 'slot_labels': NeuralType(('B', 'T'), LabelsType()), - } - - def __init__( - self, - input_file: str, - slot_file: str, - num_intents: int, - max_seq_length: int, - tokenizer: TokenizerSpec, - num_samples: int = -1, - pad_label: int = 128, - ignore_extra_tokens: bool = False, - ignore_start_end: bool = False, - do_lower_case: bool = False, - ): - if num_samples == 0: - raise ValueError("num_samples has to be positive", num_samples) - - with open(slot_file, 'r') as f: - slot_lines = f.readlines() - - with open(input_file, 'r') as f: - input_lines = f.readlines()[1:] - - assert len(slot_lines) == len(input_lines) - - dataset = list(zip(slot_lines, input_lines)) - - if num_samples > 0: - dataset = dataset[:num_samples] - - raw_slots, queries, raw_intents = [], [], [] - for slot_line, input_line in dataset: - raw_slots.append([int(slot) for slot in slot_line.strip().split()]) - parts = input_line.strip().split("\t")[1:][0] - parts = list(map(int, parts.split(","))) - parts = [1 if label in parts else 0 for label in range(num_intents)] - raw_intents.append(tuple(parts)) - tokens = input_line.strip().split("\t")[0].split() - query = ' '.join(tokens) - if do_lower_case: - query = query.lower() - queries.append(query) - - features = get_features( - queries, - max_seq_length, - tokenizer, - pad_label=pad_label, - raw_slots=raw_slots, - ignore_extra_tokens=ignore_extra_tokens, - ignore_start_end=ignore_start_end, - ) - - self.all_input_ids = features[0] - self.all_segment_ids = features[1] - self.all_input_mask = features[2] - self.all_loss_mask = features[3] - self.all_subtokens_mask = features[4] - self.all_slots = features[5] - self.all_intents = raw_intents diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_descriptor.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_descriptor.py deleted file mode 100644 index ddde1a2896de499d6bcc5cc5ada763daef3ad8be..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/intent_slot_classification/multi_label_intent_slot_classification_descriptor.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -from typing import List - -from nemo.collections.nlp.data.data_utils.data_preprocessing import ( - fill_class_weights, - get_freq_weights, - get_freq_weights_bce_with_logits_loss, - get_label_stats, - get_labels_to_labels_id_mapping, - get_multi_label_stats, - if_exist, -) -from nemo.utils import logging - - -class MultiLabelIntentSlotDataDesc: - """ Convert the raw data to the standard format supported by - MultiLabelIntentSlotDataDesc. - - By default, the None label for slots is 'O'. - - MultiLabelIntentSlotDataDesc requires two files: - - input_file: file containing sentences + labels. - the first line is header (sentence [tab] label) - each line should be [sentence][tab][label] where label is a string of comma separated values. - Example: 1 or 1,2 are both valid labels - - slot_file: file containing slot labels, each line corresponding to - slot labels for a sentence in input_file. No header. - - To keep the mapping from label index to label consistent during - training and inferencing we require the following files: - dicts.intents.csv: each line is an intent. The first line - corresponding to the 0 intent label, the second line - corresponding to the 1 intent label, and so on. - - dicts.slots.csv: each line is a slot. The first line - corresponding to the 0 slot label, the second line - corresponding to the 1 slot label, and so on. - - Args: - data_dir: the directory of the dataset - modes: ['train', 'test', 'dev'], - none_slot_label: the label for slots that aren't identified defaulted to 'O' - pad_label: the int used for padding. If set to -1, it'll be set to the whatever the None label is. - """ - - def __init__( - self, - data_dir: str, - modes: List[str] = ["train", "test", "dev"], - none_slot_label: str = "O", - pad_label: int = -1, - ): - if not if_exist(data_dir, ["dict.intents.csv", "dict.slots.csv"]): - raise FileNotFoundError( - "Make sure that your data follows the standard format " - "supported by MultiLabelIntentSlotDataset. Your data must " - "contain dict.intents.csv and dict.slots.csv." - ) - - self.data_dir = data_dir - self.intent_dict_file = self.data_dir + "/dict.intents.csv" - self.slot_dict_file = self.data_dir + "/dict.slots.csv" - - self.intents_label_ids = get_labels_to_labels_id_mapping(self.intent_dict_file) - self.num_intents = len(self.intents_label_ids) - self.slots_label_ids = get_labels_to_labels_id_mapping(self.slot_dict_file) - self.num_slots = len(self.slots_label_ids) - - infold = self.data_dir - for mode in modes: - if not if_exist(self.data_dir, [f"{mode}.tsv"]): - logging.info(f" Stats calculation for {mode} mode" f" is skipped as {mode}.tsv was not found.") - continue - logging.info(f" Stats calculating for {mode} mode...") - slot_file = f"{self.data_dir}/{mode}_slots.tsv" - with open(slot_file, "r") as f: - slot_lines = f.readlines() - - input_file = f"{self.data_dir}/{mode}.tsv" - with open(input_file, "r") as f: - input_lines = f.readlines()[1:] # Skipping headers at index 0 - - if len(slot_lines) != len(input_lines): - raise ValueError( - "Make sure that the number of slot lines match the " - "number of intent lines. There should be a 1-1 " - "correspondence between every slot and intent lines." - ) - - dataset = list(zip(slot_lines, input_lines)) - - raw_slots, raw_intents = [], [] - for slot_line, input_line in dataset: - slot_list = [int(slot) for slot in slot_line.strip().split()] - raw_slots.append(slot_list) - parts = input_line.strip().split("\t")[1:][0] - parts = list(map(int, parts.split(","))) - parts = [1 if label in parts else 0 for label in range(self.num_intents)] - raw_intents.append(tuple(parts)) - - logging.info(f"Three most popular intents in {mode} mode:") - total_intents, intent_label_freq, max_id = get_multi_label_stats( - raw_intents, infold + f"/{mode}_intent_stats.tsv" - ) - - merged_slots = itertools.chain.from_iterable(raw_slots) - logging.info(f"Three most popular slots in {mode} mode:") - slots_total, slots_label_freq, max_id = get_label_stats(merged_slots, infold + f"/{mode}_slot_stats.tsv") - - logging.info(f"Total Number of Intent Labels: {total_intents}") - logging.info(f"Intent Label Frequencies: {intent_label_freq}") - logging.info(f"Total Number of Slots: {slots_total}") - logging.info(f"Slots Label Frequencies: {slots_label_freq}") - - if mode == "train": - intent_weights_dict = get_freq_weights_bce_with_logits_loss(intent_label_freq) - logging.info(f"Intent Weights: {intent_weights_dict}") - slot_weights_dict = get_freq_weights(slots_label_freq) - logging.info(f"Slot Weights: {slot_weights_dict}") - - self.intent_weights = fill_class_weights(intent_weights_dict, self.num_intents - 1) - self.slot_weights = fill_class_weights(slot_weights_dict, self.num_slots - 1) - - if pad_label != -1: - self.pad_label = pad_label - else: - if none_slot_label not in self.slots_label_ids: - raise ValueError(f"none_slot_label {none_slot_label} not " f"found in {self.slot_dict_file}.") - self.pad_label = self.slots_label_ids[none_slot_label] diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/__init__.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/__init__.py deleted file mode 100644 index 16831be8574136062e3606034733f180574767c3..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.language_modeling.l2r_lm_dataset import L2RLanguageModelingDataset -from nemo.collections.nlp.data.language_modeling.lm_bert_dataset import ( - BertPretrainingDataset, - BertPretrainingPreprocessedDataloader, -) -from nemo.collections.nlp.data.language_modeling.sentence_dataset import SentenceDataset, TarredSentenceDataset diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/l2r_lm_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/l2r_lm_dataset.py deleted file mode 100644 index adb6f126cd78c16166e7f6ee05af822227715ec4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/l2r_lm_dataset.py +++ /dev/null @@ -1,249 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import io -import json -from typing import Optional - -import braceexpand -import numpy as np -import webdataset as wd -from torch.utils.data import Dataset, IterableDataset - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.data_utils import dataset_to_ids -from nemo.utils import logging - -__all__ = ['L2RLanguageModelingDataset', 'TarredL2RLanguageModelingDataset'] - - -class L2RLanguageModelingDataset(Dataset): - """ - Dataset for training and evaluating left-to-right language models. - - Args: - tokenizer: tokenizer, such as WordTokenizer or CharTokenizer - dataset: path to data - max_seq_length: maximum sequence length (in tokens) of input tensors - batch_step: distance (in tokens) between two successive sequences of - the text. By default, it is equal to max_seq_length which corresponds - to splitting text into disjoint segments covering full dataset - use_cache: bool value, defaults to False. Determines whether the preprocessed, - tokenized dataset should be cached into a pickle file. If true, cache is saved - at the path provided in `dataset`. - """ - - def __init__( - self, - tokenizer: TokenizerSpec, - dataset: str, - max_seq_length: Optional[int] = 512, - batch_step: Optional[int] = None, - use_cache: bool = False, - ): - self.tokenizer = tokenizer - self.max_seq_length = max_seq_length - self.batch_step = batch_step or self.max_seq_length - ids = dataset_to_ids(dataset, tokenizer, cache_ids=use_cache, add_bos_eos=False) - self.ids = np.array([j for i in ids for j in i]) - - def __len__(self): - return (len(self.ids) - self.max_seq_length) // self.batch_step - - def __getitem__(self, idx): - left = idx * self.batch_step - right = left + self.max_seq_length - src_ids = self.ids[left:right] - labels = self.ids[left + 1 : right + 1] - src_mask = (src_ids != self.tokenizer.pad_id).astype(np.float32) - return src_ids, src_mask, labels - - -class TarredL2RLanguageModelingDataset(IterableDataset): - """ - A similar Dataset to the L2RLanguageModelingDataset, but which loads tarred tokenized numpy files. - Accepts a single JSON metadata manifest file as well as the path(s) to the tarball(s) containing the wav files. - The manifest should contain information such as the number of shards, the number of tokens in the corpus, - and the number of tokens contained within each shard of the tarfile(s). - - Valid formats for the text_tar_filepaths argument include: - (1) a single string that can be brace-expanded, e.g. 'path/to/text.tar' or 'path/to/text_{1..100}.tar.gz', or - (2) a list of file paths that will not be brace-expanded, e.g. ['text_1.tar', 'text_2.tar', ...]. - - Note: For brace expansion in (1), there may be cases where `{x..y}` syntax cannot be used due to shell interference. - This occurs most commonly inside SLURM scripts. Therefore we provide a few equivalent replacements. - Supported opening braces - { <=> (, [, < and the special tag _OP_. - Supported closing braces - } <=> ), ], > and the special tag _CL_. - For SLURM based tasks, we suggest the use of the special tags for ease of use. - See the WebDataset documentation for more information about accepted data and input formats. - - If using multiple processes the number of shards should be divisible by the number of workers to ensure an - even split among workers. If it is not divisible, logging will give a warning but training will proceed. - - Additionally, please note that the len() of this DataLayer is assumed to be the number of tokens - of the text data. An incorrect manifest length may lead to some DataLoader issues down the line. - - Args: - text_tar_filepaths: Either a list of tokenized text tarball filepaths, or a - string (can be brace-expandable). - metadata_path (str): Path to the metadata manifest. - tokenizer: tokenizer, such as WordTokenizer or CharTokenizer - dataset: path to data - max_seq_length: maximum sequence length (in tokens) of input tensors - batch_step: distance (in tokens) between two successive sequences of - the text. By default, it is equal to max_seq_length which corresponds - to splitting text into disjoint segments covering full dataset - shuffle_n (int): How many samples to look ahead and load to be shuffled. - See WebDataset documentation for more details. - Defaults to 0. - shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp. - - `scatter`: The default shard strategy applied by WebDataset, where each node gets - a unique set of shards, which are permanently pre-allocated and never changed at runtime. - - `replicate`: Optional shard strategy, where each node gets all of the set of shards - available in the tarred dataset, which are permanently pre-allocated and never changed at runtime. - The benefit of replication is that it allows each node to sample data points from the entire - dataset independently of other nodes, and reduces dependence on value of `shuffle_n`. - - .. warning:: - Replicated strategy allows every node to sample the entire set of available tarfiles, - and therefore more than one node may sample the same tarfile, and even sample the same - data points! As such, there is no assured guarantee that all samples in the dataset will be - sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific - occasions (when the number of shards is not divisible with ``world_size``), will not sample - the entire dataset. For these reasons it is not advisable to use tarred datasets as validation - or test datasets. - global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. - world_size (int): Total number of processes, used for partitioning shards. Defaults to 0. - """ - - def __init__( - self, - text_tar_filepaths: str, - metadata_path: str, - tokenizer, - max_seq_length: int = 512, - batch_step: int = None, - shuffle_n: int = 1, - shard_strategy: str = "scatter", - global_rank: int = 0, - world_size: int = 0, - ): - super(TarredL2RLanguageModelingDataset, self).__init__() - - self.tokenizer = tokenizer - self.max_seq_length = max_seq_length - self.batch_step = batch_step or self.max_seq_length - - valid_shard_strategies = ['scatter', 'replicate'] - if shard_strategy not in valid_shard_strategies: - raise ValueError( - f"Invalid shard strategy of type {type(shard_strategy)} " - f"{repr(shard_strategy) if len(repr(shard_strategy)) < 100 else repr(shard_strategy)[:100] + '...'}! " - f"Allowed values are: {valid_shard_strategies}." - ) - - with open(metadata_path, 'r') as f: - metadata = json.load(f) - - self.metadata = metadata - - if isinstance(text_tar_filepaths, str): - # Replace '(', '[', '<' and '_OP_' with '{' - brace_keys_open = ['(', '[', '<', '_OP_'] - for bkey in brace_keys_open: - if bkey in text_tar_filepaths: - text_tar_filepaths = text_tar_filepaths.replace(bkey, "{") - - # Replace ')', ']', '>' and '_CL_' with '}' - brace_keys_close = [')', ']', '>', '_CL_'] - for bkey in brace_keys_close: - if bkey in text_tar_filepaths: - text_tar_filepaths = text_tar_filepaths.replace(bkey, "}") - - if isinstance(text_tar_filepaths, str): - # Brace expand - text_tar_filepaths = list(braceexpand.braceexpand(text_tar_filepaths)) - - if shard_strategy == 'scatter': - logging.info("All tarred dataset shards will be scattered evenly across all nodes.") - - if len(text_tar_filepaths) % world_size != 0: - logging.warning( - f"Number of shards in tarred dataset ({len(text_tar_filepaths)}) is not divisible " - f"by number of distributed workers ({world_size})." - ) - - begin_idx = (len(text_tar_filepaths) // world_size) * global_rank - end_idx = begin_idx + (len(text_tar_filepaths) // world_size) - text_tar_filepaths = text_tar_filepaths[begin_idx:end_idx] - logging.info( - "Partitioning tarred dataset: process (%d) taking shards [%d, %d)", global_rank, begin_idx, end_idx - ) - - elif shard_strategy == 'replicate': - logging.info("All tarred dataset shards will be replicated across all nodes.") - - else: - raise ValueError(f"Invalid shard strategy ! Allowed values are : {valid_shard_strategies}") - - self.tarpath = text_tar_filepaths - - # Put together WebDataset - self._dataset = wd.WebDataset(urls=text_tar_filepaths, nodesplitter=None) - - if shuffle_n > 0: - self._dataset = self._dataset.shuffle(shuffle_n) - else: - logging.info("WebDataset will not shuffle files within the tar files.") - - self._dataset = self._dataset.rename(npy='npy', key='__key__').to_tuple('npy', 'key').map(f=self._build_sample) - - def _build_sample(self, tup): - # Load file - npy, filepath = tup - npy = io.BytesIO(npy) - data = np.load(npy) # loads np.int64 vector - npy.close() - - # Select random contiguous subsegment - idx = np.random.randint(0, (len(data) - self.max_seq_length) // self.batch_step) - - # Slice of data chunk - left = idx * self.batch_step - right = left + self.max_seq_length - data = data[left : right + 1] - - # Create batch - src_ids = data[:-1] - labels = data[1:] - src_mask = (src_ids != self.tokenizer.pad_id).astype(np.float32) - return src_ids, src_mask, labels - - def __iter__(self): - # We need to wrap an infinite generator since the actual files - # within the tar files contains large chunks of contiguous data. - # This prevents PTL from early exiting the train loop after exhausting - # all of the files in one iteration (though the actual dataset is many - # times larger due to each file containing a large chunk of data). - dl_iter = iter(self._dataset) - while True: - try: - batch = next(dl_iter) - yield batch - except StopIteration: - dl_iter = iter(self._dataset) - continue - - def __len__(self): - return (self.metadata['num_text'] - self.max_seq_length) // self.batch_step diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/lm_bert_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/lm_bert_dataset.py deleted file mode 100644 index 4196f9736da10e808c49d00868366778257e45fc..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/lm_bert_dataset.py +++ /dev/null @@ -1,406 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import array -import os -import pickle -import random -from typing import Dict, List, Optional - -import h5py -import numpy as np -from torch.utils.data import DataLoader, DistributedSampler -from tqdm import tqdm - -from nemo.collections.nlp.data.data_utils.data_preprocessing import find_newlines, load_data_indices -from nemo.core.classes import Dataset - -__all__ = ['BertPretrainingDataset', 'BertPretrainingPreprocessedDataloader'] - - -def load_h5(input_file: str): - return h5py.File(input_file, "r") - - -class BertPretrainingDataset(Dataset): - """ - Dataset for bert pretraining when using data preprocessing including tokenization - """ - - def __init__( - self, - tokenizer: object, - data_file: str, - max_seq_length: Optional[int] = 128, - mask_prob: Optional[float] = 0.15, - short_seq_prob: Optional[float] = 0.1, - seq_a_ratio: Optional[float] = 0.6, - sentence_idx_file: Optional[str] = None, - ): - """ - Args: - tokenizer: tokenizer - data_file: path to data - max_seq_length: maximum sequence length of input tensors - mask_probability: proability to mask token - short_seq_prob: probability to create a sequence shorter than max_seq_length - seq_a_ratio: ratio between lengths of first and second sequence - sentence_idx_file: sentence indices file for caching - """ - self.tokenizer = tokenizer - - # Loading enormous datasets into RAM isn't always feasible -- for - # example, the pubmed corpus is 200+ GB, which doesn't fit into RAM on - # most computers. To get around this, we store the indices of newlines - # in each file so we can seek to and retrieve sentences immediately - # from main memory when needed during training. - - # Try and load sentence indices file if already exists - sentence_indices, sentence_idx_file, data_dir = load_data_indices( - sentence_idx_file, data_file, "sentence_indices" - ) - - # If sentence indices file doesn't exists, generate and store sentence indices - if sentence_indices is None: - sentence_indices = {} - filenames = [data_file] - - for filename in tqdm(filenames): - with open(filename, "rb") as f: - contents = f.read() - newline_indices = find_newlines(contents) - - if os.path.isdir(data_dir): - # Only keep the parts of the filepath that are invariant to - # the dataset's location on disk - filename = os.path.basename(filename) - - # In python, arrays are much more space-efficient than lists - sentence_indices[filename] = array.array("I", newline_indices) - - # Save sentence indices so we don't have to do this again - with open(sentence_idx_file, "wb") as f: - pickle.dump(sentence_indices, f) - - corpus_size = 0 - empty_files = [] - - # Find total number of newlines across entire corpus and remove files - # without any newlines - for filename in sentence_indices: - if len(sentence_indices[filename]) <= 1: - empty_files.append(filename) - else: - corpus_size += len(sentence_indices[filename]) - - for filename in empty_files: - del sentence_indices[filename] - - self.corpus_size = corpus_size - self.dataset = data_dir - self.filenames = list(sentence_indices.keys()) - self.mask_probability = mask_prob - self.max_seq_length = max_seq_length - self.sentence_indices = sentence_indices - self.vocab_size = self.tokenizer.vocab_size - self.short_seq_prob = short_seq_prob - self.seq_a_ratio = seq_a_ratio - - def __len__(self): - return self.corpus_size - - def __getitem__(self, idx: int, min_doc_length: Optional[int] = 16): - # Each sequence has three special tokens, as follows: - # tokenizer.cls_token tokenizer.sep_token tokenizer.eos_token - num_special_tokens = 3 - - max_num_tokens = self.max_seq_length - num_special_tokens - target_seq_length = max_num_tokens - if random.random() < self.short_seq_prob: - # TODO: maybe introduce an argument to control this. - target_seq_length = random.randint(2, max_num_tokens) - - # prefer the seq_a to be slightly longer than seq_b, 0.6 by default - target_seq_length_a = int(round(target_seq_length * self.seq_a_ratio)) - target_seq_length_b = target_seq_length - target_seq_length_a - - def get_document(filepath, offset): - # Retrieve a specific line from a file and return as a document - if os.path.isdir(self.dataset): - filepath = os.path.join(self.dataset, filepath) - - with open(filepath, "rb") as f: - f.seek(offset) - doc_text = f.readline()[:-1].decode("utf-8", errors="ignore") - document = self.tokenizer.text_to_ids(doc_text) - - return document - - def match_target_seq_length( - document: str, target_seq_length: int, filename: str, line_idx: int, sentence_indices: Dict[str, dict] - ): - # If document is shorter than target sequence length, - # append the next line or take a random line as replacement. - num_lines = len(sentence_indices[filename]) - - while len(document) < target_seq_length: - if line_idx < (num_lines - 1): - # append the next line - line_idx += 1 - else: - # current line is the last line, take a random one - line_idx = random.randrange(num_lines) - document = [] - - offset = sentence_indices[filename][line_idx] - document += get_document(filename, offset) - - return document, line_idx - - # Take sequence A from a random file and a random line - a_filename = random.choice(self.filenames) - a_line_idx = random.randrange(len(self.sentence_indices[a_filename])) - a_line_offset = self.sentence_indices[a_filename][a_line_idx] - a_document = get_document(a_filename, a_line_offset) - a_document, a_line_idx = match_target_seq_length( - a_document, target_seq_length_a, a_filename, a_line_idx, self.sentence_indices - ) - - is_last_line = a_line_idx >= (len(self.sentence_indices[a_filename]) - 1) - # About 50% of the time, B is a random sentence from the corpus - take_random_b = (random.random() < 0.5) or is_last_line - - if take_random_b: - # This should rarely go for more than one iteration for large - # corpora. However, just to be careful, we try to make sure that - # the random document is not the same as the document - # we're processing. - for _ in range(10): - b_filename = random.choice(self.filenames) - b_line_idx = random.choice(range(len(self.sentence_indices[b_filename]))) - if b_filename != a_filename: - break - else: - # Take another line from the same file - b_line_pos = self.sentence_indices[b_filename][b_line_idx] - a_line_pos = self.sentence_indices[a_filename][a_line_idx] - # TODO unclear about the following check - if abs(b_line_pos - a_line_pos) > max_num_tokens: - break - else: - pass - else: - b_filename = a_filename - b_line_idx = a_line_idx + 1 - - is_next = int(not take_random_b) - b_line_pos = self.sentence_indices[b_filename][b_line_idx] - b_document = get_document(b_filename, b_line_pos) - b_document, b_line_idx = match_target_seq_length( - b_document, target_seq_length_b, b_filename, b_line_idx, self.sentence_indices - ) - - def truncate_seq_pair(a, b, max_num_tokens): - # Truncates a pair of sequences to a maximum sequence length - while (len(a) + len(b)) > max_num_tokens: - # Truncate the longer sequence - if len(a) > len(b): - trunc_document = a - else: - trunc_document = b - - if len(trunc_document) <= 1: - raise ValueError( - "Input text corpora probably too small. " - "Failed to truncate sequence pair to " - "maximum sequence legnth." - ) - - # Randomly truncate from the front or the back - if random.random() < 0.5: - del trunc_document[0] - else: - trunc_document.pop() - - truncate_seq_pair(a_document, b_document, max_num_tokens) - - output_ids = ( - [self.tokenizer.cls_id] + a_document + [self.tokenizer.sep_id] + b_document + [self.tokenizer.eos_id] - ) - - input_ids, output_mask = self.mask_ids(output_ids) - - input_mask = np.zeros(self.max_seq_length, dtype=np.long) - input_mask[: len(input_ids)] = 1 - - input_type_ids = np.zeros(self.max_seq_length, dtype=np.int) - input_type_ids[len(a_document) + 2 : len(output_ids) + 1] = 1 - - padding_length = max(0, self.max_seq_length - len(input_ids)) - if padding_length > 0: - input_ids.extend([self.tokenizer.pad_id] * padding_length) - output_ids.extend([self.tokenizer.pad_id] * padding_length) - output_mask.extend([0] * padding_length) - - # TODO: wrap the return value with () for consistent style. - return ( - np.array(input_ids), - input_type_ids, - np.array(input_mask, dtype=np.long), - np.array(output_ids), - np.array(output_mask, dtype=np.float32), - is_next, - ) - - def mask_ids(self, ids: List[int]): - """ - Args: - ids: list of token ids representing a chunk of text - Returns: - masked_ids: list of input tokens with some of the entries masked - according to the following protocol from the original BERT paper: - each token is masked with a probability of 15% and is replaced with - 1) the [MASK] token 80% of the time, - 2) random token 10% of the time, - 3) the same token 10% of the time. - output_mask: list of binary variables which indicate what tokens has - been masked (to calculate the loss function for these tokens only) - """ - - # Whole-word masking by default, as it gives better performance. - cand_indexes = [[ids[0]]] - for tid in ids[1:]: - token = self.tokenizer.ids_to_tokens([tid])[0] - is_suffix = token.startswith('\u2581') - if is_suffix: - # group together with its previous token to form a whole-word - cand_indexes[-1].append(tid) - else: - cand_indexes.append([tid]) - - masked_ids, output_mask = [], [] - mask_id = self.tokenizer.token_to_id("[MASK]") - - for word_ids in cand_indexes: - is_special = (word_ids[0] == self.tokenizer.cls_id) or (word_ids[0] == self.tokenizer.sep_id) - if is_special or (random.random() > self.mask_probability): - output_mask.extend([0] * len(word_ids)) - masked_ids.extend(word_ids) - else: - output_mask.extend([1] * len(word_ids)) - p = random.random() - # for 80%, replace with mask - if p < 0.8: - masked_ids.extend([mask_id] * len(word_ids)) - # for 10%, replace by a random token - elif p < 0.9: - for _ in word_ids: - # randomly select a valid word - random_word = random.randrange(self.vocab_size) - while random_word in (self.tokenizer.cls_id, self.tokenizer.sep_id): - random_word = random.randrange(self.vocab_size) - masked_ids.append(random_word) - # for 10%, use same token - else: - masked_ids.extend(word_ids) - - return masked_ids, output_mask - - -class BertPretrainingPreprocessedDataset(Dataset): - """ - Dataset for already preprocessed data. - """ - - def __init__(self, input_file: str, max_predictions_per_seq: int): - """ - Args: - input_file: data file in hdf5 format with preprocessed data in array format - max_predictions_per_seq: maximum number of masked tokens per sequence. Need to be consistent with data in input file. - """ - self.input_file = input_file - self.max_predictions_per_seq = max_predictions_per_seq - f = load_h5(input_file) - keys = [ - 'input_ids', - 'input_mask', - 'segment_ids', - 'masked_lm_positions', - 'masked_lm_ids', - 'next_sentence_labels', - ] - self.inputs = [np.asarray(f[key][:]) for key in keys] - f.close() - - def __len__(self): - 'Denotes the total number of samples' - return len(self.inputs[0]) - - def __getitem__(self, index: int): - [input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels] = [ - input[index].astype(np.int64) for input in self.inputs - ] - - output_mask = np.zeros_like(input_ids) - output_ids = input_ids.copy() - - index = self.max_predictions_per_seq - padded_mask_indices = (masked_lm_positions == 0).nonzero() - if len(padded_mask_indices[0]) != 0: - index = padded_mask_indices[0][0] - - output_mask[masked_lm_positions[:index]] = 1.0 - output_ids[masked_lm_positions[:index]] = masked_lm_ids[:index] - - # input_mask = np.asarray(input_mask, dtype=np.float32) - # output_mask = np.asarray(output_mask, dtype=np.float32) - return (input_ids, segment_ids, input_mask, output_ids, output_mask, next_sentence_labels) - - -class BertPretrainingPreprocessedDataloader(DataLoader): - """ - Dataloader for already preprocessed data in hdf5 files that is already in the format expected by BERT model. - """ - - def __init__(self, data_files: List[str], max_predictions_per_seq: int, batch_size: int, seed: Optional[int] = 42): - """ - Args: - data_files: list of data files in hdf5 format with preprocessed data in array format - max_predictions_per_seq: maximum number of masked tokens per sequence. Need to be consistent with data in input file. - batch_size: batch size per gpu per forward pass - seed: seed to ensure each gpu process opens the same data file in each iteration - """ - super().__init__(None, batch_size=batch_size) - self.random = random.Random(seed) - self.data_files = data_files - self.max_predictions_per_seq = max_predictions_per_seq - - # def __len__(self): - # return sum([len(load_h5(data_file)['input_ids']) for data_file in self.data_files])//(self.batch_size) - - def __iter__(self): - self.random.shuffle(self.data_files) - for data_file in self.data_files: - train_data = BertPretrainingPreprocessedDataset( - input_file=data_file, max_predictions_per_seq=self.max_predictions_per_seq - ) - train_sampler = DistributedSampler(train_data) - # print("---") - # print(os.getpid(), train_sampler.rank, train_sampler.num_replicas, train_sampler.num_samples) - # print("---") - train_dataloader = DataLoader( - dataset=train_data, sampler=train_sampler, batch_size=self.batch_size, shuffle=False, - ) - for x in train_dataloader: - yield x diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/Makefile b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/Makefile deleted file mode 100644 index 150939026443cbb832123fefd44a2e16b44042e4..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color -CPPFLAGS += $(shell python3 -m pybind11 --includes) -LIBNAME = helpers -LIBEXT = $(shell python3-config --extension-suffix) - -default: $(LIBNAME)$(LIBEXT) - -%$(LIBEXT): %.cpp - $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/__init__.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/__init__.py deleted file mode 100644 index a45f68fdea3bfb3f5995103f04941161f62f10b6..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.nlp.data.language_modeling.megatron.bert_dataset import BertDataset -from nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset import GPTDataset -from nemo.collections.nlp.data.language_modeling.megatron.gpt_prompt_learning_dataset import GPTPromptLearningDataset -from nemo.collections.nlp.data.language_modeling.megatron.indexed_dataset import IndexedDataset, MMapIndexedDataset -from nemo.collections.nlp.data.language_modeling.megatron.t5_dataset import T5Dataset diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/bart_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/bart_dataset.py deleted file mode 100644 index b6a046ba85140d735fc57206279a1d985f6863eb..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/bart_dataset.py +++ /dev/null @@ -1,205 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""BART Style dataset.""" - -import numpy as np - -from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import ( - create_masked_lm_predictions, - get_samples_mapping, -) -from nemo.collections.nlp.data.language_modeling.megatron.t5_dataset import T5Dataset - - -class BARTDataset(T5Dataset): - # account for added tokens - MAX_SEQ_LENGTH_DELTA = 2 - - def __init__( - self, - cfg, - trainer, - tokenizer, - name, - indexed_dataset, - data_prefix, - num_epochs, - max_num_samples, - max_seq_length, - seed, - masked_lm_prob=0.15, - short_seq_prob=0.1, - max_ngram_size=10, - mean_ngram_size=None, - geometric_dist=True, - permutation=False, - whole_word_masking=True, - favor_long_ngrams=False, - delete_mask_prob=0, - respect_document_boundaries=True, - documents=None, - ): - super().__init__( - cfg=cfg, - trainer=trainer, - tokenizer=tokenizer, - name=name, - indexed_dataset=indexed_dataset, - data_prefix=data_prefix, - num_epochs=num_epochs, - max_num_samples=max_num_samples, - max_seq_length=max_seq_length, - max_seq_length_dec=None, - seed=seed, - masked_lm_prob=masked_lm_prob, - short_seq_prob=short_seq_prob, - max_ngram_size=max_ngram_size, - mean_ngram_size=mean_ngram_size, - geometric_dist=geometric_dist, - permutation=permutation, - whole_word_masking=whole_word_masking, - favor_long_ngrams=favor_long_ngrams, - respect_document_boundaries=respect_document_boundaries, - documents=documents, - ) - - # Params to store. - self.delete_mask_prob = delete_mask_prob - - def _build(self): - """ - Class-specific build method to be overridden by child classes. - """ - pass - - def __getitem__(self, idx): - np_rng = np.random.RandomState(seed=(self.seed + idx)) - - sample, seq_length = self._get_sample(idx) - - # flatten sentences into one list - tokens = [token for sentence in sample for token in sentence] - - # Truncate to `target_sequence_length`. - max_num_tokens = seq_length - tokens = tokens[:max_num_tokens] - - # Masking. - max_predictions_per_seq = self.masked_lm_prob * max_num_tokens - - lm_pred = create_masked_lm_predictions( - tokens=tokens, - vocab_id_list=self.vocab_id_list, - vocab_id_to_token_dict=self.vocab_id_to_token_dict, - masked_lm_prob=self.masked_lm_prob, - cls_id=self.cls_id, - sep_id=self.sep_id, - mask_id=self.mask_id, - max_predictions_per_seq=max_predictions_per_seq, - np_rng=np_rng, - max_ngram_size=self.max_ngram_size, - whole_word_masking=self.whole_word_masking, - favor_long_ngrams=self.favor_long_ngrams, - mean_ngram_size=self.mean_ngram_size, - permutation=self.permutation, - geometric_dist=self.geometric_dist, - masking_style="t5", - tokenizer_type=self.tokenizer_type, - ) - - if self.masked_lm_prob == 0: - (output_tokens, masked_positions, masked_labels, _) = lm_pred - masked_spans = None - else: - (output_tokens, masked_positions, masked_labels, _, masked_spans) = lm_pred - - # Padding. - tokens_enc, tokens_dec_in, labels, enc_mask, dec_mask, loss_mask = self.pad_and_convert_to_numpy( - tokens=tokens, - output_tokens=output_tokens, - masked_positions=masked_positions, - masked_labels=masked_labels, - masked_spans=masked_spans, - np_rng=np_rng, - ) - - train_sample = { - 'text_enc': tokens_enc, - 'text_dec': tokens_dec_in, - 'labels': labels, - 'loss_mask': loss_mask, - 'enc_mask': enc_mask, - 'dec_mask': dec_mask, - } - - return train_sample - - def pad_and_convert_to_numpy( - self, tokens, output_tokens, masked_positions, masked_labels, masked_spans=None, np_rng=None, - ): - """Pad sequences and convert them to numpy.""" - bart_decoder_in = [self.bos_id] + tokens - bart_decoder_out = tokens + [self.eos_id] - - if masked_spans is not None: - # construct bart input by collapsing multiple into one, and delete randomly - bart_input = [] - (start_index, end_index) = (0, None) - for span in masked_spans: - end_index = span.index[0] - bart_input.extend(output_tokens[start_index:end_index]) - # delete mask with probability delete_mask_prob - if np_rng.rand() >= self.delete_mask_prob: - bart_input.append(self.mask_id) - - # the next start index is the token after the last span token - start_index = span.index[-1] + 1 - - # Add the remaining tokens to the BART input - bart_input.extend(output_tokens[start_index:]) - else: - bart_input = output_tokens - - # Some checks. - # Encoder-side padding mask. - num_tokens = len(bart_input) - padding_length = self.max_seq_length - num_tokens - assert padding_length >= 0 - assert len(masked_positions) == len(masked_labels) - - # Tokens.. - filler = [self.pad_id] * padding_length - tokens_enc = np.array(bart_input + filler, dtype=np.int64) - - # Decoder-side padding mask. - num_tokens_dec = len(bart_decoder_in) - padding_length_dec = self.max_seq_length - num_tokens_dec - assert padding_length_dec >= 0 - filler_dec = [self.pad_id] * padding_length_dec - tokens_dec_in = np.array(bart_decoder_in + filler_dec, dtype=np.int64) - - # Create attention masks - enc_mask = (tokens_enc != self.pad_id).astype(np.int64) - dec_mask = (tokens_dec_in != self.pad_id).astype(np.int64) - - # Labels mask. - labels = bart_decoder_out + ([-1] * padding_length_dec) - labels = np.array(labels, dtype=np.int64) - - # Loss mask - loss_mask = ([1] * num_tokens_dec) + ([0] * padding_length_dec) - loss_mask = np.array(loss_mask, dtype=np.int64) - - return tokens_enc, tokens_dec_in, labels, enc_mask, dec_mask, loss_mask diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/base_dataset_utils.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/base_dataset_utils.py deleted file mode 100644 index 96b7f57f7dc7292005899230eef425b15f1c8c00..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/base_dataset_utils.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math - - -def get_datasets_weights_and_num_samples(data_prefix, num_samples): - - # The data prefix should be in the format of: - # weight-1, data-prefix-1, weight-2, data-prefix-2, .. - assert len(data_prefix) % 2 == 0 - num_datasets = len(data_prefix) // 2 - weights = [0] * num_datasets - prefixes = [0] * num_datasets - for i in range(num_datasets): - weights[i] = float(data_prefix[2 * i]) - prefixes[i] = (data_prefix[2 * i + 1]).strip() - # Normalize weights - weight_sum = 0.0 - for weight in weights: - weight_sum += weight - assert weight_sum > 0.0 - weights = [weight / weight_sum for weight in weights] - - # Add 0.5% (the 1.005 factor) so in case the bleding dataset does - # not uniformly distribute the number of samples, we still have - # samples left to feed to the network. - # TODO: check data leakage between train/val/test? - datasets_train_valid_test_num_samples = [] - for weight in weights: - # Comes here when we have seperate train,test and validation datasets. - if isinstance(num_samples, int): - datasets_train_valid_test_num_samples.append(int(math.ceil(num_samples * weight * 1.005))) - else: - datasets_train_valid_test_num_samples.append([int(math.ceil(val * weight * 1.005)) for val in num_samples]) - - return prefixes, weights, datasets_train_valid_test_num_samples - - -def get_train_valid_test_split_(splits_string, size): - """ Get dataset splits from comma or '/' separated string list.""" - - splits = [] - if splits_string.find(',') != -1: - splits = [float(s) for s in splits_string.split(',')] - elif splits_string.find('/') != -1: - splits = [float(s) for s in splits_string.split('/')] - else: - splits = [float(splits_string)] - if len(splits) != 3: - raise ValueError(f"Invalid splits string: {splits_string}. Expected 3 comma separated values.") - while len(splits) < 3: - splits.append(0.0) - splits = splits[:3] - splits_sum = sum(splits) - assert splits_sum > 0.0 - splits = [split / splits_sum for split in splits] - splits_index = [0] - for index, split in enumerate(splits): - splits_index.append(splits_index[index] + int(round(split * float(size)))) - diff = splits_index[-1] - size - for index in range(1, len(splits_index)): - splits_index[index] -= diff - assert len(splits_index) == 4 - assert splits_index[-1] == size - return splits_index diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py deleted file mode 100644 index 5d985466ff6cb23a3aced8074f8deb58df46726b..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py +++ /dev/null @@ -1,218 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - -from nemo.collections.nlp.modules.common import VirtualPromptSource -from nemo.core import Dataset -from nemo.utils import logging - -__all__ = ['BasePromptLearningDataset'] - - -class BasePromptLearningDataset(Dataset): - """ - The base dataset class for prompt-tuning or p-tuning. - TODO: (@adithyare) should be merged into GPTPromptLearningDataset - """ - - def __init__( - self, - datasets, - tokenizer, - virtual_prompt_source: VirtualPromptSource, - task_templates: dict, - pseudo_tokens, - pad_token_id: str, - max_seq_length: int, - min_seq_length: int = 1, - add_bos: bool = False, - add_eos: bool = True, - for_train: bool = True, - ): - self.tokenizer = tokenizer - self.virtual_prompt_source = virtual_prompt_source - self.task_templates = task_templates - self.pseudo_tokens = pseudo_tokens - self.pseudo_token_ids = set(self.tokenizer.tokens_to_ids(self.pseudo_tokens)) - self.pad_token_id = pad_token_id - self.max_seq_length = max_seq_length - self.min_seq_length = min_seq_length - self.add_bos = add_bos - self.add_eos = add_eos - self.for_train = for_train - self.examples = [] - - assert self.min_seq_length <= max_seq_length, "Min sequence length should be less than or equal to max" - assert self.max_seq_length > 0, "Max sequence length should be greater than 0" - - logging.info("Loading and tokenizing dataset ... ") - - # Datasets is just a list of json dicts - if isinstance(datasets[0], dict): - self.load_data(datasets) - - # Datasets are a list of file path strings to .json or .jsonl files - elif isinstance(datasets[0], str): - for path in datasets: - dataset = open(path, 'r', encoding='utf-8') - self.load_data(dataset) - else: - raise ValueError("Datasets must be a list of dicts or a list of filepath strings") - - def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits): - """ Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers """ - total_inserted_tokens = 0 - - for idx in range(len(virtual_token_splits)): - split_start = total_inserted_tokens - split_end = total_inserted_tokens + virtual_token_splits[idx] - pseudo_tokens_for_split = "".join(self.pseudo_tokens[split_start:split_end]) - input_example = input_example.replace(f'<|VIRTUAL_PROMPT_{idx}|>', pseudo_tokens_for_split) - total_inserted_tokens = split_end - - return input_example - - def _truncate_input(self, truncation_field, input_ids, taskname, doc, total_virtual_tokens=0): - """ Try to truncate input text to fit into the max sequence length """ - logging.info( - f"Input greater than max sequence length. Attempting to truncate: '{truncation_field}' in task: '{taskname}'" - ) - - # Truncate the text ids in this part of input to try and fit max sequence length - if truncation_field is not None and truncation_field in doc.keys(): - truncation_length = len(input_ids) - self.max_seq_length - field_text = doc[truncation_field] - field_text = self._add_leading_space(taskname, truncation_field, field_text) - - # Truncate field text - field_text_ids = self.tokenizer.text_to_ids(field_text) - truncated_text_ids = field_text_ids[: -min(truncation_length, len(field_text_ids))] - - # Replace original text ids with truncated text ids - field_start, field_end = find_subsequence_location(input_ids, field_text_ids) - input_ids = input_ids[:field_start] + truncated_text_ids + input_ids[field_end + 1 :] - else: - if not self.for_train: - # Hack alert! Slash and burn - # @TODO (@adithyare) need a more graceful truncation here, we should not skip examples in test - input_ids = ( - input_ids[:total_virtual_tokens] - + input_ids[total_virtual_tokens:][-self.max_seq_length + total_virtual_tokens :] - ) - - return input_ids - - def _add_leading_space(self, taskname, field_name, field_text): - """ Add leading space to text if there is a space before it in the template """ - prompt_template = self.task_templates[taskname]["prompt_template"] - field_text_start = prompt_template.find("{" + field_name + "}") - if field_text_start != 0 and prompt_template[field_text_start - 1] == " ": - field_text = " " + field_text - - return field_text - - def __len__(self): - return len(self.examples) - - def __getitem__(self, idx): - return self.examples[idx] - - def _input_sanity_checks( - self, - total_virtual_tokens, - virtual_token_splits, - prompt_template, - prompt_template_fields, - truncation_field, - answer_field, - doc, - answer_only_loss=None, - ): - # Sanity check amount of virtual token - assert ( - total_virtual_tokens < self.max_seq_length - ), "virtual prompt tokens should not exceed max sequence length" - - # Make sure virtual token splits add up to the total number of virtual tokens - assert ( - sum(virtual_token_splits) == total_virtual_tokens - ), "Sum of prompt token split values must equal total number of prompt tokens" - - # Make sure number of virtual prompt locations match the number of virtual prompt splits - assert prompt_template.count('<|VIRTUAL_PROMPT_') == len( - virtual_token_splits - ), "The number of '<|VIRTUAL_PROMPT_n|>' markers and the number of prompt token splits must match" - - # Check if input example has fields not present in template - keys_not_in_template = list(set(doc.keys()) - set(prompt_template_fields) - set(['taskname'])) - assert ( - len(keys_not_in_template) == 0 - ), f"Examples in your dataset contain the fields: {keys_not_in_template} that are not in the task template." - - # Check answer field - if self.for_train: - assert answer_field is not None, "An answer_field must be given" - assert answer_field in doc.keys(), f"The given answer_field '{answer_field}' is not in data json" - assert truncation_field != answer_field, "Answer field and truncation field should not match" - - answer_placeholder = "{" + answer_field + "}" - answer_placeholder_len = len(answer_placeholder) - placeholder_start = len(prompt_template) - answer_placeholder_len - assert prompt_template[placeholder_start:] == answer_placeholder, "Answer field must be at prompt end" - - def pad_taskname_ids(self, taskname_ids): - # Pad taskname_ids to be the same length for the prompt encoder - if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER: - max_taskname_length = max(len(ids) for ids in taskname_ids) - taskname_ids = [ids + [self.pad_token_id] * (max_taskname_length - len(ids)) for ids in taskname_ids] - taskname_ids = torch.tensor(taskname_ids) - - # Task ids are just used for a look up embeddings for prompt-table - elif self.virtual_prompt_source == VirtualPromptSource.NO_PROMPT: - taskname_ids = torch.tensor(taskname_ids) - - return taskname_ids - - -def find_subsequence_location(sequence, subsequence): - """ Finds the start and end index of the first occurance - of a given subsequence within a larger list. Returns - the two indices corresponding to the postition of - the first and last token of the subseqeunce. - Assumes subsequence is known to be in sequence. - """ - assert len(sequence) >= len(subsequence), "subsequence too long" - - start_idx = None - next_subseq_token = subsequence[0] - next_subsequence_idx = 1 - - for seq_idx, token in enumerate(sequence): - if token == next_subseq_token: - if start_idx is None: - start_idx = seq_idx - - if next_subsequence_idx == len(subsequence): - end_idx = seq_idx - return start_idx, end_idx - else: - next_subseq_token = subsequence[next_subsequence_idx] - next_subsequence_idx += 1 - else: - start_idx = None - next_subseq_token = subsequence[0] - next_subsequence_idx = 1 - - raise ValueError("Subsequence not found in sequence") diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/bert_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/bert_dataset.py deleted file mode 100644 index e8aa23261ffd441378d1ce09ca94c69eeeca4407..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/bert_dataset.py +++ /dev/null @@ -1,237 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""BERT Style dataset.""" - -import os -from typing import Any, Optional - -import numpy as np -import torch - -from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import ( - create_masked_lm_predictions, - create_tokens_and_tokentypes, - get_a_and_b_segments, - get_samples_mapping, - truncate_segments, -) -from nemo.collections.nlp.data.language_modeling.megatron.indexed_dataset import MMapIndexedDataset - - -class BertDataset(torch.utils.data.Dataset): - def __init__( - self, - cfg: dict, - name: str, - indexed_dataset: MMapIndexedDataset, - data_prefix: str, - num_epochs: Optional[int], - max_num_samples: int, - masked_lm_prob: float, - max_seq_length: int, - short_seq_prob: float, - seed: int, - binary_head: bool, - tokenizer: Any, - ): - - # Params to store. - self.name = name - self.seed = seed - self.masked_lm_prob = masked_lm_prob - self.max_seq_length = max_seq_length - self.binary_head = binary_head - - # Dataset. - self.indexed_dataset = indexed_dataset - - # save index mappings to a configurable dir - self.index_mapping_dir = cfg.data.get('index_mapping_dir', None) - - # create index_mapping_dir on rank 0 - if torch.distributed.is_available() and torch.distributed.is_initialized(): - if torch.distributed.get_rank() == 0: - if self.index_mapping_dir is not None and not os.path.isdir(self.index_mapping_dir): - os.makedirs(self.index_mapping_dir) - torch.distributed.barrier() - - # Build the samples mapping. - self.samples_mapping = get_samples_mapping( - self.indexed_dataset, - data_prefix, - num_epochs, - max_num_samples, - self.max_seq_length - 3, # account for added tokens - short_seq_prob, - self.seed, - self.name, - self.binary_head, - index_mapping_dir=self.index_mapping_dir, - ) - - # Vocab stuff. - self.vocab_id_list = list(tokenizer.ids_to_tokens.keys()) - self.vocab_id_to_token_dict = tokenizer.ids_to_tokens - self.cls_id = tokenizer.cls_token_id - self.sep_id = tokenizer.sep_token_id - self.mask_id = tokenizer.mask_token_id - self.pad_id = tokenizer.pad_token_id - - def __len__(self): - return self.samples_mapping.shape[0] - - def __getitem__(self, idx): - start_idx, end_idx, seq_length = self.samples_mapping[idx] - sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)] - # Note that this rng state should be numpy and not python since - # python randint is inclusive whereas the numpy one is exclusive. - # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1 - np_rng = np.random.RandomState(seed=((self.seed + idx) % 2 ** 32)) - return build_training_sample( - sample, - seq_length, - self.max_seq_length, # needed for padding - self.vocab_id_list, - self.vocab_id_to_token_dict, - self.cls_id, - self.sep_id, - self.mask_id, - self.pad_id, - self.masked_lm_prob, - np_rng, - self.binary_head, - ) - - -def build_training_sample( - sample, - target_seq_length, - max_seq_length, - vocab_id_list, - vocab_id_to_token_dict, - cls_id, - sep_id, - mask_id, - pad_id, - masked_lm_prob, - np_rng, - binary_head, - whole_word_masking=True, - skip_masking_id=None, -): - """Biuld training sample. - - Arguments: - sample: A list of sentences in which each sentence is a list token ids. - target_seq_length: Desired sequence length. - max_seq_length: Maximum length of the sequence. All values are padded to - this length. - vocab_id_list: List of vocabulary ids. Used to pick a random id. - vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. - cls_id: Start of example id. - sep_id: Separator id. - mask_id: Mask token id. - pad_id: Padding token id. - masked_lm_prob: Probability to mask tokens. - np_rng: Random number genenrator. Note that this rng state should be - numpy and not python since python randint is inclusive for - the opper bound whereas the numpy one is exclusive. - whole_word_masking: Whether to mask only whole words instead of independent subwords. - skip_mask_id: ID of a token that should not be masked. #TODO: make this a list of tokens. - """ - if binary_head: - # We assume that we have at least two sentences in the sample - assert len(sample) > 1 - assert target_seq_length <= max_seq_length - - # Divide sample into two segments (A and B). - if binary_head: - tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample, np_rng) - else: - tokens_a = [] - for j in range(len(sample)): - tokens_a.extend(sample[j]) - tokens_b = [] - is_next_random = False - - # Truncate to `target_sequence_length`. - max_num_tokens = target_seq_length - truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b), max_num_tokens, np_rng) - - # Build tokens and toketypes. - tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id) - - # Masking. - max_predictions_per_seq = masked_lm_prob * max_num_tokens - (tokens, masked_positions, masked_labels, _, _) = create_masked_lm_predictions( - tokens, - vocab_id_list, - vocab_id_to_token_dict, - masked_lm_prob, - cls_id, - sep_id, - mask_id, - max_predictions_per_seq, - np_rng, - whole_word_masking=whole_word_masking, - skip_masking_id=skip_masking_id, - ) - - # Padding. - tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np = pad_and_convert_to_numpy( - tokens, tokentypes, masked_positions, masked_labels, pad_id, max_seq_length - ) - - train_sample = { - 'text': tokens_np, - 'types': tokentypes_np, - 'labels': labels_np, - 'is_random': int(is_next_random), - 'loss_mask': loss_mask_np, - 'padding_mask': padding_mask_np, - 'truncated': int(truncated), - } - return train_sample - - -def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, masked_labels, pad_id, max_seq_length): - """Pad sequences and convert them to numpy.""" - - # Some checks. - num_tokens = len(tokens) - padding_length = max_seq_length - num_tokens - assert padding_length >= 0 - assert len(tokentypes) == num_tokens - assert len(masked_positions) == len(masked_labels) - - # Tokens and token types. - filler = [pad_id] * padding_length - tokens_np = np.array(tokens + filler, dtype=np.int64) - tokentypes_np = np.array(tokentypes + filler, dtype=np.int64) - - # Padding mask. - padding_mask_np = np.array([1] * num_tokens + [0] * padding_length, dtype=np.int64) - - # Lables and loss mask. - labels = [-1] * max_seq_length - loss_mask = [0] * max_seq_length - for i in range(len(masked_positions)): - assert masked_positions[i] < num_tokens - labels[masked_positions[i]] = masked_labels[i] - loss_mask[masked_positions[i]] = 1 - labels_np = np.array(labels, dtype=np.int64) - loss_mask_np = np.array(loss_mask, dtype=np.int64) - - return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py deleted file mode 100644 index ae2b5fff6be1d398d9fc49f3119ed5d57ddf5504..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py +++ /dev/null @@ -1,182 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Blendable dataset.""" - -import time - -import numpy as np -import torch - -from nemo.utils import logging -from nemo.utils.app_state import AppState - - -class BlendableDataset(torch.utils.data.Dataset): - def __init__(self, datasets, weights, size): - - self.datasets = datasets - num_datasets = len(datasets) - assert num_datasets == len(weights) - - self.size = size - - # Normalize weights. - weights = np.array(weights, dtype=np.float64) - sum_weights = np.sum(weights) - assert sum_weights > 0.0 - weights /= sum_weights - - # Build indecies. - start_time = time.time() - assert num_datasets < 255 - self.dataset_index = np.zeros(self.size, dtype=np.uint8) - self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) - app_state = AppState() - try: - if app_state.local_rank == 0: - from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper - - compile_helper() - torch.distributed.barrier() - from nemo.collections.nlp.data.language_modeling.megatron import helpers - except ImportError: - raise ImportError( - f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.' - ) - - helpers.build_blending_indices( - self.dataset_index, - self.dataset_sample_index, - weights, - num_datasets, - self.size, - torch.distributed.get_rank() == 0, - ) - logging.info( - '> elapsed time for building blendable dataset indices: ' '{:.2f} (sec)'.format(time.time() - start_time) - ) - - def __len__(self): - return self.size - - def __getitem__(self, idx): - dataset_idx = self.dataset_index[idx] - sample_idx = self.dataset_sample_index[idx] - return self.datasets[dataset_idx][sample_idx] - - def create_data_mmap(self): - for dataset in self.datasets: - dataset.create_data_mmap() - - -class MemoryEfficientBlendableDataset(torch.utils.data.Dataset): - """ - A BlendableDataset implementation that uses less memory than the original implementation. - Indices are computed algorithmically instead of storing them in memory. - - To test call: MemoryEfficientBlendableDataset.test_index_blending() - """ - - def __init__(self, datasets, weights, size, weight_bins=100): - self.datasets = datasets - num_datasets = len(datasets) - assert num_datasets == len(weights) - - weight_bins = min(weight_bins, size) - - self.size = size - self.weight_bins = weight_bins - - # Normalize weights. - weights = np.array(weights, dtype=np.float64) - assert (weights > 0.0).all() - sum_weights = np.sum(weights) - assert sum_weights > 0.0 - self.weights = weights / sum_weights - - # create ds index based on weights - ds_index = [] - ds_bias = [] - for i, w in enumerate(self.weights): - n = int(w * weight_bins) - ds_index.extend([i] * n) - ds_bias.extend(range(n)) - # make sure arrays have length of weight_bins - n = weight_bins - len(ds_index) - ds_index.extend([i] * n) - ds_bias.extend(range(ds_bias[-1], ds_bias[-1] + n)) - - self.ds_index = np.array(ds_index, dtype=np.uint32) - self.ds_index_size = np.array([(self.ds_index == i).sum() for i in range(num_datasets)], dtype=np.uint32) - assert ( - self.ds_index_size > 0 - ).all(), f"Some datasets have no samples in the blendable dataset, increase weight_bins or the offending weight. ds_index_size = {self.ds_index_size}" - self.ds_bias = np.array(ds_bias, dtype=np.uint32) - - self.ds_size = np.array([len(ds) for ds in datasets], dtype=np.uint32) - - def get_ds_sample_idx(self, idx): - """Returns ds index and sample index (within the ds) for the given index in the blendable dataset.""" - - bin = idx % self.weight_bins - ds_idx = self.ds_index[bin] - sample_idx = (self.ds_bias[bin] + (idx // self.weight_bins) * self.ds_index_size[ds_idx]) % self.ds_size[ - ds_idx - ] - - return ds_idx, sample_idx - - def __len__(self): - return self.size - - def __getitem__(self, idx): - ds_idx, sample_idx = self.get_ds_sample_idx(idx) - - return self.datasets[ds_idx][sample_idx] - - @classmethod - def test_index_blending(cls): - """Visualize indices of blended dataset""" - - import matplotlib.pyplot as plt - - plt.ion() - - class DS(torch.utils.data.Dataset): - def __init__(self, size, data): - self.size = size - self.data = data - - def __len__(self): - return self.size - - def __getitem__(self, idx): - return self.data[idx] - - for weight_bins in [10, 100]: - blend_ds = MemoryEfficientBlendableDataset( - [DS(10, "a"), DS(10, "b"), DS(10, "c")], [0.5, 0.3, 0.2], 50, weight_bins=weight_bins - ) - - ds_sample_idx_list = [blend_ds.get_ds_sample_idx(i) for i in range(50)] - ds_list = list(zip(*ds_sample_idx_list))[0] - sample_list = list(zip(*ds_sample_idx_list))[1] - - plt.figure() - plt.plot(ds_list, label="ds idx") - plt.plot(sample_list, label="sample") - plt.legend() - plt.grid() - plt.title(f"weight_bins={weight_bins}") diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py deleted file mode 100644 index 7df915533492533991c4044962b6254c4e45e9aa..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py +++ /dev/null @@ -1,181 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Dataloaders.""" - -import abc -from itertools import chain -from typing import Optional - -import torch - -from nemo.utils import logging - - -class BaseMegatronSampler: - def __init__( - self, - total_samples: int, - consumed_samples: int, - micro_batch_size: int, - data_parallel_rank: int, - data_parallel_size: int, - drop_last: bool = True, - global_batch_size: Optional[int] = None, - rampup_batch_size: Optional[list] = None, - pad_samples_to_global_batch_size: Optional[bool] = False, - ) -> None: - # Sanity checks. - if total_samples <= 0: - raise RuntimeError("no sample to consume: {}".format(total_samples)) - if consumed_samples >= total_samples: - raise RuntimeError("no samples left to consume: {}, {}".format(consumed_samples, total_samples)) - if micro_batch_size <= 0: - raise RuntimeError(f"micro_batch_size size must be greater than 0, but {micro_batch_size}") - if data_parallel_size <= 0: - raise RuntimeError(f"data parallel size must be greater than 0, but {data_parallel_size}") - if data_parallel_rank >= data_parallel_size: - raise RuntimeError( - "data_parallel_rank should be smaller than data size, but {} >= {}".format( - data_parallel_rank, data_parallel_size - ) - ) - if global_batch_size is not None and rampup_batch_size is None: - if global_batch_size % (micro_batch_size * data_parallel_size) != 0: - raise RuntimeError( - f"`global_batch_size` ({global_batch_size}) is not divisible by " - f"`micro_batch_size ({micro_batch_size}) x data_parallel_size " - f"({data_parallel_size})`" - ) - if pad_samples_to_global_batch_size and global_batch_size is None: - raise RuntimeError( - f"`pad_samples_to_global_batch_size` can be `True` only when " - f"`global_batch_size` is set to an integer value" - ) - - # Keep a copy of input params for later use. - self.total_samples = total_samples - self.consumed_samples = consumed_samples - self.micro_batch_size = micro_batch_size - self.data_parallel_rank = data_parallel_rank - self.micro_batch_times_data_parallel_size = self.micro_batch_size * data_parallel_size - self.drop_last = drop_last - self.global_batch_size = global_batch_size - self.pad_samples_to_global_batch_size = pad_samples_to_global_batch_size - - logging.info( - f'Instantiating MegatronPretrainingSampler with total_samples: {total_samples} and consumed_samples: {consumed_samples}' - ) - - def __len__(self): - num_available_samples: int = self.total_samples - self.consumed_samples - if self.global_batch_size is not None: - if self.drop_last: - return num_available_samples // self.global_batch_size - else: - return (num_available_samples + self.global_batch_size - 1) // self.global_batch_size - else: - return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1 - - @abc.abstractmethod - def __iter__(self): - ... - - -class MegatronPretrainingSampler(BaseMegatronSampler): - def get_start_end_idx(self): - start_idx = self.data_parallel_rank * self.micro_batch_size - end_idx = start_idx + self.micro_batch_size - return start_idx, end_idx - - def __iter__(self): - batch = [] - # Last batch will be dropped if drop_last is not set False - indices = range(self.consumed_samples, self.total_samples) - if (not self.drop_last) and self.pad_samples_to_global_batch_size: - pad_samples_num = -len(indices) % self.global_batch_size - pad_indices = range(-1, -pad_samples_num - 1, -1) - indices = chain(indices, pad_indices) - - for idx in indices: - batch.append(idx) - if len(batch) == self.micro_batch_times_data_parallel_size: - start_idx, end_idx = self.get_start_end_idx() - yield batch[start_idx:end_idx] - batch = [] - - # Check the last partial batch and see drop_last is set - if len(batch) > 0 and not self.drop_last: - assert ( - not self.pad_samples_to_global_batch_size - ), 'with pad_samples_to_global_batch_size all batches should be complete' - start_idx, end_idx = self.get_start_end_idx() - yield batch[start_idx:end_idx] - - -class MegatronPretrainingRandomSampler(BaseMegatronSampler): - def __init__( - self, - total_samples: int, - consumed_samples: int, - micro_batch_size: int, - data_parallel_rank: int, - data_parallel_size: int, - drop_last: bool = True, - global_batch_size: Optional[int] = None, - pad_samples_to_global_batch_size: Optional[bool] = False, - ) -> None: - super().__init__( - total_samples=total_samples, - consumed_samples=consumed_samples, - micro_batch_size=micro_batch_size, - data_parallel_rank=data_parallel_rank, - data_parallel_size=data_parallel_size, - drop_last=drop_last, - global_batch_size=global_batch_size, - pad_samples_to_global_batch_size=pad_samples_to_global_batch_size, - ) - assert ( - pad_samples_to_global_batch_size == False - ), "`MegatronPretrainingRandomSampler` does not support sample padding" - self.last_batch_size = self.total_samples % self.micro_batch_times_data_parallel_size - - def __iter__(self): - active_total_samples = self.total_samples - self.last_batch_size - self.epoch = self.consumed_samples // active_total_samples - current_epoch_samples = self.consumed_samples % active_total_samples - assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0 - - # data sharding and random sampling - bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) * self.micro_batch_size - bucket_offset = current_epoch_samples // self.data_parallel_size - start_idx = self.data_parallel_rank * bucket_size - - g = torch.Generator() - g.manual_seed(self.epoch) - random_idx = torch.randperm(bucket_size, generator=g).tolist() - idx_range = [start_idx + x for x in random_idx[bucket_offset:]] - - batch = [] - # Last batch if not complete will be dropped. - for idx in idx_range: - batch.append(idx) - if len(batch) == self.micro_batch_size: - self.consumed_samples += self.micro_batch_times_data_parallel_size - yield batch - batch = [] - - # Check the last partial batch and see drop_last is set - if len(batch) > 0 and not self.drop_last: - yield batch diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py deleted file mode 100644 index f286bb9a8adf6cb238d487ea6811a6f7b6b212c6..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/dataset_utils.py +++ /dev/null @@ -1,1351 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2018 The Google AI Team Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Most of the code here has been copied from: -# https://github.com/google-research/albert/blob/master/create_pretraining_data.py -# with some modifications. - -import collections -import os -import subprocess -import time -from typing import Any - -import numpy as np -import torch -from omegaconf import OmegaConf -from omegaconf.dictconfig import DictConfig - -from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import ( - get_datasets_weights_and_num_samples, - get_train_valid_test_split_, -) -from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset -from nemo.collections.nlp.data.language_modeling.megatron.indexed_dataset import deallocate_indexed_dataset_memory -from nemo.collections.nlp.data.language_modeling.megatron.indexed_dataset import make_dataset as make_indexed_dataset -from nemo.collections.nlp.data.language_modeling.megatron.indexed_dataset import make_indexed_dataset_compatibility -from nemo.collections.nlp.data.language_modeling.megatron.length_distribution_type import LengthDistribution -from nemo.collections.nlp.data.language_modeling.megatron.lm_adapted_t5_dataset import T5LMAdaptedDataset -from nemo.utils import logging -from nemo.utils.get_rank import is_global_rank_zero - -try: - from megatron.core import parallel_state - - HAVE_MEGATRON_CORE = True - -except (ImportError, ModuleNotFoundError): - - HAVE_MEGATRON_CORE = False - - -DSET_TYPE_BERT = 'standard_bert' -DSET_TYPE_ICT = 'ict' -DSET_TYPE_T5 = 't5' -DSET_TYPE_T5_LM = 't5_prefix_lm' -DSET_TYPE_BART = 'bart' -DSET_TYPE_UL2 = 'ul2' - -DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5, DSET_TYPE_T5_LM, DSET_TYPE_BART, DSET_TYPE_UL2] - - -def compile_helper(): - """Compile helper function ar runtime. Make sure this - is invoked on a single process.""" - - path = os.path.abspath(os.path.dirname(__file__)) - ret = subprocess.run(['make', '-C', path]) - if ret.returncode != 0: - logging.error("Making C++ dataset helpers module failed, exiting.") - import sys - - sys.exit(1) - - -def get_a_and_b_segments(sample, np_rng): - """Divide sample into a and b segments.""" - - # Number of sentences in the sample. - n_sentences = len(sample) - # Make sure we always have two sentences. - assert n_sentences > 1, 'make sure each sample has at least two sentences.' - - # First part: - # `a_end` is how many sentences go into the `A`. - a_end = 1 - if n_sentences >= 3: - # Note that randin in numpy is exclusive. - a_end = np_rng.randint(1, n_sentences) - tokens_a = [] - for j in range(a_end): - tokens_a.extend(sample[j]) - - # Second part: - tokens_b = [] - for j in range(a_end, n_sentences): - tokens_b.extend(sample[j]) - - # Random next: - is_next_random = False - if np_rng.random() < 0.5: - is_next_random = True - tokens_a, tokens_b = tokens_b, tokens_a - - return tokens_a, tokens_b, is_next_random - - -def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng): - """Truncates a pair of sequences to a maximum sequence length.""" - # print(len_a, len_b, max_num_tokens) - assert len_a > 0 - if len_a + len_b <= max_num_tokens: - return False - while len_a + len_b > max_num_tokens: - if len_a > len_b: - len_a -= 1 - tokens = tokens_a - else: - len_b -= 1 - tokens = tokens_b - if np_rng.random() < 0.5: - del tokens[0] - else: - tokens.pop() - return True - - -def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id): - """Merge segments A and B, add [CLS] and [SEP] and build tokentypes.""" - - tokens = [] - tokentypes = [] - # [CLS]. - tokens.append(cls_id) - tokentypes.append(0) - # Segment A. - for token in tokens_a: - tokens.append(token) - tokentypes.append(0) - # [SEP]. - tokens.append(sep_id) - tokentypes.append(0) - # Segment B. - for token in tokens_b: - tokens.append(token) - tokentypes.append(1) - if tokens_b: - # [SEP]. - tokens.append(sep_id) - tokentypes.append(1) - - return tokens, tokentypes - - -MaskedLmInstance = collections.namedtuple("MaskedLmInstance", ["index", "label"]) - - -def is_start_piece(piece): - """Check if the current word piece is the starting piece. (BERT)""" - # When a word has been split into - # WordPieces, the first token does not have any marker and any subsequence - # tokens are prefixed with ##. So whenever we see the ## token, we - # append it to the previous set of word indexes. - return not piece.startswith("##") - - -def create_masked_lm_predictions( - tokens, - vocab_id_list, - vocab_id_to_token_dict, - masked_lm_prob, - cls_id, - sep_id, - mask_id, - max_predictions_per_seq, - np_rng, - max_ngram_size=3, - mean_ngram_size=None, - whole_word_masking=True, - favor_long_ngrams=False, - permutation=False, - geometric_dist=False, - masking_style="bert", - tokenizer_type="wordpiece", - skip_masking_id=None, -): - """Creates the predictions for the masked LM objective. - Note: Tokens here are vocab ids and not text tokens.""" - if not geometric_dist and mean_ngram_size is not None: - raise ValueError(f"Mean ngram size is only supported for geometric distribution.") - - cand_indexes = [] - # Note(mingdachen): We create a list for recording if the piece is - # the starting piece of current token, where 1 means true, so that - # on-the-fly whole word masking is possible. - token_boundary = [0] * len(tokens) - skip_mask_idx = None # Store the index of token that cannot be masked. - for (i, token) in enumerate(tokens): - if token == skip_masking_id: - skip_mask_idx = i - if token == cls_id or token == sep_id: - token_boundary[i] = 1 - continue - # Whole Word Masking means that if we mask all of the wordpieces - # corresponding to an original word. - # - # Note that Whole Word Masking does *not* change the training code - # at all -- we still predict each WordPiece independently, softmaxed - # over the entire vocabulary. - if whole_word_masking and len(cand_indexes) >= 1 and not is_start_piece(vocab_id_to_token_dict[token]): - cand_indexes[-1].append(i) - else: - cand_indexes.append([i]) - if is_start_piece(vocab_id_to_token_dict[token]): - token_boundary[i] = 1 - - output_tokens = list(tokens) - - masked_lm_positions = [] - masked_lm_labels = [] - - if masked_lm_prob == 0: - return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary) - - num_to_predict = min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob)))) - if masking_style != "bert": - num_to_predict = max(1, num_to_predict) - if num_to_predict < 1: - logging.warning( - F'Number of tokens is : {len(tokens)} and mask_probability is {masked_lm_prob}. None of the tokens will be masked' - ) - - ngrams = np.arange(1, max_ngram_size + 1, dtype=np.int64) - if not geometric_dist: - # Note(mingdachen): - # By default, we set the probilities to favor shorter ngram sequences. - pvals = 1.0 / np.arange(1, max_ngram_size + 1) - pvals /= pvals.sum(keepdims=True) - if favor_long_ngrams: - pvals = pvals[::-1] - - ngram_indexes = [] - for idx in range(len(cand_indexes)): - ngram_index = {} - for n in ngrams: - # Skip this ngram if it contains the index of token that should not be masked. - # TODO: (sandeepsub) Generalize this to be a list of tokens that cannot be masked. - if skip_mask_idx is not None and skip_mask_idx >= idx and skip_mask_idx <= idx + n: - continue - ngram_index[n] = cand_indexes[idx : idx + n] - ngram_indexes.append(ngram_index) - - np_rng.shuffle(ngram_indexes) - - (masked_lms, masked_spans) = ([], []) - covered_indexes = set() - for cand_index_set in ngram_indexes: - if len(masked_lms) >= num_to_predict: - break - if not cand_index_set: - continue - # Note(mingdachen): - # Skip current piece if they are covered in lm masking or previous ngrams. - for index_set in cand_index_set[1]: - for index in index_set: - if index in covered_indexes: - continue - - if not geometric_dist: - # Not all ngrams are available because of skip_masking_id that prevents a certain ID from being masked. - available_ngrams = list(cand_index_set.keys()) - # n - 1 because pvals is 0-indexed and available ngrams are 1-indexed. - pvals_current = np.array([pvals[n - 1] for n in available_ngrams]) - n = np_rng.choice(available_ngrams, p=pvals_current / pvals_current.sum(keepdims=True),) - else: - # Sampling "n" from the geometric distribution and clipping it to - # the max_ngrams. Using p=0.2 default from the SpanBERT paper - # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1) - - # The expectation of a geometric distribution is E[X] = 1 / p - p = 1 / mean_ngram_size if mean_ngram_size is not None else 0.2 - n = min(np_rng.geometric(p), max_ngram_size) - # n may not be in the candidate index set because of skip_masking_id. - # we try to find the nearest one in the candidate index set. - if n not in cand_index_set: - n = _truncate_to_nearest(cand_index_set, n) - index_set = sum(cand_index_set[n], []) - n -= 1 - # Note(mingdachen): - # Repeatedly looking for a candidate that does not exceed the - # maximum number of predictions by trying shorter ngrams. - while len(masked_lms) + len(index_set) > num_to_predict: - if n == 0: - break - if n - 1 in cand_index_set: - index_set = sum(cand_index_set[n - 1], []) - n -= 1 - # If adding a whole-word mask would exceed the maximum number of - # predictions, then just skip this candidate. - if len(masked_lms) + len(index_set) > num_to_predict: - continue - is_any_index_covered = False - for index in index_set: - if index in covered_indexes: - is_any_index_covered = True - break - if is_any_index_covered: - continue - for index in index_set: - covered_indexes.add(index) - masked_token = None - if masking_style == "bert": - # 80% of the time, replace with [MASK] - if np_rng.random() < 0.8: - masked_token = mask_id - else: - # 10% of the time, keep original - if np_rng.random() < 0.5: - masked_token = tokens[index] - # 10% of the time, replace with random word - else: - masked_token = vocab_id_list[np_rng.randint(0, len(vocab_id_list))] - elif masking_style == "t5": - masked_token = mask_id - elif masking_style == "bart": - masked_token = mask_id - else: - raise ValueError("invalid value of masking style") - - output_tokens[index] = masked_token - masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) - - masked_spans.append(MaskedLmInstance(index=index_set, label=[tokens[index] for index in index_set])) - - assert len(masked_lms) <= num_to_predict - np_rng.shuffle(ngram_indexes) - - select_indexes = set() - if permutation: - if skip_masking_id is not None: - raise ValueError(f"permutation=True is not supported when skip_masking_id is not None.") - for cand_index_set in ngram_indexes: - if len(select_indexes) >= num_to_predict: - break - if not cand_index_set: - continue - # Note(mingdachen): - # Skip current piece if they are covered in lm masking or previous ngrams. - for index_set in cand_index_set[0]: - for index in index_set: - if index in covered_indexes or index in select_indexes: - continue - - n = np.random.choice( - ngrams[: len(cand_index_set)], - p=pvals[: len(cand_index_set)] / pvals[: len(cand_index_set)].sum(keepdims=True), - ) - index_set = sum(cand_index_set[n - 1], []) - n -= 1 - - while len(select_indexes) + len(index_set) > num_to_predict: - if n == 0: - break - index_set = sum(cand_index_set[n - 1], []) - n -= 1 - # If adding a whole-word mask would exceed the maximum number of - # predictions, then just skip this candidate. - if len(select_indexes) + len(index_set) > num_to_predict: - continue - is_any_index_covered = False - for index in index_set: - if index in covered_indexes or index in select_indexes: - is_any_index_covered = True - break - if is_any_index_covered: - continue - for index in index_set: - select_indexes.add(index) - assert len(select_indexes) <= num_to_predict - - select_indexes = sorted(select_indexes) - permute_indexes = list(select_indexes) - np_rng.shuffle(permute_indexes) - orig_token = list(output_tokens) - - for src_i, tgt_i in zip(select_indexes, permute_indexes): - output_tokens[src_i] = orig_token[tgt_i] - masked_lms.append(MaskedLmInstance(index=src_i, label=orig_token[src_i])) - - masked_lms = sorted(masked_lms, key=lambda x: x.index) - # Sort the spans by the index of the first span - masked_spans = sorted(masked_spans, key=lambda x: x.index[0]) - - for p in masked_lms: - masked_lm_positions.append(p.index) - masked_lm_labels.append(p.label) - return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary, masked_spans) - - -def _truncate_to_nearest(cand_index_set, n): - min_dist = 9999 - for key in cand_index_set: - if abs(key - n) < min_dist: - n = key - min_dist = abs(key - n) - - return n - - -def create_extreme_masked_lm_predictions( - tokens, - masked_lm_prob, - mask_id, - max_predictions_per_seq, - np_rng, - max_ngram_size=10, - min_ngram_size=2, - mean_ngram_size=5, - span_length_distribution=LengthDistribution.uniform, - skip_masking_id=None, -): - """Creates the predictions for the extreme span-masking UL2 objective. - Note: Tokens here are vocab ids and not text tokens.""" - output_tokens = list(tokens) - - masked_lm_positions = [] - masked_lm_labels = [] - - num_to_predict = int(min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob))))) - # If the number of tokens to predict is less than the min ngram size, clam it to max predictions. - min_ngram_size = int(min(num_to_predict, min_ngram_size)) - - ngrams = np.arange(min_ngram_size, max_ngram_size + 1, dtype=np.int64) - if span_length_distribution == "uniform": - pvals = np.array([1.0 / (max_ngram_size - min_ngram_size + 1)] * (max_ngram_size - min_ngram_size + 1)) - - ngram_indexes = [] - if skip_masking_id is not None: - skip_mask_idx = None - for idx in range(len(tokens)): - if tokens[idx] == skip_masking_id: - skip_mask_idx = idx - break - else: - skip_mask_idx = None - - cand_indexes = [[i] for i in range(len(tokens))] - for idx in range(len(cand_indexes)): - ngram_index = {} - for n in ngrams: - # Skip this ngram if it contains the index of token that should not be masked. - # TODO: (sandeepsub) Generalize this to be a list of tokens that cannot be masked. - if skip_mask_idx is not None and skip_mask_idx >= idx and skip_mask_idx <= idx + n: - continue - ngram_index[n] = cand_indexes[idx : idx + n] - ngram_indexes.append(ngram_index) - - np_rng.shuffle(ngram_indexes) - - (masked_lms, masked_spans) = ([], []) - covered_indexes = set() - for cand_index_set in ngram_indexes: - if len(masked_lms) >= num_to_predict: - break - if not cand_index_set: - continue - # Note(mingdachen): - # Skip current piece if they are covered in lm masking or previous ngrams. - for index_set in cand_index_set[min_ngram_size]: - for index in index_set: - if index in covered_indexes: - continue - - if span_length_distribution == LengthDistribution.uniform: - available_ngrams = list(cand_index_set.keys()) - pvals_current = np.array([pvals[n] for n in available_ngrams]) - n = np_rng.choice(available_ngrams, p=pvals_current / pvals_current.sum(keepdims=True),) - elif span_length_distribution == LengthDistribution.geometric: - # Sampling "n" from the geometric distribution and clipping it to - # the max_ngrams. Using p=0.2 default from the SpanBERT paper - # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1) - - # The expectation of a geometric distribution is E[X] = 1 / p - p = 1 / mean_ngram_size if mean_ngram_size is not None else 0.2 - n = min(np_rng.geometric(p), max_ngram_size) - # n may not be in the candidate index set because of skip_masking_id. - # we try to find the nearest one in the candidate index set. - if n not in cand_index_set: - n = _truncate_to_nearest(cand_index_set, n) - n = int(np.clip(n, min_ngram_size, max_ngram_size)) - elif span_length_distribution == LengthDistribution.truncated_normal: - # Sampling "n" from a truncated normal distribution. - mu = mean_ngram_size if mean_ngram_size is not None else (max_ngram_size - min_ngram_size) // 2 - n = int(np.clip(np_rng.normal(loc=mu, scale=np.sqrt(mu)), min_ngram_size, max_ngram_size)) - if n not in cand_index_set: - n = _truncate_to_nearest(cand_index_set, n) - n = int(np.clip(n, min_ngram_size, max_ngram_size)) - - index_set = sum(cand_index_set[n], []) - n -= 1 - # Note(mingdachen): - # Repeatedly looking for a candidate that does not exceed the - # maximum number of predictions by trying shorter ngrams. - while len(masked_lms) + len(index_set) > num_to_predict: - if n < min_ngram_size: - break - if n in cand_index_set: - index_set = sum(cand_index_set[n], []) - n -= 1 - - # If adding a whole-word mask would exceed the maximum number of - # predictions, then just skip this candidate. - if len(masked_lms) + len(index_set) > num_to_predict: - continue - is_any_index_covered = False - for index in index_set: - if index in covered_indexes: - is_any_index_covered = True - break - if is_any_index_covered: - continue - for index in index_set: - covered_indexes.add(index) - output_tokens[index] = mask_id - masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) - - masked_spans.append(MaskedLmInstance(index=index_set, label=[tokens[index] for index in index_set])) - - assert len(masked_lms) <= num_to_predict - np_rng.shuffle(ngram_indexes) - - masked_lms = sorted(masked_lms, key=lambda x: x.index) - # Sort the spans by the index of the first span - masked_spans = sorted(masked_spans, key=lambda x: x.index[0]) - - for p in masked_lms: - masked_lm_positions.append(p.index) - masked_lm_labels.append(p.label) - return (output_tokens, masked_lm_positions, masked_lm_labels, masked_spans) - - -def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, masked_labels, pad_id, max_seq_length): - """Pad sequences and convert them to numpy.""" - - # Some checks. - num_tokens = len(tokens) - padding_length = max_seq_length - num_tokens - assert padding_length >= 0 - assert len(tokentypes) == num_tokens - assert len(masked_positions) == len(masked_labels) - - # Tokens and token types. - filler = [pad_id] * padding_length - tokens_np = np.array(tokens + filler, dtype=np.int64) - tokentypes_np = np.array(tokentypes + filler, dtype=np.int64) - - # Padding mask. - padding_mask_np = np.array([1] * num_tokens + [0] * padding_length, dtype=np.int64) - - # Lables and loss mask. - labels = [-1] * max_seq_length - loss_mask = [0] * max_seq_length - for i in range(len(masked_positions)): - assert masked_positions[i] < num_tokens - labels[masked_positions[i]] = masked_labels[i] - loss_mask[masked_positions[i]] = 1 - labels_np = np.array(labels, dtype=np.int64) - loss_mask_np = np.array(loss_mask, dtype=np.int64) - - return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np - - -def get_dataset( - indexed_dataset, - start_index, - end_index, - cfg, - trainer, - num_samples, - masked_lm_prob, - short_seq_prob, - binary_head, - max_seq_length_dec, - dataset_type='standard_bert', - tokenizer=None, - max_ngram_size=3, - mean_ngram_size=None, - geometric_dist=True, - permutation=False, - whole_word_masking=True, - favor_long_ngrams=False, - delete_mask_prob=0, # This flag is used in BART only, and will not have effect on T5/BERT - respect_document_boundaries=True, - **kwargs, -): - - if dataset_type not in DSET_TYPES: - raise ValueError("Invalid dataset_type: ", dataset_type) - - # from nemo.collections.nlp.data.language_modeling.megatron.ict_dataset import ICTDataset - from nemo.collections.nlp.data.language_modeling.megatron.bart_dataset import BARTDataset - from nemo.collections.nlp.data.language_modeling.megatron.bert_dataset import BertDataset - from nemo.collections.nlp.data.language_modeling.megatron.length_distribution_type import LengthDistribution - from nemo.collections.nlp.data.language_modeling.megatron.t5_dataset import T5Dataset - from nemo.collections.nlp.data.language_modeling.megatron.ul2_dataset import UL2Dataset - - if dataset_type == DSET_TYPE_ICT: - raise NotImplementedError("ICT dataset is not implemented yet.") - ''' - dataset = ICTDataset( - block_dataset=indexed_dataset, - title_dataset=title_dataset, - query_in_block_prob=args.query_in_block_prob, - use_one_sent_docs=args.use_one_sent_docs, - binary_head=binary_head, - **kwargs, - ) - ''' - elif dataset_type == DSET_TYPE_T5: - assert tokenizer is not None, "Tokenizer is required for T5 dataset" - logging.info("Instatiating T5 Dataset ...") - documents = np.arange(start=start_index, stop=end_index, step=1, dtype=np.int32) - dataset = T5Dataset( - cfg=cfg, - trainer=trainer, - tokenizer=tokenizer, - indexed_dataset=indexed_dataset, - masked_lm_prob=masked_lm_prob, - max_seq_length_dec=max_seq_length_dec, - short_seq_prob=short_seq_prob, - max_ngram_size=max_ngram_size, - mean_ngram_size=mean_ngram_size, - geometric_dist=geometric_dist, - permutation=permutation, - whole_word_masking=whole_word_masking, - favor_long_ngrams=favor_long_ngrams, - documents=documents, - respect_document_boundaries=respect_document_boundaries, - **kwargs, - ) - elif dataset_type == DSET_TYPE_BERT: - logging.info("Instatiating BERT Dataset ...") - dataset = BertDataset( - cfg=cfg, - indexed_dataset=indexed_dataset, - masked_lm_prob=masked_lm_prob, - short_seq_prob=short_seq_prob, - binary_head=binary_head, - tokenizer=tokenizer, - **kwargs, - ) - elif dataset_type == DSET_TYPE_T5_LM: - documents = np.arange(start=start_index, stop=end_index, step=1, dtype=np.int32) - logging.info("Instatiating T5 Prefix-LM Dataset ...") - dataset = T5LMAdaptedDataset( - cfg=cfg, - trainer=trainer, - tokenizer=tokenizer, - documents=documents, - indexed_dataset=indexed_dataset, - num_samples=num_samples, - max_seq_length_encoder=kwargs["max_seq_length"], - max_seq_length_decoder=max_seq_length_dec, - **kwargs, - ) - elif dataset_type == DSET_TYPE_BART: - assert tokenizer is not None, "Tokenizer is required for BART dataset" - documents = np.arange(start=start_index, stop=end_index, step=1, dtype=np.int32) - logging.info("Instatiating BART Dataset ...") - dataset = BARTDataset( - cfg=cfg, - trainer=trainer, - tokenizer=tokenizer, - indexed_dataset=indexed_dataset, - masked_lm_prob=masked_lm_prob, - short_seq_prob=short_seq_prob, - max_ngram_size=max_ngram_size, - mean_ngram_size=mean_ngram_size, - geometric_dist=geometric_dist, - permutation=permutation, - whole_word_masking=whole_word_masking, - favor_long_ngrams=favor_long_ngrams, - delete_mask_prob=delete_mask_prob, - documents=documents, - respect_document_boundaries=respect_document_boundaries, - **kwargs, - ) - elif dataset_type == DSET_TYPE_UL2: - assert tokenizer is not None, "Tokenizer is required for UL2 dataset" - documents = np.arange(start=start_index, stop=end_index, step=1, dtype=np.int32) - logging.info("Instatiating UL2 Dataset ...") - extreme_ngram_span_length_distribution = cfg.data.get( - "extreme_ngram_span_length_distribution", "truncated_normal" - ) - ngram_span_length_distribution = cfg.data.get("ngram_span_length_distribution", "geometric") - if extreme_ngram_span_length_distribution == "truncated_normal": - extreme_ngram_span_length_distribution = LengthDistribution.truncated_normal - elif extreme_ngram_span_length_distribution == "uniform": - extreme_ngram_span_length_distribution = LengthDistribution.uniform - elif extreme_ngram_span_length_distribution == "geometric": - extreme_ngram_span_length_distribution = LengthDistribution.geometric - - if ngram_span_length_distribution == "truncated_normal": - ngram_span_length_distribution = LengthDistribution.truncated_normal - elif ngram_span_length_distribution == "uniform": - ngram_span_length_distribution = LengthDistribution.uniform - elif ngram_span_length_distribution == "geometric": - ngram_span_length_distribution = LengthDistribution.geometric - - dataset = UL2Dataset( - cfg=cfg, - trainer=trainer, - tokenizer=tokenizer, - indexed_dataset=indexed_dataset, - masked_lm_prob=masked_lm_prob, - max_seq_length_dec=max_seq_length_dec, - short_seq_prob=short_seq_prob, - max_ngram_size=max_ngram_size, - mean_ngram_size=mean_ngram_size, - ngram_span_length_distribution=ngram_span_length_distribution, - extreme_ngram_span_length_distribution=extreme_ngram_span_length_distribution, - permutation=permutation, - whole_word_masking=whole_word_masking, - favor_long_ngrams=favor_long_ngrams, - extreme_masked_lm_prob=cfg.data.get("extreme_masked_lm_prob", 0.5), - extreme_max_ngram_size=cfg.data.get("extreme_max_ngram_size", 128), - extreme_mean_ngram_size=cfg.data.get("extreme_mean_ngram_size", 64), - extreme_min_ngram_size=cfg.data.get("extreme_min_ngram_size", 32), - prefix_lm_pivot_mean=cfg.data.get("prefix_lm_pivot_mean", 0.25), - respect_document_boundaries=respect_document_boundaries, - documents=documents, - **kwargs, - ) - else: - raise NotImplementedError(f"Dataset type {dataset_type} not fully implemented.") - return dataset - - -def build_dataset( - cfg, - trainer, - data_prefix, - data_impl, - num_samples, - max_seq_length, - masked_lm_prob, - short_seq_prob, - seed, - skip_warmup, - binary_head, - max_seq_length_dec, - name, - dataset_type, - tokenizer, - max_ngram_size, - mean_ngram_size, - geometric_dist, - permutation, - whole_word_masking, - favor_long_ngrams, - delete_mask_prob, - respect_document_boundaries, - data_impl_kwargs, -): - def _build_dataset(current_data_prefix, current_num_samples): - indexed_dataset = get_indexed_dataset_( - current_data_prefix, data_impl, skip_warmup, data_impl_kwargs=data_impl_kwargs - ) - total_num_of_documents = indexed_dataset.sizes.shape[0] - # Print stats about the splits. - logging.info(' > dataset split:') - logging.info(' Total {} documents is : {} '.format(name, total_num_of_documents)) - if hasattr(indexed_dataset, 'get_doc_idx'): - doc_idx_ptr = indexed_dataset.get_doc_idx() - indexed_dataset.set_doc_idx(doc_idx_ptr[0:total_num_of_documents]) - - kwargs = dict( - name=name, - data_prefix=current_data_prefix, - num_epochs=None, - max_num_samples=int(current_num_samples), - max_seq_length=max_seq_length, - seed=seed, - ) - - dataset = get_dataset( - indexed_dataset, - 0, - total_num_of_documents, - cfg, - trainer, - current_num_samples, - masked_lm_prob, - short_seq_prob, - binary_head, - max_seq_length_dec, - dataset_type, - tokenizer, - max_ngram_size, - mean_ngram_size, - geometric_dist, - permutation, - whole_word_masking, - favor_long_ngrams, - delete_mask_prob, - respect_document_boundaries, - **kwargs, - ) - - # Set the original pointer so dataset remains the main dataset. - if hasattr(indexed_dataset, 'set_doc_idx'): - indexed_dataset.set_doc_idx(doc_idx_ptr) - # Checks. - assert indexed_dataset.doc_idx[0] == 0 - assert indexed_dataset.doc_idx.shape[0] == (total_num_of_documents + 1) - return dataset - - if len(data_prefix) == 1: - return _build_dataset(data_prefix[0], num_samples) - - else: - output = get_datasets_weights_and_num_samples(data_prefix, num_samples) - prefixes, weights, datasets_num_samples = output - datasets = [] - for i in range(len(prefixes)): - dataset = _build_dataset(prefixes[i], datasets_num_samples[i]) - datasets.append(dataset) - return BlendableDataset(datasets, weights, num_samples) - - -def build_train_valid_test_datasets( - cfg, - trainer, - data_prefix, - data_impl, - splits_string, - train_valid_test_num_samples, - max_seq_length, - masked_lm_prob, - short_seq_prob, - seed, - skip_warmup, - binary_head=False, - max_seq_length_dec=None, - dataset_type='standard_bert', - tokenizer=None, - max_ngram_size=3, - mean_ngram_size=None, - geometric_dist=True, - permutation=False, - whole_word_masking=True, - favor_long_ngrams=False, - delete_mask_prob=0, - respect_document_boundaries=True, - data_impl_kwargs={}, -): - # for VSC and text memmap we need to provide a tokenizer, if not given - if data_impl in ["text_mmap", "csv_mmap"]: - if "tokenizer" not in data_impl_kwargs: - if isinstance(data_impl_kwargs, DictConfig): - data_impl_kwargs = OmegaConf.to_object(data_impl_kwargs) - else: - # prevent updating the default - data_impl_kwargs = data_impl_kwargs.copy() - - data_impl_kwargs["tokenizer"] = tokenizer - - if not respect_document_boundaries and data_impl_kwargs != {}: - raise ValueError( - "respect_document_boundaries=False is not compatible with text_memmap and csv_memmap (data_impl_kwargs != {})" - ) - - if data_impl in ["mock"]: - logging.info(f'Initializing mock dataset, type {dataset_type}, for train, validate, and test') - if len(data_prefix) != 0: - # Files from this location will not be read; mock data will be generated instead. - logging.warning(f"Requested data_impl={data_impl}, so ignoring data_prefix setting: {data_prefix}") - if dataset_type == DSET_TYPE_T5: - from nemo.collections.nlp.data.language_modeling.megatron.t5_dataset import MockT5Dataset - - if tokenizer is None: - # Tokenizer is used to infer vocabulary size for mock data. - raise ValueError("Tokenizer is required for a mock T5 dataset") - train_ds = MockT5Dataset( - cfg, - tokenizer, - "train", - int(train_valid_test_num_samples[0]), - max_seq_length, - max_seq_length_dec, - seed, - ) - valid_ds = MockT5Dataset( - cfg, - tokenizer, - "valid", - int(train_valid_test_num_samples[1]), - max_seq_length, - max_seq_length_dec, - seed, - ) - test_ds = MockT5Dataset( - cfg, tokenizer, "test", int(train_valid_test_num_samples[2]), max_seq_length, max_seq_length_dec, seed, - ) - return train_ds, valid_ds, test_ds - else: - raise NotImplementedError(f"Mock dataset is not implemented for requested type: {dataset_type}") - - if isinstance(data_prefix, DictConfig): - assert ( - data_prefix.get('train') is not None - and data_prefix.get('test') is not None - and data_prefix.get('validation') is not None - ), f"Data prefix dictionary should have train, test and validation keys. data_prefix currently has only {data_prefix.keys()}" - if cfg.data.splits_string is not None: - logging.warning(cfg.data.splits_string + " ignored since data prefix is of type dictionary.") - train_ds = build_dataset( - cfg, - trainer, - data_prefix["train"], - data_impl, - int(train_valid_test_num_samples[0]), - max_seq_length, - masked_lm_prob, - short_seq_prob, - seed, - skip_warmup, - binary_head, - max_seq_length_dec, - "train", - dataset_type=dataset_type, - tokenizer=tokenizer, - max_ngram_size=max_ngram_size, - mean_ngram_size=mean_ngram_size, - geometric_dist=geometric_dist, - permutation=permutation, - whole_word_masking=whole_word_masking, - favor_long_ngrams=favor_long_ngrams, - delete_mask_prob=delete_mask_prob, - respect_document_boundaries=respect_document_boundaries, - data_impl_kwargs=data_impl_kwargs, - ) - validation_ds = build_dataset( - cfg, - trainer, - data_prefix["validation"], - data_impl, - int(train_valid_test_num_samples[1]), - max_seq_length, - masked_lm_prob, - short_seq_prob, - seed, - skip_warmup, - binary_head, - max_seq_length_dec, - "valid", - dataset_type=dataset_type, - tokenizer=tokenizer, - max_ngram_size=max_ngram_size, - mean_ngram_size=mean_ngram_size, - geometric_dist=geometric_dist, - permutation=permutation, - whole_word_masking=whole_word_masking, - favor_long_ngrams=favor_long_ngrams, - delete_mask_prob=delete_mask_prob, - respect_document_boundaries=respect_document_boundaries, - data_impl_kwargs=data_impl_kwargs, - ) - test_ds = build_dataset( - cfg, - trainer, - data_prefix["test"], - data_impl, - int(train_valid_test_num_samples[2]), - max_seq_length, - masked_lm_prob, - short_seq_prob, - seed, - skip_warmup, - binary_head, - max_seq_length_dec, - "test", - dataset_type=dataset_type, - tokenizer=tokenizer, - max_ngram_size=max_ngram_size, - mean_ngram_size=mean_ngram_size, - geometric_dist=geometric_dist, - permutation=permutation, - whole_word_masking=whole_word_masking, - favor_long_ngrams=favor_long_ngrams, - delete_mask_prob=delete_mask_prob, - respect_document_boundaries=respect_document_boundaries, - data_impl_kwargs=data_impl_kwargs, - ) - return train_ds, validation_ds, test_ds - - else: - - if len(data_prefix) == 1: - return _build_train_valid_test_datasets( - cfg, - trainer, - data_prefix[0], - data_impl, - splits_string, - train_valid_test_num_samples, - max_seq_length, - masked_lm_prob, - short_seq_prob, - seed, - skip_warmup, - binary_head, - max_seq_length_dec, - dataset_type=dataset_type, - tokenizer=tokenizer, - max_ngram_size=max_ngram_size, - mean_ngram_size=mean_ngram_size, - geometric_dist=geometric_dist, - permutation=permutation, - whole_word_masking=whole_word_masking, - favor_long_ngrams=favor_long_ngrams, - delete_mask_prob=delete_mask_prob, - respect_document_boundaries=respect_document_boundaries, - data_impl_kwargs=data_impl_kwargs, - ) - # Blending dataset. - # Parse the values. - output = get_datasets_weights_and_num_samples(data_prefix, train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - train_n, valid_n, test_n = map(sum, zip(*datasets_train_valid_test_num_samples)) - - # Build individual datasets. - train_datasets = [] - valid_datasets = [] - test_datasets = [] - for i in range(len(prefixes)): - train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - cfg, - trainer, - prefixes[i], - data_impl, - splits_string, - datasets_train_valid_test_num_samples[i], - max_seq_length, - masked_lm_prob, - short_seq_prob, - seed, - skip_warmup, - binary_head, - max_seq_length_dec, - dataset_type=dataset_type, - tokenizer=tokenizer, - max_ngram_size=max_ngram_size, - mean_ngram_size=mean_ngram_size, - geometric_dist=geometric_dist, - permutation=permutation, - whole_word_masking=whole_word_masking, - favor_long_ngrams=favor_long_ngrams, - delete_mask_prob=delete_mask_prob, - respect_document_boundaries=respect_document_boundaries, - data_impl_kwargs=data_impl_kwargs, - ) - if train_ds: - train_datasets.append(train_ds) - if valid_ds: - valid_datasets.append(valid_ds) - if test_ds: - test_datasets.append(test_ds) - - # Blend. - blending_train_dataset = None - if train_datasets: - blending_train_dataset = BlendableDataset(train_datasets, weights, train_n) - blending_valid_dataset = None - if valid_datasets: - blending_valid_dataset = BlendableDataset(valid_datasets, weights, valid_n) - blending_test_dataset = None - if test_datasets: - blending_test_dataset = BlendableDataset(test_datasets, weights, test_n) - - return (blending_train_dataset, blending_valid_dataset, blending_test_dataset) - - -def _build_train_valid_test_datasets( - cfg, - trainer, - data_prefix, - data_impl, - splits_string, - train_valid_test_num_samples, - max_seq_length, - masked_lm_prob, - short_seq_prob, - seed, - skip_warmup, - binary_head, - max_seq_length_dec, - dataset_type='standard_bert', - tokenizer=None, - max_ngram_size=3, - mean_ngram_size=None, - geometric_dist=True, - permutation=False, - whole_word_masking=True, - favor_long_ngrams=False, - delete_mask_prob=0, # This flag is used in BART only, and will not have effect on T5/BERT - respect_document_boundaries=True, - data_impl_kwargs={}, -): - - # Indexed dataset. - indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup, data_impl_kwargs=data_impl_kwargs) - - # if dataset_type == DSET_TYPE_ICT: - # title_dataset = get_indexed_dataset_(args.titles_data_path, data_impl, skip_warmup) - - # Get start and end indices of train/valid/train into doc-idx - # Note that doc-idx is desinged to be num-docs + 1 so we can - # easily iterate over it. - total_num_of_documents = indexed_dataset.doc_idx.shape[0] - 1 - splits = get_train_valid_test_split_(splits_string, total_num_of_documents) - - # Print stats about the splits. - logging.info(' > dataset split:') - - def print_split_stats(name, index): - logging.info(' {}:'.format(name)) - logging.info( - ' document indices in [{}, {}) total of {} ' - 'documents'.format(splits[index], splits[index + 1], splits[index + 1] - splits[index]) - ) - start_index = indexed_dataset.doc_idx[splits[index]] - end_index = indexed_dataset.doc_idx[splits[index + 1]] - logging.info( - ' sentence indices in [{}, {}) total of {} ' - 'sentences'.format(start_index, end_index, end_index - start_index) - ) - - print_split_stats('train', 0) - print_split_stats('validation', 1) - print_split_stats('test', 2) - - def build_dataset(index, name): - # from nemo.collections.nlp.data.language_modeling.megatron.ict_dataset import ICTDataset - from nemo.collections.nlp.data.language_modeling.megatron.bart_dataset import BARTDataset - from nemo.collections.nlp.data.language_modeling.megatron.bert_dataset import BertDataset - from nemo.collections.nlp.data.language_modeling.megatron.length_distribution_type import LengthDistribution - from nemo.collections.nlp.data.language_modeling.megatron.t5_dataset import T5Dataset - from nemo.collections.nlp.data.language_modeling.megatron.ul2_dataset import UL2Dataset - - dataset = None - if splits[index + 1] > splits[index]: - # Get the pointer to the original doc-idx so we can set it later. - if hasattr(indexed_dataset, 'get_doc_idx'): - doc_idx_ptr = indexed_dataset.get_doc_idx() - # Slice the doc-idx - start_index = splits[index] - # Add +1 so we can index into the dataset to get the upper bound. - end_index = splits[index + 1] + 1 - # New doc_idx view. - if hasattr(indexed_dataset, 'set_doc_idx'): - indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index]) - # Build the dataset accordingly. - kwargs = dict( - name=name, - data_prefix=data_prefix, - num_epochs=None, - max_num_samples=int(train_valid_test_num_samples[index]), - max_seq_length=max_seq_length, - seed=seed, - ) - - dataset = get_dataset( - indexed_dataset, - splits[index], - splits[index + 1], - cfg, - trainer, - int(train_valid_test_num_samples[index]), - masked_lm_prob, - short_seq_prob, - binary_head, - max_seq_length_dec, - dataset_type, - tokenizer, - max_ngram_size, - mean_ngram_size, - geometric_dist, - permutation, - whole_word_masking, - favor_long_ngrams, - delete_mask_prob, - respect_document_boundaries, - **kwargs, - ) - - # Set the original pointer so dataset remains the main dataset. - if hasattr(indexed_dataset, 'set_doc_idx'): - indexed_dataset.set_doc_idx(doc_idx_ptr) - # Checks. - if getattr(indexed_dataset, 'doc_idx', None) is not None: - assert indexed_dataset.doc_idx[0] == 0 - assert indexed_dataset.doc_idx.shape[0] == (total_num_of_documents + 1) - - return dataset - - train_dataset = build_dataset(0, 'train') - valid_dataset = build_dataset(1, 'valid') - test_dataset = build_dataset(2, 'test') - - return (train_dataset, valid_dataset, test_dataset) - - -def get_indexed_dataset_(data_prefix, data_impl, skip_warmup, data_impl_kwargs={}): - - logging.info(' > building dataset index ...') - - start_time = time.time() - indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup, impl_kwargs=data_impl_kwargs) - if data_impl in ['text_mmap', 'csv_mmap']: - # make csv/text memmap compatible with Megatron sampling - make_indexed_dataset_compatibility(indexed_dataset) - - assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1] - logging.info(' > finished creating indexed dataset in {:4f} ' 'seconds'.format(time.time() - start_time)) - - logging.info(' > indexed dataset stats:') - logging.info(' number of documents: {}'.format(indexed_dataset.doc_idx.shape[0] - 1)) - logging.info(' number of sentences: {}'.format(indexed_dataset.sizes.shape[0])) - - return indexed_dataset - - -def get_samples_mapping( - indexed_dataset, - data_prefix, - num_epochs, - max_num_samples, - max_seq_length, - short_seq_prob, - seed, - name, - binary_head, - index_mapping_dir: str = None, - samples_mapping: Any = None, -): - """Get a list that maps a sample index to a starting sentence index, end sentence index, and length""" - - if not num_epochs: - if not max_num_samples: - raise ValueError("Need to specify either max_num_samples " "or num_epochs") - num_epochs = np.iinfo(np.int32).max - 1 - if not max_num_samples: - max_num_samples = np.iinfo(np.int64).max - 1 - - # Filename of the index mapping - if index_mapping_dir is not None: - indexmap_filename = os.path.join(index_mapping_dir, os.path.basename(data_prefix)) - else: - indexmap_filename = data_prefix - indexmap_filename += '_{}_indexmap'.format(name) - if num_epochs != (np.iinfo(np.int32).max - 1): - indexmap_filename += '_{}ep'.format(num_epochs) - if max_num_samples != (np.iinfo(np.int64).max - 1): - indexmap_filename += '_{}mns'.format(max_num_samples) - indexmap_filename += '_{}msl'.format(max_seq_length) - indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob) - indexmap_filename += '_{}s'.format(seed) - indexmap_filename += '.npy' - - # Build the indexed mapping if not exist and not provided externally. - if samples_mapping is None and torch.distributed.get_rank() == 0 and not os.path.isfile(indexmap_filename): - # Fake index mapping if missing - if (getattr(indexed_dataset, 'doc_idx', None) is None) and (getattr(indexed_dataset, 'sizes', None) is None): - make_indexed_dataset_compatibility(indexed_dataset) - - print( - ' > WARNING: could not find index map file {}, building ' - 'the indices on rank 0 ...'.format(indexmap_filename) - ) - - # Make sure the types match the helpers input types. - assert indexed_dataset.doc_idx.dtype == np.int64 - assert indexed_dataset.sizes.dtype == np.int32 - - # Build samples mapping - verbose = torch.distributed.get_rank() == 0 - start_time = time.time() - logging.info(' > building samples index mapping for {} ...'.format(name)) - # First compile and then import. - try: - if is_global_rank_zero(): - compile_helper() - from nemo.collections.nlp.data.language_modeling.megatron import helpers - except ImportError: - raise ImportError( - f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.' - ) - samples_mapping = helpers.build_mapping( - indexed_dataset.doc_idx, - indexed_dataset.sizes, - num_epochs, - max_num_samples, - max_seq_length, - short_seq_prob, - seed, - verbose, - 2 if binary_head else 1, - ) - logging.info(' > done building samples index maping') - np.save(indexmap_filename, samples_mapping, allow_pickle=True) - logging.info(' > saved the index mapping in {}'.format(indexmap_filename)) - # Make sure all the ranks have built the mapping - logging.info( - ' > elasped time to build and save samples mapping ' '(seconds): {:4f}'.format(time.time() - start_time) - ) - torch.distributed.barrier() - counts = torch.cuda.LongTensor([1]) - torch.distributed.all_reduce(counts, group=parallel_state.get_data_parallel_group()) - torch.distributed.all_reduce(counts, group=parallel_state.get_pipeline_model_parallel_group()) - assert counts[0].item() == ( - torch.distributed.get_world_size() - // torch.distributed.get_world_size(group=parallel_state.get_tensor_model_parallel_group()) - ) - # Load indexed dataset if not given externally. - if samples_mapping is None: - logging.info(' > loading indexed mapping from {}'.format(indexmap_filename)) - start_time = time.time() - samples_mapping = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r') - logging.info(' loaded indexed file in {:3.3f} seconds'.format(time.time() - start_time)) - logging.info(' total number of samples: {}'.format(samples_mapping.shape[0])) - - # Deallocate temporary numpy arrays that were created for `get_samples_mapping()` when needed - if hasattr(indexed_dataset, 'doc_idx') and hasattr(indexed_dataset, 'sizes'): - deallocate_indexed_dataset_memory(indexed_dataset) - - return samples_mapping diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py deleted file mode 100644 index e86eb73add749a39dc4f314b8ce36420ebd0810a..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py +++ /dev/null @@ -1,824 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""GPT style dataset.""" - -import os -import time - -import numpy as np -import torch -from omegaconf.dictconfig import DictConfig - -from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import ( - get_datasets_weights_and_num_samples, - get_train_valid_test_split_, -) -from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset -from nemo.collections.nlp.data.language_modeling.megatron.indexed_dataset import deallocate_indexed_dataset_memory -from nemo.collections.nlp.data.language_modeling.megatron.indexed_dataset import make_dataset as make_indexed_dataset -from nemo.core import Dataset -from nemo.utils import logging - -try: - from megatron.core import parallel_state - - HAVE_MEGATRON_CORE = True - -except (ImportError, ModuleNotFoundError): - - HAVE_MEGATRON_CORE = False - - -def build_dataset(cfg, trainer, data_prefix, data_impl, num_samples, seq_length, seed, skip_warmup, tokenizer, name): - def _build_dataset(current_data_prefix, current_num_samples): - delay_data_mmap = cfg.data.get('delay_data_mmap', False) - indexed_dataset = get_indexed_dataset_(current_data_prefix, data_impl, skip_warmup, delay_data_mmap) - total_num_of_documents = indexed_dataset.sizes.shape[0] - # Print stats about the splits. - logging.info(' > dataset split:') - logging.info(' Total {} documents is : {} '.format(name, total_num_of_documents)) - drop_last = True - if name == "valid": - drop_last = cfg.data.get("validation_drop_last", True) - dataset = GPTDataset( - cfg, - trainer, - tokenizer, - name, - current_data_prefix, - np.arange(start=0, stop=total_num_of_documents, step=1, dtype=np.int32), - indexed_dataset, - current_num_samples, - seq_length, - seed, - drop_last=drop_last, - ) - return dataset - - if len(data_prefix) == 1: - return _build_dataset(data_prefix[0], num_samples) - - else: - output = get_datasets_weights_and_num_samples(data_prefix, num_samples) - prefixes, weights, datasets_num_samples = output - datasets = [] - for i in range(len(prefixes)): - dataset = _build_dataset(prefixes[i], datasets_num_samples[i]) - datasets.append(dataset) - return BlendableDataset(datasets, weights, num_samples) - - -def build_train_valid_test_datasets( - cfg, - trainer, - data_prefix, - data_impl, - splits_string, - train_valid_test_num_samples, - seq_length, - seed, - skip_warmup, - tokenizer, -): - if data_impl in ['mock']: - logging.info('Initializing mock GPT dataset for train, validate, and test') - if data_prefix is not None and len(data_prefix) != 0: - # Mock data will be generated instead of loading files. - logging.warning(f"Requested data_impl={data_impl}, so ignoring data_prefix setting: {data_prefix}") - if tokenizer is None: - # Vocabulary size is inferred from tokenizer. - raise ValueError("Tokenizer is required for a mock GPT dataset") - train_ds = MockGPTDataset(cfg, tokenizer, "train", int(train_valid_test_num_samples[0]), seq_length, seed,) - valid_ds = MockGPTDataset(cfg, tokenizer, "valid", int(train_valid_test_num_samples[1]), seq_length, seed,) - test_ds = MockGPTDataset(cfg, tokenizer, "test", int(train_valid_test_num_samples[2]), seq_length, seed,) - return train_ds, valid_ds, test_ds - - if isinstance(data_prefix, DictConfig): - assert ( - data_prefix.get('train') is not None - and data_prefix.get('test') is not None - and data_prefix.get('validation') is not None - ), f"Data prefix dictionary should have train, test and validation keys. data_prefix currently has only {data_prefix.keys()}" - if cfg.data.splits_string is not None: - logging.warning(cfg.data.splits_string + " ignored since data prefix is of type dictionary.") - train_ds = build_dataset( - cfg, - trainer, - data_prefix["train"], - data_impl, - int(train_valid_test_num_samples[0]), - seq_length, - seed, - skip_warmup, - tokenizer, - "train", - ) - validation_ds = build_dataset( - cfg, - trainer, - data_prefix["validation"], - data_impl, - int(train_valid_test_num_samples[1]), - seq_length, - seed, - skip_warmup, - tokenizer, - "valid", - ) - test_ds = build_dataset( - cfg, - trainer, - data_prefix["test"], - data_impl, - int(train_valid_test_num_samples[2]), - seq_length, - seed, - skip_warmup, - tokenizer, - "test", - ) - return train_ds, validation_ds, test_ds - - else: - # Single dataset. - if len(data_prefix) == 1: - return _build_train_valid_test_datasets( - cfg, - trainer, - data_prefix[0], - data_impl, - splits_string, - train_valid_test_num_samples, - seq_length, - seed, - skip_warmup, - tokenizer, - ) - - # Blending dataset. - # Parse the values. - output = get_datasets_weights_and_num_samples(data_prefix, train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - train_datasets = [] - valid_datasets = [] - test_datasets = [] - for i in range(len(prefixes)): - train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - cfg, - trainer, - prefixes[i], - data_impl, - splits_string, - datasets_train_valid_test_num_samples[i], - seq_length, - seed, - skip_warmup, - tokenizer, - ) - if train_ds: - train_datasets.append(train_ds) - if valid_ds: - valid_datasets.append(valid_ds) - if test_ds: - test_datasets.append(test_ds) - - train_n, valid_n, test_n = map(sum, zip(*datasets_train_valid_test_num_samples)) - - # Blend. - blending_train_dataset = None - if train_datasets: - blending_train_dataset = BlendableDataset(train_datasets, weights, train_n) - blending_valid_dataset = None - if valid_datasets: - blending_valid_dataset = BlendableDataset(valid_datasets, weights, valid_n) - blending_test_dataset = None - if test_datasets: - blending_test_dataset = BlendableDataset(test_datasets, weights, test_n) - - return (blending_train_dataset, blending_valid_dataset, blending_test_dataset) - - -def _build_train_valid_test_datasets( - cfg, - trainer, - data_prefix, - data_impl, - splits_string, - train_valid_test_num_samples, - seq_length, - seed, - skip_warmup, - tokenizer, -): - """Build train, valid, and test datasets.""" - - # Indexed dataset. - delay_data_mmap = cfg.data.get('delay_data_mmap', False) - indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup, delay_data_mmap) - - total_num_of_documents = indexed_dataset.sizes.shape[0] - splits = get_train_valid_test_split_(splits_string, total_num_of_documents) - - # Print stats about the splits. - logging.info(' > dataset split:') - - def print_split_stats(name, index): - logging.info(' {}:'.format(name)) - logging.info( - ' document indices in [{}, {}) total of {} ' - 'documents'.format(splits[index], splits[index + 1], splits[index + 1] - splits[index]) - ) - - print_split_stats('train', 0) - print_split_stats('validation', 1) - print_split_stats('test', 2) - - def build_dataset(index, name): - dataset = None - if splits[index + 1] > splits[index]: - documents = np.arange(start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32) - drop_last = True - if name == "valid": - drop_last = cfg.data.get("validation_drop_last", True) - dataset = GPTDataset( - cfg, - trainer, - tokenizer, - name, - data_prefix, - documents, - indexed_dataset, - train_valid_test_num_samples[index], - seq_length, - seed, - drop_last=drop_last, - ) - return dataset - - train_dataset = build_dataset(0, 'train') - valid_dataset = build_dataset(1, 'valid') - test_dataset = build_dataset(2, 'test') - - return (train_dataset, valid_dataset, test_dataset) - - -def get_indexed_dataset_(data_prefix, data_impl, skip_warmup, delay_data_mmap=False): - """Build indexed dataset.""" - logging.info(' > building dataset index ...') - - start_time = time.time() - indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup, delay_data_mmap=delay_data_mmap) - logging.info(' > finished creating indexed dataset in {:4f} ' 'seconds'.format(time.time() - start_time)) - logging.info(' number of documents: {}'.format(indexed_dataset.sizes.shape[0])) - - return indexed_dataset - - -class GPTDataset(Dataset): - def __init__( - self, - cfg, - trainer, - tokenizer, - name, - data_prefix, - documents, - indexed_dataset, - num_samples, - seq_length, - seed, - drop_last=True, - ): - if not HAVE_MEGATRON_CORE: - raise ImportError( - "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." - ) - - super().__init__() - self.name = name - self.indexed_dataset = indexed_dataset - self.drop_last = drop_last - self.seq_length = seq_length - - # Checks - assert np.min(documents) >= 0 - assert np.max(documents) < indexed_dataset.sizes.shape[0] - - self.reset_position_ids = cfg.data.get('reset_position_ids', False) - self.reset_attention_mask = cfg.data.get('reset_attention_mask', False) - self.eod_mask_loss = cfg.data.get('eod_mask_loss', False) - self.create_inputs = any([self.reset_position_ids, self.reset_attention_mask, self.eod_mask_loss]) - self.cached_inputs = False - self.eos_id = tokenizer.eos_id - self.no_seqlen_plus_one_input_tokens = cfg.data.get('no_seqlen_plus_one_input_tokens', False) - self.add_extra_token = 1 - if self.no_seqlen_plus_one_input_tokens: - self.add_extra_token = 0 - self.shuffle_documents = cfg.data.get('shuffle_documents', True) - self.exchange_indices_distributed = cfg.data.get('exchange_indices_distributed', False) - - # save index mappings to a configurable dir - self.index_mapping_dir = cfg.data.get('index_mapping_dir', None) - - # create index_mapping_dir on rank 0 - if torch.distributed.is_available() and torch.distributed.is_initialized(): - if torch.distributed.get_rank() == 0: - if self.index_mapping_dir is not None and not os.path.isdir(self.index_mapping_dir): - os.makedirs(self.index_mapping_dir) - torch.distributed.barrier() - - # Build index mappings. - self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( - self.name, - data_prefix, - documents, - self.indexed_dataset.sizes, - num_samples, - seq_length, - seed, - index_mapping_dir=self.index_mapping_dir, - drop_last=drop_last, - add_extra_token=self.add_extra_token, - shuffle_documents=self.shuffle_documents, - exchange_indices_distributed=self.exchange_indices_distributed, - ) - deallocate_indexed_dataset_memory(self.indexed_dataset) - - def create_data_mmap(self): - self.indexed_dataset.create_data_mmap() - - def __len__(self): - # -1 is due to data structure used to retieve the index: - # sample i --> [sample_idx[i], sample_idx[i+1]) - return self.sample_idx.shape[0] - 1 - - def _get_text(self, idx: int) -> np.ndarray: - - # Get the shuffled index. - idx = self.shuffle_idx[idx] - # Start and end documents and offsets. - doc_index_f = self.sample_idx[idx][0] - doc_index_l = self.sample_idx[idx + 1][0] - offset_f = self.sample_idx[idx][1] - offset_l = self.sample_idx[idx + 1][1] - # If we are within the same document, just extract the chunk. - if doc_index_f == doc_index_l: - sample = self.indexed_dataset.get( - self.doc_idx[doc_index_f], offset=offset_f, length=offset_l - offset_f + self.add_extra_token - ) - else: - # Otherwise, get the rest of the initial document. - sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)] - # Loop over all in between documents and add the entire document. - for i in range(doc_index_f + 1, doc_index_l): - sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) - # And finally add the relevant portion of last document. - sample_list.append( - self.indexed_dataset.get(self.doc_idx[doc_index_l], length=offset_l + self.add_extra_token) - ) - sample = np.concatenate(sample_list) - if len(sample) != (self.seq_length + self.add_extra_token): - logging.info( - F' > WARNING: Got sample of length: {len(sample)} for sequence length={self.seq_length+self.add_extra_token}, padding the sample to match sequence length' - ) - sample = np.array(sample, dtype=np.int64) - sample = np.pad( - sample, (0, self.seq_length + self.add_extra_token - len(sample)), mode='constant', constant_values=-1 - ) - return sample.astype(np.int64) - - def __getitem__(self, idx): - text = torch.from_numpy(self._get_text(idx)) - if self.add_extra_token: - tokens = text[:-1].contiguous() - labels = text[1:].contiguous() - else: - tokens = text - labels = torch.roll(text, shifts=-1, dims=0) - labels[-1] = -1 - if self.create_inputs or not self.cached_inputs: - attention_mask, loss_mask, position_ids = _create_ltor_masks_and_position_ids( - tokens, self.eos_id, self.reset_position_ids, self.reset_attention_mask, self.eod_mask_loss, - ) - if not self.create_inputs: - self.cached_attention_mask = attention_mask - self.cached_loss_mask = loss_mask - self.cached_position_ids = position_ids - self.cached_inputs = True - else: - attention_mask = self.cached_attention_mask - loss_mask = self.cached_loss_mask - position_ids = self.cached_position_ids - loss_mask[labels == -1] = 0.0 - tokens[tokens == -1] = 0 - labels[labels == -1] = 0 - - # Negative index comes when we pad the last batch in MegatronPretrainingBatchSampler - # We make the loss_mask zero to mask out loss from these samples - if idx < 0: - logging.debug('Got negative index. Masking loss from this sample') - loss_mask = torch.zeros_like(loss_mask) - - return { - 'tokens': tokens, - 'labels': labels, - 'attention_mask': attention_mask, - 'loss_mask': loss_mask, - 'position_ids': position_ids, - } - - -class MockGPTDataset(Dataset): - def __init__( - self, cfg, tokenizer, name, num_samples, seq_length, seed, - ): - if not HAVE_MEGATRON_CORE: - raise ImportError( - "Megatron core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." - ) - - super().__init__() - self.name = name - self.seq_length = seq_length - self.vocab_size = tokenizer.vocab_size - self.length = num_samples - self.seed = seed - - self.attention_mask = torch.tril(torch.ones((self.seq_length, self.seq_length))).unsqueeze(0) - self.attention_mask = self.attention_mask < 0.5 - self.loss_mask = torch.ones(self.seq_length, dtype=torch.float) - self.position_ids = torch.arange(self.seq_length, dtype=torch.int64) - - def __len__(self): - return self.length - - def _get_text(self, idx: int) -> np.ndarray: - np_gen = np.random.default_rng(seed=(self.seed + idx)) - return np_gen.integers(self.vocab_size, size=[self.seq_length], dtype=np.int64) - - def __getitem__(self, idx): - # Generate data of the expected size and datatype (based on GPTDataset). - np_gen = np.random.default_rng(seed=(self.seed + idx)) - tokens = torch.from_numpy(np_gen.integers(self.vocab_size, size=[self.seq_length], dtype=np.int64)) - labels = torch.from_numpy(np_gen.integers(self.vocab_size, size=[self.seq_length], dtype=np.int64)) - - return { - 'tokens': tokens, - 'labels': labels, - 'attention_mask': self.attention_mask, - 'loss_mask': self.loss_mask, - 'position_ids': self.position_ids, - } - - -@torch.no_grad() -def _create_ltor_masks_and_position_ids( - tokens: torch.Tensor, eod_token: int, reset_position_ids: bool, reset_attention_mask: bool, eod_mask_loss: bool, -): - """Create `attention_mask`, `loss_mask`, and `position_ids`. - - This function is modified :func:`get_ltor_masks_and_position_ids` in nemo/collections/nlp/modules/common/megatron/utils.py: - `get_ltor_masks_and_position_ids` assumes a microbatch of ``tokens``, i.e. 2D tensor while - this function assumes ``tokens`` to be 1D tensor. - - Args: - tokens: A 1D tensor that holds the indices of tokens. - eod_token: - reset_position_ids: - reset_attention_mask: - eod_mask_loss - - """ - assert tokens.ndim == 1 - seq_length = tokens.numel() - # `attention_mask` has the shape of [1, seq_length, seq_length] - attention_mask = torch.tril(torch.ones((seq_length, seq_length))).unsqueeze(0) - loss_mask = torch.ones(seq_length, dtype=torch.float) - if eod_mask_loss: - loss_mask[tokens == eod_token] = 0.0 - - position_ids = torch.arange(seq_length, dtype=torch.int64) - if reset_position_ids: - position_ids = position_ids.clone() - - if reset_position_ids or reset_attention_mask: - # Find indices where EOD token is. - eod_index = position_ids[tokens[b] == eod_token] - # Detach indices from positions if going to modify positions. - if reset_position_ids: - eod_index = eod_index.clone() - prev_index = 0 - for j in range(eod_index.numel()): - i = eod_index[j] - if reset_attention_mask: - attention_mask[0, (i + 1) :, : (i + 1)] = 0 - if reset_position_ids: - position_ids[(i + 1) :] -= i + 1 - prev_index - prev_index = i + 1 - # Convert attention mask to binary. - attention_mask = attention_mask < 0.5 - return attention_mask, loss_mask, position_ids - - -def _build_index_mappings( - name, - data_prefix, - documents, - sizes, - num_samples, - seq_length, - seed, - index_mapping_dir: str = None, - drop_last: bool = True, - add_extra_token: int = 1, - shuffle_documents: bool = True, - exchange_indices_distributed: bool = False, -): - """Build doc-idx, sample-idx, and shuffle-idx. - doc-idx: is an array (ordered) of documents to be used in training. - sample-idx: is the start document index and document offset for each - training sample. - shuffle-idx: maps the sample index into a random index into sample-idx. - """ - # Number of tokens in each epoch and number of required epochs. - tokens_per_epoch = _num_tokens(documents, sizes) - num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples, add_extra_token) - # rng state - np_rng = np.random.RandomState(seed=seed) - - # Filename of the index mappings. - if index_mapping_dir is not None: - _filename = os.path.join(index_mapping_dir, os.path.basename(data_prefix)) - else: - _filename = data_prefix - _filename += '_{}_indexmap'.format(name) - _filename += '_{}ns'.format(num_samples) - _filename += '_{}sl'.format(seq_length) - _filename += '_{}s'.format(seed) - doc_idx_filename = _filename + '_doc_idx.npy' - sample_idx_filename = _filename + '_sample_idx.npy' - shuffle_idx_filename = _filename + '_shuffle_idx.npy' - - # Build the indexed mapping if not exist. - if torch.distributed.get_rank() == 0: - using_cached_indices = True - if ( - (not os.path.isfile(doc_idx_filename)) - or (not os.path.isfile(sample_idx_filename)) - or (not os.path.isfile(shuffle_idx_filename)) - ): - using_cached_indices = False - logging.info(' > WARNING: could not find index map files, building ' 'the indices on rank 0 ...') - - # For the last epoch, decide whether include the entire epoch - # in the global shuffle or not. - - # If we need only one epoch, then separating last epoch does - # not mean anything. - if num_epochs == 1: - separate_last_epoch = False - print(' > only one epoch required, setting ' 'separate_last_epoch to False', flush=True) - - else: - # Get the number of samples for the last epoch - num_samples_from_epochs_minus_one = ( - (num_epochs - 1) * tokens_per_epoch - add_extra_token - ) // seq_length - last_epoch_num_samples = num_samples - num_samples_from_epochs_minus_one - assert last_epoch_num_samples >= 0, 'last epoch number of samples should be non-negative.' - num_samples_per_epoch = (tokens_per_epoch - add_extra_token) // seq_length - assert last_epoch_num_samples <= ( - num_samples_per_epoch + 1 - ), 'last epoch number of samples exceeded max value.' - # If we have less than 80% of the samples for the last epoch, - # seperate out the epoch and treat it differently. - # Note: the 80% number is just based on common sense and can - # be adjusted if needed. - separate_last_epoch = last_epoch_num_samples < int(0.80 * num_samples_per_epoch) - if separate_last_epoch: - string = ( - ' > last epoch number of samples ({}) is smaller ' - 'than 80% of number of samples per epoch ({}), ' - 'setting separate_last_epoch to True' - ) - else: - string = ( - ' > last epoch number of samples ({}) is larger ' - 'than 80% of number of samples per epoch ({}), ' - 'setting separate_last_epoch to False' - ) - print(string.format(last_epoch_num_samples, num_samples_per_epoch), flush=True) - - # doc-idx. - start_time = time.time() - doc_idx = _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch, shuffle_documents) - np.save(doc_idx_filename, doc_idx, allow_pickle=True) - logging.info( - ' > elasped time to build and save doc-idx mapping ' - '(seconds): {:4f}'.format(time.time() - start_time) - ) - # sample-idx. - start_time = time.time() - # Use C++ implementation for speed. - # First compile and then import. - assert doc_idx.dtype == np.int32 - assert sizes.dtype == np.int32 - try: - from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper - - compile_helper() - from nemo.collections.nlp.data.language_modeling.megatron import helpers - except ImportError: - raise ImportError( - f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.' - ) - - sample_idx = helpers.build_sample_idx( - sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch, drop_last, add_extra_token - ) - # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length, - # num_epochs, tokens_per_epoch, drop_last, add_extra_token) - np.save(sample_idx_filename, sample_idx, allow_pickle=True) - logging.info( - ' > elasped time to build and save sample-idx mapping ' - '(seconds): {:4f}'.format(time.time() - start_time) - ) - # shuffle-idx. - start_time = time.time() - # -1 is due to data structure used to retieve the index: - # sample i --> [sample_idx[i], sample_idx[i+1]) - if separate_last_epoch: - num_samples_ = num_samples_from_epochs_minus_one - else: - num_samples_ = sample_idx.shape[0] - 1 - shuffle_idx = _build_shuffle_idx(num_samples_, sample_idx.shape[0] - 1, np_rng) - np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) - logging.info( - ' > elasped time to build and save shuffle-idx mapping' - ' (seconds): {:4f}'.format(time.time() - start_time) - ) - - torch.distributed.barrier() - counts = torch.cuda.LongTensor([1]) - torch.distributed.all_reduce(counts, group=parallel_state.get_data_parallel_group()) - torch.distributed.all_reduce(counts, group=parallel_state.get_pipeline_model_parallel_group()) - assert counts[0].item() == ( - torch.distributed.get_world_size() - // torch.distributed.get_world_size(group=parallel_state.get_tensor_model_parallel_group()) - ) - - if not exchange_indices_distributed or (torch.distributed.get_rank() == 0 and using_cached_indices): - # Load mappings. - start_time = time.time() - logging.info(' > loading doc-idx mapping from {}'.format(doc_idx_filename)) - doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') - logging.info(' > loading sample-idx mapping from {}'.format(sample_idx_filename)) - sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') - logging.info(' > loading shuffle-idx mapping from {}'.format(shuffle_idx_filename)) - shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') - logging.info(' loaded indexed file in {:3.3f} seconds'.format(time.time() - start_time)) - logging.info(' total number of samples: {}'.format(sample_idx.shape[0])) - logging.info(' total number of epochs: {}'.format(num_epochs)) - - if exchange_indices_distributed: - if torch.distributed.get_rank() == 0: - indices = [(doc_idx, sample_idx, shuffle_idx)] - else: - indices = [None] - torch.distributed.broadcast_object_list(indices) - doc_idx, sample_idx, shuffle_idx = indices[0] - - return doc_idx, sample_idx, shuffle_idx - - -def _num_tokens(documents, sizes): - """Total number of tokens in the dataset.""" - return np.sum(sizes[documents]) - - -def _num_epochs(tokens_per_epoch, seq_length, num_samples, add_extra_token=1): - """Based on number of samples and sequence lenght, calculate how many - epochs will be needed.""" - num_epochs = 0 - total_tokens = 0 - while True: - num_epochs += 1 - total_tokens += tokens_per_epoch - # -1 is because we need to retrieve seq_length + 1 token each time - # but the last token will overlap with the first token of the next - # sample except for the last sample. - if ((total_tokens - add_extra_token) // seq_length) >= num_samples: - return num_epochs - - -def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch, shuffle=True): - """Build an array with length = number-of-epochs * number-of-dcuments. - Each index is mapped to a corresponding document.""" - if not separate_last_epoch or num_epochs == 1: - doc_idx = np.mgrid[0:num_epochs, 0 : len(documents)][1] - doc_idx[:] = documents - doc_idx = doc_idx.reshape(-1) - doc_idx = doc_idx.astype(np.int32) - if shuffle: - np_rng.shuffle(doc_idx) - else: - logging.info('Document shuffling disabled') - return doc_idx - - doc_idx_first = _build_doc_idx(documents, num_epochs - 1, np_rng, False, shuffle) - doc_idx_last = _build_doc_idx(documents, 1, np_rng, False, shuffle) - return np.concatenate((doc_idx_first, doc_idx_last)) - - -def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch, drop_last=True, add_extra_token=1): - """Sample index mapping is a 2D array with sizes - [number-of-samples + 1, 2] where [..., 0] contains - the index into `doc_idx` and [..., 1] is the - starting offset in that document.""" - - # Total number of samples. For -1 see comments in `_num_epochs`. - if not drop_last: - num_samples = -(-(num_epochs * tokens_per_epoch - add_extra_token) // seq_length) - else: - num_samples = (num_epochs * tokens_per_epoch - add_extra_token) // seq_length - sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int32) - - # Index into sample_idx. - sample_index = 0 - # Index into doc_idx. - doc_idx_index = 0 - # Begining offset for each document. - doc_offset = 0 - # Start with first document and no offset. - sample_idx[sample_index][0] = doc_idx_index - sample_idx[sample_index][1] = doc_offset - sample_index += 1 - while sample_index <= num_samples: - # Start with a fresh sequence. - remaining_seq_length = seq_length + add_extra_token - while remaining_seq_length != 0: - # Get the document length. - doc_id = doc_idx[doc_idx_index] - doc_length = sizes[doc_id] - doc_offset - # And add it to the current sequence. - remaining_seq_length -= doc_length - # If we have more than a full sequence, adjust offset and set - # remaining length to zero so we return from the while loop. - # Note that -1 here is for the same reason we have -1 in - # `_num_epochs` calculations. - if remaining_seq_length <= 0: - doc_offset += remaining_seq_length + doc_length - add_extra_token - remaining_seq_length = 0 - else: - # Otherwise, start from the begining of the next document. - if doc_idx_index == (len(doc_idx) - 1): - assert ( - sample_index == num_samples - ), F"sample_index={sample_index} and num_samples={num_samples} should be the same" - doc_offset = sizes[doc_idx[doc_idx_index]] - add_extra_token - break - doc_idx_index += 1 - doc_offset = 0 - # Record the sequence. - sample_idx[sample_index][0] = doc_idx_index - sample_idx[sample_index][1] = doc_offset - sample_index += 1 - - return sample_idx - - -def _build_shuffle_idx(num_samples, total_size, np_rng): - """Build the range [0, size) and shuffle.""" - print( - ' > building shuffle index with split [0, {}) and [{}, {}) ' - '...'.format(num_samples, num_samples, total_size), - flush=True, - ) - - dtype_ = np.uint32 - if total_size >= (np.iinfo(np.uint32).max - 1): - dtype_ = np.int64 - - shuffle_idx_first = np.arange(start=0, stop=num_samples, step=1, dtype=dtype_) - np_rng.shuffle(shuffle_idx_first) - if num_samples == total_size: - return shuffle_idx_first - - shuffle_idx_last = np.arange(start=num_samples, stop=total_size, step=1, dtype=dtype_) - np_rng.shuffle(shuffle_idx_last) - - return np.concatenate((shuffle_idx_first, shuffle_idx_last)) diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py deleted file mode 100644 index 4b1b4f61d4391a9f774c4c3cdcbe30ef88440b0c..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py +++ /dev/null @@ -1,425 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -import pickle - -import torch -from tqdm.auto import tqdm - -from nemo.collections.nlp.modules.common import VirtualPromptSource -from nemo.collections.nlp.modules.common.megatron.utils import build_position_ids -from nemo.core import Dataset -from nemo.utils import AppState, logging - -__all__ = ['GPTPromptLearningDataset'] - - -class GPTPromptLearningDataset(Dataset): - """ - The dataset class for prompt-tuning or p-tuning pretrained GPT models. - - Args: - data (list[strings], list[dicts]): (1) paths to .jsonl or .json files, (2) dict objects corresponding to each input example - tokenizer (tokenizer): Tokenizer from frozen language model - virtual_prompt_source (Enum): Either VirtualPromptSource.NO_PROMPTS or VirtualPromptSource.PROMPT_ENCODER - task_templates (dict): Dictionary containing all task template information needed to format prompts. Created in the GPTPromptLearningModel class. - pseudo_tokens (list[strings]): A list of virtual prompt token placeholders e.g [, , ...] up to max num virtual tokens - pad_token_id (int): ID of pad token from tokenizer - max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated. - min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. - add_bos (bool): Whether to add a beginning of sentence token to each data example - add_eos (bool): Whether to add an end of sentence token to each data example - for_train (bool): Whether you're creating a dataset for training or inference - tokens_to_generate (int): (inference only) Number of tokens to generate during inference - """ - - def __init__( - self, - data, - tokenizer, - virtual_prompt_source: VirtualPromptSource, - task_templates: dict, - pseudo_tokens, - pad_token_id: int, - max_seq_length: int, - min_seq_length: int = 1, - add_bos: bool = False, - add_eos: bool = True, - for_train: bool = True, - tokens_to_generate=None, - cache_data_path: str = None, # the cache file - load_cache: bool = True, # whether to load from the cache if it is available - ): - self.tokenizer = tokenizer - self.virtual_prompt_source = virtual_prompt_source - self.task_templates = task_templates - self.pseudo_tokens = pseudo_tokens - self.pseudo_token_ids = set(self.tokenizer.tokens_to_ids(self.pseudo_tokens)) - self.pad_token_id = pad_token_id - self.max_seq_length = max_seq_length - self.min_seq_length = min_seq_length - self.add_bos = add_bos - self.add_eos = add_eos - self.for_train = for_train - self.examples = [] - - if not self.for_train: - self.tokens_to_generate = tokens_to_generate - - assert self.min_seq_length <= max_seq_length, "Min sequence length should be less than or equal to max" - assert self.max_seq_length > 0, "Max sequence length should be greater than 0" - - logging.info("Loading and tokenizing dataset ... ") - - if load_cache and cache_data_path is not None and os.path.exists(cache_data_path): - # load it from the cache - logging.info(f'load the data from the cache file {cache_data_path}') - with open(cache_data_path, 'rb') as f: - self.examples = pickle.load(f) - else: - # Data is just a list of dicts already loaded from a json file or passed in directly as a dict - if isinstance(data[0], dict): - self.load_data(data) - - # Datasets are a list of file path strings to .json or .jsonl files - elif isinstance(data[0], str): - for path in data: - dataset = open(path, 'r', encoding='utf-8') - self.load_data(dataset) - else: - raise ValueError("Datasets must be a list of filepath strings or a list of data example dicts") - if cache_data_path is not None: - # the first worker save the results into the cache file - app_state = AppState() - if app_state._global_rank == 0: - with open(cache_data_path, 'wb') as f: - pickle.dump(self.examples, f) - logging.info(f'save the data to the cache file {cache_data_path}') - - def load_data(self, dataset): - """ - Loads a dataset by filling in the task templates specified in the config file - with the information from each training/inference example. Converts all input - text into token ids. Also replaces the <|VIRTUAL_PROMPT_#|> placeholders in - the task templates with the actual virtual prompt token ids. - - params: - dataset: A list of json objects or a dictionary objects each - containing the information needed for a training example - """ - skipped = 0 - - for json_line in tqdm(dataset): - - # Read example dict or load the information for a single example from .json file - if type(json_line) == dict: - doc = json_line - else: - doc = json.loads(json_line) - - taskname = doc["taskname"] - prompt_template = self.task_templates[taskname]["prompt_template"] - prompt_template_fields = self.task_templates[taskname]["prompt_template_fields"] - total_virtual_tokens = self.task_templates[taskname]["total_virtual_tokens"] - virtual_token_splits = self.task_templates[taskname]["virtual_token_splits"] - truncation_field = self.task_templates[taskname]['truncate_field'] - answer_only_loss = self.task_templates[taskname]["answer_only_loss"] - answer_field = self.task_templates[taskname]["answer_field"] - - input_example = prompt_template - - self._input_sanity_checks( - total_virtual_tokens, - virtual_token_splits, - prompt_template, - prompt_template_fields, - truncation_field, - answer_only_loss, - answer_field, - doc, - ) - - # Format the input example according to the template - input_example = self._insert_text_in_template(input_example, prompt_template_fields, doc) - input_example = self._insert_virtual_token_placeholders(input_example, virtual_token_splits) - input_ids = self.tokenizer.text_to_ids(input_example) - - # Add BOS/EOS if desired, adds EOS by default - if self.add_bos: - input_ids = [self.tokenizer.bos_id] + input_ids - if self.add_eos: - input_ids = input_ids + [self.tokenizer.eos_id] - - # Try to truncate input text to fit into the max sequence length - if len(input_ids) > self.max_seq_length: - input_ids = self._truncate_input( - truncation_field, - input_ids, - taskname, - doc, - prompt_template, - prompt_template_fields, - virtual_token_splits, - ) - - # Skip example if the final length doesn't fit length requirements even after truncation - if self.min_seq_length <= len(input_ids) <= self.max_seq_length: - if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER: - taskname_id = self.tokenizer.text_to_ids(taskname) - elif self.virtual_prompt_source == VirtualPromptSource.NO_PROMPT: - taskname_id = -1 - else: - raise ValueError("Invalid virtual prompt source specified") - - # Find answer field indices if training and answer_only_loss is True - answer_start_idx = None - if answer_only_loss and self.for_train: - answer_start_idx = self._find_answer_start(taskname, input_ids, answer_field, doc) - - self.examples.append((taskname_id, input_ids, answer_start_idx)) - else: - skipped += 1 - - logging.info(f'Skipped {skipped} sentences, sequence length too short or too long even after truncation') - - def _input_sanity_checks( - self, - total_virtual_tokens, - virtual_token_splits, - prompt_template, - prompt_template_fields, - truncation_field, - answer_only_loss, - answer_field, - doc, - ): - # Sanity check amount of virtual token - assert ( - total_virtual_tokens < self.max_seq_length - ), "virtual prompt tokens should not exceed max sequence length" - - # Make sure virtual token splits add up to the total number of virtual tokens - assert ( - sum(virtual_token_splits) == total_virtual_tokens - ), "Sum of prompt token split values must equal total number of prompt tokens" - - # Make sure number of virtual prompt locations match the number of virtual prompt splits - assert prompt_template.count('<|VIRTUAL_PROMPT_') == len( - virtual_token_splits - ), "The number of '<|VIRTUAL_PROMPT_n|>' markers and the number of prompt token splits must match" - - # Check if input example has fields not present in template - keys_not_in_template = list(set(doc.keys()) - set(prompt_template_fields) - set(['taskname'])) - assert ( - len(keys_not_in_template) == 0 - ), f"Examples in your dataset contain the fields: {keys_not_in_template} that are not in the task template." - - # Answer field checks - if answer_only_loss and self.for_train: - assert answer_field is not None, "If answer_only_loss=True, an answer_field must be given" - assert ( - answer_field in doc.keys() - ), f"answer_only_loss=True but the given answer_field '{answer_field}' is not in data json" - assert truncation_field != answer_field, "Answer field and truncation field should not match" - - answer_placeholder = "{" + answer_field + "}" - answer_placeholder_len = len(answer_placeholder) - placeholder_start = len(prompt_template) - answer_placeholder_len - assert prompt_template[placeholder_start:] == answer_placeholder, "Answer field must be at prompt end" - - def _insert_text_in_template(self, input_example, prompt_template_fields, doc): - """ Format the input example according to the template """ - for field in prompt_template_fields: - if field in doc.keys(): - field_text = doc[field] - input_example = input_example.replace('{' + field + '}', field_text) - - # If some fields from the template aren't present, e.g. {answer} during inference - # just remove that field from the template, leaving the space blank - else: - input_example = input_example.replace('{' + field + '}', "") - - return input_example.strip(" ") - - def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits): - """ Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers """ - total_inserted_tokens = 0 - - for idx in range(len(virtual_token_splits)): - split_start = total_inserted_tokens - split_end = total_inserted_tokens + virtual_token_splits[idx] - pseudo_tokens_for_split = "".join(self.pseudo_tokens[split_start:split_end]) - input_example = input_example.replace(f'<|VIRTUAL_PROMPT_{idx}|>', pseudo_tokens_for_split) - total_inserted_tokens = split_end - - return input_example - - def _truncate_input( - self, truncation_field, input_ids, taskname, doc, prompt_template, prompt_template_fields, virtual_token_splits - ): - """ Try to truncate input text to fit into the max sequence length """ - logging.info( - f"Input greater than max sequence length. Attempting to truncate: '{truncation_field}' in task: '{taskname}'" - ) - - # Truncate the text ids in this part of input to try and fit max sequence length - if truncation_field is not None and truncation_field in doc.keys(): - truncation_length = (len(input_ids) - self.max_seq_length) + 1 - field_text = doc[truncation_field] - - # Truncate field text - field_text_ids = self.tokenizer.text_to_ids(field_text) - truncated_text_ids = field_text_ids[: -min(truncation_length, len(field_text_ids))] - truncated_field_text = self.tokenizer.ids_to_text(truncated_text_ids) - doc[truncation_field] = truncated_field_text - - # Re-insert the truncated text string into the text prompt - input_example = prompt_template - input_example = self._insert_text_in_template(input_example, prompt_template_fields, doc) - input_example = self._insert_virtual_token_placeholders(input_example, virtual_token_splits) - - # Re-tokenize the whole prompt - input_ids = self.tokenizer.text_to_ids(input_example) - - return input_ids - - def _find_answer_start(self, taskname, input_ids, answer_field, doc): - """ Find the token ids corresponding to the answer start, for loss masking purposes. - Assumes the answer is always at the end of the prompt. - """ - answer_text = doc[answer_field] - answer_text = self._add_leading_space(taskname, answer_field, answer_text) - answer_text_ids = self.tokenizer.text_to_ids(answer_text) - num_answer_text_ids = len(answer_text_ids) - - if self.add_eos: - num_answer_text_ids += 1 - - answer_start_idx = len(input_ids) - num_answer_text_ids - - return answer_start_idx - - def _add_leading_space(self, taskname, field_name, field_text): - """ Add leading space to text if there is a space before it in the template """ - prompt_template = self.task_templates[taskname]["prompt_template"] - field_text_start = prompt_template.find("{" + field_name + "}") - if field_text_start != 0 and prompt_template[field_text_start - 1] == " ": - field_text = " " + field_text - - return field_text - - def __len__(self): - return len(self.examples) - - def __getitem__(self, idx): - return self.examples[idx] - - def _ceil_to_nearest(self, n, m): - return (n + m - 1) // m * m - - def collate_fn(self, batch, tp_workers=0): - """ Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch """ - taskname_ids, input_ids, answer_starts = zip(*batch) - - # Pad taskname_ids to be the same length for the prompt encoder - if self.virtual_prompt_source == VirtualPromptSource.PROMPT_ENCODER: - max_taskname_length = max(len(ids) for ids in taskname_ids) - taskname_ids = [ids + [self.pad_token_id] * (max_taskname_length - len(ids)) for ids in taskname_ids] - taskname_ids = torch.tensor(taskname_ids) - - # Task ids are just used for a look up embeddings for prompt-table - elif self.virtual_prompt_source == VirtualPromptSource.NO_PROMPT: - taskname_ids = torch.tensor(taskname_ids) - - # Get max sequence length of batch - batch_max = max(len(ids) for ids in input_ids) - - if tp_workers > 1: - # more sure the sequence length is multiply of number of tp_workers, needed for sequence parallel. - resi_padding = (tp_workers - (batch_max - 1) % tp_workers) % tp_workers - else: - resi_padding = 0 - batch_max += resi_padding - ceil_batch_max = self._ceil_to_nearest( - batch_max, 8 - ) # @adithyare this padding does not conflict with the tp_workers padding above - # since tp_workers is always a multiple of 2. the padding to multiple of 8 is to ensure an mem-optimized softmax is used. - batch_max = ceil_batch_max + 1 - input_ids, loss_mask = self.pad_batch_and_build_loss_mask(input_ids, batch_max, answer_starts) - # Should be a label for every token in batch, label is the next token - labels = input_ids[:, 1:].contiguous() - input_ids = input_ids[:, :-1].contiguous() - batch_max -= 1 # @adithyare I *think* this negatition is done to account for the above 2 lines which removes one item from the input_ids seq. - - # Loss mask should align with labels - loss_mask = loss_mask[:, 1:].contiguous() - - # Using causal attention mask for whole input - batch_size = len(input_ids) - attention_mask = torch.tril(torch.ones((batch_size, batch_max, batch_max))).view( - batch_size, 1, batch_max, batch_max - ) - - # Convert attention mask from float to bool - attention_mask = attention_mask < 0.5 - position_ids = build_position_ids(input_ids) - - return input_ids, labels, loss_mask, position_ids, attention_mask, taskname_ids - - def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts): - """ Pad input_ids in batch to max batch length while building loss mask """ - batch_loss_masks = [] - padded_input_ids = [] - for ids, answer_start_idx in zip(input_ids, answer_starts): - if answer_start_idx is not None: - # Loss mask where answer tokens are 1.0 and all other tokens are 0.0 - loss_mask = [float(idx >= answer_start_idx) for idx in range(len(ids))] - else: - # Loss mask where virtual tokens are 0.0 and all other tokens are 1.0 - loss_mask = [float(token_id not in self.pseudo_token_ids) for token_id in ids] - - # Pad to max length - input_length = len(ids) - padding_length = batch_max - input_length - pad_extend = [self.pad_token_id] * padding_length - ids = ids + pad_extend - padded_input_ids.append(ids) - - # Account for padding in loss mask - loss_mask.extend([0.0] * padding_length) - batch_loss_masks.append(torch.tensor(loss_mask, dtype=torch.float)) - - # Make into torch tensors - padded_input_ids = torch.tensor(padded_input_ids, dtype=torch.long) - batch_loss_masks = torch.stack(batch_loss_masks) - - return padded_input_ids, batch_loss_masks - - def inference_collate_fn(self, batch): - """ - Used for loading inference data. - """ - task_id_nums, input_ids, answer_starts = zip(*batch) - input_lengths = torch.cuda.LongTensor([len(inputs) for inputs in input_ids]) - task_id_nums = torch.cuda.LongTensor(task_id_nums) - batch_max = input_lengths.max().item() - batch_max += self.tokens_to_generate - - input_ids, _ = self.pad_batch_and_build_loss_mask(input_ids, batch_max, answer_starts) - input_ids = input_ids.cuda() - input_ids = torch.cuda.LongTensor(input_ids) - - return task_id_nums, (input_ids, input_lengths) diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py deleted file mode 100644 index 96cc57a300b863d052ce50dbcb4b72e73850e340..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py +++ /dev/null @@ -1,399 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy - -import torch - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset -from nemo.utils import logging - -__all__ = ['GPTSFTChatDataset', 'get_prompt_template_example'] - - -PREFIX_STR = ( - "\x00" # the prefix string used in the tokenizer to deal with the added empty token for some of the tokenizers -) - -IGNORE_INDEX = -100 -SYSTEM_TOKEN = "System" - -TYPE_INSTRUCTION = { - 'TEXT_TO_VALUE': "", - 'VALUE_TO_TEXT': '', -} - - -def _get_header_conversation_type_mask_role(source, special_tokens): - END_SIGNAL = special_tokens['end_of_turn'] - END_NAME_SIGNAL = special_tokens['end_of_name'] - - data_type = None - if 'type' in source: - data_type = source['type'] - if data_type is not None: - assert data_type in TYPE_INSTRUCTION, f"source type {data_type} not supported" - # add end signal and concatenate together - conversation = source['system'] - if data_type is not None: - if TYPE_INSTRUCTION[data_type] != '': - conversation = conversation + '\n' + TYPE_INSTRUCTION[data_type] - mask_role = source.get('mask', 'User') - header = f"{special_tokens['system_turn_start']}{SYSTEM_TOKEN}{END_NAME_SIGNAL}{conversation}{END_SIGNAL}" - conversation = _add_speaker_and_signal(header, source['conversations'], mask_role, data_type, special_tokens) - return header, conversation, data_type, mask_role - - -def get_prompt_template_example(special_tokens): - source = { - 'system': '{system message}', - 'conversations': [ - {'from': 'User', 'value': '{turn 1 user message}', 'label': None}, - {'from': 'Assistant', 'value': '{turn 1 assistant message}', 'label': '{turn 1 assistant label}'}, - {'from': 'User', 'value': '{turn 2 user message}', 'label': None}, - {'from': 'Assistant', 'value': '{turn 2 assistant message}', 'label': '{turn 2 assistant label}'}, - ], - "mask": "User", - "type": "VALUE_TO_TEXT", - } - _, conversation, _, _ = _get_header_conversation_type_mask_role(source, special_tokens) - return conversation - - -def identify_start_index_of_subsequence(subsequence, sequence): - """ find the location of the small tensor in the large tensor. - e.g. small = [1,3], large = [2,3,1,3], returns 2 - small = [3,2], large = [2,3,1,3], returns -1 - Args: - small (tensor): small tensor - large (tensor): large tensor - """ - for i in range(sequence.size(0) - subsequence.size(0) + 1): - if torch.equal(sequence[i : i + subsequence.size(0)], subsequence): - return i - return -1 - - -def _mask_targets( - target, - tokenized_lens, - speakers, - header_len, - s_ids, - tokenizer, - mask_role, - gtype, - name_end_token_ids, - special_tokens, - label_start_ids, - num_turn_start_tokens, -): - """ This function masks the tokens so the loss is computed only on the non-masked role's responses. - For 'TEXT_TO_VALUE' type, the loss is computed on the value attributes. - - Args: - target (Tensor): input ids - tokenized_lens (List[int]): array of lengths of each turns - speakers (List[str]): array of speakers of each turns - header_len (int): the system prompt length - s_ids (List[Tensor]): array of tokenized ids of each turns - tokenizer (TokenizerSpec): tokenizer object - mask_role (str): the speaker id to be masked from loss computation - gtype (str): either 'TEXT_TO_VALUE' or 'VALUE_TO_TEXT' - name_end_token_ids (int): end of name token ids - special_tokens (dict): special tokens used for the chat prompt. It has the keys: system_turn_start, turn_start, label_start, end_of_turn - label_start_ids (list): list of label start token ids, - num_turn_start_tokens (int): number of tokens of the turn_start str - """ - TURN_TOKEN = special_tokens['turn_start'] - END_NAME_SIGNAL = special_tokens['end_of_name'] - label_start_ids = torch.tensor(label_start_ids) - name_end_token_ids = torch.tensor(name_end_token_ids) - - cur_idx = header_len - tgt_len = target.shape[0] - for i, (tokenized_len, speaker, s_id) in enumerate(zip(tokenized_lens, speakers, s_ids)): - # note, sentence piece will add extra empty token in front. has to compute the diff - id1 = tokenizer.text_to_ids(PREFIX_STR) - id2 = tokenizer.text_to_ids(PREFIX_STR + TURN_TOKEN + speaker + END_NAME_SIGNAL) - skip_name_len = len(id2) - len( - id1 - ) # s_ids[:skip_name_len] is the name part of the prompt 'TURN_TOKEN + speaker + END_NAME_SIGNAL' - # get the position of the label start string in this turn - location = identify_start_index_of_subsequence(label_start_ids, s_id) - - if location >= 0: - # if it contains the label start tokens - if gtype == 'VALUE_TO_TEXT': - # handles the case that condition on labels to generate respone - # the next token after the name part of the prompt is the beginning of the label start tokens - assert skip_name_len == location - # find the first new line token after the label part, which indicates the end of the whole label string - # newline_loc = torch.where((s_id[skip_name_len:] == name_end_token_ids))[0] - newline_loc = identify_start_index_of_subsequence(name_end_token_ids, s_id[skip_name_len:]) - if newline_loc < 0: - # cannot find new line token, which means the the whole turn is just a partial label string. Mask the whole turn - target[cur_idx : cur_idx + tokenized_len] = IGNORE_INDEX - continue - # skip the label part and the new line token - more_skip_len = newline_loc + len(name_end_token_ids) - # skip the name part and the label part - skip_name_len += more_skip_len - elif gtype == 'TEXT_TO_VALUE': - # handles the case that condition on response to generate label - # skip the name part, response and the label start tokens part, the remainder is the label string without label start, e.g. 'quality:9,toxicity:8...' - skip_name_len = location + len(label_start_ids) - if cur_idx >= tgt_len: - break - elif cur_idx + tokenized_len < tgt_len: - # Check whether the mask is applied to the correct position, the first token is turn start tokens - if not torch.equal(target[cur_idx + 1 : cur_idx + tokenized_len], s_id[1:]): - logging.warning("a sentence mismatches the corresponding piece " "in the conversation") - if i == 0 and (gtype == 'VALUE_TO_TEXT' or gtype is None): - # mask the first turn completely to provide at least one turn as context for the rest - target[cur_idx : cur_idx + tokenized_len] = IGNORE_INDEX - elif speaker == mask_role and i == 1 and gtype == 'TEXT_TO_VALUE': - # leave the first turn start tag unmasked, servers severs as the end of turn signal - target[cur_idx + num_turn_start_tokens : cur_idx + tokenized_len] = IGNORE_INDEX - elif speaker == mask_role and (i > 1): - # leave the first turn start tag unmasked, which severs as the end of turn signal - target[cur_idx + num_turn_start_tokens : cur_idx + tokenized_len] = IGNORE_INDEX - elif speaker == mask_role and (i <= 1): - # mask out everything in the second turn - target[cur_idx : cur_idx + tokenized_len] = IGNORE_INDEX - else: - # mask up to name part, label part for VALUE_TO_TEXT, or name part, response and label start tokens for TEXT_TO_VALUE, or just the name part if gtype is None - target[cur_idx : cur_idx + skip_name_len] = IGNORE_INDEX - cur_idx += tokenized_len - - -def response_value_formater(label, label_start, end_signal): - if isinstance(label, str): - return label_start + label + end_signal - elif label is None: - return '' - else: - raise ValueError(f'Unknown label type {type(label)}, only str type is supported') - - -def _add_speaker_and_signal(header, source, mask_role, gtype, special_tokens): - TURN_TOKEN = special_tokens['turn_start'] - END_SIGNAL = special_tokens['end_of_turn'] - LABEL_START = special_tokens['label_start'] - END_NAME_SIGNAL = special_tokens['end_of_name'] - - """Add speaker and start/end signal on each round.""" - BEGIN_SIGNAL = "" - conversation = header - for i, sentence in enumerate(source): - sentence_from = sentence["from"] - role_token = TURN_TOKEN - if gtype is None: - sentence["value"] = ( - BEGIN_SIGNAL + role_token + sentence_from + END_NAME_SIGNAL + sentence["value"] + END_SIGNAL - ) - elif gtype == "VALUE_TO_TEXT": - sentence["value"] = ( - BEGIN_SIGNAL - + role_token - + sentence_from - + END_NAME_SIGNAL - + ( - response_value_formater(sentence['label'], LABEL_START, END_NAME_SIGNAL) - if 'label' in sentence - else '' - ) - + sentence["value"] - + END_SIGNAL - ) - elif gtype == "TEXT_TO_VALUE": - sentence["value"] = ( - BEGIN_SIGNAL - + role_token - + sentence_from - + END_NAME_SIGNAL - + sentence["value"] - + END_SIGNAL - + ( - response_value_formater(sentence['label'], LABEL_START, END_NAME_SIGNAL) - if 'label' in sentence - else '' - ) - ) - else: - raise ValueError( - f"source type {gtype} not supported, only 'VALUE_TO_TEXT' and 'TEXT_TO_VALUE' are supported" - ) - conversation += sentence["value"] - # if the last turn is not masked, add next token start token to the end, which will be included for loss calculation - if sentence_from != mask_role and i == len(source) - 1: - conversation += TURN_TOKEN - return conversation - - -def preprocess( - source: dict, - tokenizer: TokenizerSpec, - name_end_token_ids: int, - label_start_ids: list, - special_tokens: dict, - num_turn_start_tokens: int, -): - """ - Given a conversation list. This transform: - 1. Add signal '### ' at the beginning each sentence, with end signal '\n'; - 2. Concatenate conversations together; - 3. Tokenize the concatenated conversation; - 4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX. - """ - header, conversation, data_type, mask_role = _get_header_conversation_type_mask_role(source, special_tokens) - # tokenize conversations - input_ids = tokenizer.text_to_ids(conversation) - target = copy.deepcopy(input_ids) - header_tokens = tokenizer.text_to_ids(header) - header_len = len(header_tokens) - - ids = [] - tokenized_lens = [] - assert torch.equal(torch.tensor(target[:header_len]), torch.tensor(header_tokens)) - for s in source['conversations']: - # hack to remove the extra empty token in front - id1 = tokenizer.text_to_ids(PREFIX_STR + s["value"]) - id2 = tokenizer.text_to_ids(PREFIX_STR) - tokenized_sentence = id1[len(id2) :] - ids.append(torch.tensor(tokenized_sentence)) - tokenized_lens.append(len(tokenized_sentence)) - speakers = [sentence["from"] for sentence in source['conversations']] - assert mask_role in speakers, "mask role not in the conversation" - target = torch.LongTensor(target) - # not going to train on the header - target[:header_len] = IGNORE_INDEX - input_ids = torch.LongTensor(input_ids) - _mask_targets( - target, - tokenized_lens, - speakers, - header_len, - ids, - tokenizer, - mask_role, - data_type, - name_end_token_ids, - special_tokens, - label_start_ids, - num_turn_start_tokens, - ) - mask = (target != IGNORE_INDEX).bool() - assert mask.sum().item() != 0, "mask is empty" - # Choose the last conversation as answer other history are context - last_ignore_index_pos = torch.nonzero(target == IGNORE_INDEX)[-1].item() + 1 - context_ids = input_ids[:last_ignore_index_pos] - answer_ids = input_ids[last_ignore_index_pos:] - return dict(input_ids=input_ids, mask=mask, context_ids=context_ids, answer_ids=answer_ids) - - -class GPTSFTChatDataset(GPTSFTDataset): - def _maybe_validate_prompt_template(self): - pass - - def _build_samples_mapping(self): - super()._build_samples_mapping() - assert hasattr(self.tokenizer, "vocab"), "tokenizer should have vocab property, not supported" - LABEL_START = self.special_tokens['label_start'] - END_NAME_SIGNAL = self.special_tokens['end_of_name'] - - id1 = self.tokenizer.text_to_ids(PREFIX_STR) - id2 = self.tokenizer.text_to_ids(PREFIX_STR + LABEL_START) - self.label_start_tokens = id2[len(id1) :] - - id1 = self.tokenizer.text_to_ids(PREFIX_STR + END_NAME_SIGNAL) - id2 = self.tokenizer.text_to_ids(PREFIX_STR) - self.name_end_token_ids = id1[len(id2) :] - - id1 = self.tokenizer.text_to_ids(PREFIX_STR + self.special_tokens['turn_start']) - id2 = self.tokenizer.text_to_ids(PREFIX_STR) - self.num_turn_start_tokens = len(id1) - len(id2) - - def _process_example(self, example): - """ - Create an example by concatenating text and answer. - Truncation is carried out when needed, but it is performed only on the prompt side. - BOS, EOS, and SEP, are added if specified. - """ - result = preprocess( - example, - self.tokenizer, - self.name_end_token_ids, - self.label_start_tokens, - self.special_tokens, - self.num_turn_start_tokens, - ) - - # store metadata in dataset, in case user may have keys required in the prediction json files - metadata = {k: v for k, v in example.items() if k not in ['conversations']} - result['metadata'] = metadata - - return result - - def collate_fn(self, batch): - input_ids = [item['input_ids'][:-1].tolist() for item in batch] - labels = [item['input_ids'][1:].tolist() for item in batch] - contexts = [item['context_ids'].tolist() for item in batch] - answers = [item['answer_ids'].tolist() for item in batch] - loss_mask = [item['mask'][1:].tolist() for item in batch] - metadata = [item['metadata'] for item in batch] - - max_length = max(max([len(x) for x in input_ids]), max([len(x) for x in contexts]) + self.tokens_to_generate) - if max_length > self.max_seq_length: - # truncate the sequences if it is longer than max_seq_length - input_ids = [x[: self.max_seq_length] for x in input_ids] - labels = [x[: self.max_seq_length] for x in labels] - loss_mask = [x[: self.max_seq_length] for x in loss_mask] - contexts = [x[: self.max_seq_length] for x in contexts] - answers = [x[: self.max_seq_length] for x in answers] - - # increase max length to nearest multiple of 4 or 8 - if self.pad_to_max_length: - max_length = self.max_seq_length - else: - max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 8)) - assert max_length <= self.max_seq_length - - attention_mask = [self._create_attention_mask(max_length) for _ in batch] - attention_mask = torch.stack(attention_mask) - position_ids = [list(range(max_length)) for _ in batch] - position_ids = torch.LongTensor(position_ids) - input_ids = torch.LongTensor( - self._collate_item(input_ids, max_length=max_length, pad_id=self.tokenizer.eos_id) - ) - labels = torch.LongTensor(self._collate_item(labels, max_length=max_length, pad_id=self.tokenizer.eos_id)) - loss_mask = torch.LongTensor(self._collate_item(loss_mask, max_length=max_length, pad_id=0)) - context_lengths = torch.LongTensor([len(x) for x in contexts]) - contexts = torch.LongTensor(self._collate_item(contexts, max_length=max_length, pad_id=self.tokenizer.eos_id)) - answers = torch.LongTensor(self._collate_item(answers, max_length=max_length, pad_id=self.tokenizer.eos_id)) - - processed_batch = { - 'tokens': input_ids, - 'labels': labels, - 'attention_mask': attention_mask, - 'loss_mask': loss_mask, - 'position_ids': position_ids, - 'contexts': contexts, - 'context_lengths': context_lengths, - 'answers': answers, - 'metadata': metadata, - } - - return processed_batch diff --git a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py deleted file mode 100644 index 9c6e50f5e43ff8351c76fe6bba417da074e5d2af..0000000000000000000000000000000000000000 --- a/SoundScribe/SpeakerID/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py +++ /dev/null @@ -1,450 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -from typing import List, Mapping, Optional - -import datasets -import numpy as np -import torch - -# hack to avoid the "not enough disk space" error in some slurm cluster -datasets.builder.has_sufficient_disk_space = lambda needed_bytes, directory='.': True -from datasets import load_dataset - -from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import get_samples_mapping -from nemo.collections.nlp.data.language_modeling.text_memmap_dataset import JSONLMemMapDataset -from nemo.core.classes import Dataset -from nemo.utils import logging - -__all__ = ['GPTSFTDataset'] - - -class GPTSFTDataset(Dataset): - def __init__( - self, - file_path: str, - tokenizer: TokenizerSpec, - max_seq_length: int = 1024, - min_seq_length: int = 1, - add_bos: bool = False, - add_eos: bool = True, - add_sep: bool = False, - sep_id: int = None, - max_num_samples: int = None, - seed: int = 1234, - label_key: str = "answer", - answer_only_loss: bool = True, - truncation_field: str = "text", - pad_to_max_length: bool = False, # (@adithyare) allows for much faster training especially in PEFT settings. - index_mapping_dir: str = None, - prompt_template: str = None, - virtual_tokens: int = 0, - tokens_to_generate: int = 0, - memmap_workers: Optional[int] = None, - hf_dataset: bool = False, - truncation_method: str = 'right', - special_tokens: Optional[Mapping[str, str]] = None, # special tokens, a dictory of {token_type: token} - ): - """ - file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} - tokenizer: Tokenizer for the dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece). - max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated. - min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. - add_bos (bool): Whether to add a beginning of sentence token to each data example - add_eos (bool): Whether to add an end of sentence token to each data example - add_sep (bool): Whether to add a separation token to each data example (goes between prompt and answer) - tokens_to_generate (int): (inference only) Number of tokens to generate during inference - seed: Random seed for data shuffling. - max_num_samples: Maximum number of samples to load. This can be > dataset length if you want to oversample data. If None, all samples will be loaded. - seed: int = 1234, - label_key: Key to use for the label in your JSONL file - answer_only_loss: If True, will compute the loss only on the answer part of the input. If False, will compute the loss on the entire input. - truncation_field: Field to use for truncation. (Options: keys in prompt_template). Field to be used for truncation if the combined length exceeds the max sequence length. - pad_to_max_length: Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch. - index_mapping_dir: Directory to save the index mapping to. If None, will write to the same folder as the dataset. - prompt_template: Prompt template to inject via an fstring. Formatted like Q: {context_key}\n\nA: {label_key} - hf_dataset: Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset. - truncation_method: Truncation from which position. Options: ['left', 'right'] - special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '', 'turn_start': '', 'label_start': '', 'end_of_turn': '\n', "end_of_name": "\n"} - """ - self.tokenizer = tokenizer - self.file_path = file_path - self.max_seq_length = max_seq_length - self.min_seq_length = min_seq_length - self.add_bos = add_bos - self.add_eos = add_eos - self.add_sep = add_sep - self.sep_id = sep_id - self.max_num_samples = max_num_samples - self.seed = seed - self.label_key = label_key - self.answer_only_loss = answer_only_loss - self.truncation_fields = truncation_field.split(',') - self.pad_to_max_length = pad_to_max_length - self.index_mapping_dir = index_mapping_dir - self.prompt_template = prompt_template - self.virtual_tokens = virtual_tokens - self.tokens_to_generate = tokens_to_generate - self.truncation_method = truncation_method - if special_tokens is None: - self.special_tokens = { - "system_turn_start": "", - "turn_start": "", - "label_start": "", - "end_of_turn": "\n", - "end_of_name": "\n", - } - else: - self.special_tokens = special_tokens - - if hf_dataset: - self.indexed_dataset = load_dataset( - 'json', data_files=file_path, cache_dir=index_mapping_dir, num_proc=memmap_workers, split='train' - ) - else: - self.indexed_dataset = JSONLMemMapDataset( - dataset_paths=[file_path], - tokenizer=None, - header_lines=0, - index_mapping_dir=index_mapping_dir, - workers=memmap_workers, - ) - - # Validate prompt template - self._maybe_validate_prompt_template() - - # Will be None after this call if `max_num_samples` is None - self._build_samples_mapping() - - def _maybe_validate_prompt_template(self): - assert ( - self.prompt_template is not None - ), f'we need prompt_template to combine contexts and label {self.label_key}' - # When providing things like newlines in the prompt template via the CLI, they are escaped. This line unescapes them. - self.prompt_template = self.prompt_template.encode('utf-8').decode('unicode_escape') - self.prompt_template_keys = re.findall(r'{(.*?)}', self.prompt_template) - - label_placeholder = f'{{{self.label_key}}}' - assert ( - self.prompt_template[-len(label_placeholder) :] == label_placeholder - ), f'{label_placeholder} must be at the end of prompt_template.' - - # Legacy checkpoints has self.truncation_fields = ['context'] and self.prompt_template_keys = ['input', 'output'] - if self.prompt_template_keys[0] == 'input' and self.truncation_fields[0] == 'context': - self.truncation_fields[0] = self.prompt_template_keys[0] - - assert set(self.truncation_fields).issubset( - self.prompt_template_keys - ), f'truncation_fields {self.truncation_fields} must in {self.prompt_template_keys}' - - def _build_samples_mapping(self): - if self.max_num_samples is not None: - self.samples_mapping = get_samples_mapping( - indexed_dataset=self.indexed_dataset, - data_prefix=self.file_path, - num_epochs=None, - max_num_samples=self.max_num_samples, - max_seq_length=self.max_seq_length - 2, - short_seq_prob=0, - seed=self.seed, - name=self.file_path.split('/')[-1], - binary_head=False, - index_mapping_dir=self.index_mapping_dir, - ) - else: - self.samples_mapping = None - - def __len__(self): - if self.max_num_samples is None: - return len(self.indexed_dataset) - else: - return len(self.samples_mapping) - - def __getitem__(self, idx): - if isinstance(idx, np.int64): - idx = idx.item() - - if self.samples_mapping is not None: - assert idx < len(self.samples_mapping) - idx, _, _ = self.samples_mapping[idx] - if isinstance(idx, np.uint32): - idx = idx.item() - - assert idx < len(self.indexed_dataset) - # idx may < 0 because we pad_samples_to_global_batch_size, e.g. id = -1 - if idx < 0: - idx = len(self) + idx - auto_gen_idx = True - else: - auto_gen_idx = False - try: - example = self.indexed_dataset[idx] - if auto_gen_idx: - example['__AUTOGENERATED__'] = True - except Exception as e: - logging.error(f"Error while loading example {idx} from dataset {self.file_path}") - raise e - return self._process_example(example) - - def _separate_template(self, prompt_template_values: List[str]): - """ - Combine contexts and label based on prompt_template into a list of strings and a list of keys. - - Args: - prompt_template_values (List[str]): the list of context and label strings extrated from jsonl file with prompt_template_keys. - - Returns: - template_strings (List[str]): separated prompt_template with contexts/label placeholder filled with corresponding strings - template_strings_keys (List[str]): strings point to placeholder keys or