upd src

Browse files

Files changed (8) hide show

src/{run_base.sh → bash_runners/run_base.sh} +0 -0
src/bash_runners/run_eval_cv11.sh +9 -0
src/{run_small.sh → bash_runners/run_small.sh} +7 -8
src/{run_tiny_debug.sh → bash_runners/run_tiny_debug.sh} +0 -0
src/belarusian_text_normalizer.py +41 -0
src/readme.md +10 -0
src/run_eval_whisper_streaming.py +165 -0
src/run_speech_recognition_seq2seq_streaming.py +1 -38

src/{run_base.sh → bash_runners/run_base.sh} RENAMED Viewed

File without changes

src/bash_runners/run_eval_cv11.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+python src/run_eval_whisper_streaming \
+	--model_id="." \
+	--language="be" \
+	--dataset="mozilla-foundation/common_voice_11_0" \
+    --config="be" \
+    --split="test" \
+    --device="0" \
+    --batch_size="32" \
+    --streaming="True"

src/{run_small.sh → bash_runners/run_small.sh} RENAMED Viewed

@@ -1,5 +1,5 @@
 python src/run_speech_recognition_seq2seq_streaming.py \
-	--model_name_or_path="openai/whisper-small" \
 	--dataset_name="mozilla-foundation/common_voice_11_0" \
 	--dataset_config_name="be" \
 	--language="be" \
@@ -7,14 +7,14 @@ python src/run_speech_recognition_seq2seq_streaming.py \
 	--eval_split_name="validation" \
 	--model_index_name="Whisper Small Belarusian" \
     \
-	--max_steps="12000" \
 	--output_dir="./" \
 	--per_device_train_batch_size="64" \
-	--per_device_eval_batch_size="64" \
 	--logging_steps="50" \
 	--logging_first_step \
-	--learning_rate="1e-4" \
-	--warmup_steps="500" \
 	--evaluation_strategy="steps" \
 	--eval_steps="1000" \
 	--save_strategy="steps" \
@@ -39,6 +39,5 @@ python src/run_speech_recognition_seq2seq_streaming.py \
 	--do_normalize_eval \
 	--streaming_train="True" \
 	--streaming_eval="False" \
-	--use_auth_token \
-	--push_to_hub \
-	--hub_model_id="ales/whisper-small-belarusian"

 python src/run_speech_recognition_seq2seq_streaming.py \
+	--model_name_or_path="ales/whisper-small-belarusian" \
 	--dataset_name="mozilla-foundation/common_voice_11_0" \
 	--dataset_config_name="be" \
 	--language="be" \
 	--eval_split_name="validation" \
 	--model_index_name="Whisper Small Belarusian" \
     \
+	--max_steps="6000" \
 	--output_dir="./" \
 	--per_device_train_batch_size="64" \
+	--per_device_eval_batch_size="32" \
 	--logging_steps="50" \
 	--logging_first_step \
+	--learning_rate="3.5e-5" \
+	--warmup_steps="0" \
 	--evaluation_strategy="steps" \
 	--eval_steps="1000" \
 	--save_strategy="steps" \
 	--do_normalize_eval \
 	--streaming_train="True" \
 	--streaming_eval="False" \
+	--seed="43" \
+	--use_auth_token

src/{run_tiny_debug.sh → bash_runners/run_tiny_debug.sh} RENAMED Viewed

File without changes

src/belarusian_text_normalizer.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import re
+import regex
+import unicodedata
+from typing import Iterable
+class BelarusianTextNormalizer:
+    """
+    Based on transformers.models.whisper.english_normalizer.BasicTextNormalizer
+    but with support not to remove certain characters.
+    e.g. apostrophe (') - a symbol from Belarusian alphabet - was removed using BasicTextNormalizer.
+    """
+    def __init__(self, split_letters: bool = False):
+        self.split_letters = split_letters
+        self.allowed_symbols = ("'",)
+    @staticmethod
+    def clean(s: str, allowed_symbols: Iterable[str] = None):
+        """
+        Replace any other markers, symbols, punctuations with a space, keeping diacritics
+        """
+        if allowed_symbols is None:
+            allowed_symbols = []
+        res = "".join(" " if unicodedata.category(c)[0] in "MSP" and c not in allowed_symbols else c
+                      for c in unicodedata.normalize("NFKC", s))
+        return res
+    def __call__(self, s: str):
+        s = s.lower()
+        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
+        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
+        s = self.clean(s, allowed_symbols=self.allowed_symbols).lower()
+        if self.split_letters:
+            s = " ".join(regex.findall(r"\X", s, regex.U))
+        s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space
+        return s

src/readme.md CHANGED Viewed

@@ -39,6 +39,9 @@ The code in this repository is a modified version of code from
 ## Resuming training from exising checkpoint
 When resuming training from existing checkpoint:
 * it's better to save all `checkpoint-\d+` dirs. better not to rely on data saved to `output_dir` because:
   * not all data is saved to `output_dir`. e.g. following files are not saved to `output_dir`:
     `optimizer.pt`, `rng_state.pth`, `scaler.pt`, `scheduler.pt`. so can't resume training in a correct way from
@@ -70,9 +73,16 @@ When resuming training from existing checkpoint:
   but does StreamingDataset have any epochs?
 * does streaming mode support parallel data load and processing?<br>
   when using non-streaming mode we can use `dataset.map(..., num_proc=<num_proc>)`
 ## Notes:
 * using CommonVoice 11 dataset in a streaming way.<br>
   use `streaming=True` for train & validation & test.<br>
   as an alternative, we can use `streaming=False` for validation & test sets to save time on data processing.

 ## Resuming training from exising checkpoint
 When resuming training from existing checkpoint:
+* when using streaming, epoch will get reset to 0. that means order of items passed to a model would be the same,
+  if the seed does not change. actual train_dataloader seed would be:
+  `train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)`
 * it's better to save all `checkpoint-\d+` dirs. better not to rely on data saved to `output_dir` because:
   * not all data is saved to `output_dir`. e.g. following files are not saved to `output_dir`:
     `optimizer.pt`, `rng_state.pth`, `scaler.pt`, `scheduler.pt`. so can't resume training in a correct way from
   but does StreamingDataset have any epochs?
 * does streaming mode support parallel data load and processing?<br>
   when using non-streaming mode we can use `dataset.map(..., num_proc=<num_proc>)`
+* I got CUDA out of memory error when tried to launch a second training run for Whisper Small model.
+  training params are almost the same: `--per_device_train_batch_size="64"`
+  the only thing changed is that now evaluation dataset now doesn't use streaming.
 ## Notes:
+* Common Voice 11 dataset
+  [uploaded to HuggingFace](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0)
+  has only single voicing of each sentence in each split (train, validation, test).<br>
+  Much more audiofiles should be available on Common Voice so that each sentence is voiced multiple times by different people
 * using CommonVoice 11 dataset in a streaming way.<br>
   use `streaming=True` for train & validation & test.<br>
   as an alternative, we can use `streaming=False` for validation & test sets to save time on data processing.

src/run_eval_whisper_streaming.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import argparse
+from transformers import pipeline
+from transformers.models.whisper.english_normalizer import BasicTextNormalizer
+from datasets import load_dataset, Audio
+import evaluate
+from belarusian_text_normalizer import BelarusianTextNormalizer
+wer_metric = evaluate.load("wer")
+def is_target_text_in_range(ref):
+    if ref.strip() == "ignore time segment in scoring":
+        return False
+    else:
+        return ref.strip() != ""
+def get_text(sample):
+    if "text" in sample:
+        return sample["text"]
+    elif "sentence" in sample:
+        return sample["sentence"]
+    elif "normalized_text" in sample:
+        return sample["normalized_text"]
+    elif "transcript" in sample:
+        return sample["transcript"]
+    elif "transcription" in sample:
+        return sample["transcription"]
+    else:
+        raise ValueError(
+            f"Expected transcript column of either 'text', 'sentence', 'normalized_text' or 'transcript'. Got sample of "
+            ".join{sample.keys()}. Ensure a text column name is present in the dataset."
+        )
+whisper_norm = BelarusianTextNormalizer()
+def normalise(batch):
+    batch["norm_text"] = whisper_norm(get_text(batch))
+    return batch
+def data(dataset):
+    for i, item in enumerate(dataset):
+        yield {**item["audio"], "reference": item["norm_text"]}
+def main(args):
+    batch_size = args.batch_size
+    whisper_asr = pipeline(
+        "automatic-speech-recognition", model=args.model_id, device=args.device
+    )
+    whisper_asr.model.config.forced_decoder_ids = (
+        whisper_asr.tokenizer.get_decoder_prompt_ids(
+            language=args.language, task="transcribe"
+        )
+    )
+    dataset = load_dataset(
+        args.dataset,
+        args.config,
+        split=args.split,
+        streaming=args.streaming,
+        use_auth_token=True,
+    )
+    # Only uncomment for debugging
+    dataset = dataset.take(args.max_eval_samples)
+    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+    dataset = dataset.map(normalise)
+    dataset = dataset.filter(is_target_text_in_range, input_columns=["norm_text"])
+    predictions = []
+    references = []
+    # run streamed inference
+    for out in whisper_asr(data(dataset), batch_size=batch_size):
+        predictions.append(whisper_norm(out["text"]))
+        references.append(out["reference"][0])
+    wer = wer_metric.compute(references=references, predictions=predictions)
+    wer = round(100 * wer, 2)
+    print("WER:", wer)
+    evaluate.push_to_hub(
+        model_id=args.model_id,
+        metric_value=wer,
+        metric_type="wer",
+        metric_name="WER",
+        dataset_name=args.dataset,
+        dataset_type=args.dataset,
+        dataset_split=args.split,
+        dataset_config=args.config,
+        task_type="automatic-speech-recognition",
+        task_name="Automatic Speech Recognition"
+    )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_id",
+        type=str,
+        required=True,
+        help="Model identifier. Should be loadable with 🤗 Transformers",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="mozilla-foundation/common_voice_11_0",
+        help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="Config of the dataset. *E.g.* `'en'` for the English split of Common Voice",
+    )
+    parser.add_argument(
+        "--split",
+        type=str,
+        default="test",
+        help="Split of the dataset. *E.g.* `'test'`",
+    )
+    parser.add_argument(
+        "--device",
+        type=int,
+        default=-1,
+        help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=16,
+        help="Number of samples to go through each streamed batch.",
+    )
+    parser.add_argument(
+        "--max_eval_samples",
+        type=int,
+        default=None,
+        help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.",
+    )
+    parser.add_argument(
+        "--streaming",
+        type=bool,
+        default=True,
+        help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.",
+    )
+    parser.add_argument(
+        "--language",
+        type=str,
+        required=True,
+        help="Two letter language code for the transcription language, e.g. use 'en' for English.",
+    )
+    args = parser.parse_args()
+    main(args)

src/run_speech_recognition_seq2seq_streaming.py CHANGED Viewed

@@ -24,9 +24,6 @@ import logging
 import os
 import sys
 import datetime
-import re
-import regex
-import unicodedata
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Union, Iterable
@@ -54,6 +51,7 @@ from transformers.trainer_utils import get_last_checkpoint, is_main_process
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 check_min_version("4.25.0.dev0")
@@ -230,41 +228,6 @@ class DataTrainingArguments:
     )
-class BelarusianTextNormalizer:
-    """
-    Based on transformers.models.whisper.english_normalizer.BasicTextNormalizer
-    but with support not to remove certain characters.
-    e.g. apostrophe (') - a symbol from Belarusian alphabet - was removed using BasicTextNormalizer.
-    """
-    def __init__(self, split_letters: bool = False):
-        self.split_letters = split_letters
-        self.allowed_symbols = ("'",)
-    @staticmethod
-    def clean(s: str, allowed_symbols: Iterable[str] = None):
-        """
-        Replace any other markers, symbols, punctuations with a space, keeping diacritics
-        """
-        if allowed_symbols is None:
-            allowed_symbols = []
-        res = "".join(" " if unicodedata.category(c)[0] in "MSP" and c not in allowed_symbols else c
-                      for c in unicodedata.normalize("NFKC", s))
-        return res
-    def __call__(self, s: str):
-        s = s.lower()
-        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
-        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
-        s = self.clean(s, allowed_symbols=self.allowed_symbols).lower()
-        if self.split_letters:
-            s = " ".join(regex.findall(r"\X", s, regex.U))
-        s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space
-        return s
 @dataclass
 class DataCollatorSpeechSeq2SeqWithPadding:

 import os
 import sys
 import datetime
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Union, Iterable
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
+from belarusian_text_normalizer import BelarusianTextNormalizer
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 check_min_version("4.25.0.dev0")
     )
 @dataclass
 class DataCollatorSpeechSeq2SeqWithPadding: