Training in progress, step 1000

Browse files

Files changed (14) hide show

.ipynb_checkpoints/eval-checkpoint.py +134 -0
.ipynb_checkpoints/eval_results-checkpoint.json +10 -0
.ipynb_checkpoints/log_mozilla-foundation_common_voice_8_0_ja_test_predictions-checkpoint.txt +0 -0
.ipynb_checkpoints/log_mozilla-foundation_common_voice_8_0_ja_test_targets-checkpoint.txt +0 -0
.ipynb_checkpoints/log_speech-recognition-community-v2_dev_data_ja_validation_predictions-checkpoint.txt +0 -0
.ipynb_checkpoints/log_speech-recognition-community-v2_dev_data_ja_validation_targets-checkpoint.txt +0 -0
.ipynb_checkpoints/mozilla-foundation_common_voice_8_0_ja_test_eval_results-checkpoint.txt +2 -0
.ipynb_checkpoints/run_speech_recognition_ctc_bnb-checkpoint.py +1 -1
.ipynb_checkpoints/run_training-checkpoint.sh +3 -2
pytorch_model.bin +1 -1
run_speech_recognition_ctc_bnb.py +1 -1
run_training.sh +3 -2
special_tokens_map.json +1 -1
training_args.bin +1 -1

.ipynb_checkpoints/eval-checkpoint.py ADDED Viewed

	@@ -0,0 +1,134 @@

+#!/usr/bin/env python3
+from datasets import load_dataset, load_metric, Audio, Dataset
+from transformers import pipeline, AutoFeatureExtractor
+import re
+import argparse
+import unicodedata
+from typing import Dict
+def log_results(result: Dataset, args: Dict[str, str]):
+    """ DO NOT CHANGE. This function computes and logs the result metrics. """
+    log_outputs = args.log_outputs
+    dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
+    # load metric
+    wer = load_metric("wer")
+    cer = load_metric("cer")
+    # compute metrics
+    wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
+    cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
+    # print & log results
+    result_str = (
+        f"WER: {wer_result}\n"
+        f"CER: {cer_result}"
+    )
+    print(result_str)
+    with open(f"{dataset_id}_eval_results.txt", "w") as f:
+        f.write(result_str)
+    # log all results in text file. Possibly interesting for analysis
+    if log_outputs is not None:
+        pred_file = f"log_{dataset_id}_predictions.txt"
+        target_file = f"log_{dataset_id}_targets.txt"
+        with open(pred_file, "w") as p, open(target_file, "w") as t:
+            # mapping function to write output
+            def write_to_file(batch, i):
+                p.write(f"{i}" + "\n")
+                p.write(batch["prediction"] + "\n")
+                t.write(f"{i}" + "\n")
+                t.write(batch["target"] + "\n")
+            result.map(write_to_file, with_indices=True)
+def normalize_text(text: str) -> str:
+    """ DO ADAPT FOR YOUR USE CASE. this function normalizes the target text. """
+    from pykakasi import kakasi
+    kakasi = kakasi()
+    kakasi.setMode('J', 'H') #Convert from kanji to hiragana
+    conv = kakasi.getConverter()
+    chars_to_ignore_regex = '[\,\?\!\-\;\:\"\“\%\‘\”\�\—\’\…\–\（\，\[\]\)\(\！\/\「\」\『\』]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
+    # remove punctuation
+    text = conv.do(re.sub(chars_to_ignore_regex, "", text))
+    # In addition, we can normalize the target text, e.g. removing new lines characters etc...
+    # note that order is important here!
+    token_sequences_to_ignore = ["\n\n", "\n", "   ", "  "]
+    for t in token_sequences_to_ignore:
+        text = " ".join(text.split(t))
+    return text
+def main(args):
+    # load dataset
+    dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
+    # for testing: only process the first two examples as a test
+    # dataset = dataset.select(range(10))
+    # load processor
+    feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
+    sampling_rate = feature_extractor.sampling_rate
+    # resample audio
+    dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
+    # load eval pipeline
+    asr = pipeline("automatic-speech-recognition", model=args.model_id)
+    # map function to decode audio
+    def map_to_pred(batch):
+        prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s)
+        batch["prediction"] = prediction["text"]
+        batch["target"] = normalize_text(batch["sentence"])
+        return batch
+    # run inference on all examples
+    result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
+    # compute and log_results
+    # do not change function below
+    log_results(result, args)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
+    )
+    parser.add_argument(
+        "--dataset", type=str, required=True, help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets"
+    )
+    parser.add_argument(
+        "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'`  for Common Voice"
+    )
+    parser.add_argument(
+        "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
+    )
+    parser.add_argument(
+        "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
+    )
+    parser.add_argument(
+        "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
+    )
+    parser.add_argument(
+        "--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
+    )
+    args = parser.parse_args()
+    main(args)

.ipynb_checkpoints/eval_results-checkpoint.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "epoch": 50.0,
+    "eval_cer": 0.1826705782774121,
+    "eval_loss": 0.6643062829971313,
+    "eval_runtime": 307.697,
+    "eval_samples": 4466,
+    "eval_samples_per_second": 14.514,
+    "eval_steps_per_second": 1.817,
+    "eval_wer": 1.0241664801969121
+}

.ipynb_checkpoints/log_mozilla-foundation_common_voice_8_0_ja_test_predictions-checkpoint.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

.ipynb_checkpoints/log_mozilla-foundation_common_voice_8_0_ja_test_targets-checkpoint.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

.ipynb_checkpoints/log_speech-recognition-community-v2_dev_data_ja_validation_predictions-checkpoint.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

.ipynb_checkpoints/log_speech-recognition-community-v2_dev_data_ja_validation_targets-checkpoint.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

.ipynb_checkpoints/mozilla-foundation_common_voice_8_0_ja_test_eval_results-checkpoint.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ WER: 0.9675266903914591
2	+ CER: 0.30694865529668464

.ipynb_checkpoints/run_speech_recognition_ctc_bnb-checkpoint.py CHANGED Viewed

@@ -155,7 +155,7 @@ class DataTrainingArguments:
     eval_split_name: str = field(
         default="test",
         metadata={
-            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
         },
     )
     audio_column_name: str = field(

     eval_split_name: str = field(
         default="test",
         metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'test'"
         },
     )
     audio_column_name: str = field(

.ipynb_checkpoints/run_training-checkpoint.sh CHANGED Viewed

@@ -7,8 +7,9 @@ python run_speech_recognition_ctc_bnb.py \
 	--num_train_epochs="50" \
 	--per_device_train_batch_size="32" \
 	--per_device_eval_batch_size="8" \
-	--learning_rate="1e-4" \
-	--warmup_steps="2000" \
 	--length_column_name="input_length" \
 	--evaluation_strategy="steps" \
 	--text_column_name="sentence" \

 	--num_train_epochs="50" \
 	--per_device_train_batch_size="32" \
 	--per_device_eval_batch_size="8" \
+    --gradient_accumulation_steps="4" \
+	--learning_rate="7.5e-5" \
+	--warmup_steps="1500" \
 	--length_column_name="input_length" \
 	--evaluation_strategy="steps" \
 	--text_column_name="sentence" \

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:56253cab1c0408c1004e62a7377543a509069ec8b0aaba1aaaa28b3041b8cf29
 size 3851240177

 version https://git-lfs.github.com/spec/v1
+oid sha256:490634cd84fbf3811afe86fb73dee322c6704b2e70e34a9b04adc71e593d0f24
 size 3851240177

run_speech_recognition_ctc_bnb.py CHANGED Viewed

@@ -155,7 +155,7 @@ class DataTrainingArguments:
     eval_split_name: str = field(
         default="test",
         metadata={
-            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
         },
     )
     audio_column_name: str = field(

     eval_split_name: str = field(
         default="test",
         metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'test'"
         },
     )
     audio_column_name: str = field(

run_training.sh CHANGED Viewed

@@ -7,8 +7,9 @@ python run_speech_recognition_ctc_bnb.py \
 	--num_train_epochs="50" \
 	--per_device_train_batch_size="32" \
 	--per_device_eval_batch_size="8" \
-	--learning_rate="1e-4" \
-	--warmup_steps="2000" \
 	--length_column_name="input_length" \
 	--evaluation_strategy="steps" \
 	--text_column_name="sentence" \

 	--num_train_epochs="50" \
 	--per_device_train_batch_size="32" \
 	--per_device_eval_batch_size="8" \
+    --gradient_accumulation_steps="4" \
+	--learning_rate="7.5e-5" \
+	--warmup_steps="1500" \
 	--length_column_name="input_length" \
 	--evaluation_strategy="steps" \
 	--text_column_name="sentence" \

special_tokens_map.json CHANGED Viewed

@@ -1 +1 @@

- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}

+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c4677067a838996bf37783552c2516b25469b06a2d8d9c31d21a824ac3f516a7
 size 2991

 version https://git-lfs.github.com/spec/v1
+oid sha256:0786d1d55e0806ed6c3ec835e9f4c65da62f2a569bf56129fbdf16fbc6e4d544
 size 2991