kingabzpro commited on
Commit
e829dfa
1 Parent(s): fb98e92

add LM preprocessing

Browse files
.ipynb_checkpoints/README-checkpoint.md ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - id
4
+
5
+ license: apache-2.0
6
+ tags:
7
+ - automatic-speech-recognition
8
+ - robust-speech-event
9
+ datasets:
10
+ - mozilla-foundation/common_voice_7_0
11
+ metrics:
12
+ - wer
13
+ - cer
14
+ model-index:
15
+ - name: wav2vec2-large-xls-r-300m-Indonesian
16
+ results:
17
+ - task:
18
+ type: automatic-speech-recognition # Required. Example: automatic-speech-recognition
19
+ name: Speech Recognition # Optional. Example: Speech Recognition
20
+ dataset:
21
+ type: mozilla-foundation/common_voice_7_0 # Required. Example: common_voice. Use dataset id from https://hf.co/datasets
22
+ name: Common Voice id # Required. Example: Common Voice zh-CN
23
+ args: id # Optional. Example: zh-CN
24
+ metrics:
25
+ - type: wer # Required. Example: wer
26
+ value: 25.06 # Required. Example: 20.90
27
+ name: Test WER # Optional. Example: Test WER
28
+
29
+ - type: cer # Required. Example: wer
30
+ value: 6.50 # Required. Example: 20.90
31
+ name: Test CER # Optional. Example: Test WER
32
+
33
+
34
+ ---
35
+
36
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
37
+ should probably proofread and complete it, then remove this comment. -->
38
+
39
+ # wav2vec2-large-xls-r-300m-Indonesian
40
+
41
+ This model is a fine-tuned version of [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m) on the common_voice dataset.
42
+ It achieves the following results on the evaluation set:
43
+ - Loss: 0.4087
44
+ - Wer: 0.2461
45
+ - Cer: 0.0666
46
+
47
+
48
+
49
+ ### Training hyperparameters
50
+
51
+ The following hyperparameters were used during training:
52
+ - learning_rate: 0.0003
53
+ - train_batch_size: 64
54
+ - eval_batch_size: 8
55
+ - seed: 42
56
+ - gradient_accumulation_steps: 2
57
+ - total_train_batch_size: 128
58
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
59
+ - lr_scheduler_type: linear
60
+ - lr_scheduler_warmup_steps: 400
61
+ - num_epochs: 50
62
+ - mixed_precision_training: Native AMP
63
+
64
+ ### Training results
65
+
66
+ | Training Loss | Epoch | Step | Validation Loss | Wer | Cer |
67
+ |:-------------:|:-----:|:----:|:---------------:|:------:|:------:|
68
+ | 5.0788 | 4.26 | 200 | 2.9389 | 1.0 | 1.0 |
69
+ | 2.8288 | 8.51 | 400 | 2.2535 | 1.0 | 0.8004 |
70
+ | 0.907 | 12.77 | 600 | 0.4558 | 0.4243 | 0.1095 |
71
+ | 0.4071 | 17.02 | 800 | 0.4013 | 0.3468 | 0.0913 |
72
+ | 0.3 | 21.28 | 1000 | 0.4167 | 0.3075 | 0.0816 |
73
+ | 0.2544 | 25.53 | 1200 | 0.4132 | 0.2835 | 0.0762 |
74
+ | 0.2145 | 29.79 | 1400 | 0.3878 | 0.2693 | 0.0729 |
75
+ | 0.1923 | 34.04 | 1600 | 0.4023 | 0.2623 | 0.0702 |
76
+ | 0.1681 | 38.3 | 1800 | 0.3984 | 0.2581 | 0.0686 |
77
+ | 0.1598 | 42.55 | 2000 | 0.3982 | 0.2493 | 0.0663 |
78
+ | 0.1464 | 46.81 | 2200 | 0.4087 | 0.2461 | 0.0666 |
79
+
80
+
81
+ ### Framework versions
82
+
83
+ - Transformers 4.17.0.dev0
84
+ - Pytorch 1.10.2+cu102
85
+ - Datasets 1.18.2.dev0
86
+ - Tokenizers 0.11.0
.ipynb_checkpoints/alphabet-checkpoint.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"labels": [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\u00e9", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
README.md CHANGED
@@ -24,11 +24,11 @@ model-index:
24
  metrics:
25
  - type: wer # Required. Example: wer
26
  value: 25.06 # Required. Example: 20.90
27
- name: Test WER # Optional. Example: Test WER
28
 
29
  - type: cer # Required. Example: wer
30
  value: 6.50 # Required. Example: 20.90
31
- name: Test CER # Optional. Example: Test WER
32
 
33
 
34
  ---
 
24
  metrics:
25
  - type: wer # Required. Example: wer
26
  value: 25.06 # Required. Example: 20.90
27
+ name: Test WER With LM # Optional. Example: Test WER
28
 
29
  - type: cer # Required. Example: wer
30
  value: 6.50 # Required. Example: 20.90
31
+ name: Test CER With LM # Optional. Example: Test WER
32
 
33
 
34
  ---
eval.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from datasets import load_dataset, load_metric, Audio, Dataset
3
+ from transformers import pipeline, AutoFeatureExtractor
4
+ import re
5
+ import argparse
6
+ import unicodedata
7
+ from typing import Dict
8
+
9
+
10
+ def log_results(result: Dataset, args: Dict[str, str]):
11
+ """ DO NOT CHANGE. This function computes and logs the result metrics. """
12
+
13
+ log_outputs = args.log_outputs
14
+ dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
15
+
16
+ # load metric
17
+ wer = load_metric("wer")
18
+ cer = load_metric("cer")
19
+
20
+ # compute metrics
21
+ wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
22
+ cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
23
+
24
+ # print & log results
25
+ result_str = (
26
+ f"WER: {wer_result}\n"
27
+ f"CER: {cer_result}"
28
+ )
29
+ print(result_str)
30
+
31
+ with open(f"{dataset_id}_eval_results.txt", "w") as f:
32
+ f.write(result_str)
33
+
34
+ # log all results in text file. Possibly interesting for analysis
35
+ if log_outputs is not None:
36
+ pred_file = f"log_{dataset_id}_predictions.txt"
37
+ target_file = f"log_{dataset_id}_targets.txt"
38
+
39
+ with open(pred_file, "w") as p, open(target_file, "w") as t:
40
+
41
+ # mapping function to write output
42
+ def write_to_file(batch, i):
43
+ p.write(f"{i}" + "\n")
44
+ p.write(batch["prediction"] + "\n")
45
+ t.write(f"{i}" + "\n")
46
+ t.write(batch["target"] + "\n")
47
+
48
+ result.map(write_to_file, with_indices=True)
49
+
50
+
51
+ def normalize_text(text: str) -> str:
52
+ """ DO ADAPT FOR YOUR USE CASE. this function normalizes the target text. """
53
+
54
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\\\\&\—\!\,\’\(\)]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
55
+
56
+ text = text.lower()
57
+ # normalize non-standard (stylized) unicode characters
58
+ text = unicodedata.normalize('NFKC', text)
59
+
60
+ # Let's also make sure we split on all kinds of newlines, spaces, etc...
61
+ text = " ".join(text.split())
62
+
63
+ return text
64
+
65
+
66
+ def main(args):
67
+ # load dataset
68
+ dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
69
+
70
+ # for testing: only process the first two examples as a test
71
+ # dataset = dataset.select(range(10))
72
+
73
+ # load processor
74
+ feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
75
+ sampling_rate = feature_extractor.sampling_rate
76
+
77
+ # resample audio
78
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
79
+
80
+ # load eval pipeline
81
+ asr = pipeline("automatic-speech-recognition", model=args.model_id)
82
+
83
+ # map function to decode audio
84
+ def map_to_pred(batch):
85
+ prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s)
86
+
87
+ batch["prediction"] = prediction["text"]
88
+ batch["target"] = normalize_text(batch["sentence"])
89
+ return batch
90
+
91
+ # run inference on all examples
92
+ result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
93
+
94
+ # compute and log_results
95
+ # do not change function below
96
+ log_results(result, args)
97
+
98
+
99
+ if __name__ == "__main__":
100
+ parser = argparse.ArgumentParser()
101
+
102
+ parser.add_argument(
103
+ "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
104
+ )
105
+ parser.add_argument(
106
+ "--dataset", type=str, required=True, help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets"
107
+ )
108
+ parser.add_argument(
109
+ "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
110
+ )
111
+ parser.add_argument(
112
+ "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
113
+ )
114
+ parser.add_argument(
115
+ "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
116
+ )
117
+ parser.add_argument(
118
+ "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
119
+ )
120
+ parser.add_argument(
121
+ "--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
122
+ )
123
+ args = parser.parse_args()
124
+
125
+ main(args)