versae commited on
Commit
0d4a063
1 Parent(s): 0539da3

Update eval.py

Browse files
Files changed (1) hide show
  1. eval.py +37 -21
eval.py CHANGED
@@ -1,6 +1,5 @@
1
  #!/usr/bin/env python3
2
  import argparse
3
- import os
4
  import re
5
  from typing import Dict
6
 
@@ -12,6 +11,8 @@ from slugify import slugify
12
  from transformers import AutoFeatureExtractor, AutoModelForCTC, pipeline, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM, Wav2Vec2FeatureExtractor
13
  # from pyctcdecode import BeamSearchDecoderCTC
14
 
 
 
15
 
16
  def log_results(result: Dataset, args: Dict[str, str]):
17
  """DO NOT CHANGE. This function computes and logs the result metrics."""
@@ -19,7 +20,11 @@ def log_results(result: Dataset, args: Dict[str, str]):
19
  log_outputs = args.log_outputs
20
  lm = "withLM" if args.use_lm else "noLM"
21
  model_id = args.model_id.replace("/", "_").replace(".", "")
22
- dataset_id = "_".join([model_id] + args.dataset.split("/") + [args.config, slugify(args.filter), args.split, lm])
 
 
 
 
23
 
24
  # load metric
25
  wer = load_metric("wer")
@@ -58,24 +63,36 @@ def normalize_text(original_text: str, dataset: str) -> str:
58
  text = original_text.lower()
59
  if dataset.lower().endswith("fleurs"):
60
  replacements = (
61
- ("e.kr", "etter kristus fødsel"),
62
- ("f.kr", "før kristi fødsel"),
63
- ("km/t", "kilometer i timen"),
64
- ("km", "kilometer"),
65
- ("cm", "centimeter"),
66
- ("mm", "millimeter"),
67
- ("kl.", "klokka"),
68
- ("f.eks", "for eksempel"),
 
69
  )
70
  for abrev, expasion in replacements:
71
- text = re.sub(f' {abrev}', f" {expasion}", text)
72
- text = re.sub(':00', '', text)
73
- text = re.sub(r"(\d{1,2})[ .](\d{3})", r"\1\2", text)
74
- text = re.sub(r"(\d{2}):(\d{2})", r"\1 \2", text)
75
- text = re.compile(r"-?[1-9][\d.]*").sub(lambda x: n2w(x.group(0), lang="no"), text.replace(".", ""))
76
-
77
- chars_to_ignore_regex = '[«»\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
78
- text = re.sub(chars_to_ignore_regex, " ", text) + " "
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  if dataset.lower().endswith("nst"):
81
  text = text.lower()
@@ -109,6 +126,7 @@ def normalize_text(original_text: str, dataset: str) -> str:
109
  text = re.sub('[ö]', 'ø', text)
110
  text = re.sub('[ç]', 'c', text)
111
  text = re.sub('[úùüû]', 'u', text)
 
112
  text = re.sub('\s+', ' ', text)
113
  text = re.sub('<ee>', 'eee', text)
114
  text = re.sub('<qq>', 'qqq', text)
@@ -131,8 +149,7 @@ def main(args):
131
  if args.filter:
132
  attribute, value = list(map(str.strip, args.filter.split(":")))
133
  dataset = dataset.filter(
134
- lambda x: x[attribute] == value,
135
- num_proc=os.cpu_count() // 2,
136
  desc=f"Filtering on {args.filter}",
137
  )
138
  # for testing: only process the first two examples as a test
@@ -233,4 +250,3 @@ if __name__ == "__main__":
233
  args = parser.parse_args()
234
 
235
  main(args)
236
-
 
1
  #!/usr/bin/env python3
2
  import argparse
 
3
  import re
4
  from typing import Dict
5
 
 
11
  from transformers import AutoFeatureExtractor, AutoModelForCTC, pipeline, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM, Wav2Vec2FeatureExtractor
12
  # from pyctcdecode import BeamSearchDecoderCTC
13
 
14
+ from .cardinal_numbers import convert_nums
15
+
16
 
17
  def log_results(result: Dataset, args: Dict[str, str]):
18
  """DO NOT CHANGE. This function computes and logs the result metrics."""
 
20
  log_outputs = args.log_outputs
21
  lm = "withLM" if args.use_lm else "noLM"
22
  model_id = args.model_id.replace("/", "_").replace(".", "")
23
+ if args.filter:
24
+ extra_args = [args.config, slugify(args.filter), args.split, lm]
25
+ else:
26
+ extra_args = [args.config, args.split, lm]
27
+ dataset_id = "_".join([model_id] + args.dataset.split("/") + extra_args)
28
 
29
  # load metric
30
  wer = load_metric("wer")
 
63
  text = original_text.lower()
64
  if dataset.lower().endswith("fleurs"):
65
  replacements = (
66
+ (r"\be\.kr", "etter kristus fødsel"),
67
+ (r"\bf\.kr", "før kristi fødsel"),
68
+ (r"\bca[.]?\b", "circa"),
69
+ (r"(\d)\s*km/t", r"\1 kilometer i timen"),
70
+ (r"(\d)\s*km", r"\1 kilometer"),
71
+ (r"(\d)\s*cm", r"\1 centimeter"),
72
+ (r"(\d)\s*mm", r"\1 millimeter"),
73
+ (r"kl\.", "klokka"),
74
+ (r"f\.eks", "for eksempel"),
75
  )
76
  for abrev, expasion in replacements:
77
+ text = re.sub(abrev, expasion, text)
78
+ text = re.sub(r'(\d+)[-–](\d+)', r'\1 til \2', text) # 1-89, 70-90
79
+ text = re.sub(r'(\d{2}):00', r'\1', text) # 21:00
80
+ text = re.sub(r"(\d{2}):0(\d{1})", r"\1 null \2", text) # 17:03
81
+ text = re.sub(r"(\d{1,2}):(\d{1,2})", r"\1 \2", text) # 17:23 (time), 4:3 (aspect ratios)
82
+ text = re.sub(r"(1[1-9])00", r"\1 hundre", text) # 1800, 1900
83
+ text = re.sub(r"(1[1-9])0([1-9])", r"\1 null \2 ", text) # 1901, 1909
84
+ text = re.sub(r"(1[1-9])([1-9]\d)", r"\1 \2 ", text) # 1911, 1987
85
+ text = re.sub(r"(20)0([1-9])", r"\1 null \2 ", text) # 2009
86
+ text = re.sub(r"(20)(\d{2})", r"\1 \2 ", text) # 2009
87
+ text = re.sub(r"(\d{1,3})[.](\d{1,2})", r"\1 dot \2 ", text) # 802.11n, 2.5ghz (in English)
88
+ text = re.sub(r"(\d{1,2})[ .](\d{3})", r"\1\2", text) # 10 000, 32.000
89
+ text = re.sub(r'(\w+)-(\w+)', r'\1 \2', text) # n-standard
90
+ # text = re.compile(r"-?0?[1-9][\d.]*").sub(lambda x: n2w(x.group(0), lang="no"), text.replace(".", ""))
91
+ text = re.compile(r"-?0?[1-9][\d.]*").sub(lambda x: convert_nums(int(x.group(0)), nn=True), text.replace(".", ""))
92
+
93
+
94
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
95
+ text = re.sub(chars_to_ignore_regex, "", text) + " "
96
 
97
  if dataset.lower().endswith("nst"):
98
  text = text.lower()
 
126
  text = re.sub('[ö]', 'ø', text)
127
  text = re.sub('[ç]', 'c', text)
128
  text = re.sub('[úùüû]', 'u', text)
129
+ text = re.sub('[«»]', '', text)
130
  text = re.sub('\s+', ' ', text)
131
  text = re.sub('<ee>', 'eee', text)
132
  text = re.sub('<qq>', 'qqq', text)
 
149
  if args.filter:
150
  attribute, value = list(map(str.strip, args.filter.split(":")))
151
  dataset = dataset.filter(
152
+ lambda x: x[attribute == value],
 
153
  desc=f"Filtering on {args.filter}",
154
  )
155
  # for testing: only process the first two examples as a test
 
250
  args = parser.parse_args()
251
 
252
  main(args)