versae commited on
Commit
bb05670
1 Parent(s): a1e8de0

Update eval.py

Browse files
Files changed (1) hide show
  1. eval.py +23 -6
eval.py CHANGED
@@ -50,11 +50,29 @@ def log_results(result: Dataset, args: Dict[str, str]):
50
  result.map(write_to_file, with_indices=True)
51
 
52
 
53
- def normalize_text(text: str, dataset: str) -> str:
54
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
55
 
56
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
57
- text = re.sub(chars_to_ignore_regex, "", text.lower()) + " "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  if dataset.lower().endswith("nst"):
60
  text = text.lower()
@@ -79,7 +97,7 @@ def normalize_text(text: str, dataset: str) -> str:
79
  text = re.sub('[ç]', 'c', text)
80
  text = re.sub('[úùüû]', 'u', text)
81
  text = re.sub('\s+', ' ', text)
82
- elif dataset.lower().endswith("fleurs"):
83
  text = re.sub('[áàâ]', 'a', text)
84
  text = re.sub('[ä]', 'æ', text)
85
  text = re.sub('[éèëê]', 'e', text)
@@ -88,7 +106,6 @@ def normalize_text(text: str, dataset: str) -> str:
88
  text = re.sub('[ö]', 'ø', text)
89
  text = re.sub('[ç]', 'c', text)
90
  text = re.sub('[úùüû]', 'u', text)
91
- text = re.compile(r"-?[1-9][\d.]*").sub(lambda x: n2w(x.group(0), lang="no"), text)
92
  text = re.sub('\s+', ' ', text)
93
  text = re.sub('<ee>', 'eee', text)
94
  text = re.sub('<qq>', 'qqq', text)
@@ -102,7 +119,7 @@ def normalize_text(text: str, dataset: str) -> str:
102
  # for t in token_sequences_to_ignore:
103
  # text = " ".join(text.split(t))
104
 
105
- return text
106
 
107
 
108
  def main(args):
 
50
  result.map(write_to_file, with_indices=True)
51
 
52
 
53
+ def normalize_text(original_text: str, dataset: str) -> str:
54
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
55
 
56
+ text = original_text.lower()
57
+ if dataset.lower().endswith("fleurs"):
58
+ replacements = (
59
+ ("e.kr", "etter kristus fødsel"),
60
+ ("f.kr", "før kristi fødsel"),
61
+ ("km/t", "kilometer i timen"),
62
+ ("km", "kilometer"),
63
+ ("cm", "centimeter"),
64
+ ("mm", "millimeter"),
65
+ ("kl.", "klokka"),
66
+ )
67
+ for abrev, expasion in replacements:
68
+ text = re.sub(f' {abrev}', f" {expasion}", text)
69
+ text = re.sub(':00', '', text)
70
+ text = re.sub(r"(\d{1,2})[ .](\d{3})", r"\1\2", text)
71
+ text = re.sub(r"(\d{2}):(\d{2})", r"\1 \2", text)
72
+ text = re.compile(r"-?[1-9][\d.]*").sub(lambda x: n2w(x.group(0), lang="no"), text.replace(".", ""))
73
+
74
+ chars_to_ignore_regex = '[«»\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
75
+ text = re.sub(chars_to_ignore_regex, " ", text) + " "
76
 
77
  if dataset.lower().endswith("nst"):
78
  text = text.lower()
 
97
  text = re.sub('[ç]', 'c', text)
98
  text = re.sub('[úùüû]', 'u', text)
99
  text = re.sub('\s+', ' ', text)
100
+ elif dataset.lower().endswith("fleurs"):
101
  text = re.sub('[áàâ]', 'a', text)
102
  text = re.sub('[ä]', 'æ', text)
103
  text = re.sub('[éèëê]', 'e', text)
 
106
  text = re.sub('[ö]', 'ø', text)
107
  text = re.sub('[ç]', 'c', text)
108
  text = re.sub('[úùüû]', 'u', text)
 
109
  text = re.sub('\s+', ' ', text)
110
  text = re.sub('<ee>', 'eee', text)
111
  text = re.sub('<qq>', 'qqq', text)
 
119
  # for t in token_sequences_to_ignore:
120
  # text = " ".join(text.split(t))
121
 
122
+ return text.strip()
123
 
124
 
125
  def main(args):