NbAiLab
/

wav2vec2-1b-npsc-nst

@@ -50,11 +50,29 @@ def log_results(result: Dataset, args: Dict[str, str]):
             result.map(write_to_file, with_indices=True)
-def normalize_text(text: str, dataset: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
-    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
-    text = re.sub(chars_to_ignore_regex, "", text.lower()) + " "
     if dataset.lower().endswith("nst"):
         text = text.lower()
@@ -79,7 +97,7 @@ def normalize_text(text: str, dataset: str) -> str:
         text = re.sub('[ç]', 'c', text)
         text = re.sub('[úùüû]', 'u', text)
         text = re.sub('\s+', ' ', text)
-    elif dataset.lower().endswith("fleurs"):
         text = re.sub('[áàâ]', 'a', text)
         text = re.sub('[ä]', 'æ', text)
         text = re.sub('[éèëê]', 'e', text)
@@ -88,7 +106,6 @@ def normalize_text(text: str, dataset: str) -> str:
         text = re.sub('[ö]', 'ø', text)
         text = re.sub('[ç]', 'c', text)
         text = re.sub('[úùüû]', 'u', text)
-        text = re.compile(r"-?[1-9][\d.]*").sub(lambda x: n2w(x.group(0), lang="no"), text)
         text = re.sub('\s+', ' ', text)
     text = re.sub('<ee>', 'eee', text)
     text = re.sub('<qq>', 'qqq', text)
@@ -102,7 +119,7 @@ def normalize_text(text: str, dataset: str) -> str:
     # for t in token_sequences_to_ignore:
     #     text = " ".join(text.split(t))
-    return text
 def main(args):

             result.map(write_to_file, with_indices=True)
+def normalize_text(original_text: str, dataset: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
+    text = original_text.lower()
+    if dataset.lower().endswith("fleurs"):
+        replacements = (
+            ("e.kr", "etter kristus fødsel"),
+            ("f.kr", "før kristi fødsel"),
+            ("km/t", "kilometer i timen"),
+            ("km", "kilometer"),
+            ("cm", "centimeter"),
+            ("mm", "millimeter"),
+            ("kl.", "klokka"),
+        )
+        for abrev, expasion in replacements:
+            text = re.sub(f' {abrev}', f" {expasion}", text)
+        text = re.sub(':00', '', text)
+        text = re.sub(r"(\d{1,2})[ .](\d{3})", r"\1\2", text)
+        text = re.sub(r"(\d{2}):(\d{2})", r"\1 \2", text)
+        text = re.compile(r"-?[1-9][\d.]*").sub(lambda x: n2w(x.group(0), lang="no"), text.replace(".", ""))
+    chars_to_ignore_regex = '[«»\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
+    text = re.sub(chars_to_ignore_regex, " ", text) + " "
     if dataset.lower().endswith("nst"):
         text = text.lower()
         text = re.sub('[ç]', 'c', text)
         text = re.sub('[úùüû]', 'u', text)
         text = re.sub('\s+', ' ', text)
+    elif dataset.lower().endswith("fleurs"):
         text = re.sub('[áàâ]', 'a', text)
         text = re.sub('[ä]', 'æ', text)
         text = re.sub('[éèëê]', 'e', text)
         text = re.sub('[ö]', 'ø', text)
         text = re.sub('[ç]', 'c', text)
         text = re.sub('[úùüû]', 'u', text)
         text = re.sub('\s+', ' ', text)
     text = re.sub('<ee>', 'eee', text)
     text = re.sub('<qq>', 'qqq', text)
     # for t in token_sequences_to_ignore:
     #     text = " ".join(text.split(t))
+    return text.strip()
 def main(args):