Update eval.py
Browse files
eval.py
CHANGED
@@ -50,11 +50,29 @@ def log_results(result: Dataset, args: Dict[str, str]):
|
|
50 |
result.map(write_to_file, with_indices=True)
|
51 |
|
52 |
|
53 |
-
def normalize_text(
|
54 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
55 |
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
if dataset.lower().endswith("nst"):
|
60 |
text = text.lower()
|
@@ -79,7 +97,7 @@ def normalize_text(text: str, dataset: str) -> str:
|
|
79 |
text = re.sub('[ç]', 'c', text)
|
80 |
text = re.sub('[úùüû]', 'u', text)
|
81 |
text = re.sub('\s+', ' ', text)
|
82 |
-
elif dataset.lower().endswith("fleurs"):
|
83 |
text = re.sub('[áàâ]', 'a', text)
|
84 |
text = re.sub('[ä]', 'æ', text)
|
85 |
text = re.sub('[éèëê]', 'e', text)
|
@@ -88,7 +106,6 @@ def normalize_text(text: str, dataset: str) -> str:
|
|
88 |
text = re.sub('[ö]', 'ø', text)
|
89 |
text = re.sub('[ç]', 'c', text)
|
90 |
text = re.sub('[úùüû]', 'u', text)
|
91 |
-
text = re.compile(r"-?[1-9][\d.]*").sub(lambda x: n2w(x.group(0), lang="no"), text)
|
92 |
text = re.sub('\s+', ' ', text)
|
93 |
text = re.sub('<ee>', 'eee', text)
|
94 |
text = re.sub('<qq>', 'qqq', text)
|
@@ -102,7 +119,7 @@ def normalize_text(text: str, dataset: str) -> str:
|
|
102 |
# for t in token_sequences_to_ignore:
|
103 |
# text = " ".join(text.split(t))
|
104 |
|
105 |
-
return text
|
106 |
|
107 |
|
108 |
def main(args):
|
|
|
50 |
result.map(write_to_file, with_indices=True)
|
51 |
|
52 |
|
53 |
+
def normalize_text(original_text: str, dataset: str) -> str:
|
54 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
55 |
|
56 |
+
text = original_text.lower()
|
57 |
+
if dataset.lower().endswith("fleurs"):
|
58 |
+
replacements = (
|
59 |
+
("e.kr", "etter kristus fødsel"),
|
60 |
+
("f.kr", "før kristi fødsel"),
|
61 |
+
("km/t", "kilometer i timen"),
|
62 |
+
("km", "kilometer"),
|
63 |
+
("cm", "centimeter"),
|
64 |
+
("mm", "millimeter"),
|
65 |
+
("kl.", "klokka"),
|
66 |
+
)
|
67 |
+
for abrev, expasion in replacements:
|
68 |
+
text = re.sub(f' {abrev}', f" {expasion}", text)
|
69 |
+
text = re.sub(':00', '', text)
|
70 |
+
text = re.sub(r"(\d{1,2})[ .](\d{3})", r"\1\2", text)
|
71 |
+
text = re.sub(r"(\d{2}):(\d{2})", r"\1 \2", text)
|
72 |
+
text = re.compile(r"-?[1-9][\d.]*").sub(lambda x: n2w(x.group(0), lang="no"), text.replace(".", ""))
|
73 |
+
|
74 |
+
chars_to_ignore_regex = '[«»\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\–\_\\\+\#\/]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
|
75 |
+
text = re.sub(chars_to_ignore_regex, " ", text) + " "
|
76 |
|
77 |
if dataset.lower().endswith("nst"):
|
78 |
text = text.lower()
|
|
|
97 |
text = re.sub('[ç]', 'c', text)
|
98 |
text = re.sub('[úùüû]', 'u', text)
|
99 |
text = re.sub('\s+', ' ', text)
|
100 |
+
elif dataset.lower().endswith("fleurs"):
|
101 |
text = re.sub('[áàâ]', 'a', text)
|
102 |
text = re.sub('[ä]', 'æ', text)
|
103 |
text = re.sub('[éèëê]', 'e', text)
|
|
|
106 |
text = re.sub('[ö]', 'ø', text)
|
107 |
text = re.sub('[ç]', 'c', text)
|
108 |
text = re.sub('[úùüû]', 'u', text)
|
|
|
109 |
text = re.sub('\s+', ' ', text)
|
110 |
text = re.sub('<ee>', 'eee', text)
|
111 |
text = re.sub('<qq>', 'qqq', text)
|
|
|
119 |
# for t in token_sequences_to_ignore:
|
120 |
# text = " ".join(text.split(t))
|
121 |
|
122 |
+
return text.strip()
|
123 |
|
124 |
|
125 |
def main(args):
|