Training in progress, step 60
Browse files- .ipynb_checkpoints/eval-checkpoint.py +20 -2
- eval.py +20 -2
- pytorch_model.bin +1 -1
.ipynb_checkpoints/eval-checkpoint.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
import argparse
|
3 |
import re
|
|
|
4 |
from typing import Dict
|
5 |
|
6 |
import torch
|
@@ -50,7 +51,15 @@ def log_results(result: Dataset, args: Dict[str, str]):
|
|
50 |
def normalize_text(text: str) -> str:
|
51 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
52 |
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
text = re.sub(chars_to_ignore_regex, "", text.lower())
|
56 |
|
@@ -59,7 +68,16 @@ def normalize_text(text: str) -> str:
|
|
59 |
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
60 |
|
61 |
for t in token_sequences_to_ignore:
|
62 |
-
text = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
return text
|
65 |
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
import argparse
|
3 |
import re
|
4 |
+
import string
|
5 |
from typing import Dict
|
6 |
|
7 |
import torch
|
|
|
51 |
def normalize_text(text: str) -> str:
|
52 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
53 |
|
54 |
+
chars_to_ignore = [
|
55 |
+
",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
|
56 |
+
"؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
|
57 |
+
"{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
|
58 |
+
"、", "﹂", "﹁", "‧", "~", "﹏", ",", "{", "}", "(", ")", "[", "]", "【", "】", "‥", "〽",
|
59 |
+
"『", "』", "〝", "〟", "⟨", "⟩", "〜", ":", "!", "?", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"
|
60 |
+
] # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
|
61 |
+
|
62 |
+
chars_to_ignore_regex = f'[{"".join(chars_to_ignore)}]'
|
63 |
|
64 |
text = re.sub(chars_to_ignore_regex, "", text.lower())
|
65 |
|
|
|
68 |
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
69 |
|
70 |
for t in token_sequences_to_ignore:
|
71 |
+
text = "".join(text.split(t))
|
72 |
+
|
73 |
+
# convert 'D' and 'd' to '啲' if there a 'D' in sentence
|
74 |
+
# hacky stuff, wont work on 'D', 'd' co-occure with normal english words
|
75 |
+
# wont work on multiple 'D'
|
76 |
+
if "d" in text:
|
77 |
+
if len([c for c in text if c in string.ascii_lowercase]) == 1:
|
78 |
+
text = text.replace("d", "啲")
|
79 |
+
|
80 |
+
text += ' '
|
81 |
|
82 |
return text
|
83 |
|
eval.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
import argparse
|
3 |
import re
|
|
|
4 |
from typing import Dict
|
5 |
|
6 |
import torch
|
@@ -50,7 +51,15 @@ def log_results(result: Dataset, args: Dict[str, str]):
|
|
50 |
def normalize_text(text: str) -> str:
|
51 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
52 |
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
text = re.sub(chars_to_ignore_regex, "", text.lower())
|
56 |
|
@@ -59,7 +68,16 @@ def normalize_text(text: str) -> str:
|
|
59 |
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
60 |
|
61 |
for t in token_sequences_to_ignore:
|
62 |
-
text = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
return text
|
65 |
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
import argparse
|
3 |
import re
|
4 |
+
import string
|
5 |
from typing import Dict
|
6 |
|
7 |
import torch
|
|
|
51 |
def normalize_text(text: str) -> str:
|
52 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
53 |
|
54 |
+
chars_to_ignore = [
|
55 |
+
",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
|
56 |
+
"؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
|
57 |
+
"{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
|
58 |
+
"、", "﹂", "﹁", "‧", "~", "﹏", ",", "{", "}", "(", ")", "[", "]", "【", "】", "‥", "〽",
|
59 |
+
"『", "』", "〝", "〟", "⟨", "⟩", "〜", ":", "!", "?", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"
|
60 |
+
] # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
|
61 |
+
|
62 |
+
chars_to_ignore_regex = f'[{"".join(chars_to_ignore)}]'
|
63 |
|
64 |
text = re.sub(chars_to_ignore_regex, "", text.lower())
|
65 |
|
|
|
68 |
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
69 |
|
70 |
for t in token_sequences_to_ignore:
|
71 |
+
text = "".join(text.split(t))
|
72 |
+
|
73 |
+
# convert 'D' and 'd' to '啲' if there a 'D' in sentence
|
74 |
+
# hacky stuff, wont work on 'D', 'd' co-occure with normal english words
|
75 |
+
# wont work on multiple 'D'
|
76 |
+
if "d" in text:
|
77 |
+
if len([c for c in text if c in string.ascii_lowercase]) == 1:
|
78 |
+
text = text.replace("d", "啲")
|
79 |
+
|
80 |
+
text += ' '
|
81 |
|
82 |
return text
|
83 |
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1278024433
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2cf9654583b75dea424b875769f5c205aadeff4ef6f019a7717d32a2d023c8d6
|
3 |
size 1278024433
|