lsy641 commited on
Commit
00ff4e2
·
1 Parent(s): 1bb2612

Upload tokenizer_13a.py

Browse files
Files changed (1) hide show
  1. tokenizer_13a.py +100 -0
tokenizer_13a.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Source: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_13a.py
2
+ # Copyright 2020 SacreBLEU Authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import re
17
+ from functools import lru_cache
18
+
19
+
20
+ class BaseTokenizer:
21
+ """A base dummy tokenizer to derive from."""
22
+
23
+ def signature(self):
24
+ """
25
+ Returns a signature for the tokenizer.
26
+ :return: signature string
27
+ """
28
+ return "none"
29
+
30
+ def __call__(self, line):
31
+ """
32
+ Tokenizes an input line with the tokenizer.
33
+ :param line: a segment to tokenize
34
+ :return: the tokenized line
35
+ """
36
+ return line
37
+
38
+
39
+ class TokenizerRegexp(BaseTokenizer):
40
+ def signature(self):
41
+ return "re"
42
+
43
+ def __init__(self):
44
+ self._re = [
45
+ # language-dependent part (assuming Western languages)
46
+ (re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "),
47
+ # tokenize period and comma unless preceded by a digit
48
+ (re.compile(r"([^0-9])([\.,])"), r"\1 \2 "),
49
+ # tokenize period and comma unless followed by a digit
50
+ (re.compile(r"([\.,])([^0-9])"), r" \1 \2"),
51
+ # tokenize dash when preceded by a digit
52
+ (re.compile(r"([0-9])(-)"), r"\1 \2 "),
53
+ # one space only between words
54
+ # NOTE: Doing this in Python (below) is faster
55
+ # (re.compile(r'\s+'), r' '),
56
+ ]
57
+
58
+ @lru_cache(maxsize=2**16)
59
+ def __call__(self, line):
60
+ """Common post-processing tokenizer for `13a` and `zh` tokenizers.
61
+ :param line: a segment to tokenize
62
+ :return: the tokenized line
63
+ """
64
+ for (_re, repl) in self._re:
65
+ line = _re.sub(repl, line)
66
+
67
+ # no leading or trailing spaces, single space within words
68
+ # return ' '.join(line.split())
69
+ # This line is changed with regards to the original tokenizer (seen above) to return individual words
70
+ return line.split()
71
+
72
+
73
+ class Tokenizer13a(BaseTokenizer):
74
+ def signature(self):
75
+ return "13a"
76
+
77
+ def __init__(self):
78
+ self._post_tokenizer = TokenizerRegexp()
79
+
80
+ @lru_cache(maxsize=2**16)
81
+ def __call__(self, line):
82
+ """Tokenizes an input line using a relatively minimal tokenization
83
+ that is however equivalent to mteval-v13a, used by WMT.
84
+
85
+ :param line: a segment to tokenize
86
+ :return: the tokenized line
87
+ """
88
+
89
+ # language-independent part:
90
+ line = line.replace("<skipped>", "")
91
+ line = line.replace("-\n", "")
92
+ line = line.replace("\n", " ")
93
+
94
+ if "&" in line:
95
+ line = line.replace("&quot;", '"')
96
+ line = line.replace("&amp;", "&")
97
+ line = line.replace("&lt;", "<")
98
+ line = line.replace("&gt;", ">")
99
+
100
+ return self._post_tokenizer(f" {line} ")