|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import re |
|
from functools import lru_cache |
|
|
|
|
|
class BaseTokenizer: |
|
"""A base dummy tokenizer to derive from.""" |
|
|
|
def signature(self): |
|
""" |
|
Returns a signature for the tokenizer. |
|
:return: signature string |
|
""" |
|
return "none" |
|
|
|
def __call__(self, line): |
|
""" |
|
Tokenizes an input line with the tokenizer. |
|
:param line: a segment to tokenize |
|
:return: the tokenized line |
|
""" |
|
return line |
|
|
|
|
|
class TokenizerRegexp(BaseTokenizer): |
|
def signature(self): |
|
return "re" |
|
|
|
def __init__(self): |
|
self._re = [ |
|
|
|
(re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "), |
|
|
|
(re.compile(r"([^0-9])([\.,])"), r"\1 \2 "), |
|
|
|
(re.compile(r"([\.,])([^0-9])"), r" \1 \2"), |
|
|
|
(re.compile(r"([0-9])(-)"), r"\1 \2 "), |
|
|
|
|
|
|
|
] |
|
|
|
@lru_cache(maxsize=2**16) |
|
def __call__(self, line): |
|
"""Common post-processing tokenizer for `13a` and `zh` tokenizers. |
|
:param line: a segment to tokenize |
|
:return: the tokenized line |
|
""" |
|
for (_re, repl) in self._re: |
|
line = _re.sub(repl, line) |
|
|
|
|
|
|
|
|
|
return line.split() |
|
|
|
|
|
class Tokenizer13a(BaseTokenizer): |
|
def signature(self): |
|
return "13a" |
|
|
|
def __init__(self): |
|
self._post_tokenizer = TokenizerRegexp() |
|
|
|
@lru_cache(maxsize=2**16) |
|
def __call__(self, line): |
|
"""Tokenizes an input line using a relatively minimal tokenization |
|
that is however equivalent to mteval-v13a, used by WMT. |
|
|
|
:param line: a segment to tokenize |
|
:return: the tokenized line |
|
""" |
|
|
|
|
|
line = line.replace("<skipped>", "") |
|
line = line.replace("-\n", "") |
|
line = line.replace("\n", " ") |
|
|
|
if "&" in line: |
|
line = line.replace(""", '"') |
|
line = line.replace("&", "&") |
|
line = line.replace("<", "<") |
|
line = line.replace(">", ">") |
|
|
|
return self._post_tokenizer(f" {line} ") |
|
|