|
""" |
|
A module for interfacing with ``tokenizer.perl`` from Moses. |
|
|
|
Copyright ® 2016-2017, Luís Gomes <[email protected]> |
|
""" |
|
|
|
usage = """ |
|
Usage: |
|
moses-tokenizer [options] <lang> [<inputfile> [<outputfile>]] |
|
moses-tokenizer --selftest [--verbose] |
|
|
|
Options: |
|
--selftest, -t Run selftests. |
|
--verbose, -v Be more verbose. |
|
--old Use older version (1.0) of the tokenizer. |
|
If this option is not given, then version 1.1 |
|
will be used. |
|
|
|
2016, Luís Gomes <[email protected]> |
|
""" |
|
|
|
|
|
from docopt import docopt |
|
from openfile import openfile |
|
from os import path |
|
from toolwrapper import ToolWrapper |
|
import sys |
|
|
|
|
|
class MosesTokenizer(ToolWrapper): |
|
"""A module for interfacing with ``tokenizer.perl`` from Moses. |
|
|
|
This class communicates with tokenizer.perl process via pipes. When the |
|
MosesTokenizer object is no longer needed, the close() method should be |
|
called to free system resources. The class supports the context manager |
|
interface. If used in a with statement, the close() method is invoked |
|
automatically. |
|
|
|
>>> tokenize = MosesTokenizer('en') |
|
>>> tokenize('Hello World!') |
|
['Hello', 'World', '!'] |
|
""" |
|
|
|
def __init__(self, lang="en"): |
|
self.lang = lang |
|
program = path.join( |
|
path.dirname(__file__), |
|
"../tokenizer.perl" |
|
) |
|
argv = ["perl", program, "-q", "-l", self.lang] |
|
|
|
|
|
|
|
argv.extend(["-b", "-a"]) |
|
super().__init__(argv) |
|
|
|
def __str__(self): |
|
return "MosesTokenizer(lang=\"{lang}\")".format(lang=self.lang) |
|
|
|
def __call__(self, sentence): |
|
"""Tokenizes a single sentence. |
|
|
|
Newline characters are not allowed in the sentence to be tokenized. |
|
""" |
|
assert isinstance(sentence, str) |
|
sentence = sentence.rstrip("\n") |
|
assert "\n" not in sentence |
|
if not sentence: |
|
return [] |
|
self.writeline(sentence) |
|
return self.readline().split() |
|
|
|
|
|
def main(): |
|
args = docopt(usage) |
|
if args["--selftest"]: |
|
import doctest |
|
import mosestokenizer.tokenizer |
|
doctest.testmod(mosestokenizer.tokenizer) |
|
if not args["<lang>"]: |
|
sys.exit(0) |
|
tokenize = MosesTokenizer( |
|
args["<lang>"] |
|
) |
|
inputfile = openfile(args["<inputfile>"]) |
|
outputfile = openfile(args["<outputfile>"], "wt") |
|
with inputfile, outputfile: |
|
for line in inputfile: |
|
print(*tokenize(line), file=outputfile) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|