|
""" |
|
A module for interfacing with ``normalize-punctuation.perl`` from Moses. |
|
|
|
Copyright ® 2016-2017, Luís Gomes <[email protected]> |
|
""" |
|
|
|
usage = """ |
|
Usage: |
|
moses-punct-normalizer [options] <lang> [<inputfile> [<outputfile>]] |
|
moses-punct-normalizer --selftest [--verbose] |
|
|
|
Options: |
|
--selftest, -t Run selftests. |
|
--verbose, -v Be more verbose. |
|
|
|
2016, Luís Gomes <[email protected]> |
|
""" |
|
|
|
|
|
from docopt import docopt |
|
from os import path |
|
from toolwrapper import ToolWrapper |
|
import sys |
|
|
|
|
|
class MosesPunctuationNormalizer(ToolWrapper): |
|
"""A module for interfacing with ``normalize-punctuation.perl`` from Moses. |
|
|
|
This class communicates with normalize-punctuation.perl process via pipes. |
|
When the MosesPunctuationNormalizer object is no longer needed, the close() |
|
method should be called to free system resources. The class supports the |
|
context manager interface. If used in a with statement, the close() method |
|
is invoked automatically. |
|
|
|
>>> normalize = MosesPunctuationNormalizer("en") |
|
>>> normalize("«Hello World» — she said…") |
|
'"Hello World" - she said...' |
|
""" |
|
|
|
def __init__(self, lang="en"): |
|
self.lang = lang |
|
program = path.join( |
|
path.dirname(__file__), |
|
"normalize-punctuation.perl" |
|
) |
|
argv = ["perl", program, "-b", "-l", self.lang] |
|
super().__init__(argv) |
|
|
|
def __str__(self): |
|
return "MosesPunctuationNormalizer(lang=\"{lang}\")".format( |
|
lang=self.lang |
|
) |
|
|
|
def __call__(self, line): |
|
"""Normalizes punctuation of a single line of text. |
|
|
|
Newline characters are not allowed in the text to be normalized. |
|
""" |
|
assert isinstance(line, str) |
|
line = line.strip() |
|
assert "\n" not in line |
|
if not line: |
|
return [] |
|
self.writeline(line) |
|
return self.readline() |
|
|
|
|
|
def main(): |
|
args = docopt(usage) |
|
if args["--selftest"]: |
|
import doctest |
|
import mosestokenizer.punctnormalizer |
|
doctest.testmod(mosestokenizer.punctnormalizer) |
|
if not args["<lang>"]: |
|
sys.exit(0) |
|
normalize = MosesPunctuationNormalizer(args["<lang>"]) |
|
inputfile = open(args["<inputfile>"]) if args["<inputfile>"] else sys.stdin |
|
outputfile = open(args["<outputfile>"], "wt") if args["<outputfile>"] else sys.stdout |
|
with inputfile, outputfile: |
|
for line in inputfile: |
|
print(normalize(line), file=outputfile) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|