sakharamg's picture
Uploading all files
158b61b
"""
A module for interfacing with ``normalize-punctuation.perl`` from Moses.
Copyright ® 2016-2017, Luís Gomes <[email protected]>
"""
usage = """
Usage:
moses-punct-normalizer [options] <lang> [<inputfile> [<outputfile>]]
moses-punct-normalizer --selftest [--verbose]
Options:
--selftest, -t Run selftests.
--verbose, -v Be more verbose.
2016, Luís Gomes <[email protected]>
"""
from docopt import docopt
from os import path
from toolwrapper import ToolWrapper
import sys
class MosesPunctuationNormalizer(ToolWrapper):
"""A module for interfacing with ``normalize-punctuation.perl`` from Moses.
This class communicates with normalize-punctuation.perl process via pipes.
When the MosesPunctuationNormalizer object is no longer needed, the close()
method should be called to free system resources. The class supports the
context manager interface. If used in a with statement, the close() method
is invoked automatically.
>>> normalize = MosesPunctuationNormalizer("en")
>>> normalize("«Hello World» — she said…")
'"Hello World" - she said...'
"""
def __init__(self, lang="en"):
self.lang = lang
program = path.join(
path.dirname(__file__),
"normalize-punctuation.perl"
)
argv = ["perl", program, "-b", "-l", self.lang]
super().__init__(argv)
def __str__(self):
return "MosesPunctuationNormalizer(lang=\"{lang}\")".format(
lang=self.lang
)
def __call__(self, line):
"""Normalizes punctuation of a single line of text.
Newline characters are not allowed in the text to be normalized.
"""
assert isinstance(line, str)
line = line.strip()
assert "\n" not in line
if not line:
return []
self.writeline(line)
return self.readline()
def main():
args = docopt(usage)
if args["--selftest"]:
import doctest
import mosestokenizer.punctnormalizer
doctest.testmod(mosestokenizer.punctnormalizer)
if not args["<lang>"]:
sys.exit(0)
normalize = MosesPunctuationNormalizer(args["<lang>"])
inputfile = open(args["<inputfile>"]) if args["<inputfile>"] else sys.stdin
outputfile = open(args["<outputfile>"], "wt") if args["<outputfile>"] else sys.stdout
with inputfile, outputfile:
for line in inputfile:
print(normalize(line), file=outputfile)
if __name__ == "__main__":
main()