File size: 2,533 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
A module for interfacing with ``normalize-punctuation.perl`` from Moses.

Copyright ® 2016-2017, Luís Gomes <[email protected]>
"""

usage = """
Usage:
    moses-punct-normalizer [options] <lang> [<inputfile> [<outputfile>]]
    moses-punct-normalizer --selftest [--verbose]

Options:
    --selftest, -t  Run selftests.
    --verbose, -v   Be more verbose.

2016, Luís Gomes <[email protected]>
"""


from docopt import docopt
from os import path
from toolwrapper import ToolWrapper
import sys


class MosesPunctuationNormalizer(ToolWrapper):
    """A module for interfacing with ``normalize-punctuation.perl`` from Moses.

    This class communicates with normalize-punctuation.perl process via pipes.
    When the MosesPunctuationNormalizer object is no longer needed, the close()
    method should be called to free system resources. The class supports the
    context manager interface. If used in a with statement, the close() method
    is invoked automatically.

    >>> normalize = MosesPunctuationNormalizer("en")
    >>> normalize("«Hello World» — she said…")
    '"Hello World" - she said...'
    """

    def __init__(self, lang="en"):
        self.lang = lang
        program = path.join(
            path.dirname(__file__),
            "normalize-punctuation.perl"
        )
        argv = ["perl", program, "-b", "-l", self.lang]
        super().__init__(argv)

    def __str__(self):
        return "MosesPunctuationNormalizer(lang=\"{lang}\")".format(
            lang=self.lang
        )

    def __call__(self, line):
        """Normalizes punctuation of a single line of text.

        Newline characters are not allowed in the text to be normalized.
        """
        assert isinstance(line, str)
        line = line.strip()
        assert "\n" not in line
        if not line:
            return []
        self.writeline(line)
        return self.readline()


def main():
    args = docopt(usage)
    if args["--selftest"]:
        import doctest
        import mosestokenizer.punctnormalizer
        doctest.testmod(mosestokenizer.punctnormalizer)
        if not args["<lang>"]:
            sys.exit(0)
    normalize = MosesPunctuationNormalizer(args["<lang>"])
    inputfile = open(args["<inputfile>"]) if args["<inputfile>"] else sys.stdin
    outputfile = open(args["<outputfile>"], "wt") if args["<outputfile>"] else sys.stdout
    with inputfile, outputfile:
        for line in inputfile:
            print(normalize(line), file=outputfile)

if __name__ == "__main__":
    main()