File size: 4,448 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
A module for interfacing with ``split-sentences.perl`` from Moses toolkit.

Copyright ® 2016-2017, Luís Gomes <[email protected]>
"""

usage = """
Usage:
    moses-sentence-splitter [options] <lang> [<inputfile> [<outputfile>]]
    moses-sentence-splitter --selftest [--verbose]

Options:
    --selftest, -t  Run selftests.
    --verbose, -v   Be more verbose.
    --unwrap, -u    Assume that the text is wrapped and try to unwrap it.
                    Note that this option will cause all consecutive non-empty
                    lines to be buffered in memory.  If you give this option
                    make sure that you have empty lines separating paragraphs.
                    When this option is not given, each line is assumed to be
                    an independent paragraph or sentence and thus will not be
                    joined with other lines.
    --more          Also split on colons and semi-colons.

2016, Luís Gomes <[email protected]>
"""


from docopt import docopt
from openfile import openfile
from os import path
from toolwrapper import ToolWrapper
import sys


class MosesSentenceSplitter(ToolWrapper):
    """
    A class for interfacing with ``split-sentences.perl`` from Moses toolkit.

    This class communicates with split-sentences.perl process via pipes. When
    the MosesSentenceSplitter object is no longer needed, the close() method
    should be called to free system resources. The class supports the context
    manager interface. If used in a with statement, the close() method is
    invoked automatically.

    When attribute ``more`` is True, colons and semi-colons are considered
    sentence separators.

    >>> split_sents = MosesSentenceSplitter('en')
    >>> split_sents(['Hello World! Hello', 'again.'])
    ['Hello World!', 'Hello again.']

    """

    def __init__(self, lang="en", more=True):
        self.lang = lang
        program = path.join(
            path.dirname(__file__),
            "split-sentences.perl"
        )
        argv = ["perl", program, "-q", "-b", "-l", self.lang]
        if more:
            argv.append("-m")
        super().__init__(argv)

    def __str__(self):
        return "MosesSentenceSplitter(lang=\"{lang}\")".format(lang=self.lang)

    def __call__(self, paragraph):
        """Splits sentences within a paragraph.
        The paragraph is a list of non-empty lines.  XML-like tags are not
         allowed.
        """
        assert isinstance(paragraph, (list, tuple))
        if not paragraph:  # empty paragraph is OK
            return []
        assert all(isinstance(line, str) for line in paragraph)
        paragraph = [line.strip() for line in paragraph]
        assert all(paragraph), "blank lines are not allowed"
        for line in paragraph:
            self.writeline(line)
        self.writeline("<P>")
        sentences = []
        while True:
            sentence = self.readline().strip()
            if sentence == "<P>":
                break
            sentences.append(sentence)
        return sentences


def read_paragraphs(inputfile, wrapped=True):
    lines = map(str.strip, inputfile)
    if wrapped:
        paragraph = []
        for line in lines:
            if line:
                paragraph.append(line)
            elif paragraph:
                yield paragraph
                paragraph = []
        if paragraph:
            yield paragraph
    else:
        for line in lines:
            yield [line] if line else []


def write_paragraphs(paragraphs, outputfile, blank_sep=True):
    for paragraph in paragraphs:
        for sentence in paragraph:
            print(sentence, file=outputfile)
        if blank_sep or not paragraph:
            print(file=outputfile)  # paragraph separator


def main():
    args = docopt(usage)
    if args["--selftest"]:
        import doctest
        import mosestokenizer.sentsplitter
        doctest.testmod(mosestokenizer.sentsplitter)
        if not args["<lang>"]:
            sys.exit(0)
    split_sents = MosesSentenceSplitter(args["<lang>"], more=args["--more"])
    inputfile = openfile(args["<inputfile>"])
    outputfile = openfile(args["<outputfile>"], "wt")
    with inputfile, outputfile:
        paragraphs = read_paragraphs(inputfile, wrapped=args["--unwrap"])
        paragraphs = map(split_sents, paragraphs)
        write_paragraphs(paragraphs, outputfile, blank_sep=args["--unwrap"])


if __name__ == "__main__":
    main()