|
""" |
|
A module for interfacing with ``split-sentences.perl`` from Moses toolkit. |
|
|
|
Copyright ® 2016-2017, Luís Gomes <[email protected]> |
|
""" |
|
|
|
usage = """ |
|
Usage: |
|
moses-sentence-splitter [options] <lang> [<inputfile> [<outputfile>]] |
|
moses-sentence-splitter --selftest [--verbose] |
|
|
|
Options: |
|
--selftest, -t Run selftests. |
|
--verbose, -v Be more verbose. |
|
--unwrap, -u Assume that the text is wrapped and try to unwrap it. |
|
Note that this option will cause all consecutive non-empty |
|
lines to be buffered in memory. If you give this option |
|
make sure that you have empty lines separating paragraphs. |
|
When this option is not given, each line is assumed to be |
|
an independent paragraph or sentence and thus will not be |
|
joined with other lines. |
|
--more Also split on colons and semi-colons. |
|
|
|
2016, Luís Gomes <[email protected]> |
|
""" |
|
|
|
|
|
from docopt import docopt |
|
from openfile import openfile |
|
from os import path |
|
from toolwrapper import ToolWrapper |
|
import sys |
|
|
|
|
|
class MosesSentenceSplitter(ToolWrapper): |
|
""" |
|
A class for interfacing with ``split-sentences.perl`` from Moses toolkit. |
|
|
|
This class communicates with split-sentences.perl process via pipes. When |
|
the MosesSentenceSplitter object is no longer needed, the close() method |
|
should be called to free system resources. The class supports the context |
|
manager interface. If used in a with statement, the close() method is |
|
invoked automatically. |
|
|
|
When attribute ``more`` is True, colons and semi-colons are considered |
|
sentence separators. |
|
|
|
>>> split_sents = MosesSentenceSplitter('en') |
|
>>> split_sents(['Hello World! Hello', 'again.']) |
|
['Hello World!', 'Hello again.'] |
|
|
|
""" |
|
|
|
def __init__(self, lang="en", more=True): |
|
self.lang = lang |
|
program = path.join( |
|
path.dirname(__file__), |
|
"split-sentences.perl" |
|
) |
|
argv = ["perl", program, "-q", "-b", "-l", self.lang] |
|
if more: |
|
argv.append("-m") |
|
super().__init__(argv) |
|
|
|
def __str__(self): |
|
return "MosesSentenceSplitter(lang=\"{lang}\")".format(lang=self.lang) |
|
|
|
def __call__(self, paragraph): |
|
"""Splits sentences within a paragraph. |
|
The paragraph is a list of non-empty lines. XML-like tags are not |
|
allowed. |
|
""" |
|
assert isinstance(paragraph, (list, tuple)) |
|
if not paragraph: |
|
return [] |
|
assert all(isinstance(line, str) for line in paragraph) |
|
paragraph = [line.strip() for line in paragraph] |
|
assert all(paragraph), "blank lines are not allowed" |
|
for line in paragraph: |
|
self.writeline(line) |
|
self.writeline("<P>") |
|
sentences = [] |
|
while True: |
|
sentence = self.readline().strip() |
|
if sentence == "<P>": |
|
break |
|
sentences.append(sentence) |
|
return sentences |
|
|
|
|
|
def read_paragraphs(inputfile, wrapped=True): |
|
lines = map(str.strip, inputfile) |
|
if wrapped: |
|
paragraph = [] |
|
for line in lines: |
|
if line: |
|
paragraph.append(line) |
|
elif paragraph: |
|
yield paragraph |
|
paragraph = [] |
|
if paragraph: |
|
yield paragraph |
|
else: |
|
for line in lines: |
|
yield [line] if line else [] |
|
|
|
|
|
def write_paragraphs(paragraphs, outputfile, blank_sep=True): |
|
for paragraph in paragraphs: |
|
for sentence in paragraph: |
|
print(sentence, file=outputfile) |
|
if blank_sep or not paragraph: |
|
print(file=outputfile) |
|
|
|
|
|
def main(): |
|
args = docopt(usage) |
|
if args["--selftest"]: |
|
import doctest |
|
import mosestokenizer.sentsplitter |
|
doctest.testmod(mosestokenizer.sentsplitter) |
|
if not args["<lang>"]: |
|
sys.exit(0) |
|
split_sents = MosesSentenceSplitter(args["<lang>"], more=args["--more"]) |
|
inputfile = openfile(args["<inputfile>"]) |
|
outputfile = openfile(args["<outputfile>"], "wt") |
|
with inputfile, outputfile: |
|
paragraphs = read_paragraphs(inputfile, wrapped=args["--unwrap"]) |
|
paragraphs = map(split_sents, paragraphs) |
|
write_paragraphs(paragraphs, outputfile, blank_sep=args["--unwrap"]) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|