|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Convert trees in moses XML format to PTB-style bracketed format.""" |
|
|
|
from __future__ import print_function, unicode_literals |
|
import sys |
|
import codecs |
|
|
|
from lxml import etree as ET |
|
|
|
|
|
def escape(word): |
|
|
|
word = word.replace('|', '|') |
|
|
|
word = word.replace('[', '[') |
|
|
|
word = word.replace(']', ']') |
|
word = word.replace('\'', ''') |
|
word = word.replace('\"', '"') |
|
|
|
return word |
|
|
|
|
|
def make_brackets(xml): |
|
out = ' [' + xml.get('label') |
|
|
|
if xml.text and xml.text.strip(): |
|
word = escape(xml.text.strip()) |
|
out += ' ' + word + ']' |
|
|
|
else: |
|
for child in xml: |
|
out += make_brackets(child) |
|
|
|
out += ']' |
|
|
|
return out |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
if sys.version_info < (3, 0): |
|
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) |
|
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) |
|
sys.stdin = codecs.getreader('UTF-8')(sys.stdin) |
|
|
|
for line in sys.stdin: |
|
if line == '\n': |
|
sys.stdout.write(line) |
|
continue |
|
out = make_brackets(ET.fromstring(line)).strip() |
|
sys.stdout.write(out + '\n') |
|
|