File size: 1,440 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
"""Convert trees in moses XML format to PTB-style bracketed format."""
from __future__ import print_function, unicode_literals
import sys
import codecs
from lxml import etree as ET
def escape(word):
# Factor separator:
word = word.replace('|', '|')
# Syntax non-terminal:
word = word.replace('[', '[')
# Syntax non-terminal:
word = word.replace(']', ']')
word = word.replace('\'', ''')
word = word.replace('\"', '"')
return word
def make_brackets(xml):
out = ' [' + xml.get('label')
if xml.text and xml.text.strip():
word = escape(xml.text.strip())
out += ' ' + word + ']'
else:
for child in xml:
out += make_brackets(child)
out += ']'
return out
if __name__ == '__main__':
if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
for line in sys.stdin:
if line == '\n':
sys.stdout.write(line)
continue
out = make_brackets(ET.fromstring(line)).strip()
sys.stdout.write(out + '\n')
|