|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Takes a file in the CoNLL dependency format (from the CoNLL-X shared task on |
|
dependency parsing; http://ilk.uvt.nl/conll/#dataformat ) and produces |
|
Moses XML format. |
|
|
|
Note that the structure is built based on fields 9 and 10 (projective HEAD |
|
and RELATION), which not all parsers produce. |
|
|
|
Usage: conll2mosesxml.py [--brackets] < input_file > output_file |
|
""" |
|
|
|
from __future__ import print_function, unicode_literals |
|
import sys |
|
import re |
|
import codecs |
|
from collections import ( |
|
namedtuple, |
|
defaultdict, |
|
) |
|
from lxml import etree as ET |
|
|
|
|
|
Word = namedtuple( |
|
'Word', |
|
['pos', 'word', 'lemma', 'tag', 'head', 'func', 'proj_head', 'proj_func']) |
|
|
|
|
|
def main(output_format='xml'): |
|
sentence = [] |
|
|
|
for line in sys.stdin: |
|
|
|
|
|
if line == "\n": |
|
sentence.insert(0, []) |
|
if is_projective(sentence): |
|
write(sentence, output_format) |
|
else: |
|
sys.stderr.write( |
|
' '.join(w.word for w in sentence[1:]) + '\n') |
|
sys.stdout.write('\n') |
|
sentence = [] |
|
continue |
|
|
|
try: |
|
( |
|
pos, |
|
word, |
|
lemma, |
|
tag, |
|
tag2, |
|
morph, |
|
head, |
|
func, |
|
proj_head, |
|
proj_func, |
|
) = line.split() |
|
except ValueError: |
|
( |
|
pos, |
|
word, |
|
lemma, |
|
tag, |
|
tag2, |
|
morph, |
|
head, |
|
func, |
|
proj_head, |
|
proj_func, |
|
) = re.split(' *\t*', line.strip()) |
|
|
|
word = escape_special_chars(word) |
|
lemma = escape_special_chars(lemma) |
|
|
|
if proj_head == '_': |
|
proj_head = head |
|
proj_func = func |
|
|
|
sentence.append( |
|
Word( |
|
int(pos), word, lemma, tag2, int(head), func, int(proj_head), |
|
proj_func)) |
|
|
|
|
|
|
|
|
|
|
|
def escape_special_chars(line): |
|
line = line.replace('\'', ''') |
|
line = line.replace('"', '"') |
|
line = line.replace('[', '[') |
|
line = line.replace(']', ']') |
|
|
|
return line |
|
|
|
|
|
|
|
def is_projective(sentence): |
|
dominates = defaultdict(set) |
|
for i, w in enumerate(sentence): |
|
dominates[i].add(i) |
|
if not i: |
|
continue |
|
head = int(w.proj_head) |
|
while head != 0: |
|
if i in dominates[head]: |
|
break |
|
dominates[head].add(i) |
|
head = int(sentence[head].proj_head) |
|
|
|
for i in dominates: |
|
dependents = dominates[i] |
|
if max(dependents) - min(dependents) != len(dependents) - 1: |
|
sys.stderr.write("error: non-projective structure.\n") |
|
return False |
|
return True |
|
|
|
|
|
def write(sentence, output_format='xml'): |
|
|
|
if output_format == 'xml': |
|
tree = create_subtree(0, sentence) |
|
out = ET.tostring(tree, encoding='UTF-8').decode('UTF-8') |
|
|
|
if output_format == 'brackets': |
|
out = create_brackets(0, sentence) |
|
|
|
out = out.replace('|', '|') |
|
|
|
|
|
out = out.replace('&apos;', ''') |
|
|
|
out = out.replace('&quot;', '"') |
|
|
|
out = out.replace('&#91;', '[') |
|
|
|
out = out.replace('&#93;', ']') |
|
|
|
print(out) |
|
|
|
|
|
def create_subtree(position, sentence): |
|
""""Write node in Moses XML format.""" |
|
element = ET.Element('tree') |
|
|
|
if position: |
|
element.set('label', sentence[position].proj_func) |
|
else: |
|
element.set('label', 'sent') |
|
|
|
for i in range(1, position): |
|
if sentence[i].proj_head == position: |
|
element.append(create_subtree(i, sentence)) |
|
|
|
if position: |
|
|
|
if preterminals: |
|
head = ET.Element('tree') |
|
head.set('label', sentence[position].tag) |
|
head.text = sentence[position].word |
|
element.append(head) |
|
|
|
else: |
|
if len(element): |
|
element[-1].tail = sentence[position].word |
|
else: |
|
element.text = sentence[position].word |
|
|
|
for i in range(position, len(sentence)): |
|
if i and sentence[i].proj_head == position: |
|
element.append(create_subtree(i, sentence)) |
|
|
|
return element |
|
|
|
|
|
|
|
def create_brackets(position, sentence): |
|
|
|
if position: |
|
element = "[ " + sentence[position].proj_func + ' ' |
|
else: |
|
element = "[ sent " |
|
|
|
for i in range(1, position): |
|
if sentence[i].proj_head == position: |
|
element += create_brackets(i, sentence) |
|
|
|
if position: |
|
word = sentence[position].word |
|
tag = sentence[position].tag |
|
|
|
if preterminals: |
|
element += '[ ' + tag + ' ' + word + ' ] ' |
|
else: |
|
element += word + ' ] ' |
|
|
|
for i in range(position, len(sentence)): |
|
if i and sentence[i].proj_head == position: |
|
element += create_brackets(i, sentence) |
|
|
|
if preterminals or not position: |
|
element += '] ' |
|
|
|
return element |
|
|
|
if __name__ == '__main__': |
|
if sys.version_info < (3, 0, 0): |
|
sys.stdin = codecs.getreader('UTF-8')(sys.stdin) |
|
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) |
|
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) |
|
|
|
if '--no_preterminals' in sys.argv: |
|
preterminals = False |
|
else: |
|
preterminals = True |
|
|
|
if '--brackets' in sys.argv: |
|
main('brackets') |
|
else: |
|
main('xml') |
|
|