sakharamg
/

NMTKD

Model card Files Files and versions Community

File size: 6,259 Bytes

158b61b

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

"""
Takes a file in the CoNLL dependency format (from the CoNLL-X shared task on
dependency parsing; http://ilk.uvt.nl/conll/#dataformat ) and produces
Moses XML format.

Note that the structure is built based on fields 9 and 10 (projective HEAD
and RELATION), which not all parsers produce.

Usage: conll2mosesxml.py [--brackets] < input_file > output_file
"""

from __future__ import print_function, unicode_literals
import sys
import re
import codecs
from collections import (
    namedtuple,
    defaultdict,
    )
from lxml import etree as ET


Word = namedtuple(
    'Word',
    ['pos', 'word', 'lemma', 'tag', 'head', 'func', 'proj_head', 'proj_func'])


def main(output_format='xml'):
    sentence = []

    for line in sys.stdin:

        # Process sentence.
        if line == "\n":
            sentence.insert(0, [])
            if is_projective(sentence):
                write(sentence, output_format)
            else:
                sys.stderr.write(
                    ' '.join(w.word for w in sentence[1:]) + '\n')
                sys.stdout.write('\n')
            sentence = []
            continue

        try:
            (
                pos,
                word,
                lemma,
                tag,
                tag2,
                morph,
                head,
                func,
                proj_head,
                proj_func,
            ) = line.split()
        except ValueError:  # Word may be unicode whitespace.
            (
                pos,
                word,
                lemma,
                tag,
                tag2,
                morph,
                head,
                func,
                proj_head,
                proj_func,
            ) = re.split(' *\t*', line.strip())

        word = escape_special_chars(word)
        lemma = escape_special_chars(lemma)

        if proj_head == '_':
            proj_head = head
            proj_func = func

        sentence.append(
            Word(
                int(pos), word, lemma, tag2, int(head), func, int(proj_head),
                proj_func))


# This script performs the same escaping as escape-special-chars.perl in
# Moses.  Most of it is done in function write(), but quotation marks need
# to be processed first.
def escape_special_chars(line):
    line = line.replace('\'', '&apos;')  # xml
    line = line.replace('"', '&quot;')  # xml
    line = line.replace('[', '&#91;')  # syntax non-terminal
    line = line.replace(']', '&#93;')  # syntax non-terminal

    return line


# make a check if structure is projective
def is_projective(sentence):
    dominates = defaultdict(set)
    for i, w in enumerate(sentence):
        dominates[i].add(i)
        if not i:
            continue
        head = int(w.proj_head)
        while head != 0:
            if i in dominates[head]:
                break
            dominates[head].add(i)
            head = int(sentence[head].proj_head)

    for i in dominates:
        dependents = dominates[i]
        if max(dependents) - min(dependents) != len(dependents) - 1:
            sys.stderr.write("error: non-projective structure.\n")
            return False
    return True


def write(sentence, output_format='xml'):

    if output_format == 'xml':
        tree = create_subtree(0, sentence)
        out = ET.tostring(tree, encoding='UTF-8').decode('UTF-8')

    if output_format == 'brackets':
        out = create_brackets(0, sentence)

    out = out.replace('|', '&#124;')  # factor separator

    # lxml is buggy if input is escaped:
    out = out.replace('&amp;apos;', '&apos;')
    # lxml is buggy if input is escaped:
    out = out.replace('&amp;quot;', '&quot;')
    # lxml is buggy if input is escaped:
    out = out.replace('&amp;#91;', '&#91;')
    # lxml is buggy if input is escaped:
    out = out.replace('&amp;#93;', '&#93;')

    print(out)


def create_subtree(position, sentence):
    """"Write node in Moses XML format."""
    element = ET.Element('tree')

    if position:
        element.set('label', sentence[position].proj_func)
    else:
        element.set('label', 'sent')

    for i in range(1, position):
        if sentence[i].proj_head == position:
            element.append(create_subtree(i, sentence))

    if position:

        if preterminals:
            head = ET.Element('tree')
            head.set('label', sentence[position].tag)
            head.text = sentence[position].word
            element.append(head)

        else:
            if len(element):
                element[-1].tail = sentence[position].word
            else:
                element.text = sentence[position].word

    for i in range(position, len(sentence)):
        if i and sentence[i].proj_head == position:
            element.append(create_subtree(i, sentence))

    return element


# write node in bracket format (Penn treebank style)
def create_brackets(position, sentence):

    if position:
        element = "[ " + sentence[position].proj_func + ' '
    else:
        element = "[ sent "

    for i in range(1, position):
        if sentence[i].proj_head == position:
            element += create_brackets(i, sentence)

    if position:
        word = sentence[position].word
        tag = sentence[position].tag

        if preterminals:
            element += '[ ' + tag + ' ' + word + ' ] '
        else:
            element += word + ' ] '

    for i in range(position, len(sentence)):
        if i and sentence[i].proj_head == position:
            element += create_brackets(i, sentence)

    if preterminals or not position:
        element += '] '

    return element

if __name__ == '__main__':
    if sys.version_info < (3, 0, 0):
        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)

    if '--no_preterminals' in sys.argv:
        preterminals = False
    else:
        preterminals = True

    if '--brackets' in sys.argv:
        main('brackets')
    else:
        main('xml')