sakharamg
/

NMTKD

Model card Files Files and versions Community

NMTKD / translation /tools /mosesdecoder /scripts /training /wrappers /conll2mosesxml.py

sakharamg

Uploading all files

158b61b about 2 years ago

raw

history blame contribute delete

6.26 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --
	# Author: Rico Sennrich
	#
	# This file is part of moses. Its use is licensed under the GNU Lesser General
	# Public License version 2.1 or, at your option, any later version.

	"""
	Takes a file in the CoNLL dependency format (from the CoNLL-X shared task on
	dependency parsing; http://ilk.uvt.nl/conll/#dataformat ) and produces
	Moses XML format.

	Note that the structure is built based on fields 9 and 10 (projective HEAD
	and RELATION), which not all parsers produce.

	Usage: conll2mosesxml.py [--brackets] < input_file > output_file
	"""

	from __future__ import print_function, unicode_literals
	import sys
	import re
	import codecs
	from collections import (
	namedtuple,
	defaultdict,
	)
	from lxml import etree as ET


	Word = namedtuple(
	'Word',
	['pos', 'word', 'lemma', 'tag', 'head', 'func', 'proj_head', 'proj_func'])


	def main(output_format='xml'):
	sentence = []

	for line in sys.stdin:

	# Process sentence.
	if line == "\n":
	sentence.insert(0, [])
	if is_projective(sentence):
	write(sentence, output_format)
	else:
	sys.stderr.write(
	' '.join(w.word for w in sentence[1:]) + '\n')
	sys.stdout.write('\n')
	sentence = []
	continue

	try:
	(
	pos,
	word,
	lemma,
	tag,
	tag2,
	morph,
	head,
	func,
	proj_head,
	proj_func,
	) = line.split()
	except ValueError: # Word may be unicode whitespace.
	(
	pos,
	word,
	lemma,
	tag,
	tag2,
	morph,
	head,
	func,
	proj_head,
	proj_func,
	) = re.split(' \t', line.strip())

	word = escape_special_chars(word)
	lemma = escape_special_chars(lemma)

	if proj_head == '_':
	proj_head = head
	proj_func = func

	sentence.append(
	Word(
	int(pos), word, lemma, tag2, int(head), func, int(proj_head),
	proj_func))


	# This script performs the same escaping as escape-special-chars.perl in
	# Moses. Most of it is done in function write(), but quotation marks need
	# to be processed first.
	def escape_special_chars(line):
	line = line.replace('\'', ''') # xml
	line = line.replace('"', '"') # xml
	line = line.replace('[', '[') # syntax non-terminal
	line = line.replace(']', ']') # syntax non-terminal

	return line


	# make a check if structure is projective
	def is_projective(sentence):
	dominates = defaultdict(set)
	for i, w in enumerate(sentence):
	dominates[i].add(i)
	if not i:
	continue
	head = int(w.proj_head)
	while head != 0:
	if i in dominates[head]:
	break
	dominates[head].add(i)
	head = int(sentence[head].proj_head)

	for i in dominates:
	dependents = dominates[i]
	if max(dependents) - min(dependents) != len(dependents) - 1:
	sys.stderr.write("error: non-projective structure.\n")
	return False
	return True


	def write(sentence, output_format='xml'):

	if output_format == 'xml':
	tree = create_subtree(0, sentence)
	out = ET.tostring(tree, encoding='UTF-8').decode('UTF-8')

	if output_format == 'brackets':
	out = create_brackets(0, sentence)

	out = out.replace('\|', '\|') # factor separator

	# lxml is buggy if input is escaped:
	out = out.replace('&apos;', ''')
	# lxml is buggy if input is escaped:
	out = out.replace('&quot;', '"')
	# lxml is buggy if input is escaped:
	out = out.replace('&#91;', '[')
	# lxml is buggy if input is escaped:
	out = out.replace('&#93;', ']')

	print(out)


	def create_subtree(position, sentence):
	""""Write node in Moses XML format."""
	element = ET.Element('tree')

	if position:
	element.set('label', sentence[position].proj_func)
	else:
	element.set('label', 'sent')

	for i in range(1, position):
	if sentence[i].proj_head == position:
	element.append(create_subtree(i, sentence))

	if position:

	if preterminals:
	head = ET.Element('tree')
	head.set('label', sentence[position].tag)
	head.text = sentence[position].word
	element.append(head)

	else:
	if len(element):
	element[-1].tail = sentence[position].word
	else:
	element.text = sentence[position].word

	for i in range(position, len(sentence)):
	if i and sentence[i].proj_head == position:
	element.append(create_subtree(i, sentence))

	return element


	# write node in bracket format (Penn treebank style)
	def create_brackets(position, sentence):

	if position:
	element = "[ " + sentence[position].proj_func + ' '
	else:
	element = "[ sent "

	for i in range(1, position):
	if sentence[i].proj_head == position:
	element += create_brackets(i, sentence)

	if position:
	word = sentence[position].word
	tag = sentence[position].tag

	if preterminals:
	element += '[ ' + tag + ' ' + word + ' ] '
	else:
	element += word + ' ] '

	for i in range(position, len(sentence)):
	if i and sentence[i].proj_head == position:
	element += create_brackets(i, sentence)

	if preterminals or not position:
	element += '] '

	return element

	if __name__ == '__main__':
	if sys.version_info < (3, 0, 0):
	sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
	sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
	sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)

	if '--no_preterminals' in sys.argv:
	preterminals = False
	else:
	preterminals = True

	if '--brackets' in sys.argv:
	main('brackets')
	else:
	main('xml')