sakharamg
/

NMTKD

Model card Files Files and versions Community

NMTKD / translation /tools /mosesdecoder /scripts /training /wrappers /senna2brackets.py

sakharamg

Uploading all files

158b61b about 2 years ago

raw

history blame contribute delete

3 kB

	#!/usr/bin/env python
	#
	# This file is part of moses. Its use is licensed under the GNU Lesser General
	# Public License version 2.1 or, at your option, any later version.

	"""
	Read SENNA output (from stdin), extract the parse trees, and write them in
	PTB-style bracketed format (to stdout).

	The SENNA output is assumed to contain tokens in the first column, POS tags
	in the second column, and PSG fragments in the final column.

	It is also assumed that SENNA was run through the parse-en-senna.perl wrapper,
	which:

	- Substitutes the special "SENTENCE_TOO_LONG" token for sentences that
	exceed SENNA's hardcoded limit.

	- Replaces the bracket-like tokens "-LRB-", "-RRB-", etc. with "(", ")",
	etc.
	"""

	import optparse
	import os
	import sys


	def main():
	usage = "usage: %prog [options]"
	parser = optparse.OptionParser(usage=usage)
	parser.add_option("--berkeley-style", action="store_true", default=False,
	dest="berkeley",
	help="mimic the Berkeley Parser's output format")
	(options, args) = parser.parse_args()
	if len(args) > 0:
	parser.error("incorrect number of arguments")

	tree = ""
	line_num = 0
	for line in sys.stdin:
	line_num += 1
	# Check for a blank line (the sentence delimiter).
	if line.strip() == "":
	if not balanced(tree):
	warn("unbalanced parentheses in tree ending at line %d: "
	"discarding tree" % line_num)
	tree = ""
	tree = beautify(tree)
	if options.berkeley:
	tree = berkelify(tree)
	print tree
	tree = ""
	continue
	tokens = line.split()
	word, pos, frag = tokens[0], tokens[1], tokens[-1]
	# Check for the special "SENTENCE_TOO_LONG" token (see
	# parse-en-senna.perl)
	if word == "SENTENCE_TOO_LONG":
	continue
	# Restore -LRB-, -RRB-, etc.
	if word == "(":
	word = "-LRB-"
	elif word == ")":
	word = "-RRB-"
	elif word == "[":
	word = "-LSB-"
	elif word == "]":
	word = "-RSB-"
	elif word == "{":
	word = "-LCB-"
	elif word == "}":
	word = "-RCB-"
	tree += frag.replace("*", "(%s %s)" % (pos, word))


	def balanced(s):
	num_left = 0
	num_right = 0
	for char in s:
	if char == "(":
	num_left += 1
	elif char == ")":
	num_right += 1
	return num_left == num_right


	def beautify(tree):
	s = tree.replace("(", " (")
	return s.strip()


	def berkelify(tree):
	if tree == "":
	return "(())"
	assert tree[0] == "("
	pos = tree.find(" (", 1)
	assert pos != -1
	old_root = tree[1:pos]
	return tree.replace(old_root, "TOP")


	def warn(msg):
	prog_name = os.path.basename(sys.argv[0])
	sys.stderr.write("%s: warning: %s\n" % (prog_name, msg))


	if __name__ == "__main__":
	main()