sakharamg
/

NMTKD

Model card Files Files and versions Community

NMTKD / translation /tools /mosesdecoder /scripts /training /wrappers /parse-en-egret.perl

sakharamg

Uploading all files

158b61b about 2 years ago

raw

history blame contribute delete

5.97 kB

	#!/usr/bin/env perl
	#
	# This file is part of moses. Its use is licensed under the GNU Lesser General
	# Public License version 2.1 or, at your option, any later version.

	use warnings;
	use strict;
	use Getopt::Long "GetOptions";
	use FindBin qw($RealBin);

	my ($EGRET_DIR,$MOSES_DIR,$TREE_CONVERTER,$FOREST,$SPLIT_HYPHEN,$SPLIT_SLASH,$MARK_SPLIT,$BINARIZE,$UNPARSEABLE,$RAW_IN,$RAW_OUT,$EGRET_OPTIONS,$TREE_CONVERTER_OPTIONS);

	$UNPARSEABLE = 0;

	die("ERROR: syntax is: parse-en-egret.perl [-forest] [-split-hyphen] [-split-slash] [-mark-split] [-binarize] [-unparseable] [-raw-in PATH] [-raw-out PATH] [-egret-options OPTIONS] [-tree-converter-options OPTIONS] -egret-dir DIR -moses-dir DIR -tree-converter PATH < in > out\n")
	unless &GetOptions
	('egret-dir=s' => \$EGRET_DIR,
	'moses-dir=s' => \$MOSES_DIR,
	'tree-converter=s' => \$TREE_CONVERTER,
	'forest' => \$FOREST,
	'split-hyphen' => \$SPLIT_HYPHEN,
	'split-slash' => \$SPLIT_SLASH,
	'mark-split' => \$MARK_SPLIT,
	'binarize' => \$BINARIZE,
	'unparseable' => \$UNPARSEABLE,
	'raw-in=s' => \$RAW_IN,
	'raw-out=s' => \$RAW_OUT,
	'egret-options=s' => \$EGRET_OPTIONS,
	'tree-converter-options=s' => \$TREE_CONVERTER_OPTIONS
	)
	&& defined($EGRET_DIR) && defined($MOSES_DIR) && defined($TREE_CONVERTER);

	die("ERROR: could not find egret directory: '$EGRET_DIR'\n") unless -d $EGRET_DIR;
	die("ERROR: could not find moses directory: '$MOSES_DIR'\n") unless -d $MOSES_DIR;
	die("ERROR: file not found or not executable: '$TREE_CONVERTER'\n") unless -x $TREE_CONVERTER;

	# Pre-processing.

	my $tmpEscaped = "/tmp/parse-en-egret.1.$$";
	my $tmpDeescaped = "/tmp/parse-en-egret.2.$$";
	my $tmpSplitPoints = "/tmp/parse-en-egret.3.$$";

	open(ESCAPED, ">>$tmpEscaped");
	open(DEESCAPED, "\| $RealBin/../../tokenizer/deescape-special-chars.perl > $tmpDeescaped");
	open(SPLIT_POINTS, ">>$tmpSplitPoints");

	# Unsplit hyphen and slashes and write a file indicating where split points
	# are required in later post-processing.
	while(<STDIN>) {
	print ESCAPED $_;
	my @tokens = split;
	my $new_token = "";
	my $i = 0; # current token index in input sentence
	my $j = -1; # current token index in output sentence
	my $s = ""; # output sentence
	my $t = ""; # split point line
	while ($i <= $#tokens) {
	if (defined($SPLIT_HYPHEN) && $i <= $#tokens-1 &&
	$tokens[$i] eq "\@\-\@") {
	my $pos = length $new_token;
	$new_token .= "-$tokens[$i+1]";
	$t .= "$j,$pos,- ";
	$i += 2;
	} elsif (defined($SPLIT_SLASH) && $i <= $#tokens-1 &&
	$tokens[$i] eq "\@\/\@") {
	my $pos = length $new_token;
	$new_token .= "/$tokens[$i+1]";
	$t .= "$j,$pos,/ ";
	$i += 2;
	} else {
	$s .= "$new_token ";
	$new_token = $tokens[$i];
	$i++;
	$j++;
	}
	}
	$s .= "$new_token";
	$s =~ s/^\s+//;
	$t =~ s/^\s+//;
	print DEESCAPED "$s\n";
	print SPLIT_POINTS "$t\n";
	}

	close(SPLIT_POINTS);
	close(DEESCAPED);
	close(ESCAPED);

	# Construct the parsing / post-processing pipeline:

	# Stage 1: Parse (unless the user has provided Egret input via -raw-in option).
	my $pipeline = "";
	if (defined($RAW_IN)) {
	$pipeline .= "cat \"$RAW_IN\" \|";
	} else {
	$pipeline .= "$EGRET_DIR/egret";
	$pipeline .= " -lapcfg";
	$pipeline .= " -data=$EGRET_DIR/eng_grammar";
	$pipeline .= " -printForest" if $FOREST;
	$pipeline .= " -i=$tmpDeescaped";
	$pipeline .= " $EGRET_OPTIONS" if defined($EGRET_OPTIONS);
	$pipeline .= " \|";
	}
	if (defined($RAW_OUT)) {
	$pipeline .= "tee \"$RAW_OUT\" \|";
	}

	# Stage 2: Convert trees to forests (unless we already have forests)
	unless ($FOREST) {
	$pipeline .= 'sed \'s/^(//\' \|'; # Remove opening (
	$pipeline .= 'sed \'s/)$//\' \|'; # Remove closing )
	$pipeline .= "$TREE_CONVERTER";
	$pipeline .= " -input_format penn";
	$pipeline .= " -output_format egret";
	$pipeline .= " \|";
	}

	# Stage 3: Postprocess using Moses' postprocess-egret-forests
	# This performs some minor transformations to the forest: Moses-style escaping
	# of special characters; removal of Egret's "^g" suffixes from constituent
	# labels; and marking of slash/hyphen split points (using @ characters).
	$pipeline .= "$MOSES_DIR/bin/postprocess-egret-forests";
	$pipeline .= " --Escape" if $FOREST;
	$pipeline .= " --MarkSplitPoints $tmpSplitPoints";
	$pipeline .= " \|";

	# Stage 4: Postprocess using Travatar's tree-converter.
	# This normalizes the forest weights and performs hyphen / slash splitting (if
	# requested). The option -tree-converter-options can be used to enable
	# additional tree-converter transformations (such as binarization).
	#my $output_format = $FOREST ? "egret" : "mosesxml";
	my $output_format = $FOREST ? "egret" : "penn";
	$pipeline .= "$TREE_CONVERTER";
	$pipeline .= " -input_format egret";
	$pipeline .= " -output_format $output_format";
	# FIXME Single split option
	$pipeline .= " -split \@\-\@" if defined($SPLIT_HYPHEN);
	$pipeline .= " -split \@\/\@" if defined($SPLIT_SLASH);
	$pipeline .= " $TREE_CONVERTER_OPTIONS" if defined($TREE_CONVERTER_OPTIONS);
	$pipeline .= " \|";

	unless ($FOREST) {
	$pipeline .= 'sed \'s/^()$//\' \|'; # Remove empty trees (failed parses)
	$pipeline .= 'sed \'s/^(/( (/\' \|'; # Add Berkeley-style opening ( + blank
	$pipeline .= 'sed \'s/)$/))/\' \|'; # Add Berkeley-style closing )
	$pipeline .= 'sed \'s/^$/(())/\' \|'; # Restore empty trees (Berkeley-style)
	$pipeline .= "$RealBin/berkeleyparsed2mosesxml.perl \|";
	$pipeline .= 'sed \'s/^<tree label="TOP"/<tree label="ROOT"/\' \|';
	}

	# Run the parsing / post-processing pipeline.

	open(PARSE, $pipeline);

	if ($FOREST) {
	while (<PARSE>) {
	print $_;
	}
	} else {
	open(TMPESCAPED, $tmpEscaped);
	while (<PARSE>) {
	my $outLine = $_;
	my $unparsedLine = <TMPESCAPED>;
	if ($UNPARSEABLE == 1 && length($outLine) == 1) {
	print $unparsedLine;
	} else {
	print $outLine;
	}
	}
	}

	close(PARSE);

	`rm $tmpSplitPoints`;
	`rm $tmpDeescaped`;
	`rm $tmpEscaped`;