|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
use strict; |
|
use Getopt::Long "GetOptions"; |
|
use FindBin qw($RealBin); |
|
|
|
my ($EGRET_DIR,$MOSES_DIR,$TREE_CONVERTER,$FOREST,$SPLIT_HYPHEN,$SPLIT_SLASH,$MARK_SPLIT,$BINARIZE,$UNPARSEABLE,$RAW_IN,$RAW_OUT,$EGRET_OPTIONS,$TREE_CONVERTER_OPTIONS); |
|
|
|
$UNPARSEABLE = 0; |
|
|
|
die("ERROR: syntax is: parse-en-egret.perl [-forest] [-split-hyphen] [-split-slash] [-mark-split] [-binarize] [-unparseable] [-raw-in PATH] [-raw-out PATH] [-egret-options OPTIONS] [-tree-converter-options OPTIONS] -egret-dir DIR -moses-dir DIR -tree-converter PATH < in > out\n") |
|
unless &GetOptions |
|
('egret-dir=s' => \$EGRET_DIR, |
|
'moses-dir=s' => \$MOSES_DIR, |
|
'tree-converter=s' => \$TREE_CONVERTER, |
|
'forest' => \$FOREST, |
|
'split-hyphen' => \$SPLIT_HYPHEN, |
|
'split-slash' => \$SPLIT_SLASH, |
|
'mark-split' => \$MARK_SPLIT, |
|
'binarize' => \$BINARIZE, |
|
'unparseable' => \$UNPARSEABLE, |
|
'raw-in=s' => \$RAW_IN, |
|
'raw-out=s' => \$RAW_OUT, |
|
'egret-options=s' => \$EGRET_OPTIONS, |
|
'tree-converter-options=s' => \$TREE_CONVERTER_OPTIONS |
|
) |
|
&& defined($EGRET_DIR) && defined($MOSES_DIR) && defined($TREE_CONVERTER); |
|
|
|
die("ERROR: could not find egret directory: '$EGRET_DIR'\n") unless -d $EGRET_DIR; |
|
die("ERROR: could not find moses directory: '$MOSES_DIR'\n") unless -d $MOSES_DIR; |
|
die("ERROR: file not found or not executable: '$TREE_CONVERTER'\n") unless -x $TREE_CONVERTER; |
|
|
|
|
|
|
|
my $tmpEscaped = "/tmp/parse-en-egret.1.$$"; |
|
my $tmpDeescaped = "/tmp/parse-en-egret.2.$$"; |
|
my $tmpSplitPoints = "/tmp/parse-en-egret.3.$$"; |
|
|
|
open(ESCAPED, ">>$tmpEscaped"); |
|
open(DEESCAPED, "| $RealBin/../../tokenizer/deescape-special-chars.perl > $tmpDeescaped"); |
|
open(SPLIT_POINTS, ">>$tmpSplitPoints"); |
|
|
|
|
|
|
|
while(<STDIN>) { |
|
print ESCAPED $_; |
|
my @tokens = split; |
|
my $new_token = ""; |
|
my $i = 0; |
|
my $j = -1; |
|
my $s = ""; |
|
my $t = ""; |
|
while ($i <= $#tokens) { |
|
if (defined($SPLIT_HYPHEN) && $i <= $#tokens-1 && |
|
$tokens[$i] eq "\@\-\@") { |
|
my $pos = length $new_token; |
|
$new_token .= "-$tokens[$i+1]"; |
|
$t .= "$j,$pos,- "; |
|
$i += 2; |
|
} elsif (defined($SPLIT_SLASH) && $i <= $#tokens-1 && |
|
$tokens[$i] eq "\@\/\@") { |
|
my $pos = length $new_token; |
|
$new_token .= "/$tokens[$i+1]"; |
|
$t .= "$j,$pos,/ "; |
|
$i += 2; |
|
} else { |
|
$s .= "$new_token "; |
|
$new_token = $tokens[$i]; |
|
$i++; |
|
$j++; |
|
} |
|
} |
|
$s .= "$new_token"; |
|
$s =~ s/^\s+//; |
|
$t =~ s/^\s+//; |
|
print DEESCAPED "$s\n"; |
|
print SPLIT_POINTS "$t\n"; |
|
} |
|
|
|
close(SPLIT_POINTS); |
|
close(DEESCAPED); |
|
close(ESCAPED); |
|
|
|
|
|
|
|
|
|
my $pipeline = ""; |
|
if (defined($RAW_IN)) { |
|
$pipeline .= "cat \"$RAW_IN\" |"; |
|
} else { |
|
$pipeline .= "$EGRET_DIR/egret"; |
|
$pipeline .= " -lapcfg"; |
|
$pipeline .= " -data=$EGRET_DIR/eng_grammar"; |
|
$pipeline .= " -printForest" if $FOREST; |
|
$pipeline .= " -i=$tmpDeescaped"; |
|
$pipeline .= " $EGRET_OPTIONS" if defined($EGRET_OPTIONS); |
|
$pipeline .= " |"; |
|
} |
|
if (defined($RAW_OUT)) { |
|
$pipeline .= "tee \"$RAW_OUT\" |"; |
|
} |
|
|
|
|
|
unless ($FOREST) { |
|
$pipeline .= 'sed \'s/^(//\' |'; |
|
$pipeline .= 'sed \'s/)$//\' |'; |
|
$pipeline .= "$TREE_CONVERTER"; |
|
$pipeline .= " -input_format penn"; |
|
$pipeline .= " -output_format egret"; |
|
$pipeline .= " |"; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
$pipeline .= "$MOSES_DIR/bin/postprocess-egret-forests"; |
|
$pipeline .= " --Escape" if $FOREST; |
|
$pipeline .= " --MarkSplitPoints $tmpSplitPoints"; |
|
$pipeline .= " |"; |
|
|
|
|
|
|
|
|
|
|
|
|
|
my $output_format = $FOREST ? "egret" : "penn"; |
|
$pipeline .= "$TREE_CONVERTER"; |
|
$pipeline .= " -input_format egret"; |
|
$pipeline .= " -output_format $output_format"; |
|
|
|
$pipeline .= " -split \@\-\@" if defined($SPLIT_HYPHEN); |
|
$pipeline .= " -split \@\/\@" if defined($SPLIT_SLASH); |
|
$pipeline .= " $TREE_CONVERTER_OPTIONS" if defined($TREE_CONVERTER_OPTIONS); |
|
$pipeline .= " |"; |
|
|
|
unless ($FOREST) { |
|
$pipeline .= 'sed \'s/^()$//\' |'; |
|
$pipeline .= 'sed \'s/^(/( (/\' |'; |
|
$pipeline .= 'sed \'s/)$/))/\' |'; |
|
$pipeline .= 'sed \'s/^$/(())/\' |'; |
|
$pipeline .= "$RealBin/berkeleyparsed2mosesxml.perl |"; |
|
$pipeline .= 'sed \'s/^<tree label="TOP"/<tree label="ROOT"/\' |'; |
|
} |
|
|
|
|
|
|
|
open(PARSE, $pipeline); |
|
|
|
if ($FOREST) { |
|
while (<PARSE>) { |
|
print $_; |
|
} |
|
} else { |
|
open(TMPESCAPED, $tmpEscaped); |
|
while (<PARSE>) { |
|
my $outLine = $_; |
|
my $unparsedLine = <TMPESCAPED>; |
|
if ($UNPARSEABLE == 1 && length($outLine) == 1) { |
|
print $unparsedLine; |
|
} else { |
|
print $outLine; |
|
} |
|
} |
|
} |
|
|
|
close(PARSE); |
|
|
|
`rm $tmpSplitPoints`; |
|
`rm $tmpDeescaped`; |
|
`rm $tmpEscaped`; |
|
|