|
|
|
|
|
|
|
|
|
|
|
use strict; |
|
use warnings; |
|
|
|
use autodie; |
|
use FindBin qw($RealBin); |
|
use Getopt::Long "GetOptions"; |
|
|
|
my ($SENNA, |
|
$SENNA_DIR, |
|
$SENNA_OPTIONS, |
|
$SPLIT_HYPHEN, |
|
$SPLIT_SLASH, |
|
$MARK_SPLIT, |
|
$BINARIZE, |
|
$UNPARSEABLE, |
|
$RAW_IN, |
|
$RAW_OUT); |
|
|
|
$UNPARSEABLE = 0; |
|
|
|
die("ERROR: syntax is: parse-en-senna.perl [-senna-options OPTIONS] [-split-hyphen] [-split-slash] [-mark-split] [-binarize] [-unparseable] [-raw-in PATH] [-raw-out PATH] -senna PATH -senna-dir PATH < in > out\n") |
|
unless &GetOptions |
|
('senna=s' => \$SENNA, |
|
'senna-dir=s' => \$SENNA_DIR, |
|
'senna-options=s' => \$SENNA_OPTIONS, |
|
'split-hyphen' => \$SPLIT_HYPHEN, |
|
'split-slash' => \$SPLIT_SLASH, |
|
'mark-split' => \$MARK_SPLIT, |
|
'binarize' => \$BINARIZE, |
|
'unparseable' => \$UNPARSEABLE, |
|
'raw-in=s' => \$RAW_IN, |
|
'raw-out=s' => \$RAW_OUT |
|
) |
|
&& defined($SENNA); |
|
|
|
die("ERROR: file not found or not executable: '$SENNA'\n") unless -x $SENNA; |
|
die("ERROR: could not find SENNA directory: '$SENNA_DIR'\n") unless -d $SENNA_DIR; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
my $tmpOriginal = "/tmp/parse-en-senna.1.$$"; |
|
my $tmpProcessed = "/tmp/parse-en-senna.2.$$"; |
|
|
|
open(TMP_ORIGINAL, ">$tmpOriginal"); |
|
|
|
open(TMP_PROCESSED, |
|
"| $RealBin/../../tokenizer/deescape-special-chars.perl > $tmpProcessed;"); |
|
|
|
while(<STDIN>) { |
|
print TMP_ORIGINAL $_; |
|
|
|
|
|
|
|
|
|
|
|
my $num_bytes; |
|
{ |
|
use bytes; |
|
$num_bytes = length($_); |
|
} |
|
if ($num_bytes > 1023) { |
|
print TMP_PROCESSED "SENTENCE_TOO_LONG\n"; |
|
next; |
|
} |
|
|
|
|
|
s/-LRB-/(/g; |
|
s/-RRB-/)/g; |
|
s/-LSB-/[/g; |
|
s/-RSB-/]/g; |
|
s/-LCB-/{/g; |
|
s/-RCB-/}/g; |
|
|
|
|
|
s/ \@-\@ /-/g if $SPLIT_HYPHEN; |
|
# Unsplit slashes. |
|
s/ \@\/\@ /\//g if $SPLIT_SLASH; |
|
|
|
print TMP_PROCESSED $_; |
|
} |
|
|
|
close(TMP_ORIGINAL); |
|
close(TMP_PROCESSED); |
|
|
|
|
|
|
|
|
|
my $pipeline = ""; |
|
|
|
|
|
if (defined($RAW_IN)) { |
|
$pipeline .= "cat \"$RAW_IN\" |"; |
|
} else { |
|
$pipeline .= "cat $tmpProcessed |"; |
|
my $path = $SENNA_DIR; |
|
|
|
if ($path !~ /\/$/) { |
|
$path .= "/"; |
|
} |
|
$pipeline .= " $SENNA -path $path -usrtokens"; |
|
$pipeline .= " $SENNA_OPTIONS" if defined($SENNA_OPTIONS); |
|
$pipeline .= " |"; |
|
} |
|
|
|
if (defined($RAW_OUT)) { |
|
$pipeline .= " tee \"$RAW_OUT\" |"; |
|
} |
|
|
|
|
|
$pipeline .= " $RealBin/senna2brackets.py --berkeley-style |"; |
|
$pipeline .= " $RealBin/berkeleyparsed2mosesxml.perl |"; |
|
|
|
|
|
if ($SPLIT_HYPHEN) { |
|
$pipeline .= " $RealBin/syntax-hyphen-splitting.perl"; |
|
$pipeline .= " -binarize" if $BINARIZE; |
|
$pipeline .= " -mark-split" if $MARK_SPLIT; |
|
$pipeline .= " |"; |
|
} |
|
if ($SPLIT_SLASH) { |
|
$pipeline .= " $RealBin/syntax-hyphen-splitting.perl -slash"; |
|
$pipeline .= " -binarize" if $BINARIZE; |
|
$pipeline .= " -mark-split" if $MARK_SPLIT; |
|
$pipeline .= " |"; |
|
} |
|
|
|
|
|
open(PARSE, $pipeline); |
|
open(TMP_ORIGINAL, $tmpOriginal); |
|
while (<PARSE>) { |
|
my $parsedLine = $_; |
|
my $originalLine = <TMP_ORIGINAL>; |
|
if ($UNPARSEABLE == 1 && length($parsedLine) == 1) { |
|
print $originalLine; |
|
} else { |
|
print $parsedLine; |
|
} |
|
} |
|
close(PARSE); |
|
|
|
`rm $tmpOriginal`; |
|
`rm $tmpProcessed`; |
|
|