use warnings; |
use strict; |
use File::Basename; |
use File::Temp qw/tempfile/; |
use Getopt::Long "GetOptions"; |
my $COLLINS = "/exports/home/s0565741/work/bin/COLLINS-PARSER"; |
my $MXPOST = "/exports/home/s0565741/work/bin/mxpost"; |
my $TMPDIR = "tmp"; |
my $KEEP_TMP = 0; |
my $RAW = undef; |
my $BASIC = 0; |
GetOptions( |
"collins=s" => \$COLLINS, |
"mxpost=s" => \$MXPOST, |
"tmpdir=s" => \$TMPDIR, |
"keep-tmp" => \$KEEP_TMP, |
"raw=s" => \$RAW |
) or die("ERROR: unknown options"); |
`mkdir -p $TMPDIR`; |
my $MaxChar=10000; |
my $MaxWord=120; |
my $ParserBin="$COLLINS/code/parser"; |
my $ParserEvn="$COLLINS/models/model2/events.gz"; |
my $ParserGrm="$COLLINS/models/model2/grammar"; |
my ($scriptname, $directories) = fileparse($0); |
my ($TMP, $tmpfile) = tempfile("$scriptname-XXXXXXXXXX", DIR=>$TMPDIR, UNLINK=>!$KEEP_TMP); |
my $pipeline = "perl -ne 'use Encode; encode(\"iso-8859-1\", decode(\"utf8\", \$_)); print \$_;' |"; |
$pipeline .= "perl -ne 'tr/\\x20-\\x7f//cd; print \$_.\"\\n\";' | "; |
$pipeline .= "$MXPOST/mxpost $MXPOST/tagger.project |"; |
open(TAG,$pipeline); |
my $sentence_count=0; |
while(<TAG>) { |
if ($sentence_count % 2000 == 0) { |
close(PARSER_IN) if $sentence_count; |
open(PARSER_IN,sprintf(">%s.%05d",$tmpfile,$sentence_count/2000)); |
} |
$sentence_count++; |
chop; |
my $line = &conv_posfmt($_); |
$line = "1 SentenceTooLong NN" if (! &check_length($line)); |
print PARSER_IN "$line\n"; |
} |
close(TAG); |
close(PARSER_IN); |
for(my $i=0;$i * 2000 < $sentence_count;$i++) { |
my $i_formatted = sprintf("%05d",$i); |
`gunzip -c $ParserEvn | $ParserBin $tmpfile.$i_formatted $ParserGrm 10000 1 1 1 1 > $tmpfile.$i_formatted.out`; |
} |
my $DEBUG = 0; |
my $DEBUG_SPACE = " "; |
open(PARSER,"cat $tmpfile.?????.out|"); |
while(my $line = <PARSER>) { |
next unless $line =~ /^\(/; |
if ($line =~ /SentenceTooLong/) { |
print "\n"; |
next; |
} |
chop($line); |
my @LABEL = (); |
my @OUT = (); |
for(my $i=0;$i<length($line);$i++) { |
if (substr($line,$i,1) eq "(") { |
my ($label,$rest) = split(/[\( ]/,substr($line,$i+1)); |
print STDERR substr($DEBUG_SPACE,0,scalar @LABEL)."BEGINNING of $label\n" if $DEBUG; |
$i+=length($label); |
$label =~ s/\$/PUNC/g; |
$label =~ s/\|/:/g; |
$label =~ s/\~.+//; |
push @OUT,"<tree label=\"$label\">"; |
push @LABEL,$label; |
$i++ if substr($line,$i+1,1) eq " "; |
$i++ if substr($line,$i+1,1) eq " "; |
} |
elsif (substr($line,$i,1) eq ")") { |
die("ERROR: NO LABEL ON STACK") unless @LABEL; |
my $label = pop @LABEL; |
print STDERR substr($DEBUG_SPACE,0,scalar @LABEL)."END of $label\n" if $DEBUG; |
push @OUT,"</tree>"; |
$i++ if substr($line,$i+1,1) eq " "; |
} |
else { |
my ($word,$rest) = split(/ /,substr($line,$i)); |
if (substr($line,$i,2) eq "\\)") { |
$word = substr($line,$i,2); |
} |
$i+=length($word); |
print STDERR substr($DEBUG_SPACE,0,scalar @LABEL)."WORD $word\n" if $DEBUG; |
$word =~ /^(.+)\/([^\/]+)$/; |
my ($w,$p) = ($1,$2); |
$w = "(" if $w eq "-LRB-"; |
$w = ")" if $w eq "-RRB-"; |
$w = &escape($w); |
$p =~ s/^-//; |
$p =~ s/-$//; |
push @OUT,"<tree label=\"$p\"> $w </tree>"; |
} |
} |
my $first=1; |
foreach (@OUT) { |
print " " unless $first; |
print $_; |
$first = 0; |
} |
print "\n"; |
} |
sub escape { |
my ($text) = @_; |
$text =~ s/&/&/g; |
$text =~ s/</</g; |
$text =~ s/>/>/g; |
return $text; |
} |
sub check_length { |
my ($line) = @_; |
my ($numc,$numw,@words); |
return 0 if $line =~ /^\d+ [^a-z0-9]+$/i || $line eq "0" || $line eq "0 "; |
$numc = length($line); |
@words = split(" ",$line); |
$numw = ($#words+1)/2; |
return ($numc <= $MaxChar) && ($numw <= $MaxWord); |
} |
sub conv_posfmt { |
my ($line) = @_; |
my ($sep,$ret,$w,$i,$w1,$w2,$numw); |
$ret=""; $sep=""; $numw=0; |
for $w (split(" ",$line)) { |
$i = rindex($w,"_"); |
$w1 = substr($w,0,$i); |
$w2 = substr($w,$i+1); |
$ret .= "$sep$w1 $w2"; |
$sep = " "; $numw++; |
} |
$ret = "$numw $ret"; |
$ret =~ s/\(/-LRB-/g; |
$ret =~ s/\)/-RRB-/g; |
$ret; |
} |