|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
use strict; |
|
use File::Basename; |
|
use File::Temp qw/tempfile/; |
|
use Getopt::Long "GetOptions"; |
|
|
|
my $COLLINS = "/exports/home/s0565741/work/bin/COLLINS-PARSER"; |
|
my $MXPOST = "/exports/home/s0565741/work/bin/mxpost"; |
|
my $TMPDIR = "tmp"; |
|
my $KEEP_TMP = 0; |
|
my $RAW = undef; |
|
|
|
my $BASIC = 0; |
|
GetOptions( |
|
"collins=s" => \$COLLINS, |
|
"mxpost=s" => \$MXPOST, |
|
"tmpdir=s" => \$TMPDIR, |
|
"keep-tmp" => \$KEEP_TMP, |
|
"raw=s" => \$RAW |
|
) or die("ERROR: unknown options"); |
|
|
|
`mkdir -p $TMPDIR`; |
|
|
|
|
|
my $MaxChar=10000; |
|
my $MaxWord=120; |
|
my $ParserBin="$COLLINS/code/parser"; |
|
my $ParserEvn="$COLLINS/models/model2/events.gz"; |
|
my $ParserGrm="$COLLINS/models/model2/grammar"; |
|
my ($scriptname, $directories) = fileparse($0); |
|
my ($TMP, $tmpfile) = tempfile("$scriptname-XXXXXXXXXX", DIR=>$TMPDIR, UNLINK=>!$KEEP_TMP); |
|
|
|
|
|
my $pipeline = "perl -ne 'use Encode; encode(\"iso-8859-1\", decode(\"utf8\", \$_)); print \$_;' |"; |
|
$pipeline .= "perl -ne 'tr/\\x20-\\x7f//cd; print \$_.\"\\n\";' | "; |
|
$pipeline .= "$MXPOST/mxpost $MXPOST/tagger.project |"; |
|
|
|
open(TAG,$pipeline); |
|
my $sentence_count=0; |
|
while(<TAG>) { |
|
if ($sentence_count % 2000 == 0) { |
|
close(PARSER_IN) if $sentence_count; |
|
open(PARSER_IN,sprintf(">%s.%05d",$tmpfile,$sentence_count/2000)); |
|
} |
|
$sentence_count++; |
|
chop; |
|
|
|
|
|
my $line = &conv_posfmt($_); |
|
|
|
|
|
$line = "1 SentenceTooLong NN" if (! &check_length($line)); |
|
|
|
|
|
print PARSER_IN "$line\n"; |
|
} |
|
close(TAG); |
|
close(PARSER_IN); |
|
|
|
|
|
for(my $i=0;$i * 2000 < $sentence_count;$i++) { |
|
my $i_formatted = sprintf("%05d",$i); |
|
`gunzip -c $ParserEvn | $ParserBin $tmpfile.$i_formatted $ParserGrm 10000 1 1 1 1 > $tmpfile.$i_formatted.out`; |
|
} |
|
|
|
|
|
my $DEBUG = 0; |
|
my $DEBUG_SPACE = " "; |
|
open(PARSER,"cat $tmpfile.?????.out|"); |
|
while(my $line = <PARSER>) { |
|
next unless $line =~ /^\(/; |
|
if ($line =~ /SentenceTooLong/) { |
|
print "\n"; |
|
next; |
|
} |
|
chop($line); |
|
my @LABEL = (); |
|
my @OUT = (); |
|
for(my $i=0;$i<length($line);$i++) { |
|
|
|
if (substr($line,$i,1) eq "(") { |
|
my ($label,$rest) = split(/[\( ]/,substr($line,$i+1)); |
|
print STDERR substr($DEBUG_SPACE,0,scalar @LABEL)."BEGINNING of $label\n" if $DEBUG; |
|
$i+=length($label); |
|
$label =~ s/\$/PUNC/g; |
|
$label =~ s/\|/:/g; |
|
$label =~ s/\~.+//; |
|
push @OUT,"<tree label=\"$label\">"; |
|
push @LABEL,$label; |
|
$i++ if substr($line,$i+1,1) eq " "; |
|
$i++ if substr($line,$i+1,1) eq " "; |
|
} |
|
elsif (substr($line,$i,1) eq ")") { |
|
die("ERROR: NO LABEL ON STACK") unless @LABEL; |
|
my $label = pop @LABEL; |
|
print STDERR substr($DEBUG_SPACE,0,scalar @LABEL)."END of $label\n" if $DEBUG; |
|
push @OUT,"</tree>"; |
|
$i++ if substr($line,$i+1,1) eq " "; |
|
} |
|
else { |
|
my ($word,$rest) = split(/ /,substr($line,$i)); |
|
if (substr($line,$i,2) eq "\\)") { |
|
$word = substr($line,$i,2); |
|
} |
|
$i+=length($word); |
|
print STDERR substr($DEBUG_SPACE,0,scalar @LABEL)."WORD $word\n" if $DEBUG; |
|
$word =~ /^(.+)\/([^\/]+)$/; |
|
my ($w,$p) = ($1,$2); |
|
$w = "(" if $w eq "-LRB-"; |
|
$w = ")" if $w eq "-RRB-"; |
|
$w = &escape($w); |
|
$p =~ s/^-//; |
|
$p =~ s/-$//; |
|
push @OUT,"<tree label=\"$p\"> $w </tree>"; |
|
} |
|
} |
|
die("ERROR: STACK NOT EMPTY $#LABEL\n") if @LABEL; |
|
my $first=1; |
|
foreach (@OUT) { |
|
print " " unless $first; |
|
|
|
print $_; |
|
$first = 0; |
|
} |
|
print "\n"; |
|
} |
|
|
|
sub escape { |
|
my ($text) = @_; |
|
$text =~ s/&/&/g; |
|
$text =~ s/</</g; |
|
$text =~ s/>/>/g; |
|
return $text; |
|
} |
|
|
|
sub check_length { |
|
my ($line) = @_; |
|
my ($numc,$numw,@words); |
|
|
|
return 0 if $line =~ /^\d+ [^a-z0-9]+$/i || $line eq "0" || $line eq "0 "; |
|
|
|
$numc = length($line); |
|
@words = split(" ",$line); |
|
$numw = ($#words+1)/2; |
|
|
|
return ($numc <= $MaxChar) && ($numw <= $MaxWord); |
|
} |
|
|
|
sub conv_posfmt { |
|
my ($line) = @_; |
|
my ($sep,$ret,$w,$i,$w1,$w2,$numw); |
|
|
|
|
|
|
|
$ret=""; $sep=""; $numw=0; |
|
for $w (split(" ",$line)) { |
|
$i = rindex($w,"_"); |
|
$w1 = substr($w,0,$i); |
|
$w2 = substr($w,$i+1); |
|
$ret .= "$sep$w1 $w2"; |
|
$sep = " "; $numw++; |
|
} |
|
$ret = "$numw $ret"; |
|
|
|
|
|
$ret =~ s/\(/-LRB-/g; |
|
$ret =~ s/\)/-RRB-/g; |
|
|
|
$ret; |
|
} |
|
|