|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
use strict; |
|
use File::Temp qw/tempfile/; |
|
use Getopt::Long "GetOptions"; |
|
use File::Basename; |
|
use FindBin qw($RealBin); |
|
use Cwd 'abs_path'; |
|
|
|
sub GetFactors; |
|
|
|
|
|
my $TMPDIR = "tmp"; |
|
my $KEEP_TMP = 0; |
|
my $MADA_DIR; |
|
my $CONFIG; |
|
|
|
my $FACTORS_STR; |
|
my @FACTORS; |
|
|
|
GetOptions( |
|
"tmpdir=s" => \$TMPDIR, |
|
"keep-tmp" => \$KEEP_TMP, |
|
"mada-dir=s" => \$MADA_DIR, |
|
"factors=s" => \$FACTORS_STR, |
|
"config=s" => \$CONFIG |
|
) or die("ERROR: unknown options"); |
|
|
|
if (!defined($CONFIG)) { |
|
$CONFIG = "$MADA_DIR/samples/sampleConfigFile.xml"; |
|
} |
|
|
|
$TMPDIR = abs_path($TMPDIR); |
|
print STDERR "TMPDIR=$TMPDIR \n"; |
|
|
|
if (defined($FACTORS_STR)) { |
|
@FACTORS = split(",", $FACTORS_STR); |
|
} |
|
|
|
|
|
|
|
|
|
$TMPDIR = "$TMPDIR/madamira.$$"; |
|
`mkdir -p $TMPDIR`; |
|
`mkdir -p $TMPDIR/split`; |
|
`mkdir -p $TMPDIR/out`; |
|
|
|
my $infile = "$TMPDIR/input"; |
|
print STDERR $infile."\n"; |
|
|
|
open(TMP,">$infile"); |
|
while(<STDIN>) { |
|
print TMP $_; |
|
} |
|
close(TMP); |
|
|
|
my $cmd; |
|
|
|
|
|
my $SPLIT_EXEC = `gsplit --help 2>/dev/null`; |
|
if($SPLIT_EXEC) { |
|
$SPLIT_EXEC = 'gsplit'; |
|
} |
|
else { |
|
$SPLIT_EXEC = 'split'; |
|
} |
|
|
|
$cmd = "$SPLIT_EXEC -l 10000 -a 7 -d $TMPDIR/input $TMPDIR/split/x"; |
|
`$cmd`; |
|
|
|
$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $CONFIG ::: $TMPDIR/split/x*"; |
|
print STDERR "Executing: $cmd\n"; |
|
`$cmd`; |
|
|
|
$cmd = "cat $TMPDIR/out/x*.mada > $infile.mada"; |
|
print STDERR "Executing: $cmd\n"; |
|
`$cmd`; |
|
|
|
|
|
open(MADA_OUT,"<$infile.mada"); |
|
|
|
while(my $line = <MADA_OUT>) { |
|
chomp($line); |
|
|
|
|
|
if (index($line, "SENTENCE BREAK") == 0) { |
|
|
|
|
|
print "\n"; |
|
} |
|
elsif (index($line, ";;WORD") == 0) { |
|
|
|
my $word = substr($line, 7, length($line) - 8); |
|
|
|
|
|
for (my $i = 0; $i < 4; ++$i) { |
|
$line = <MADA_OUT>; |
|
} |
|
|
|
my $factors = GetFactors($line, \@FACTORS); |
|
$word .= $factors; |
|
|
|
print "$word "; |
|
} |
|
else { |
|
|
|
} |
|
} |
|
close (MADA_OUT); |
|
|
|
|
|
if ($KEEP_TMP == 0) { |
|
|
|
} |
|
|
|
|
|
|
|
sub GetFactors |
|
{ |
|
my $line = shift; |
|
my $factorsRef = shift; |
|
my @factors = @{$factorsRef}; |
|
|
|
|
|
my %allFactors; |
|
my @toks = split(" ", $line); |
|
for (my $i = 1; $i < scalar(@toks); ++$i) { |
|
|
|
|
|
my ($key, $value) = split(":", $toks[$i]); |
|
$allFactors{$key} = $value; |
|
} |
|
|
|
my $ret = ""; |
|
my $factorType; |
|
foreach $factorType(@factors) { |
|
|
|
my $value = $allFactors{$factorType}; |
|
|
|
$ret .= "|$value"; |
|
} |
|
|
|
return $ret; |
|
} |
|
|
|
|