|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
use strict; |
|
use File::Basename; |
|
|
|
sub RunFork($); |
|
sub systemCheck($); |
|
sub GetSourcePhrase($); |
|
sub NumStr($); |
|
sub CutContextFile($$$); |
|
|
|
my $GZIP_EXEC; |
|
if(`which pigz`) { |
|
$GZIP_EXEC = 'pigz'; |
|
} |
|
else { |
|
$GZIP_EXEC = 'gzip'; |
|
} |
|
print STDERR "using $GZIP_EXEC \n"; |
|
|
|
|
|
my $EXTRACT_SPLIT_LINES = 50000000; |
|
|
|
print STDERR "Started ".localtime() ."\n"; |
|
|
|
my $numParallel = $ARGV[0]; |
|
$numParallel = 1 if $numParallel < 1; |
|
|
|
my $sortCmd = $ARGV[1]; |
|
my $scoreCmd = $ARGV[2]; |
|
|
|
my $extractFile = $ARGV[3]; |
|
my $lexFile = $ARGV[4]; |
|
my $ptHalf = $ARGV[5]; |
|
my $inverse = 0; |
|
my $sourceLabelsFile; |
|
my $partsOfSpeechFile; |
|
my $targetSyntacticPreferencesLabelsFile; |
|
|
|
my $otherExtractArgs= ""; |
|
for (my $i = 6; $i < $#ARGV; ++$i) |
|
{ |
|
if ($ARGV[$i] eq '--SourceLabels') { |
|
$sourceLabelsFile = $ARGV[++$i]; |
|
$otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS "; |
|
next; |
|
} |
|
if ($ARGV[$i] eq '--PartsOfSpeech') { |
|
$partsOfSpeechFile = $ARGV[++$i]; |
|
$otherExtractArgs .= "--PartsOfSpeech "; |
|
next; |
|
} |
|
if ($ARGV[$i] eq '--TargetSyntacticPreferences') { |
|
$targetSyntacticPreferencesLabelsFile = $ARGV[++$i]; |
|
$otherExtractArgs .= "--TargetSyntacticPreferences "; |
|
next; |
|
} |
|
if ($ARGV[$i] eq '--Inverse') { |
|
$inverse = 1; |
|
$otherExtractArgs .= $ARGV[$i] ." "; |
|
next; |
|
} |
|
$otherExtractArgs .= $ARGV[$i] ." "; |
|
} |
|
|
|
|
|
my $FlexibilityScore = $otherExtractArgs =~ /--FlexibilityScore/; |
|
my $FlexibilityCmd = $otherExtractArgs; |
|
$otherExtractArgs =~ s/--FlexibilityScore=\S+//; |
|
if ($FlexibilityCmd =~ /--FlexibilityScore=(\S+)/) { |
|
$FlexibilityCmd = $1; |
|
} |
|
|
|
my $doSort = $ARGV[$#ARGV]; |
|
|
|
my $TMPDIR=dirname($ptHalf) ."/tmp.$$"; |
|
mkdir $TMPDIR; |
|
|
|
my $cmd; |
|
|
|
my $extractFileContext; |
|
if ($FlexibilityScore) { |
|
$extractFileContext = $extractFile; |
|
$extractFileContext =~ s/extract./extract.context./; |
|
} |
|
|
|
my $fileCount = 0; |
|
if ($numParallel <= 1) |
|
{ |
|
$cmd = "ln -s $extractFile $TMPDIR/extract.0.gz"; |
|
if ($FlexibilityScore) { |
|
$cmd .= " && ln -s $extractFileContext $TMPDIR/extract.context.0.gz"; |
|
} |
|
print STDERR "$cmd \n"; |
|
systemCheck($cmd); |
|
|
|
$fileCount = 1; |
|
} |
|
else |
|
{ |
|
if ($extractFile =~ /\.gz$/) { |
|
open(IN, "gunzip -c $extractFile |") || die "can't open pipe to $extractFile"; |
|
} |
|
else { |
|
open(IN, $extractFile) || die "can't open $extractFile"; |
|
} |
|
|
|
my $lastlineContext; |
|
if ($FlexibilityScore) { |
|
$lastlineContext = ""; |
|
if ($extractFileContext =~ /\.gz$/) { |
|
open(IN_CONTEXT, "gunzip -c $extractFileContext |") || die "can't open pipe to $extractFileContext"; |
|
} |
|
else { |
|
open(IN_CONTEXT, $extractFileContext) || die "can't open $extractFileContext"; |
|
} |
|
} |
|
|
|
my $filePath = "$TMPDIR/extract.$fileCount.gz"; |
|
open (OUT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!"; |
|
|
|
my $lineCount = 0; |
|
my $line; |
|
my $prevSourcePhrase = ""; |
|
while ($line=<IN>) |
|
{ |
|
chomp($line); |
|
++$lineCount; |
|
|
|
if ($lineCount > $EXTRACT_SPLIT_LINES) |
|
{ |
|
my $sourcePhrase = GetSourcePhrase($line); |
|
|
|
if ($prevSourcePhrase eq "") |
|
{ |
|
$prevSourcePhrase = $sourcePhrase; |
|
} |
|
elsif ($sourcePhrase eq $prevSourcePhrase) |
|
{ |
|
} |
|
else |
|
{ |
|
close OUT; |
|
|
|
if ($FlexibilityScore) { |
|
$lastlineContext = CutContextFile($prevSourcePhrase, $fileCount, $lastlineContext); |
|
} |
|
$prevSourcePhrase = ""; |
|
$lineCount = 0; |
|
++$fileCount; |
|
my $filePath = $fileCount; |
|
$filePath = "$TMPDIR/extract.$filePath.gz"; |
|
open (OUT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!"; |
|
} |
|
} |
|
else |
|
{ |
|
} |
|
|
|
print OUT "$line\n"; |
|
|
|
} |
|
close OUT; |
|
if ($FlexibilityScore) { |
|
$lastlineContext = CutContextFile($prevSourcePhrase, $fileCount, $lastlineContext); |
|
} |
|
++$fileCount; |
|
} |
|
|
|
|
|
|
|
my @runFiles = (0..($numParallel-1)); |
|
for (my $i = 0; $i < $numParallel; ++$i) |
|
{ |
|
my $path = "$TMPDIR/run.$i.sh"; |
|
open(my $fh, ">", $path) or die "cannot open $path: $!"; |
|
$runFiles[$i] = $fh; |
|
} |
|
|
|
|
|
for (my $i = 0; $i < $fileCount; ++$i) |
|
{ |
|
my $numStr = NumStr($i); |
|
|
|
my $fileInd = $i % $numParallel; |
|
my $fh = $runFiles[$fileInd]; |
|
|
|
my $cmd = "$scoreCmd $TMPDIR/extract.$i.gz $lexFile $TMPDIR/phrase-table.half.$numStr.gz $otherExtractArgs 2>> /dev/stderr \n"; |
|
print STDERR $cmd; |
|
|
|
if ($FlexibilityScore) { |
|
$cmd .= "gzip -cd $TMPDIR/phrase-table.half.$numStr.gz | $FlexibilityCmd $TMPDIR/extract.context.$i.gz"; |
|
$cmd .= " --Inverse" if ($otherExtractArgs =~ /--Inverse/); |
|
$cmd .= " --Hierarchical" if ($otherExtractArgs =~ /--Hierarchical/); |
|
$cmd .= " | $GZIP_EXEC -c > $TMPDIR/phrase-table.half.$numStr.flex.gz\n"; |
|
$cmd .= "mv $TMPDIR/phrase-table.half.$numStr.flex.gz $TMPDIR/phrase-table.half.$numStr.gz\n"; |
|
} |
|
|
|
print $fh $cmd; |
|
} |
|
|
|
|
|
for (my $i = 0; $i < $numParallel; ++$i) |
|
{ |
|
close($runFiles[$i]); |
|
my $path = "$TMPDIR/run.$i.sh"; |
|
systemCheck("chmod +x $path"); |
|
} |
|
|
|
|
|
my @children; |
|
for (my $i = 0; $i < $numParallel; ++$i) |
|
{ |
|
my $cmd = "$TMPDIR/run.$i.sh"; |
|
my $pid = RunFork($cmd); |
|
push(@children, $pid); |
|
} |
|
|
|
|
|
foreach (@children) { |
|
waitpid($_, 0); |
|
} |
|
|
|
|
|
$cmd = "\n\nOH SHIT. This should have been filled in \n\n"; |
|
if ($fileCount == 1 && !$doSort && !$FlexibilityScore) |
|
{ |
|
my $numStr = NumStr(0); |
|
$cmd = "mv $TMPDIR/phrase-table.half.$numStr.gz $ptHalf"; |
|
} |
|
else |
|
{ |
|
$cmd = "gunzip -c $TMPDIR/phrase-table.half.*.gz 2>> /dev/stderr"; |
|
|
|
if ($doSort) { |
|
$cmd .= "| LC_ALL=C $sortCmd -T $TMPDIR "; |
|
} |
|
|
|
$cmd .= " | $GZIP_EXEC -c > $ptHalf 2>> /dev/stderr "; |
|
} |
|
print STDERR $cmd; |
|
systemCheck($cmd); |
|
|
|
|
|
my $numStr = NumStr(0); |
|
my $cocPath = "$TMPDIR/phrase-table.half.$numStr.gz.coc"; |
|
|
|
if (-e $cocPath) |
|
{ |
|
my @arrayCOC; |
|
my $line; |
|
|
|
|
|
open(FHCOC, $cocPath) || die "can't open pipe to $cocPath"; |
|
while ($line = <FHCOC>) |
|
{ |
|
my $coc = int($line); |
|
push(@arrayCOC, $coc); |
|
} |
|
close(FHCOC); |
|
|
|
|
|
for (my $i = 1; $i < $fileCount; ++$i) |
|
{ |
|
$numStr = NumStr($i); |
|
$cocPath = "$TMPDIR/phrase-table.half.$numStr.gz.coc"; |
|
open(FHCOC, $cocPath) || die "can't open pipe to $cocPath"; |
|
my $arrayInd = 0; |
|
while ($line = <FHCOC>) |
|
{ |
|
my $coc = int($line); |
|
$arrayCOC[$arrayInd] += $coc; |
|
|
|
++$arrayInd; |
|
} |
|
|
|
close(FHCOC); |
|
} |
|
|
|
|
|
$cocPath = "$ptHalf.coc"; |
|
open(FHCOC, ">", $cocPath) or die "cannot open $cocPath: $!"; |
|
for (my $i = 0; $i < @arrayCOC; ++$i) |
|
{ |
|
print FHCOC $arrayCOC[$i]."\n"; |
|
} |
|
close(FHCOC); |
|
} |
|
|
|
|
|
if (!$inverse && defined($sourceLabelsFile)) |
|
{ |
|
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; echo \"SSTART 2\"; echo \"SEND 3\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.src | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+3]}/\") > $sourceLabelsFile"; |
|
print STDERR "Merging source labels files: $cmd \n"; |
|
`$cmd`; |
|
} |
|
|
|
|
|
if (!$inverse && defined($partsOfSpeechFile)) |
|
{ |
|
my $cmd = "(echo \"SSTART 0\"; echo \"SEND 1\"; cat $TMPDIR/phrase-table.half.*.gz.partsOfSpeech | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $partsOfSpeechFile"; |
|
print STDERR "Merging parts-of-speech files: $cmd \n"; |
|
`$cmd`; |
|
} |
|
|
|
|
|
if (!$inverse && defined($targetSyntacticPreferencesLabelsFile)) |
|
{ |
|
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.tgtpref | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $targetSyntacticPreferencesLabelsFile"; |
|
print STDERR "Merging target syntactic preferences labels files: $cmd \n"; |
|
`$cmd`; |
|
} |
|
|
|
$cmd = "rm -rf $TMPDIR \n"; |
|
print STDERR $cmd; |
|
systemCheck($cmd); |
|
|
|
print STDERR "Finished ".localtime() ."\n"; |
|
|
|
|
|
|
|
|
|
sub RunFork($) |
|
{ |
|
my $cmd = shift; |
|
|
|
my $pid = fork(); |
|
|
|
if ($pid == 0) |
|
{ |
|
print STDERR $cmd; |
|
systemCheck($cmd); |
|
exit(); |
|
} |
|
return $pid; |
|
} |
|
sub systemCheck($) |
|
{ |
|
my $cmd = shift; |
|
my $retVal = system($cmd); |
|
if ($retVal != 0) |
|
{ |
|
exit(1); |
|
} |
|
} |
|
|
|
sub GetSourcePhrase($) |
|
{ |
|
my $line = shift; |
|
my $pos = index($line, "|||"); |
|
my $sourcePhrase = substr($line, 0, $pos); |
|
return $sourcePhrase; |
|
} |
|
|
|
|
|
sub NumStr($) |
|
{ |
|
my $i = shift; |
|
my $numStr; |
|
if ($i < 10) { |
|
$numStr = "000000$i"; |
|
} |
|
elsif ($i < 100) { |
|
$numStr = "00000$i"; |
|
} |
|
elsif ($i < 1000) { |
|
$numStr = "0000$i"; |
|
} |
|
elsif ($i < 10000) { |
|
$numStr = "000$i"; |
|
} |
|
elsif ($i < 100000) { |
|
$numStr = "00$i"; |
|
} |
|
elsif ($i < 1000000) { |
|
$numStr = "0$i"; |
|
} |
|
else { |
|
$numStr = $i; |
|
} |
|
return $numStr; |
|
} |
|
|
|
|
|
sub CutContextFile($$$) |
|
{ |
|
my($lastsourcePhrase, $fileCount, $lastline) = @_; |
|
my $line; |
|
my $sourcePhrase; |
|
|
|
my $filePath = "$TMPDIR/extract.context.$fileCount.gz"; |
|
open (OUT_CONTEXT, "| $GZIP_EXEC -c > $filePath") or die "error starting $GZIP_EXEC $!"; |
|
|
|
if ($lastline ne "") { |
|
print OUT_CONTEXT "$lastline\n"; |
|
} |
|
|
|
|
|
while ($line=<IN_CONTEXT>) |
|
{ |
|
chomp($line); |
|
$sourcePhrase = GetSourcePhrase($line); |
|
print OUT_CONTEXT "$line\n"; |
|
if ($sourcePhrase eq $lastsourcePhrase) {last;} |
|
} |
|
|
|
|
|
while ($line=<IN_CONTEXT>) |
|
{ |
|
chomp($line); |
|
$sourcePhrase = GetSourcePhrase($line); |
|
if ($sourcePhrase ne $lastsourcePhrase) {last;} |
|
print OUT_CONTEXT "$line\n"; |
|
} |
|
|
|
close(OUT_CONTEXT); |
|
|
|
return $line; |
|
|
|
} |
|
|