|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
use strict; |
|
use FindBin qw($RealBin); |
|
|
|
|
|
|
|
|
|
|
|
my $queueparameters=""; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
my $pwdcmd = &getPwdCmd(); |
|
|
|
my $workingdir = `$pwdcmd`; chomp $workingdir; |
|
my $tmpdir="$workingdir/tmp$$"; |
|
my $splitpfx="split$$"; |
|
|
|
$SIG{'INT'} = \&kill_all_and_quit; |
|
|
|
|
|
|
|
my $jobscript="$workingdir/job$$"; |
|
my $qsubout="$workingdir/out.job$$"; |
|
my $qsuberr="$workingdir/err.job$$"; |
|
|
|
my $mosesparameters=""; |
|
my $feed_moses_via_stdin = 0; |
|
|
|
|
|
my $cfgfile=""; |
|
|
|
my $version=undef; |
|
my $help=0; |
|
my $dbg=0; |
|
my $jobs=4; |
|
my $cache_model=undef; |
|
my $mosescmd="$ENV{MOSESBIN}/moses" if defined $ENV{"MOSESBIN"}; |
|
my $inputlist=undef; |
|
my $inputfile=undef; |
|
my $inputtype=0; |
|
my @nbestlist=(); |
|
my $nbestlist=undef; |
|
my $nbestfile=undef; |
|
my $oldnbestfile=undef; |
|
my $oldnbest=undef; |
|
my $nbestflag=0; |
|
my $oovlist=undef; |
|
my $oovfile=undef; |
|
my $oovflag=0; |
|
my @wordgraphlist=(); |
|
my $wordgraphlist=undef; |
|
my $wordgraphfile=undef; |
|
my $wordgraphflag=0; |
|
my $robust=5; |
|
my $alifile=undef; |
|
my $detailsfile=undef; |
|
my $logfile=""; |
|
my $logflag=""; |
|
my $searchgraphlist=""; |
|
my $searchgraphfile=""; |
|
my $searchgraphflag=0; |
|
my $qsubname="MOSES"; |
|
my $old_sge = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub init(){ |
|
use Getopt::Long qw(:config pass_through no_ignore_case permute); |
|
GetOptions('version'=>\$version, |
|
'help'=>\$help, |
|
'debug'=>\$dbg, |
|
'jobs=i'=>\$jobs, |
|
'cache-model=s'=>\$cache_model, |
|
'decoder=s'=> \$mosescmd, |
|
'robust=i' => \$robust, |
|
'decoder-parameters=s'=> \$mosesparameters, |
|
'feed-decoder-via-stdin'=> \$feed_moses_via_stdin, |
|
'logfile=s'=> \$logfile, |
|
'i|inputfile|input-file=s'=> \$inputlist, |
|
'n-best-list=s'=> \$nbestlist, |
|
'n-best-file=s'=> \$oldnbestfile, |
|
'n-best-size=i'=> \$oldnbest, |
|
'output-search-graph|osg=s'=> \$searchgraphlist, |
|
'output-word-graph|owg=s'=> \$wordgraphlist, |
|
'output-unknowns=s'=> \$oovlist, |
|
'alignment-output-file=s'=> \$alifile, |
|
'translation-details|T=s'=> \$detailsfile, |
|
'qsub-prefix=s'=> \$qsubname, |
|
'queue-parameters=s'=> \$queueparameters, |
|
'inputtype=i'=> \$inputtype, |
|
'config|f=s'=>\$cfgfile, |
|
'old-sge' => \$old_sge, |
|
) or exit(1); |
|
|
|
getNbestParameters(); |
|
|
|
getSearchGraphParameters(); |
|
|
|
getWordGraphParameters(); |
|
|
|
getOOVParameters(); |
|
|
|
getLogParameters(); |
|
|
|
|
|
|
|
|
|
print STDERR "wordgraphflag:$wordgraphflag\n"; |
|
|
|
|
|
chomp($inputfile=`basename $inputlist`) if defined($inputlist); |
|
|
|
$mosesparameters.="@ARGV -inputtype $inputtype"; |
|
} |
|
|
|
|
|
|
|
|
|
sub version(){ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print STDERR "version 1.13 (29-12-2006)\n"; |
|
exit(1); |
|
} |
|
|
|
|
|
sub usage(){ |
|
print STDERR "moses-parallel.pl [parallel-options] [moses-options]\n"; |
|
print STDERR "Options marked (*) are required.\n"; |
|
print STDERR "Parallel options:\n"; |
|
print STDERR "* -decoder <file> Moses decoder to use\n"; |
|
print STDERR "* -i|inputfile|input-file <file> the input text to translate\n"; |
|
print STDERR "* -jobs <N> number of required jobs\n"; |
|
print STDERR " -cache-model <dir> local directory for copying model files\n"; |
|
print STDERR " -logfile <file> file where storing log files of all jobs\n"; |
|
print STDERR " -qsub-prefix <string> name for sumbitte jobs\n"; |
|
print STDERR " -queue-parameters <string> specific requirements for queue\n"; |
|
print STDERR " -old-sge Assume Sun Grid Engine < 6.0\n"; |
|
print STDERR " -debug debug\n"; |
|
print STDERR " -version print version of the script\n"; |
|
print STDERR " -help this help\n"; |
|
print STDERR "Moses options:\n"; |
|
print STDERR " -inputtype <0|1|2> 0 for text, 1 for confusion networks, 2 for lattices\n"; |
|
print STDERR " -output-search-graph (osg) <file>: Output connected hypotheses of search into specified filename\n"; |
|
print STDERR " -output-word-graph (osg) '<file> <0|1>': Output stack info as word graph. Takes filename, 0=only hypos in stack, 1=stack + nbest hypos\n"; |
|
print STDERR " IMPORTANT NOTE: use single quote to group parameters of -output-word-graph\n"; |
|
print STDERR " This is different from standard moses\n"; |
|
print STDERR " -n-best-list '<file> <N> [distinct]' where\n"; |
|
print STDERR " <file>: file where storing nbest lists\n"; |
|
print STDERR " <N>: size of nbest lists\n"; |
|
print STDERR " distinct: (optional) to activate generation of distinct nbest alternatives\n"; |
|
print STDERR " IMPORTANT NOTE: use single quote to group parameters of -n-best-list\n"; |
|
print STDERR " This is different from standard moses\n"; |
|
print STDERR " IMPORTANT NOTE: The following two parameters are now OBSOLETE, and they are no more supported\n"; |
|
print STDERR " -n-best-file <file> file where storing nbet lists\n"; |
|
print STDERR " -n-best-size <N> size of nbest lists\n"; |
|
print STDERR " NOTE: -n-best-file-n-best-size are passed to the decoder as \"-n-best-list <file> <N>\"\n"; |
|
print STDERR "* -config (f) <cfgfile> configuration file\n"; |
|
print STDERR " -decoder-parameters <string> specific parameters for the decoder\n"; |
|
print STDERR "All other options are passed to Moses\n"; |
|
print STDERR " (This way to pass parameters is maintained for back compatibility\n"; |
|
print STDERR " but preferably use -decoder-parameters)\n"; |
|
exit(1); |
|
} |
|
|
|
|
|
sub print_parameters(){ |
|
print STDERR "Inputfile: $inputlist\n"; |
|
print STDERR "Configuration file: $cfgfile\n"; |
|
print STDERR "Decoder in use: $mosescmd\n"; |
|
print STDERR "Number of jobs:$jobs\n"; |
|
print STDERR "Model cache directory: $cache_model\n" if ($cache_model); |
|
print STDERR "Nbest list: $nbestlist\n" if ($nbestflag); |
|
print STDERR "Output Search Graph: $searchgraphlist\n" if ($searchgraphflag); |
|
print STDERR "Output Word Graph: $wordgraphlist\n" if ($wordgraphflag); |
|
print STDERR "Output OOV: $oovlist\n" if ($oovflag); |
|
print STDERR "LogFile:$logfile\n" if ($logflag); |
|
print STDERR "Qsub name: $qsubname\n"; |
|
print STDERR "Queue parameters: $queueparameters\n"; |
|
print STDERR "Inputtype: text\n" if $inputtype == 0; |
|
print STDERR "Inputtype: confusion network\n" if $inputtype == 1; |
|
print STDERR "Inputtype: lattices\n" if $inputtype == 2; |
|
|
|
print STDERR "parameters directly passed to Moses: $mosesparameters -config $cfgfile\n"; |
|
} |
|
|
|
|
|
sub getLogParameters(){ |
|
if ($logfile){ $logflag=1; } |
|
} |
|
|
|
|
|
sub getNbestParameters(){ |
|
if (!$nbestlist){ |
|
open (CFG, "$cfgfile"); |
|
while (chomp($_=<CFG>)){ |
|
if (/^\[n-best-list\]/){ |
|
my $tmp; |
|
while (chomp($tmp=<CFG>)){ |
|
last if $tmp eq "" || $tmp=~/^\[/; |
|
$nbestlist .= "$tmp "; |
|
} |
|
last; |
|
} |
|
} |
|
close(CFG); |
|
} |
|
|
|
if ($nbestlist){ |
|
if ($oldnbestfile){ |
|
print STDERR "There is a conflict between NEW parameter -n-best-list and OBSOLETE parameter -n-best-file\n"; |
|
print STDERR "Please use only -nbest-list '<file> <N> [distinct]\n"; |
|
exit; |
|
} |
|
} |
|
else{ |
|
if ($oldnbestfile){ |
|
print STDERR "You are using the OBSOLETE parameter -n-best-file\n"; |
|
print STDERR "Next time please use only -n-best-list '<file> <N> [distinct]\n"; |
|
$nbestlist="$oldnbestfile"; |
|
if ($oldnbest){ $nbestlist.=" $oldnbest"; } |
|
else { $nbestlist.=" 1"; } |
|
} |
|
} |
|
|
|
if ($nbestlist){ |
|
my @tmp=split(/[ \t]+/,$nbestlist); |
|
@nbestlist = @tmp; |
|
|
|
if ($nbestlist[0] eq '-'){ $nbestfile="nbest"; } |
|
else{ chomp($nbestfile=`basename $nbestlist[0]`); } |
|
$nbestflag=1; |
|
} |
|
} |
|
|
|
|
|
sub getSearchGraphParameters(){ |
|
if (!$searchgraphlist){ |
|
open (CFG, $cfgfile) or die "Can't read '$cfgfile'"; |
|
while (chomp($_=<CFG>)){ |
|
if (/^\[output-search-graph\]/ || /^\[osg\]/){ |
|
my $tmp; |
|
while (chomp($tmp=<CFG>)){ |
|
last if $tmp eq "" || $tmp=~/^\[/; |
|
$searchgraphlist = "$tmp"; |
|
} |
|
last; |
|
} |
|
} |
|
close(CFG); |
|
} |
|
if ($searchgraphlist){ |
|
if ($searchgraphlist eq '-'){ $searchgraphfile="searchgraph"; } |
|
else{ chomp($searchgraphfile=`basename $searchgraphlist`); } |
|
$searchgraphflag=1; |
|
} |
|
} |
|
|
|
|
|
sub getWordGraphParameters(){ |
|
if (!$wordgraphlist){ |
|
open (CFG, $cfgfile) or die "Can't read '$cfgfile'"; |
|
while (chomp($_=<CFG>)){ |
|
if (/^\[output-word-graph\]/ || /^\[owg\]/){ |
|
my $tmp; |
|
while (chomp($tmp=<CFG>)){ |
|
last if $tmp eq "" || $tmp=~/^\[/; |
|
$wordgraphlist .= "$tmp "; |
|
} |
|
last; |
|
} |
|
} |
|
close(CFG); |
|
} |
|
if ($wordgraphlist){ |
|
my @tmp=split(/[ \t]+/,$wordgraphlist); |
|
@wordgraphlist = @tmp; |
|
|
|
if ($wordgraphlist[0] eq '-'){ $wordgraphfile="wordgraph"; } |
|
else{ chomp($wordgraphfile=`basename $wordgraphlist[0]`); } |
|
$wordgraphflag=1; |
|
} |
|
} |
|
|
|
sub getOOVParameters { |
|
|
|
if ($oovlist) { |
|
if ($oovlist eq "-") { |
|
$oovfile = "oov"; |
|
} |
|
else { |
|
chomp($oovfile = `basename $oovlist`); |
|
} |
|
$oovflag = 1; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
init(); |
|
|
|
version() if $version; |
|
usage() if $help; |
|
|
|
|
|
if (!defined $inputlist || !defined $mosescmd || ! defined $cfgfile) { |
|
print STDERR "Please specify -input-file, -decoder and -config\n"; |
|
usage(); |
|
} |
|
|
|
|
|
if (! -e ${inputlist} ){ |
|
print STDERR "Inputfile ($inputlist) does not exists\n"; |
|
usage(); |
|
} |
|
|
|
|
|
if (! -e $mosescmd) { |
|
print STDERR "Decoder ($mosescmd) does not exists\n"; |
|
usage(); |
|
} |
|
|
|
|
|
if (! -e $cfgfile) { |
|
print STDERR "Configuration file ($cfgfile) does not exists\n"; |
|
usage(); |
|
} |
|
|
|
|
|
print_parameters(); |
|
exit(1) if $dbg; |
|
|
|
|
|
|
|
|
|
my $decimal=""; |
|
|
|
my $cmd; |
|
my $sentenceN; |
|
my $splitN; |
|
|
|
my @idxlist=(); |
|
|
|
if ($inputtype==0){ |
|
|
|
chomp($sentenceN=`wc -l ${inputlist} | awk '{print \$1}' `); |
|
|
|
|
|
if ($jobs>$sentenceN){ $jobs=$sentenceN; } |
|
|
|
|
|
if ($sentenceN % $jobs == 0){ $splitN=int($sentenceN / $jobs); } |
|
else{ $splitN=int($sentenceN /$jobs) + 1; } |
|
|
|
if ($dbg){ |
|
print STDERR "There are $sentenceN sentences to translate\n"; |
|
print STDERR "There are at most $splitN sentences per job\n"; |
|
} |
|
|
|
$cmd="split $decimal -a 2 -l $splitN $inputlist ${inputfile}.$splitpfx-"; |
|
safesystem("$cmd") or die; |
|
} |
|
elsif ($inputtype==1){ |
|
my $tmpfile="/tmp/cnsplit$$"; |
|
$cmd="cat $inputlist | perl -pe 's/\\n/ _CNendline_ /g;' | perl -pe 's/_CNendline_ _CNendline_ /_CNendline_\\n/g;' > $tmpfile"; |
|
safesystem("$cmd") or die; |
|
|
|
|
|
chomp($sentenceN=`wc -l $tmpfile | awk '{print \$1}' `); |
|
|
|
|
|
if ($jobs>$sentenceN){ $jobs=$sentenceN; } |
|
|
|
|
|
if ($sentenceN % $jobs == 0){ $splitN=int($sentenceN / $jobs); } |
|
else{ $splitN=int($sentenceN /$jobs) + 1; } |
|
|
|
if ($dbg){ |
|
print STDERR "There are $sentenceN confusion networks to translate\n"; |
|
print STDERR "There are at most $splitN sentences per job\n"; |
|
} |
|
|
|
$cmd="split $decimal -a 2 -l $splitN $tmpfile $tmpfile-"; |
|
safesystem("$cmd") or die; |
|
|
|
my @idxlist=(); |
|
chomp(@idxlist=`ls $tmpfile-*`); |
|
grep(s/.+(\-\S+)$/$1/e,@idxlist); |
|
|
|
foreach my $idx (@idxlist){ |
|
$cmd="perl -pe 's/ _CNendline_ /\\n/g;s/ _CNendline_/\\n/g;'"; |
|
safesystem("cat $tmpfile$idx | $cmd > ${inputfile}.$splitpfx$idx ; \\rm -f $tmpfile$idx;"); |
|
} |
|
} |
|
elsif ($inputtype==2){ |
|
|
|
chomp($sentenceN=`wc -l ${inputlist} | awk '{print \$1}' `); |
|
|
|
|
|
if ($jobs>$sentenceN){ $jobs=$sentenceN; } |
|
|
|
|
|
if ($sentenceN % $jobs == 0){ $splitN=int($sentenceN / $jobs); } |
|
else{ $splitN=int($sentenceN /$jobs) + 1; } |
|
|
|
if ($dbg){ |
|
print STDERR "There are $sentenceN lattices to translate\n"; |
|
print STDERR "There are at most $splitN lattices per job\n"; |
|
} |
|
|
|
$cmd="split $decimal -a 2 -l $splitN $inputlist ${inputfile}.$splitpfx-"; |
|
safesystem("$cmd") or die; |
|
} |
|
else{ |
|
die "INPUTTYPE:$inputtype is unknown!\n"; |
|
} |
|
|
|
chomp(@idxlist=`ls ${inputfile}.$splitpfx-*`); |
|
grep(s/.+(\-\S+)$/$1/e,@idxlist); |
|
|
|
safesystem("mkdir -p $tmpdir") or die; |
|
|
|
&preparing_script(); |
|
|
|
|
|
my @sgepids =(); |
|
|
|
my @idx_todo = (); |
|
foreach (@idxlist) { push @idx_todo,$_; } |
|
|
|
|
|
while ($robust && scalar @idx_todo) { |
|
$robust--; |
|
|
|
my $failure=0; |
|
foreach my $idx (@idx_todo){ |
|
|
|
my $batch_and_join = undef; |
|
if ($old_sge) { |
|
|
|
$batch_and_join = "-j y"; |
|
} else { |
|
$batch_and_join = "-b no -j yes"; |
|
} |
|
$cmd="qsub $queueparameters $batch_and_join -o $qsubout$idx -e $qsuberr$idx -N $qsubname$idx ${jobscript}${idx}.bash > ${jobscript}${idx}.log 2>&1"; |
|
print STDERR "$cmd\n" if $dbg; |
|
|
|
safesystem($cmd) or die; |
|
|
|
my ($res,$id); |
|
|
|
open (IN,"${jobscript}${idx}.log") |
|
or die "Can't read id of job ${jobscript}${idx}.log"; |
|
chomp($res=<IN>); |
|
my @arrayStr = split(/\s+/,$res); |
|
$id=$arrayStr[2]; |
|
die "Failed to guess job id from $jobscript$idx.log, got: $res" |
|
if $id !~ /^[0-9]+$/; |
|
close(IN); |
|
|
|
push @sgepids, $id; |
|
} |
|
|
|
|
|
my $hj = "-hold_jid " . join(" -hold_jid ", @sgepids); |
|
|
|
if ($old_sge) { |
|
|
|
my $syncscript = "${jobscript}.sync_workaround_script.sh"; |
|
safesystem("echo 'date' > $syncscript") or &kill_all_and_quit(); |
|
|
|
my $pwd = `$pwdcmd`; chomp $pwd; |
|
|
|
my $checkpointfile = "${jobscript}.sync_workaround_checkpoint"; |
|
|
|
|
|
safesystem("\\rm -f $checkpointfile") or &kill_all_and_quit(); |
|
|
|
|
|
$cmd="qsub -cwd $queueparameters $hj -o $checkpointfile -e /dev/null -N $qsubname.W $syncscript 2> $qsubname.W.log"; |
|
safesystem($cmd) or &kill_all_and_quit(); |
|
|
|
|
|
my $nr=0; |
|
while (!-e $checkpointfile) { |
|
sleep(10); |
|
$nr++; |
|
print STDERR "w" if $nr % 3 == 0; |
|
} |
|
print STDERR "End of waiting.\n"; |
|
safesystem("\\rm -f $checkpointfile $syncscript") or &kill_all_and_quit(); |
|
|
|
my $failure = 1; |
|
$nr = 0; |
|
while ($nr < 60 && $failure) { |
|
$nr ++; |
|
$failure=&check_exit_status(); |
|
if (!$failure) { |
|
$failure = &check_translation_old_sge(); |
|
} |
|
last if !$failure; |
|
print STDERR "Extra wait ($nr) for possibly unfinished processes.\n"; |
|
sleep 10; |
|
} |
|
} else { |
|
|
|
$cmd="qsub $queueparameters -sync y $hj -j y -o /dev/null -e /dev/null -N $qsubname.W -b y /bin/ls > $qsubname.W.log"; |
|
safesystem($cmd) or &kill_all_and_quit(); |
|
|
|
$failure=&check_exit_status(); |
|
} |
|
|
|
&kill_all_and_quit() if $failure && !$robust; |
|
|
|
|
|
my @idx_still_todo = &check_translation(); |
|
if ($robust) { |
|
|
|
if ((scalar @idx_still_todo) == (scalar @idxlist)) { |
|
|
|
print STDERR "everything crashed, not trying to resubmit jobs\n"; |
|
$robust = 0; |
|
&kill_all_and_quit(); |
|
} |
|
@idx_todo = @idx_still_todo; |
|
} |
|
else { |
|
if (scalar (@idx_still_todo)) { |
|
print STDERR "some jobs crashed: ".join(" ",@idx_still_todo)."\n"; |
|
&kill_all_and_quit(); |
|
} |
|
|
|
} |
|
} |
|
|
|
|
|
&concatenate_1best(); |
|
&concatenate_logs() if $logflag; |
|
&concatenate_ali() if defined $alifile; |
|
&concatenate_details() if defined $detailsfile; |
|
&concatenate_nbest() if $nbestflag; |
|
safesystem("cat nbest$$ >> /dev/stdout") if $nbestlist[0] eq '-'; |
|
|
|
&concatenate_searchgraph() if $searchgraphflag; |
|
safesystem("cat searchgraph$$ >> /dev/stdout") if $searchgraphlist eq '-'; |
|
|
|
&concatenate_wordgraph() if $wordgraphflag; |
|
safesystem("cat wordgraph$$ >> /dev/stdout") if $wordgraphlist[0] eq '-'; |
|
|
|
&concatenate_oov() if $oovflag; |
|
safesystem("cat oov$$ >> /dev/stdout") if $oovlist eq '-'; |
|
|
|
&remove_temporary_files(); |
|
|
|
|
|
|
|
sub preparing_script(){ |
|
my $currStartTranslationId = 0; |
|
|
|
my $possibly_modified_cfgfile = $cfgfile; |
|
my $cache_model_cmd = ""; |
|
if ($cache_model) { |
|
$cache_model_cmd = "MOSES_INI=`$RealBin/../ems/support/cache-model.perl $cfgfile $cache_model`\n"; |
|
$possibly_modified_cfgfile = "\$MOSES_INI"; |
|
} |
|
|
|
foreach my $idx (@idxlist){ |
|
my $scriptheader=""; |
|
$scriptheader.="\#\! /bin/bash\n\n"; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$scriptheader.="uname -a\n\n"; |
|
$scriptheader.="ulimit -c 0\n\n"; |
|
$scriptheader.="cd $workingdir\n\n"; |
|
|
|
|
|
|
|
open (OUT, "> ${jobscript}${idx}.bash"); |
|
print OUT $scriptheader; |
|
|
|
|
|
print OUT $cache_model_cmd; |
|
|
|
my $inputmethod = $feed_moses_via_stdin ? "<" : "-input-file"; |
|
|
|
my $tmpnbestlist=""; |
|
if ($nbestflag){ |
|
$tmpnbestlist="$tmpdir/$nbestfile.$splitpfx$idx $nbestlist[1]"; |
|
$tmpnbestlist = "$tmpnbestlist $nbestlist[2]" if scalar(@nbestlist)==3; |
|
$tmpnbestlist = "-n-best-list $tmpnbestlist"; |
|
} |
|
|
|
my $tmpalioutfile = ""; |
|
if (defined $alifile){ |
|
$tmpalioutfile="-alignment-output-file $tmpdir/$alifile.$splitpfx$idx"; |
|
} |
|
|
|
my $tmpdetailsoutfile = ""; |
|
if (defined $detailsfile){ |
|
$tmpdetailsoutfile="-translation-details $tmpdir/$detailsfile.$splitpfx$idx"; |
|
} |
|
|
|
my $tmpsearchgraphlist=""; |
|
if ($searchgraphflag){ |
|
$tmpsearchgraphlist="-output-search-graph $tmpdir/$searchgraphfile.$splitpfx$idx"; |
|
} |
|
|
|
my $tmpwordgraphlist=""; |
|
if ($wordgraphflag){ |
|
$tmpwordgraphlist="-output-word-graph $tmpdir/$wordgraphfile.$splitpfx$idx $wordgraphlist[1]"; |
|
} |
|
|
|
my $tmpoovlist=""; |
|
if ($oovflag){ |
|
$tmpoovlist="-output-unknowns $tmpdir/$oovfile.$splitpfx$idx"; |
|
} |
|
|
|
my $tmpStartTranslationId = ""; |
|
|
|
print OUT "$mosescmd $mosesparameters -config $possibly_modified_cfgfile $tmpStartTranslationId $tmpalioutfile $tmpdetailsoutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpoovlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n"; |
|
print OUT "echo exit status \$\?\n\n"; |
|
|
|
if (defined $alifile){ |
|
print OUT "\\mv -f $tmpdir/${alifile}.$splitpfx$idx .\n\n"; |
|
print OUT "echo exit status \$\?\n\n"; |
|
} |
|
if (defined $detailsfile){ |
|
print OUT "\\mv -f $tmpdir/${detailsfile}.$splitpfx$idx .\n\n"; |
|
print OUT "echo exit status \$\?\n\n"; |
|
} |
|
if ($nbestflag){ |
|
print OUT "\\mv -f $tmpdir/${nbestfile}.$splitpfx$idx .\n\n"; |
|
print OUT "echo exit status \$\?\n\n"; |
|
} |
|
if ($searchgraphflag){ |
|
print OUT "\\mv -f $tmpdir/${searchgraphfile}.$splitpfx$idx .\n\n"; |
|
print OUT "echo exit status \$\?\n\n"; |
|
} |
|
if ($wordgraphflag){ |
|
print OUT "\\mv -f $tmpdir/${wordgraphfile}.$splitpfx$idx .\n\n"; |
|
print OUT "echo exit status \$\?\n\n"; |
|
} |
|
if ($oovflag){ |
|
print OUT "\\mv -f $tmpdir/${oovfile}.$splitpfx$idx .\n\n"; |
|
print OUT "echo exit status \$\?\n\n"; |
|
} |
|
|
|
print OUT "\\mv -f $tmpdir/${inputfile}.$splitpfx$idx.trans .\n\n"; |
|
print OUT "echo exit status \$\?\n\n"; |
|
close(OUT); |
|
|
|
|
|
chmod(oct(755),"${jobscript}${idx}.bash"); |
|
|
|
$currStartTranslationId += $splitN; |
|
} |
|
} |
|
|
|
sub concatenate_wordgraph(){ |
|
my $oldcode=""; |
|
my $newcode=-1; |
|
my %inplength = (); |
|
my $offset = 0; |
|
|
|
my $outwordgraph=$wordgraphlist[0]; |
|
if ($wordgraphlist[0] eq '-'){ $outwordgraph="wordgraph$$"; } |
|
|
|
open (OUT, "> $outwordgraph"); |
|
foreach my $idx (@idxlist){ |
|
|
|
|
|
my @in=(); |
|
open (IN, "${inputfile}.${splitpfx}${idx}.trans"); |
|
@in=<IN>; |
|
close(IN); |
|
$inplength{$idx} = scalar(@in); |
|
|
|
open (IN, "${wordgraphfile}.${splitpfx}${idx}"); |
|
while (<IN>){ |
|
|
|
my $code=""; |
|
if (/^UTTERANCE=/){ |
|
($code)=($_=~/^UTTERANCE=(\d+)/); |
|
|
|
print STDERR "code:$code offset:$offset\n"; |
|
$code += $offset; |
|
if ($code ne $oldcode){ |
|
|
|
|
|
|
|
|
|
|
|
while ($code - $oldcode > 1){ |
|
$oldcode++; |
|
print OUT "UTTERANCE=$oldcode\n"; |
|
print STDERR " to OUT -> code:$oldcode\n"; |
|
print OUT "_EMPTYWORDGRAPH_\n"; |
|
} |
|
} |
|
|
|
$oldcode=$code; |
|
print OUT "UTTERANCE=$oldcode\n"; |
|
next; |
|
} |
|
print OUT "$_"; |
|
} |
|
close(IN); |
|
$offset += $inplength{$idx}; |
|
|
|
while ($offset - $oldcode > 1){ |
|
$oldcode++; |
|
print OUT "UTTERANCE=$oldcode\n"; |
|
print OUT "_EMPTYWORDGRAPH_\n"; |
|
} |
|
} |
|
close(OUT); |
|
} |
|
|
|
|
|
sub concatenate_searchgraph(){ |
|
my $oldcode=""; |
|
my $newcode=-1; |
|
my %inplength = (); |
|
my $offset = 0; |
|
|
|
my $outsearchgraph=$searchgraphlist; |
|
if ($searchgraphlist eq '-'){ $outsearchgraph="searchgraph$$"; } |
|
|
|
open (OUT, "> $outsearchgraph"); |
|
foreach my $idx (@idxlist){ |
|
|
|
|
|
my @in=(); |
|
open (IN, "${inputfile}.${splitpfx}${idx}.trans"); |
|
@in=<IN>; |
|
close(IN); |
|
$inplength{$idx} = scalar(@in); |
|
|
|
open (IN, "${searchgraphfile}.${splitpfx}${idx}"); |
|
while (<IN>){ |
|
my ($code,@extra)=split(/[ \t]+/,$_); |
|
$code += $offset; |
|
if ($code ne $oldcode){ |
|
|
|
|
|
|
|
|
|
|
|
while ($code - $oldcode > 1){ |
|
$oldcode++; |
|
print OUT "$oldcode _EMPTYSEARCHGRAPH_\n"; |
|
} |
|
} |
|
$oldcode=$code; |
|
print OUT join(" ",($oldcode,@extra)); |
|
} |
|
close(IN); |
|
$offset += $inplength{$idx}; |
|
|
|
while ($offset - $oldcode > 1){ |
|
$oldcode++; |
|
print OUT "$oldcode _EMPTYSEARCHGRAPH_\n"; |
|
} |
|
} |
|
close(OUT); |
|
} |
|
|
|
sub concatenate_nbest(){ |
|
my $oldcode=""; |
|
my $newcode=-1; |
|
my %inplength = (); |
|
my $offset = 0; |
|
|
|
|
|
open (IN, "${nbestfile}.${splitpfx}$idxlist[0]"); |
|
my $str = <IN>; |
|
chomp($str); |
|
close(IN); |
|
my ($code,$trans,$featurescores,$globalscore)=split(/\|\|\|/,$str); |
|
|
|
my $emptytrans = " "; |
|
my $emptyglobalscore = " 0.0"; |
|
my $emptyfeaturescores = $featurescores; |
|
$emptyfeaturescores =~ s/[-0-9\.]+/0/g; |
|
|
|
my $outnbest=$nbestlist[0]; |
|
if ($nbestlist[0] eq '-'){ $outnbest="nbest$$"; } |
|
|
|
open (OUT, "> $outnbest"); |
|
foreach my $idx (@idxlist){ |
|
|
|
|
|
my @in=(); |
|
open (IN, "${inputfile}.${splitpfx}${idx}.trans") |
|
or die "Failed to open '${inputfile}.${splitpfx}${idx}.trans'"; |
|
@in=<IN>; |
|
close(IN); |
|
$inplength{$idx} = scalar(@in); |
|
|
|
open (IN, "${nbestfile}.${splitpfx}${idx}") |
|
or die "Failed to open '${nbestfile}.${splitpfx}${idx}'"; |
|
while (<IN>){ |
|
my ($code,@extra)=split(/\|\|\|/,$_); |
|
$code += $offset; |
|
if ($code ne $oldcode){ |
|
|
|
|
|
|
|
|
|
|
|
while ($code - $oldcode > 1){ |
|
$oldcode++; |
|
print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n"; |
|
} |
|
} |
|
$oldcode=$code; |
|
print OUT join("\|\|\|",($oldcode,@extra)); |
|
} |
|
close(IN); |
|
$offset += $inplength{$idx}; |
|
|
|
while ($offset - $oldcode > 1){ |
|
$oldcode++; |
|
print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n"; |
|
} |
|
} |
|
close(OUT); |
|
} |
|
|
|
sub concatenate_1best(){ |
|
foreach my $idx (@idxlist){ |
|
my @in=(); |
|
open (IN, "${inputfile}.${splitpfx}${idx}.trans"); |
|
@in=<IN>; |
|
print STDOUT "@in"; |
|
close(IN); |
|
} |
|
} |
|
|
|
sub concatenate_oov(){ |
|
my $outoov=$oovlist; |
|
if ($oovlist eq '-'){ $outoov="oov$$"; } |
|
open (OUT, "> $outoov"); |
|
foreach my $idx (@idxlist){ |
|
my @in=(); |
|
open (IN, "${oovfile}.${splitpfx}${idx}"); |
|
@in=<IN>; |
|
print OUT "@in"; |
|
close(IN); |
|
} |
|
close(OUT); |
|
} |
|
|
|
sub concatenate_logs(){ |
|
open (OUT, "> ${logfile}"); |
|
foreach my $idx (@idxlist){ |
|
my @in=(); |
|
open (IN, "$qsubout$idx"); |
|
@in=<IN>; |
|
print OUT "@in"; |
|
close(IN); |
|
} |
|
close(OUT); |
|
} |
|
|
|
sub concatenate_ali(){ |
|
open (OUT, "> ${alifile}"); |
|
foreach my $idx (@idxlist){ |
|
my @in=(); |
|
open (IN, "$alifile.$splitpfx$idx"); |
|
@in=<IN>; |
|
print OUT "@in"; |
|
close(IN); |
|
} |
|
close(OUT); |
|
} |
|
|
|
sub concatenate_details(){ |
|
open (OUT, "> ${detailsfile}"); |
|
foreach my $idx (@idxlist){ |
|
my @in=(); |
|
open (IN, "$detailsfile.$splitpfx$idx"); |
|
@in=<IN>; |
|
print OUT "@in"; |
|
close(IN); |
|
} |
|
close(OUT); |
|
} |
|
|
|
|
|
sub check_exit_status(){ |
|
print STDERR "check_exit_status\n"; |
|
my $failure=0; |
|
foreach my $idx (@idxlist){ |
|
print STDERR "check_exit_status of job $idx\n"; |
|
open(IN,"$qsubout$idx"); |
|
while (<IN>){ |
|
$failure=1 if (/exit status 1/); |
|
} |
|
close(IN); |
|
} |
|
return $failure; |
|
} |
|
|
|
sub kill_all_and_quit(){ |
|
print STDERR "Got interrupt or something failed.\n"; |
|
print STDERR "kill_all_and_quit\n"; |
|
foreach my $id (@sgepids){ |
|
print STDERR "qdel $id\n"; |
|
safesystem("qdel $id"); |
|
} |
|
|
|
print STDERR "Translation was not performed correctly\n"; |
|
print STDERR "or some of the submitted jobs died.\n"; |
|
print STDERR "qdel function was called for all submitted jobs\n"; |
|
|
|
exit(1); |
|
} |
|
|
|
|
|
sub check_translation(){ |
|
|
|
my $inputN; |
|
my $outputN; |
|
my @failed = (); |
|
foreach my $idx (@idx_todo){ |
|
if ($inputtype==0){ |
|
chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`); |
|
} |
|
elsif ($inputtype==1){ |
|
chomp($inputN=`cat ${inputfile}.$splitpfx$idx | perl -pe 's/\\n/ _CNendline_ /g;' | perl -pe 's/_CNendline_ _CNendline_ /_CNendline_\\n/g;' | wc -l | cut -d' ' -f1 `); |
|
} |
|
elsif ($inputtype==2){ |
|
chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`); |
|
} |
|
else{ |
|
die "INPUTTYPE:$inputtype is unknown!\n"; |
|
} |
|
chomp($outputN=`wc -l ${inputfile}.$splitpfx$idx.trans | cut -d' ' -f1`); |
|
|
|
if ($inputN != $outputN){ |
|
print STDERR "Split ($idx) were not entirely translated\n"; |
|
print STDERR "outputN=$outputN inputN=$inputN\n"; |
|
print STDERR "outputfile=${inputfile}.$splitpfx$idx.trans inputfile=${inputfile}.$splitpfx$idx\n"; |
|
push @failed,$idx; |
|
} |
|
} |
|
return @failed; |
|
} |
|
|
|
sub check_translation_old_sge(){ |
|
|
|
my $inputN; |
|
my $outputN; |
|
foreach my $idx (@idx_todo){ |
|
if ($inputtype==0){ |
|
chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`); |
|
} |
|
elsif ($inputtype==1){ |
|
chomp($inputN=`cat ${inputfile}.$splitpfx$idx | perl -pe 's/\\n/ _CNendline_ /g;' | perl -pe 's/_CNendline_ _CNendline_ /_CNendline_\\n/g;' | wc -l | |
|
cut -d' ' -f1 `); |
|
} |
|
elsif ($inputtype==2){ |
|
chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`); |
|
} |
|
else{ |
|
die "INPUTTYPE:$inputtype is unknown!\n"; |
|
} |
|
chomp($outputN=`wc -l ${inputfile}.$splitpfx$idx.trans | cut -d' ' -f1`); |
|
|
|
if ($inputN != $outputN){ |
|
print STDERR "Split ($idx) were not entirely translated\n"; |
|
print STDERR "outputN=$outputN inputN=$inputN\n"; |
|
print STDERR "outputfile=${inputfile}.$splitpfx$idx.trans inputfile=${inputfile}.$splitpfx$idx\n"; |
|
return 1; |
|
} |
|
|
|
} |
|
return 0; |
|
} |
|
|
|
sub remove_temporary_files(){ |
|
|
|
foreach my $idx (@idxlist){ |
|
unlink("${inputfile}.${splitpfx}${idx}.trans"); |
|
unlink("${inputfile}.${splitpfx}${idx}"); |
|
if (defined $alifile){ unlink("${alifile}.${splitpfx}${idx}"); } |
|
if (defined $detailsfile){ unlink("${detailsfile}.${splitpfx}${idx}"); } |
|
if ($nbestflag){ unlink("${nbestfile}.${splitpfx}${idx}"); } |
|
if ($searchgraphflag){ unlink("${searchgraphfile}.${splitpfx}${idx}"); } |
|
if ($wordgraphflag){ unlink("${wordgraphfile}.${splitpfx}${idx}"); } |
|
if ($oovfile){ unlink("${oovfile}.${splitpfx}${idx}"); } |
|
unlink("${jobscript}${idx}.bash"); |
|
unlink("${jobscript}${idx}.log"); |
|
unlink("$qsubname.W.log"); |
|
unlink("$qsubout$idx"); |
|
unlink("$qsuberr$idx"); |
|
rmdir("$tmpdir"); |
|
} |
|
if ($nbestflag && $nbestlist[0] eq '-'){ unlink("${nbestfile}$$"); }; |
|
if ($searchgraphflag && $searchgraphlist eq '-'){ unlink("${searchgraphfile}$$"); }; |
|
if ($wordgraphflag && $wordgraphlist eq '-'){ unlink("${wordgraphfile}$$"); }; |
|
if ($oovflag && $oovlist eq '-'){ unlink("oov$$"); }; |
|
} |
|
|
|
sub safesystem { |
|
print STDERR "Executing: @_\n"; |
|
system(@_); |
|
if ($? == -1) { |
|
print STDERR "Failed to execute: @_\n $!\n"; |
|
exit(1); |
|
} |
|
elsif ($? & 127) { |
|
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", |
|
($? & 127), ($? & 128) ? 'with' : 'without'; |
|
exit 1; |
|
} |
|
else { |
|
my $exitcode = $? >> 8; |
|
print STDERR "Exit code: $exitcode\n" if $exitcode; |
|
return ! $exitcode; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
sub getPwdCmd(){ |
|
my $pwdcmd="pwd"; |
|
my $a; |
|
chomp($a=`which pawd 2> /dev/null | head -1 | awk '{print \$1}'`); |
|
if ($a && -e $a){ $pwdcmd=$a; } |
|
return $pwdcmd; |
|
} |
|
|
|
|