|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
use strict; |
|
use Getopt::Long "GetOptions"; |
|
use FindBin qw($RealBin); |
|
|
|
sub trim($) |
|
{ |
|
my $string = shift; |
|
$string =~ s/^\s+//; |
|
$string =~ s/\s+$//; |
|
return $string; |
|
} |
|
|
|
my $host = `hostname`; chop($host); |
|
print STDERR "STARTING UP AS PROCESS $$ ON $host AT ".`date`; |
|
|
|
my ($CONFIG_FILE, |
|
$EXECUTE, |
|
$NO_GRAPH, |
|
$CONTINUE, |
|
$FINAL_STEP, |
|
$FINAL_OUT, |
|
$VERBOSE, |
|
$IGNORE_TIME, |
|
$DELETE_CRASHED, |
|
$DELETE_VERSION |
|
); |
|
|
|
my $SLEEP = 2; |
|
my $META = "$RealBin/experiment.meta"; |
|
|
|
|
|
|
|
my ($MULTICORE,$MAX_ACTIVE) = (0,2); |
|
&detect_if_multicore(); |
|
|
|
|
|
my $CLUSTER; |
|
&detect_if_cluster(); |
|
|
|
|
|
die("experiment.perl -config config-file [-exec] [-no-graph]") |
|
unless &GetOptions('config=s' => \$CONFIG_FILE, |
|
'continue=i' => \$CONTINUE, |
|
'delete-crashed=i' => \$DELETE_CRASHED, |
|
'delete-run=i' => \$DELETE_VERSION, |
|
'delete-version=i' => \$DELETE_VERSION, |
|
'ignore-time' => \$IGNORE_TIME, |
|
'exec' => \$EXECUTE, |
|
'cluster' => \$CLUSTER, |
|
'multicore' => \$MULTICORE, |
|
'final-step=s' => \$FINAL_STEP, |
|
'final-out=s' => \$FINAL_OUT, |
|
'meta=s' => \$META, |
|
'verbose' => \$VERBOSE, |
|
'sleep=i' => \$SLEEP, |
|
'max-active=i' => \$MAX_ACTIVE, |
|
'no-graph' => \$NO_GRAPH); |
|
if (! -e "steps") { `mkdir -p steps`; } |
|
|
|
die("error: could not find config file") |
|
unless ($CONFIG_FILE && -e $CONFIG_FILE) || |
|
($CONTINUE && -e &steps_file("config.$CONTINUE",$CONTINUE)) || |
|
($DELETE_CRASHED && -e &steps_file("config.$DELETE_CRASHED",$DELETE_CRASHED)) || |
|
($DELETE_VERSION && -e &steps_file("config.$DELETE_VERSION",$DELETE_VERSION)); |
|
$CONFIG_FILE = &steps_file("config.$CONTINUE",$CONTINUE) if $CONTINUE && !$CONFIG_FILE; |
|
$CONFIG_FILE = &steps_file("config.$DELETE_CRASHED",$DELETE_CRASHED) if $DELETE_CRASHED; |
|
$CONFIG_FILE = &steps_file("config.$DELETE_VERSION",$DELETE_VERSION) if $DELETE_VERSION; |
|
|
|
my (@MODULE, |
|
%MODULE_TYPE, |
|
%MODULE_STEP, |
|
%STEP_IN, |
|
%STEP_OUT, |
|
%STEP_OUTNAME, |
|
%STEP_TMPNAME, |
|
%STEP_FINAL, |
|
%STEP_PASS, |
|
%STEP_PASS_IF, |
|
%STEP_IGNORE, |
|
%STEP_IGNORE_IF, |
|
%QSUB_SCRIPT, |
|
%QSUB_STEP, |
|
%RERUN_ON_CHANGE, |
|
%ONLY_EXISTENCE_MATTERS, |
|
%MULTIREF, |
|
%TEMPLATE, |
|
%TEMPLATE_IF, |
|
%ONLY_FACTOR_0, |
|
%PARALLELIZE, |
|
%ERROR, |
|
%NOT_ERROR); |
|
&read_meta(); |
|
|
|
print "LOAD CONFIG...\n"; |
|
my (@MODULE_LIST, |
|
%CONFIG); |
|
&read_config(); |
|
print "working directory is ".&check_and_get("GENERAL:working-dir")."\n"; |
|
chdir(&check_and_get("GENERAL:working-dir")); |
|
|
|
my $VERSION = 0; |
|
$VERSION = $CONTINUE if $CONTINUE; |
|
$VERSION = $DELETE_CRASHED if $DELETE_CRASHED; |
|
$VERSION = $DELETE_VERSION if $DELETE_VERSION; |
|
|
|
&compute_version_number() if $EXECUTE && !$CONTINUE && !$DELETE_CRASHED && !$DELETE_VERSION; |
|
`mkdir -p steps/$VERSION` unless -d "steps/$VERSION"; |
|
|
|
&log_config() unless $DELETE_CRASHED || $DELETE_VERSION; |
|
print "running experimental run number $VERSION\n"; |
|
|
|
print "\nESTABLISH WHICH STEPS NEED TO BE RUN\n"; |
|
my (%NEEDED, |
|
%USES_INPUT, |
|
@DO_STEP, |
|
%STEP_LOOKUP, |
|
%PASS, |
|
%GIVEN); |
|
&find_steps(); |
|
|
|
print "\nFIND DEPENDENCIES BETWEEN STEPS\n"; |
|
my @DEPENDENCY; |
|
&find_dependencies(); |
|
|
|
if (defined($DELETE_CRASHED)) { |
|
&delete_crashed($DELETE_CRASHED); |
|
exit; |
|
} |
|
|
|
if (defined($DELETE_VERSION)) { |
|
&delete_version($DELETE_VERSION); |
|
exit; |
|
} |
|
|
|
print "\nCHECKING IF OLD STEPS ARE RE-USABLE\n"; |
|
my @RE_USE; |
|
my %RECURSIVE_RE_USE; |
|
&find_re_use(); |
|
|
|
print "\nDEFINE STEPS (run with -exec if everything ok)\n" unless $EXECUTE || $CONTINUE; |
|
&define_step("all") unless $EXECUTE || $CONTINUE; |
|
&init_agenda_graph(); |
|
&draw_agenda_graph(); |
|
|
|
print "\nEXECUTE STEPS\n" if $EXECUTE; |
|
my (%DO,%DONE,%CRASHED); |
|
&execute_steps() if $EXECUTE; |
|
&draw_agenda_graph(); |
|
|
|
exit(); |
|
|
|
|
|
|
|
|
|
sub init_agenda_graph() { |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
|
|
my $graph_file = &steps_file("graph.$VERSION",$VERSION); |
|
open(PS,">".$graph_file.".ps") or die "Cannot open: $!"; |
|
print PS "%!\n" |
|
."/Helvetica findfont 36 scalefont setfont\n" |
|
."72 72 moveto\n" |
|
."(its all gone blank...) show\n" |
|
."showpage\n"; |
|
close(PS); |
|
|
|
`convert -alpha off $graph_file.ps $graph_file.png`; |
|
|
|
if (!$NO_GRAPH && !fork) { |
|
|
|
if (`which gv 2> /dev/null`) { |
|
`gv -watch $graph_file.ps`; |
|
} |
|
|
|
else { |
|
`display -update 10 $graph_file.png`; |
|
} |
|
|
|
|
|
exit; |
|
} |
|
} |
|
|
|
|
|
|
|
sub detect_machine { |
|
my ($hostname,$list) = @_; |
|
$list =~ s/\s+/ /; |
|
$list =~ s/^ //; |
|
$list =~ s/ $//; |
|
foreach my $machine (split(/ /,$list)) { |
|
return 1 if $hostname =~ /$machine/; |
|
} |
|
return 0; |
|
} |
|
|
|
sub detect_if_cluster { |
|
my $hostname = `hostname`; chop($hostname); |
|
foreach my $line (`cat $RealBin/experiment.machines`) { |
|
next unless $line =~ /^cluster: (.+)$/; |
|
if (&detect_machine($hostname,$1)) { |
|
$CLUSTER = 1; |
|
print "running on a cluster\n" if $CLUSTER; |
|
} |
|
} |
|
} |
|
|
|
sub detect_if_multicore { |
|
my $hostname = `hostname`; chop($hostname); |
|
foreach my $line (`cat $RealBin/experiment.machines`) { |
|
next unless $line =~ /^multicore-(\d+): (.+)$/; |
|
my ($cores,$list) = ($1,$2); |
|
if (&detect_machine($hostname,$list)) { |
|
$MAX_ACTIVE = $cores; |
|
$MULTICORE = 1; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
sub read_meta { |
|
open(META,$META) || die("ERROR: no meta file at $META"); |
|
my ($module,$step); |
|
while(<META>) { |
|
s/\#.*$//; |
|
next if /^\s*$/; |
|
while (/\\\s*$/) { |
|
$_ .= <META>; |
|
s/\s*\\\s*[\n\r]*\s+/ /; |
|
} |
|
if (/^\[(.+)\]\s+(\S+)/) { |
|
$module = $1; |
|
push @MODULE,$module; |
|
$MODULE_TYPE{$module} = $2; |
|
|
|
} |
|
elsif (/^(\S+)/) { |
|
$step = $1; |
|
push @{$MODULE_STEP{$module}},$step; |
|
|
|
} |
|
elsif (/^\s+(\S+): (.+\S)\s*$/) { |
|
if ($1 eq "in") { |
|
@{$STEP_IN{"$module:$step"}} = split(/\s+/,$2); |
|
} |
|
elsif ($1 eq "out") { |
|
$STEP_OUT{"$module:$step"} = $2; |
|
} |
|
elsif ($1 eq "default-name") { |
|
$STEP_OUTNAME{"$module:$step"} = $2; |
|
} |
|
elsif ($1 eq "tmp-name") { |
|
$STEP_TMPNAME{"$module:$step"} = $2; |
|
} |
|
elsif ($1 eq "final-model") { |
|
$STEP_FINAL{"$module:$step"} = $2; |
|
} |
|
elsif ($1 eq "pass-unless") { |
|
@{$STEP_PASS{"$module:$step"}} = split(/\s+/,$2); |
|
push @{$RERUN_ON_CHANGE{"$module:$step"}}, split(/\s+/,$2); |
|
} |
|
elsif ($1 eq "pass-if") { |
|
@{$STEP_PASS_IF{"$module:$step"}} = split(/\s+/,$2); |
|
push @{$RERUN_ON_CHANGE{"$module:$step"}}, split(/\s+/,$2); |
|
} |
|
elsif ($1 eq "ignore-unless") { |
|
$STEP_IGNORE{"$module:$step"} = $2; |
|
} |
|
elsif ($1 eq "ignore-if") { |
|
$STEP_IGNORE_IF{"$module:$step"} = $2; |
|
} |
|
elsif ($1 eq "qsub-script") { |
|
$QSUB_SCRIPT{"$module:$step"}++; |
|
} |
|
elsif ($1 eq "rerun-on-change") { |
|
push @{$RERUN_ON_CHANGE{"$module:$step"}}, split(/\s+/,$2); |
|
} |
|
elsif ($1 eq "only-existence-matters") { |
|
$ONLY_EXISTENCE_MATTERS{"$module:$step"}{$2}++; |
|
} |
|
elsif ($1 eq "multiref") { |
|
$MULTIREF{"$module:$step"} = $2; |
|
} |
|
elsif ($1 eq "template") { |
|
my $escaped_template = $2; |
|
$escaped_template =~ s/^IN/EMS_IN_EMS/; |
|
$escaped_template =~ s/ IN(\d*)/ EMS_IN$1_EMS/g; |
|
$escaped_template =~ s/ OUT/ EMS_OUT_EMS/g; |
|
$escaped_template =~ s/TMP/EMS_TMP_EMS/g; |
|
$TEMPLATE{"$module:$step"} = $escaped_template; |
|
} |
|
elsif ($1 eq "template-if") { |
|
my $escaped_template = $2; |
|
$escaped_template =~ s/^IN/EMS_IN_EMS/; |
|
$escaped_template =~ s/ IN(\d*)/ EMS_IN$1_EMS/g; |
|
$escaped_template =~ s/ OUT/ EMS_OUT_EMS/g; |
|
$escaped_template =~ s/TMP/EMS_TMP_EMS/g; |
|
my @IF = split(/\s+/,$escaped_template); |
|
push @{$TEMPLATE_IF{"$module:$step"}}, \@IF; |
|
} |
|
elsif ($1 eq "parallelizable") { |
|
$PARALLELIZE{"$module:$step"}++; |
|
} |
|
elsif ($1 eq "only-factor-0") { |
|
$ONLY_FACTOR_0{"$module:$step"}++; |
|
} |
|
elsif ($1 eq "error") { |
|
push @{$ERROR{"$module:$step"}}, $2; |
|
} |
|
elsif ($1 eq "not-error") { |
|
push @{$NOT_ERROR{"$module:$step"}}, $2; |
|
} |
|
else { |
|
die("META ERROR unknown parameter: $1"); |
|
} |
|
} |
|
else { |
|
die("META ERROR buggy line $_"); |
|
} |
|
} |
|
close(META); |
|
} |
|
|
|
|
|
|
|
sub read_config { |
|
|
|
my $module = "GENERAL"; |
|
my $error = 0; |
|
my $ignore = 0; |
|
my $line_count=0; |
|
open(INI,$CONFIG_FILE) || die("ERROR: CONFIG FILE NOT FOUND: $CONFIG_FILE"); |
|
while(<INI>) { |
|
$line_count++; |
|
s/\#.*$//; |
|
next if /^\ |
|
while (/\\\s*$/) { |
|
s/\s*\\\s*$/ /; |
|
$_ .= <INI>; |
|
} |
|
if (/^\[(.+)\]/) { |
|
$module = $1; |
|
$ignore = /ignore/i; |
|
push @MODULE_LIST,$1 unless $ignore; |
|
} |
|
elsif (! $ignore) { |
|
if (/^(\S+) = (.+)$/) { |
|
my $parameter = $1; |
|
my $value = $2; |
|
$value =~ s/\s+/ /g; |
|
$value =~ s/^ //; |
|
$value =~ s/ $//; |
|
my @VALUE; |
|
if ($value =~ /^\"(.*)\"$/) { |
|
@VALUE = ($1); |
|
} |
|
else { |
|
@VALUE = split(/ /,$value); |
|
} |
|
$CONFIG{"$module:$parameter"} = \@VALUE; |
|
} |
|
else { |
|
print STDERR "BUGGY CONFIG LINE ($line_count): $_"; |
|
$error++; |
|
} |
|
} |
|
} |
|
die("$error ERROR".(($error>1)?"s":"")." IN CONFIG FILE") if $error; |
|
|
|
|
|
my $resolve = 1; |
|
my $loop_count = 0; |
|
while($resolve && $loop_count++ < 100) { |
|
$resolve = 0; |
|
foreach my $parameter (keys %CONFIG) { |
|
foreach (@{$CONFIG{$parameter}}) { |
|
next unless /\$[a-z\{]/i; |
|
my $escaped = 0; |
|
die ("BAD USE OF \$ IN VALUE used in parameter $parameter") |
|
if ! ( /^(.*)\$([a-z][a-z\-\:\d]*)(.*)$/i || |
|
(/^(.*)\$\{([a-z][a-z\-\:\d]*)\}(.*)$/i && ($escaped = 1))); |
|
my ($pre,$substitution,$post) = ($1,$2,$3); |
|
my $pattern = $substitution; |
|
if ($substitution !~ /\:/) { |
|
$parameter =~ /^(.+)\:/; |
|
$substitution = $1.":".$substitution; |
|
} |
|
|
|
my $orig = $substitution; |
|
$substitution =~ s/^(.+):.+:(.+)$/$1:$2/ |
|
unless defined($CONFIG{$substitution}); |
|
$substitution = "GENERAL:$2" |
|
unless defined($CONFIG{$substitution}); |
|
die ("UNKNOWN PARAMETER $orig used in parameter $parameter") |
|
unless defined($CONFIG{$substitution}); |
|
|
|
my $o = $CONFIG{$substitution}[0]; |
|
print "changing $_ to " if $VERBOSE; |
|
s/\$\{$pattern\}/$o/ if $escaped; |
|
s/\$$pattern/$o/ unless $escaped; |
|
print "$_\n" if $VERBOSE; |
|
if (/\$/) { |
|
print "more resolving needed\n" if $VERBOSE; |
|
$resolve = 1; |
|
} |
|
} |
|
} |
|
} |
|
close(INI); |
|
die("ERROR: CIRCULAR PARAMETER DEFINITION") if $resolve; |
|
|
|
|
|
$error = 0; |
|
foreach my $parameter (keys %CONFIG) { |
|
foreach (@{$CONFIG{$parameter}}) { |
|
next if $parameter =~ /temp-dir/; |
|
next if (!/^\// || -e); |
|
my $file = $_; |
|
$file =~ s/ .+$//; |
|
my $gz = $file; $gz =~ s/\.gz$//; |
|
next if -e $gz; |
|
next if `find $file* -maxdepth 0 -follow`; |
|
print STDERR "$parameter: file $_ does not exist!\n"; |
|
$error++; |
|
} |
|
} |
|
die if $error; |
|
} |
|
|
|
|
|
|
|
sub log_config { |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
`mkdir -p $dir/steps`; |
|
my $config_file = &steps_file("config.$VERSION",$VERSION); |
|
`cp $CONFIG_FILE $config_file` unless $CONTINUE; |
|
open(PARAMETER,">".&steps_file("parameter.$VERSION",$VERSION)) or die "Cannot open: $!"; |
|
foreach my $parameter (sort keys %CONFIG) { |
|
print PARAMETER "$parameter ="; |
|
foreach (@{$CONFIG{$parameter}}) { |
|
print PARAMETER " ".$_; |
|
} |
|
print PARAMETER "\n"; |
|
} |
|
close(PARAMETER); |
|
} |
|
|
|
|
|
|
|
sub find_steps { |
|
|
|
if (defined($FINAL_OUT)) { |
|
push @{$NEEDED{$FINAL_OUT}}, "final"; |
|
} |
|
elsif (!defined($FINAL_STEP)) { |
|
push @{$NEEDED{"REPORTING:report"}}, "final"; |
|
} |
|
|
|
|
|
while(1) { |
|
my $step_count_before = scalar(@DO_STEP); |
|
for(my $m=$#MODULE; $m>=0; $m--) { |
|
my $module = $MODULE[$m]; |
|
|
|
|
|
if ($MODULE_TYPE{$module} eq "multiple") { |
|
my @SETS = &get_sets($module); |
|
foreach my $set (@SETS) { |
|
&find_steps_for_module($module,$set); |
|
} |
|
} |
|
|
|
|
|
elsif ($MODULE_TYPE{$module} eq "synchronous") { |
|
my $previous_module = $MODULE[$m-1]; |
|
my @SETS = &get_sets($previous_module); |
|
foreach my $set (@SETS) { |
|
&find_steps_for_module($module,$set); |
|
} |
|
} |
|
|
|
|
|
else { |
|
&find_steps_for_module($module,""); |
|
} |
|
} |
|
last if $step_count_before == scalar(@DO_STEP); |
|
} |
|
} |
|
|
|
sub find_steps_for_module { |
|
my ($module,$set,$final_module) = @_; |
|
|
|
print "processing module $module:$set\n" if $VERBOSE; |
|
|
|
|
|
foreach my $stepname (reverse @{$MODULE_STEP{$module}}) { |
|
|
|
my $step = &construct_name($module,$set,$stepname); |
|
my $defined_step = &defined_step($step); |
|
next if defined($STEP_LOOKUP{$step}); |
|
|
|
|
|
print "\tchecking step: $step\n" if $VERBOSE; |
|
|
|
|
|
my $out = &construct_name($module,$set,$STEP_OUT{$defined_step}); |
|
print "\t\tproduces $out\n" if $VERBOSE; |
|
next unless defined($NEEDED{$out}) || (defined($FINAL_STEP) && $FINAL_STEP eq $step); |
|
print "\t\tneeded\n" if $VERBOSE; |
|
|
|
|
|
|
|
if(defined($CONFIG{$out})) { |
|
$GIVEN{$out} = $step; |
|
next; |
|
} |
|
print "\t\toutput not specified in config\n" if $VERBOSE; |
|
|
|
|
|
if (defined($STEP_IGNORE{$defined_step})) { |
|
my $next = 0; |
|
my $and = 0; |
|
my @IGNORE = split(/ /,$STEP_IGNORE{$defined_step}); |
|
if ($IGNORE[0] eq "AND") { |
|
$and = 1; |
|
shift @IGNORE; |
|
} |
|
foreach my $ignore (@IGNORE) { |
|
my $extended_name = &extend_local_name($module,$set,$ignore); |
|
if (! &backoff_and_get($extended_name)) { |
|
print "\t\tignored because of non-existance of ".$extended_name."\n" if $VERBOSE; |
|
$next++; |
|
} |
|
} |
|
next if !$and && ($next == scalar @IGNORE); |
|
next if $and && $next; |
|
print "\t\t=> not all non-existant, not ignored" if $next && $VERBOSE; |
|
} |
|
|
|
|
|
if (defined($STEP_IGNORE_IF{$defined_step})) { |
|
my $next = 0; |
|
foreach my $ignore (split(/ /,$STEP_IGNORE_IF{$defined_step})) { |
|
my $extended_name = &extend_local_name($module,$set,$ignore); |
|
if (&backoff_and_get($extended_name)) { |
|
print "\t\tignored because of existance of ".$extended_name."\n" if $VERBOSE; |
|
$next++; |
|
} |
|
} |
|
next if $next; |
|
} |
|
|
|
|
|
|
|
push @DO_STEP,$step; |
|
$STEP_LOOKUP{$step} = $#DO_STEP; |
|
print "\tdo-step: $step\n" if $VERBOSE; |
|
|
|
|
|
|
|
if (defined($STEP_PASS{$defined_step})) { |
|
my $flag = 1; |
|
foreach my $pass (@{$STEP_PASS{$defined_step}}) { |
|
$flag = 0 |
|
if &backoff_and_get(&extend_local_name($module,$set,$pass)); |
|
} |
|
$PASS{$#DO_STEP}++ if $flag; |
|
} |
|
|
|
if (defined($STEP_PASS_IF{$defined_step})) { |
|
my $flag = 0; |
|
foreach my $pass (@{$STEP_PASS_IF{$defined_step}}) { |
|
$flag = 1 |
|
if &backoff_and_get(&extend_local_name($module,$set,$pass)); |
|
} |
|
$PASS{$#DO_STEP}++ if $flag; |
|
} |
|
|
|
|
|
if (defined($ONLY_FACTOR_0{$defined_step})) { |
|
my $FACTOR = &backoff_and_get_array("LM:$set:factors"); |
|
if (defined($FACTOR)) { |
|
my $ok = 0; |
|
foreach my $factor (@{$FACTOR}) { |
|
$ok++ if ($factor eq "word"); |
|
} |
|
$PASS{$#DO_STEP}++ unless $ok; |
|
} |
|
} |
|
|
|
|
|
foreach (@{$STEP_IN{$defined_step}}) { |
|
my $in = $_; |
|
|
|
|
|
if ($in =~ /=OR=/) { |
|
my @POTENTIAL_IN = split(/=OR=/,$in); |
|
foreach my $potential_in (@POTENTIAL_IN) { |
|
if (&check_producability($module,$set,$potential_in)) { |
|
$in = $potential_in; |
|
last; |
|
} |
|
|
|
} |
|
|
|
$in = $POTENTIAL_IN[$#POTENTIAL_IN] if $in =~ /=OR=/; |
|
} |
|
|
|
|
|
my @IN = &construct_input($module,$set,$in); |
|
foreach my $in (@IN) { |
|
print "\t\tneeds input $in: " if $VERBOSE; |
|
if(defined($CONFIG{$in}) && $CONFIG{$in}[0] =~ /^\[(.+)\]$/) { |
|
|
|
if ($CONFIG{$in}[0] =~ /^\[([^:]+):[{](\S+)[}]:(\S+)\]$/) { |
|
my @SETS = split(',', $2); |
|
foreach my $set (@SETS) { |
|
$in = &construct_name($1,$set,$3); |
|
print $in if $VERBOSE; |
|
push @{$NEEDED{$in}}, $#DO_STEP; |
|
push @{$USES_INPUT{$#DO_STEP}},$in; |
|
print "\n\t\tcross-directed to $in\n" if $VERBOSE; |
|
} |
|
$in = ""; |
|
} |
|
else { |
|
$in = $1; |
|
print $in if $VERBOSE; |
|
push @{$NEEDED{$in}}, $#DO_STEP; |
|
print "\n\t\tcross-directed to $in\n" if $VERBOSE; |
|
} |
|
} |
|
elsif(defined($CONFIG{$in})) { |
|
print "\n\t\t... but that is specified\n" if $VERBOSE; |
|
} |
|
else { |
|
push @{$NEEDED{$in}}, $#DO_STEP; |
|
print "\n" if $VERBOSE; |
|
} |
|
push @{$USES_INPUT{$#DO_STEP}},$in; |
|
} |
|
} |
|
} |
|
} |
|
|
|
sub check_producability { |
|
my ($module,$set,$output) = @_; |
|
|
|
|
|
my @OUT = &construct_input($module,$set,$output); |
|
|
|
|
|
|
|
foreach my $out (@OUT) { |
|
print "producable? $out\n" if $VERBOSE; |
|
|
|
|
|
return 1 if defined($CONFIG{$out}); |
|
|
|
|
|
$out =~ s/:.+:/:/g; |
|
my $defined_step; |
|
foreach my $ds (keys %STEP_OUT) { |
|
my ($ds_module) = &deconstruct_name($ds); |
|
my $ds_out = &construct_name($ds_module,"",$STEP_OUT{$ds}); |
|
print "checking $ds -> $ds_out\n" if $VERBOSE; |
|
$defined_step = $ds if $out eq $ds_out; |
|
} |
|
die("ERROR: cannot possibly produce output $out") |
|
unless $defined_step; |
|
|
|
|
|
return 1 unless defined($STEP_IGNORE{$defined_step}); |
|
|
|
|
|
foreach my $ignore (split(/ /,$STEP_IGNORE{$defined_step})) { |
|
my ($ds_module) = &deconstruct_name($defined_step); |
|
my $ds_set = $set; |
|
$ds_set = "" if $MODULE_TYPE{$ds_module} eq "single"; |
|
my $req = &construct_name($ds_module,$ds_set,$ignore); |
|
print "producable req $req\n" if $VERBOSE; |
|
return 1 if defined($CONFIG{$req}); |
|
} |
|
} |
|
print "not producable: ($module,$set,$output)\n" if $VERBOSE; |
|
return 0; |
|
} |
|
|
|
|
|
|
|
sub construct_input { |
|
my ($module,$set,$in) = @_; |
|
|
|
|
|
my @IN; |
|
|
|
|
|
if ($in !~ /([^:]+):(\S+)/) { |
|
push @IN, &construct_name($module,$set,$in); |
|
} |
|
|
|
|
|
elsif ($MODULE_TYPE{$1} eq "multiple") { |
|
my @SETS = &get_sets($1); |
|
foreach my $set (@SETS) { |
|
push @IN, &construct_name($1,$set,$2); |
|
} |
|
} |
|
|
|
elsif ($1 eq "EVALUATION" && $module eq "REPORTING") { |
|
my @SETS = &get_sets("EVALUATION"); |
|
foreach my $set (@SETS) { |
|
push @IN, &construct_name($1,$set,$2); |
|
} |
|
} |
|
|
|
else { |
|
push @IN,$in; |
|
} |
|
|
|
return @IN; |
|
} |
|
|
|
|
|
|
|
sub get_sets { |
|
my ($config) = @_; |
|
my @SET; |
|
foreach (@MODULE_LIST) { |
|
if (/^$config:([^:]+)/) { |
|
push @SET,$1; |
|
} |
|
} |
|
return @SET; |
|
} |
|
|
|
|
|
|
|
sub delete_crashed { |
|
my $crashed = 0; |
|
for(my $i=0;$i<=$#DO_STEP;$i++) { |
|
my $step_file = &versionize(&step_file($i),$DELETE_CRASHED); |
|
next unless -e $step_file; |
|
if (! -e $step_file.".DONE" || |
|
&check_if_crashed($i,$DELETE_CRASHED,"no wait")) { |
|
&delete_step($DO_STEP[$i],$DELETE_CRASHED); |
|
$crashed++; |
|
} |
|
} |
|
print "run with -exec to delete steps\n" if $crashed && !$EXECUTE; |
|
print "nothing to do\n" unless $crashed; |
|
} |
|
|
|
|
|
sub delete_version { |
|
|
|
|
|
my %ALREADY_DELETED; |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
open(VERSION,"ls $dir/steps/*/deleted.* 2>/dev/null|"); |
|
while(<VERSION>) { |
|
/deleted\.(\d+)/; |
|
$ALREADY_DELETED{$1}++; |
|
} |
|
close(VERSION); |
|
|
|
|
|
my (%USED_BY_OTHERS,%DELETABLE,%NOT_DELETABLE); |
|
open(VERSION,"ls $dir/steps|"); |
|
while(my $version = <VERSION>) { |
|
chop($version); |
|
next if $version !~ /^\d+/ || $version == 0; |
|
open(RE_USE,"steps/$version/re-use.$version"); |
|
while(<RE_USE>) { |
|
next unless /^(.+) (\d+)$/; |
|
my ($step,$re_use_version) = ($1,$2); |
|
|
|
|
|
$USED_BY_OTHERS{$step}++ if $re_use_version == $DELETE_VERSION && !defined($ALREADY_DELETED{$version}); |
|
|
|
|
|
push @{$DELETABLE{$re_use_version}}, $step if $version == $DELETE_VERSION && defined($ALREADY_DELETED{$re_use_version}); |
|
|
|
|
|
$NOT_DELETABLE{$re_use_version}{$step}++ if $version != $DELETE_VERSION && !defined($ALREADY_DELETED{$version}); |
|
} |
|
close(RE_USE); |
|
} |
|
|
|
|
|
open(STEPS,"ls $dir/steps/$DELETE_VERSION/[A-Z]*.$DELETE_VERSION|"); |
|
while(my $step_file = <STEPS>) { |
|
chomp($step_file); |
|
my $step = &get_step_from_step_file($step_file); |
|
next if $USED_BY_OTHERS{$step}; |
|
&delete_step($step,$DELETE_VERSION); |
|
} |
|
|
|
|
|
foreach my $version (keys %DELETABLE) { |
|
foreach my $step (@{$DELETABLE{$version}}) { |
|
next if defined($NOT_DELETABLE{$version}) && defined($NOT_DELETABLE{$version}{$step}); |
|
&delete_step($step,$version); |
|
} |
|
} |
|
my $deleted_flag_file = &steps_file("deleted.$DELETE_VERSION",$DELETE_VERSION); |
|
`touch $deleted_flag_file` if $EXECUTE; |
|
} |
|
|
|
sub get_step_from_step_file { |
|
my ($step) = @_; |
|
$step =~ s/^.+\///; |
|
$step =~ s/\.\d+$//; |
|
$step =~ s/_/:/g; |
|
return $step; |
|
} |
|
|
|
sub delete_step { |
|
my ($step_name,$version) = @_; |
|
my ($module,$set,$step) = &deconstruct_name($step_name); |
|
|
|
my $step_file = &versionize(&step_file2($module,$set,$step),$version); |
|
print "delete step $step_file\n"; |
|
`rm $step_file $step_file.*` if $EXECUTE; |
|
|
|
my $out_file = $STEP_OUTNAME{"$module:$step"}; |
|
$out_file =~ s/^(.+\/)([^\/]+)$/$1$set.$2/g if $set; |
|
&delete_output(&versionize(&long_file_name($out_file,$module,$set), $version)); |
|
|
|
if (defined($STEP_TMPNAME{"$module:$step"})) { |
|
my $tmp_file = &get_tmp_file($module,$set,$step,$version); |
|
&delete_output($tmp_file); |
|
} |
|
} |
|
|
|
|
|
sub delete_output { |
|
my ($file) = @_; |
|
|
|
if (-d $file) { |
|
print "\tdelete directory $file\n"; |
|
`rm -r $file` if $EXECUTE; |
|
} |
|
|
|
elsif (-e $file) { |
|
print "\tdelete file $file\n"; |
|
`rm $file` if $EXECUTE; |
|
} |
|
|
|
$file =~ /^(.+)\/([^\/]+)$/; |
|
my ($dir,$f) = ($1,$2); |
|
my @FILES = `ls $file.* 2>/dev/null`; |
|
foreach (`ls $dir`) { |
|
chop; |
|
next unless substr($_,0,length($f)) eq $f; |
|
if (-d "$dir/$_") { |
|
print "\tdelete directory $file\n"; |
|
`rm -r $dir/$_` if $EXECUTE; |
|
} |
|
elsif (-e "$dir/$_") { |
|
print "\tdelete file $dir/$_\n"; |
|
`rm $dir/$_` if $EXECUTE; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
sub find_re_use { |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
return unless -e "$dir/steps"; |
|
|
|
for(my $i=0;$i<=$#DO_STEP;$i++) { |
|
%{$RE_USE[$i]} = (); |
|
} |
|
|
|
|
|
open(LS,"find $dir/steps/* -maxdepth 1 -follow | sort -r |"); |
|
while(my $info_file = <LS>) { |
|
next unless $info_file =~ /INFO$/; |
|
$info_file =~ s/.+\/([^\/]+)$/$1/; |
|
for(my $i=0;$i<=$#DO_STEP;$i++) { |
|
|
|
my $pattern = &step_file($i); |
|
$pattern =~ s/\+/\\+/; |
|
$pattern = "^$pattern.(\\d+).INFO\$"; |
|
$pattern =~ s/.+\/([^\/]+)$/$1/; |
|
next unless $info_file =~ /$pattern/; |
|
my $old_version = $1; |
|
print "re_use $i $DO_STEP[$i] (v$old_version) ".join(" ",keys %{$RE_USE[$i]})." ?\n" if $VERBOSE; |
|
print "\tno info file ".&versionize(&step_file($i),$old_version).".INFO\n" if ! -e &versionize(&step_file($i),$old_version).".INFO" && $VERBOSE; |
|
print "\tno done file " if ! -e &versionize(&step_file($i),$old_version).".DONE" && $VERBOSE; |
|
if (! -e &versionize(&step_file($i),$old_version).".INFO") { |
|
print "\tinfo file does not exist\n" if $VERBOSE; |
|
print "\tnot re-usable\n" if $VERBOSE; |
|
} |
|
elsif (! -e &versionize(&step_file($i),$old_version).".DONE") { |
|
print "\tstep not done (done file does not exist)\n" if $VERBOSE; |
|
print "\tnot re-usable\n" if $VERBOSE; |
|
} |
|
elsif (! &check_info($i,$old_version) ) { |
|
print "\tparameters from info file do not match\n" if $VERBOSE; |
|
print "\tnot re-usable\n" if $VERBOSE; |
|
} |
|
elsif (&check_if_crashed($i,$old_version)) { |
|
print "\tstep crashed\n" if $VERBOSE; |
|
print "\tnot re-usable\n" if $VERBOSE; |
|
} |
|
else { |
|
$RE_USE[$i]{$old_version}++; |
|
print "\tre-usable\n" if $VERBOSE; |
|
} |
|
} |
|
} |
|
close(LS); |
|
|
|
|
|
|
|
my $change = 1; |
|
while($change) { |
|
$change = 0; |
|
|
|
for(my $i=0;$i<=$#DO_STEP;$i++) { |
|
next unless $RE_USE[$i]; |
|
foreach my $run (keys %{$RE_USE[$i]}) { |
|
print "check on dependencies for $i ($run) $DO_STEP[$i]\n" if $VERBOSE; |
|
foreach (@{$DEPENDENCY[$i]}) { |
|
my $parent = $_; |
|
print "\tchecking on $parent $DO_STEP[$parent]\n" if $VERBOSE; |
|
my @PASSING; |
|
|
|
while (defined($PASS{$parent})) { |
|
if (scalar (@{$DEPENDENCY[$parent]}) == 0) { |
|
$parent = 0; |
|
print "\tprevious step's output is specified\n" if $VERBOSE; |
|
} |
|
else { |
|
push @PASSING, $parent; |
|
$parent = $DEPENDENCY[$parent][0]; |
|
print "\tmoving up to $parent $DO_STEP[$parent]\n" if $VERBOSE; |
|
} |
|
} |
|
|
|
if ($parent) { |
|
my $reuse_run = $run; |
|
|
|
if (defined($RECURSIVE_RE_USE{$i,$run,$DO_STEP[$parent]})) { |
|
print "\trecursive re-use run $reuse_run\n" if $VERBOSE; |
|
$reuse_run = $RECURSIVE_RE_USE{$i,$run,$DO_STEP[$parent]}; |
|
} |
|
|
|
else { |
|
|
|
foreach (@PASSING) { |
|
my $passed = $DO_STEP[$_]; |
|
$passed =~ s/:/_/g; |
|
if (-e &steps_file("$passed.$run",$run)) { |
|
delete($RE_USE[$i]{$run}); |
|
$change = 1; |
|
print "\tpassed step $DO_STEP[$_] used in re-use run $run -> fail\n" if $VERBOSE; |
|
} |
|
} |
|
} |
|
|
|
if (! defined($RE_USE[$parent]{$reuse_run})) { |
|
print "\tno previous step -> fail\n" if $VERBOSE; |
|
delete($RE_USE[$i]{$run}); |
|
$change = 1; |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
print "\nSTEP SUMMARY:\n"; |
|
open(RE_USE,">".&steps_file("re-use.$VERSION",$VERSION)) or die "Cannot open: $!"; |
|
for(my $i=$#DO_STEP;$i>=0;$i--) { |
|
if ($PASS{$i}) { |
|
$RE_USE[$i] = 0; |
|
next; |
|
} |
|
print "$i $DO_STEP[$i] ->\t"; |
|
if (scalar(keys %{$RE_USE[$i]})) { |
|
my @ALL = sort { $a <=> $b} keys %{$RE_USE[$i]}; |
|
print "re-using (".join(" ",@ALL).")\n"; |
|
$RE_USE[$i] = $ALL[0]; |
|
if ($ALL[0] != $VERSION) { |
|
print RE_USE "$DO_STEP[$i] $ALL[0]\n"; |
|
} |
|
} |
|
else { |
|
print "run\n"; |
|
$RE_USE[$i] = 0; |
|
} |
|
} |
|
close(RE_USE); |
|
} |
|
|
|
sub find_dependencies { |
|
for(my $i=0;$i<=$#DO_STEP;$i++) { |
|
@{$DEPENDENCY[$i]} = (); |
|
} |
|
for(my $i=0;$i<=$#DO_STEP;$i++) { |
|
my $step = $DO_STEP[$i]; |
|
$step =~ /^(.+:)[^:]+$/; |
|
my $module_set = $1; |
|
foreach my $needed_by (@{$NEEDED{$module_set.$STEP_OUT{&defined_step($step)}}}) { |
|
print "$needed_by needed by $i\n" if $VERBOSE; |
|
next if $needed_by eq 'final'; |
|
push @{$DEPENDENCY[$needed_by]},$i; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
} |
|
|
|
sub draw_agenda_graph { |
|
my %M; |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
open(DOT,">".&steps_file("graph.$VERSION.dot",$VERSION)) or die "Cannot open: $!"; |
|
print DOT "digraph Experiment$VERSION {\n"; |
|
print DOT " ranksep=0;\n"; |
|
for(my $i=0;$i<=$#DO_STEP;$i++) { |
|
my $step = $DO_STEP[$i]; |
|
$step =~ /^(.+):[^:]+$/; |
|
my $module_set = $1; |
|
push @{$M{$module_set}},$i; |
|
} |
|
my $i = 0; |
|
my (@G,%GIVEN_NUMBER); |
|
foreach (values %GIVEN) { |
|
push @G,$_; |
|
$GIVEN_NUMBER{$_} = $#G; |
|
/^(.+):[^:]+$/; |
|
my $module_set = $1; |
|
push @{$M{$module_set}},"g".($#G); |
|
} |
|
my $m = 0; |
|
foreach my $module (keys %M) { |
|
print DOT " subgraph cluster_".($m++)." {\n"; |
|
print DOT " fillcolor=\"lightyellow\";\n"; |
|
print DOT " shape=box;\n"; |
|
print DOT " style=filled;\n"; |
|
print DOT " fontsize=10;\n"; |
|
print DOT " label=\"$module\";\n"; |
|
foreach my $i (@{$M{$module}}) { |
|
if ($i =~ /g(\d+)/) { |
|
my $step = $G[$1]; |
|
$step =~ /^.+:([^:]+)$/; |
|
print DOT " $i [label=\"$1\",shape=box,fontsize=10,height=0,style=filled,fillcolor=\"#c0b060\"];\n"; |
|
} |
|
else { |
|
my $step = $DO_STEP[$i]; |
|
$step =~ s/^.+:([^:]+)$/$1/; |
|
$step .= " (".$RE_USE[$i].")" if $RE_USE[$i]; |
|
|
|
my $color = "green"; |
|
$color = "#0000ff" if defined($DO{$i}) && $DO{$i} >= 1; |
|
$color = "#8080ff" if defined($DONE{$i}) || ($RE_USE[$i] && $RE_USE[$i] == $VERSION); |
|
$color = "lightblue" if $RE_USE[$i] && $RE_USE[$i] != $VERSION; |
|
$color = "red" if defined($CRASHED{$i}); |
|
$color = "lightyellow" if defined($PASS{$i}); |
|
|
|
print DOT " $i [label=\"$step\",shape=box,fontsize=10,height=0,style=filled,fillcolor=\"$color\"];\n"; |
|
} |
|
} |
|
print DOT " }\n"; |
|
} |
|
for(my $i=0;$i<=$#DO_STEP;$i++) { |
|
foreach (@{$DEPENDENCY[$i]}) { |
|
print DOT " $_ -> $i;\n"; |
|
} |
|
} |
|
|
|
|
|
|
|
foreach my $out (keys %GIVEN) { |
|
foreach my $needed_by (@{$NEEDED{$out}}) { |
|
print DOT " g".$GIVEN_NUMBER{$GIVEN{$out}}." -> $needed_by;\n"; |
|
} |
|
} |
|
|
|
print DOT "}\n"; |
|
close(DOT); |
|
my $graph_file = &steps_file("graph.$VERSION",$VERSION); |
|
`dot -Tps $graph_file.dot >$graph_file.ps 2>/dev/null`; |
|
`convert -alpha off $graph_file.ps $graph_file.png`; |
|
} |
|
|
|
sub define_step { |
|
my ($step) = @_; |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
`mkdir -p $dir` if ! -e $dir; |
|
my @STEP; |
|
if ($step eq "all") { |
|
for(my $i=0;$i<=$#DO_STEP;$i++) { |
|
push @STEP,$i; |
|
} |
|
} |
|
else { |
|
@STEP = ($step); |
|
} |
|
foreach my $i (@STEP) { |
|
next if $RE_USE[$i]; |
|
next if defined($PASS{$i}); |
|
next if &define_template($i); |
|
if ($DO_STEP[$i] =~ /^CORPUS:(.+):(post-split-)?factorize$/) { |
|
&define_corpus_factorize($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'SPLITTER:train') { |
|
&define_splitter_train($i); |
|
} |
|
elsif ($DO_STEP[$i] =~ /^LM:(.+):(post-split-)?factorize$/) { |
|
&define_lm_factorize($i,$1); |
|
} |
|
elsif ($DO_STEP[$i] =~ /^LM:(.+):randomize$/ || |
|
$DO_STEP[$i] eq 'INTERPOLATED-LM:randomize') { |
|
&define_lm_randomize($i,$1); |
|
} |
|
elsif ($DO_STEP[$i] =~ /^LM:(.+):train-randomized$/) { |
|
&define_lm_train_randomized($i,$1); |
|
} |
|
elsif ($DO_STEP[$i] =~ /^LM:(.+):train-bilingual-lm$/) { |
|
&define_lm_train_bilingual_lm($i,$1); |
|
} |
|
elsif ($DO_STEP[$i] =~ /^LM:(.+):prepare-bilingual-lm$/) { |
|
&define_lm_prepare_bilingual_lm($i,$1); |
|
} |
|
elsif ($DO_STEP[$i] =~ /^LM:(.+):train-nplm$/) { |
|
&define_lm_train_nplm($i,$1); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TRAINING:prepare-data') { |
|
&define_training_prepare_data($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TRAINING:prepare-data-fast-align') { |
|
&define_training_prepare_data_fast_align($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TRAINING:run-giza') { |
|
&define_training_run_giza($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TRAINING:run-giza-inverse') { |
|
&define_training_run_giza_inverse($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TRAINING:symmetrize-giza') { |
|
&define_training_symmetrize_giza($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TRAINING:build-biconcor') { |
|
&define_training_build_biconcor($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TRAINING:build-suffix-array') { |
|
&define_training_build_suffix_array($i); |
|
} |
|
|
|
elsif ($DO_STEP[$i] eq 'TRAINING:build-lex-trans') { |
|
&define_training_build_lex_trans($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TRAINING:extract-phrases') { |
|
&define_training_extract_phrases($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TRAINING:build-reordering') { |
|
&define_training_build_reordering($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TRAINING:build-ttable') { |
|
&define_training_build_ttable($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TRAINING:build-transliteration-model') { |
|
&define_training_build_transliteration_model($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TRAINING:build-generation') { |
|
&define_training_build_generation($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TRAINING:build-generation-custom') { |
|
&define_training_build_custom_generation($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TRAINING:sigtest-filter-ttable' || |
|
$DO_STEP[$i] eq 'TRAINING:sigtest-filter-reordering') { |
|
&define_training_sigtest_filter($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TRAINING:create-config' || $DO_STEP[$i] eq 'TRAINING:create-config-interpolated-lm') { |
|
&define_training_create_config($i); |
|
} |
|
elsif ($DO_STEP[$i] =~ /^INTERPOLATED-LM:(post-split-)?factorize-tuning$/) { |
|
&define_interpolated_lm_factorize_tuning($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:interpolate') { |
|
&define_interpolated_lm_interpolate($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:binarize' || |
|
$DO_STEP[$i] eq 'INTERPOLATED-LM:quantize' || |
|
$DO_STEP[$i] eq 'INTERPOLATED-LM:randomize') { |
|
&define_interpolated_lm_process($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TUNING:factorize-input') { |
|
&define_tuningevaluation_factorize($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TUNING:factorize-input-devtest') { |
|
&define_tuningevaluation_factorize($i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TUNING:filter') { |
|
&define_tuningevaluation_filter(undef,$i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TUNING:filter-devtest') { |
|
&define_tuningevaluation_filter(undef,$i); |
|
} |
|
elsif ($DO_STEP[$i] eq 'TUNING:tune') { |
|
&define_tuning_tune($i); |
|
} |
|
elsif ($DO_STEP[$i] =~ /^EVALUATION:(.+):factorize-input$/) { |
|
&define_tuningevaluation_factorize($i); |
|
} |
|
elsif ($DO_STEP[$i] =~ /^EVALUATION:(.+):filter$/) { |
|
&define_tuningevaluation_filter($1,$i); |
|
} |
|
elsif ($DO_STEP[$i] =~ /^EVALUATION:(.+):decode$/) { |
|
&define_evaluation_decode($1,$i); |
|
} |
|
elsif ($DO_STEP[$i] =~ /^EVALUATION:(.+):analysis$/) { |
|
&define_evaluation_analysis($1,$i); |
|
} |
|
elsif ($DO_STEP[$i] =~ /^EVALUATION:(.+):analysis-precision$/) { |
|
&define_evaluation_analysis_precision($1,$i); |
|
} |
|
elsif ($DO_STEP[$i] =~ /^EVALUATION:(.+):analysis-coverage$/) { |
|
&define_evaluation_analysis_coverage($1,$i); |
|
} |
|
elsif ($DO_STEP[$i] =~ /^EVALUATION:(.+):meteor$/) { |
|
|
|
} |
|
elsif ($DO_STEP[$i] =~ /^EVALUATION:(.+):ter$/) { |
|
|
|
} |
|
elsif ($DO_STEP[$i] eq 'REPORTING:report') { |
|
&define_reporting_report($i); |
|
} |
|
else { |
|
print STDERR "ERROR: unknown step $DO_STEP[$i]\n"; |
|
exit; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
sub execute_steps { |
|
my $running_file = &steps_file("running.$VERSION",$VERSION); |
|
`touch $running_file`; |
|
|
|
for(my $i=0;$i<=$#DO_STEP;$i++) { |
|
$DONE{$i}++ if $RE_USE[$i]; |
|
} |
|
|
|
my $active = 0; |
|
while(1) { |
|
|
|
|
|
my $repeat_if_passed = 1; |
|
while($repeat_if_passed) { |
|
$repeat_if_passed = 0; |
|
for(my $i=0;$i<=$#DO_STEP;$i++) { |
|
next if (defined($DONE{$i})); |
|
next if (defined($DO{$i})); |
|
next if (defined($CRASHED{$i})); |
|
my $doable = 1; |
|
|
|
foreach my $prev_step (@{$DEPENDENCY[$i]}) { |
|
$doable = 0 if !defined($DONE{$prev_step}); |
|
} |
|
next unless $doable; |
|
$DO{$i} = 1; |
|
|
|
|
|
next unless defined($PASS{$i}); |
|
$DONE{$i} = 1; |
|
delete($DO{$i}); |
|
$repeat_if_passed = 1; |
|
} |
|
} |
|
|
|
print "number of steps doable or running: ".(scalar keys %DO)." at ".`date`; |
|
foreach my $step (keys %DO) { print "\t".($DO{$step}==2?"running: ":"doable: ").$DO_STEP[$step]."\n"; } |
|
return unless scalar keys %DO; |
|
|
|
|
|
my $done = 0; |
|
foreach my $i (keys %DO) { |
|
next unless $DO{$i} == 1; |
|
if (defined($PASS{$i})) { |
|
$DONE{$i}++; |
|
delete($DO{$i}); |
|
$done++; |
|
} |
|
elsif (! -e &versionize(&step_file($i)).".DONE") { |
|
my $step = &versionize(&step_file($i)); |
|
&define_step($i); |
|
&write_info($i); |
|
|
|
|
|
if ($CLUSTER && (!&is_qsub_script($i) || (&backoff_and_get($DO_STEP[$i].":jobs") && (&backoff_and_get($DO_STEP[$i].":jobs")==1)))) { |
|
$DO{$i}++; |
|
my $qsub_args = &get_qsub_args($DO_STEP[$i]); |
|
print "\texecuting $step via qsub $qsub_args ($active active)\n"; |
|
my $qsub_command="qsub $qsub_args -S /bin/bash -e $step.STDERR -o $step.STDOUT $step"; |
|
print "\t$qsub_command\n" if $VERBOSE; |
|
`$qsub_command`; |
|
} |
|
|
|
|
|
elsif ($CLUSTER || $active < $MAX_ACTIVE) { |
|
$active++; |
|
$DO{$i}++; |
|
print "\texecuting $step via sh ($active active)\n"; |
|
sleep(5); |
|
if (!fork) { |
|
`sh $step >$step.STDOUT 2> $step.STDERR`; |
|
exit; |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
&draw_agenda_graph() unless $done; |
|
|
|
|
|
while(! $done) { |
|
sleep($SLEEP); |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
`ls $dir/steps > /dev/null`; |
|
foreach my $i (keys %DO) { |
|
if (-e &versionize(&step_file($i)).".DONE") { |
|
delete($DO{$i}); |
|
if (&check_if_crashed($i)) { |
|
$CRASHED{$i}++; |
|
print "step $DO_STEP[$i] crashed\n"; |
|
} |
|
else { |
|
$DONE{$i}++; |
|
} |
|
$done++; |
|
$active--; |
|
} |
|
} |
|
`touch $running_file`; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
sub get_qsub_args { |
|
my ($step) = @_; |
|
my $qsub_args = &backoff_and_get("$step:qsub-settings"); |
|
$qsub_args = "" unless defined($qsub_args); |
|
my $memory = &get("$step:qsub-memory"); |
|
$qsub_args .= " -pe memory $memory" if defined($memory); |
|
my $hours = &get("$step:qsub-hours"); |
|
$qsub_args .= " -l h_rt=$hours:0:0" if defined($hours); |
|
my $project = &backoff_and_get("$step:qsub-project"); |
|
$qsub_args .= " -P $project" if defined($project); |
|
$qsub_args =~ s/^ //; |
|
print "qsub args: $qsub_args\n" if $VERBOSE; |
|
return $qsub_args; |
|
} |
|
|
|
|
|
|
|
|
|
sub is_qsub_script { |
|
my ($i) = @_; |
|
return (defined($QSUB_STEP{$i}) || |
|
defined($QSUB_SCRIPT{&defined_step($DO_STEP[$i])})); |
|
} |
|
|
|
|
|
|
|
sub write_info { |
|
my ($i) = @_; |
|
my $step = $DO_STEP[$i]; |
|
my $module_set = $step; $module_set =~ s/:[^:]+$//; |
|
|
|
open(INFO,">".&versionize(&step_file($i)).".INFO") or die "Cannot open: $!"; |
|
my %VALUE = &get_parameters_relevant_for_re_use($i); |
|
foreach my $parameter (keys %VALUE) { |
|
print INFO "$parameter = $VALUE{$parameter}\n"; |
|
} |
|
|
|
|
|
foreach my $parent (@{$DEPENDENCY[$i]}) { |
|
my $p = $parent; |
|
while (defined($PASS{$p}) && scalar @{$DEPENDENCY[$p]}) { |
|
$p = $DEPENDENCY[$p][0]; |
|
} |
|
if ($RE_USE[$p]) { |
|
print INFO "# reuse run $RE_USE[$p] for $DO_STEP[$p]\n"; |
|
} |
|
} |
|
|
|
close(INFO); |
|
} |
|
|
|
|
|
sub check_info { |
|
my ($i,$version) = @_; |
|
$version = $VERSION unless $version; |
|
my %VALUE = &get_parameters_relevant_for_re_use($i); |
|
my ($module,$set,$step) = &deconstruct_name($DO_STEP[$i]); |
|
|
|
my %INFO; |
|
open(INFO,&versionize(&step_file($i),$version).".INFO") or die "Cannot open: $!"; |
|
while(<INFO>) { |
|
chop; |
|
if (/ = /) { |
|
my ($parameter,$value) = split(/ = /,$_,2); |
|
$INFO{$parameter} = $value; |
|
} |
|
elsif (/^\# reuse run (\d+) for (\S+)/) { |
|
if ($1>0 && defined($STEP_LOOKUP{$2})) { |
|
print "\tRECURSIVE_RE_USE{$i,$version,$2} = $1\n" if $VERBOSE; |
|
$RECURSIVE_RE_USE{$i,$version,$2} = $1; |
|
} |
|
else { |
|
print "\tnot using '$_', step $2 not required\n" if $VERBOSE; |
|
return 0; |
|
} |
|
} |
|
} |
|
close(INFO); |
|
|
|
print "\tcheck parameter count current: ".(scalar keys %VALUE).", old: ".(scalar keys %INFO)."\n" if $VERBOSE; |
|
return 0 unless scalar keys %INFO == scalar keys %VALUE; |
|
foreach my $parameter (keys %VALUE) { |
|
if (! defined($INFO{$parameter})) { |
|
print "\told has no '$parameter' -> not re-usable\n" if $VERBOSE; |
|
return 0; |
|
} |
|
print "\tcheck '$VALUE{$parameter}' eq '$INFO{$parameter}' -> " if $VERBOSE; |
|
if (defined($ONLY_EXISTENCE_MATTERS{"$module:$step"}{$parameter})) { |
|
print "existence ok\n" if $VERBOSE; |
|
} |
|
elsif (&match_info_strings($VALUE{$parameter},$INFO{$parameter})) { |
|
print "ok\n" if $VERBOSE; |
|
} |
|
else { |
|
print "mismatch\n" if $VERBOSE; |
|
return 0; |
|
} |
|
} |
|
print "\tall parameters match\n" if $VERBOSE; |
|
return 1; |
|
} |
|
|
|
sub match_info_strings { |
|
my ($current,$old) = @_; |
|
$current =~ s/ $//; |
|
$old =~ s/ $//; |
|
return 1 if $current eq $old; |
|
|
|
if (defined($IGNORE_TIME)) { |
|
$current =~ s/\[\d{10}\]//g; |
|
$old =~ s/\[\d{10}\]//g; |
|
} |
|
return 1 if $current eq $old; |
|
|
|
while($current =~ /^([^\*]+)\*(.*)$/) { |
|
return 0 unless $1 eq substr($old,0,length($1)); |
|
$current = $2; |
|
return 0 unless substr($old,length($1)) =~ /^\d+(.*)$/; |
|
$old = $1; |
|
return 1 if $old eq $current; |
|
} |
|
return 0; |
|
} |
|
|
|
sub get_parameters_relevant_for_re_use { |
|
my ($i) = @_; |
|
|
|
my %VALUE; |
|
my $step = $DO_STEP[$i]; |
|
|
|
my ($module,$set,$dummy) = &deconstruct_name($step); |
|
foreach my $parameter (@{$RERUN_ON_CHANGE{&defined_step($step)}}) { |
|
|
|
|
|
|
|
my $value = &backoff_and_get_array(&extend_local_name($module,$set,$parameter)); |
|
$value = join(" ",@{$value}) if ref($value) eq 'ARRAY'; |
|
$VALUE{$parameter} = $value if $value; |
|
} |
|
|
|
my ($out,@INPUT) = &get_output_and_input($i); |
|
my $actually_used = "USED"; |
|
foreach my $in_file (@INPUT) { |
|
$actually_used .= " ".$in_file; |
|
} |
|
$VALUE{"INPUT"} = $actually_used; |
|
|
|
foreach my $in_file (@{$USES_INPUT{$i}}) { |
|
my $value = &backoff_and_get($in_file); |
|
$VALUE{$in_file} = $value if $value; |
|
} |
|
|
|
|
|
foreach my $value (values %VALUE) { |
|
if ($value =~ /^\//) { |
|
my $file = $value; |
|
$file =~ s/ .+//; |
|
if (-e $file) { |
|
my @filestat = stat($file); |
|
$value .= " [".$filestat[9]."]"; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
return %VALUE; |
|
} |
|
|
|
sub check_if_crashed { |
|
my ($i,$version,$no_wait) = @_; |
|
$version = $VERSION unless $version; |
|
my $file = &versionize(&step_file($i),$version).".STDERR"; |
|
|
|
|
|
if ($version == $VERSION && !$no_wait) { |
|
my $j = 0; |
|
while (! -e $file && $j < 100) { |
|
sleep(5); |
|
$j++; |
|
} |
|
} |
|
|
|
|
|
return 1 if ! -e $file; |
|
|
|
|
|
if (-e $file.".digest") { |
|
my $error = 0; |
|
open(DIGEST,$file.".digest") or die "Cannot open: $!"; |
|
while(<DIGEST>) { |
|
print "\t$DO_STEP[$i]($version) crashed: $_" if $VERBOSE; |
|
$error++; |
|
} |
|
close(DIGEST); |
|
return $error; |
|
} |
|
|
|
|
|
my @DIGEST; |
|
open(ERROR,$file) or die "Cannot open: $!"; |
|
while(<ERROR>) { |
|
foreach my $pattern (@{$ERROR{&defined_step_id($i)}}, |
|
'error','killed','core dumped','can\'t read', |
|
'no such file or directory','unknown option', |
|
'died at','exit code','permission denied', |
|
'segmentation fault','abort', |
|
'no space left on device', ': not found', |
|
'can\'t locate', 'unrecognized option', 'Exception') { |
|
if (/$pattern/i) { |
|
my $not_error = 0; |
|
if (defined($NOT_ERROR{&defined_step_id($i)})) { |
|
foreach my $override (@{$NOT_ERROR{&defined_step_id($i)}}) { |
|
$not_error++ if /$override/i; |
|
} |
|
} |
|
if (!$not_error) { |
|
push @DIGEST,$pattern; |
|
print "\t$DO_STEP[$i]($version) crashed: $pattern\n" if $VERBOSE; |
|
} |
|
} |
|
} |
|
last if scalar(@DIGEST)>10 |
|
} |
|
close(ERROR); |
|
|
|
|
|
my $output = &get_default_file(&deconstruct_name($DO_STEP[$i])); |
|
|
|
if (-e $output && -z $output) { |
|
push @DIGEST,"output file $output is empty"; |
|
} |
|
|
|
|
|
open(DIGEST,">$file.digest") or die "Cannot open: $!"; |
|
foreach (@DIGEST) { |
|
print DIGEST $_."\n"; |
|
} |
|
close(DIGEST); |
|
return scalar(@DIGEST); |
|
} |
|
|
|
|
|
sub step_file { |
|
my ($i) = @_; |
|
my $step = $DO_STEP[$i]; |
|
$step =~ s/:/_/g; |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
return "$dir/steps/$step"; |
|
} |
|
|
|
sub step_file2 { |
|
my ($module,$set,$step) = @_; |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
`mkdir -p $dir/steps` if ! -e "$dir/steps"; |
|
my $file = "$dir/steps/$module" . ($set ? ("_".$set) : "") . "_$step"; |
|
return $file; |
|
} |
|
|
|
sub versionize { |
|
my ($file,$version) = @_; |
|
$version = $VERSION unless $version; |
|
$file =~ s/steps\//steps\/$version\//; |
|
return $file.".".$version; |
|
} |
|
|
|
sub defined_step_id { |
|
my ($i) = @_; |
|
return &defined_step($DO_STEP[$i]); |
|
} |
|
|
|
sub defined_step { |
|
my ($step) = @_; |
|
my $defined_step = $step; |
|
$defined_step =~ s/:.+:/:/; |
|
return $defined_step; |
|
} |
|
|
|
sub construct_name { |
|
my ($module,$set,$step) = @_; |
|
if (!defined($set) || $set eq "") { |
|
return "$module:$step"; |
|
} |
|
return "$module:$set:$step"; |
|
} |
|
|
|
sub deconstruct_name { |
|
my ($name) = @_; |
|
my ($module,$set,$step); |
|
if ($name !~ /:.+:/) { |
|
($module,$step) = split(/:/,$name); |
|
$set = ""; |
|
} |
|
else { |
|
($module,$set,$step) = split(/:/,$name); |
|
} |
|
|
|
return ($module,$set,$step); |
|
} |
|
|
|
sub deconstruct_local_name { |
|
my ($module,$set,$name) = @_; |
|
if ($name =~ /^(.+):(.+)$/) { |
|
$module = $1; |
|
$name = $2; |
|
} |
|
return ($module,$set,$name); |
|
} |
|
|
|
sub extend_local_name { |
|
my ($module,$set,$name) = @_; |
|
return &construct_name(&deconstruct_local_name($module,$set,$name)); |
|
} |
|
|
|
|
|
|
|
sub define_corpus_factorize { |
|
my ($step_id) = @_; |
|
my $scripts = &check_backoff_and_get("TUNING:moses-script-dir"); |
|
|
|
my ($output,$input) = &get_output_and_input($step_id); |
|
my $input_extension = &check_backoff_and_get("TRAINING:input-extension"); |
|
my $output_extension = &check_backoff_and_get("TRAINING:output-extension"); |
|
|
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
my $temp_dir = &check_and_get("INPUT-FACTOR:temp-dir") . ".$VERSION"; |
|
my $cmd = "mkdir -p $temp_dir\n" |
|
. &factorize_one_language("INPUT-FACTOR", |
|
"$input.$input_extension", |
|
"$output.$input_extension", |
|
&check_backoff_and_get_array("TRAINING:input-factors"), |
|
$step_id) |
|
. &factorize_one_language("OUTPUT-FACTOR", |
|
"$input.$output_extension", |
|
"$output.$output_extension", |
|
&check_backoff_and_get_array("TRAINING:output-factors"), |
|
$step_id); |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_tuningevaluation_factorize { |
|
my ($step_id) = @_; |
|
my $scripts = &check_backoff_and_get("TUNING:moses-script-dir"); |
|
|
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
my ($output,$input) = &get_output_and_input($step_id); |
|
|
|
my $temp_dir = &check_and_get("INPUT-FACTOR:temp-dir") . ".$VERSION"; |
|
my $cmd = "mkdir -p $temp_dir\n" |
|
. &factorize_one_language("INPUT-FACTOR",$input,$output, |
|
&check_backoff_and_get_array("TRAINING:input-factors"), |
|
$step_id); |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_lm_factorize { |
|
my ($step_id,$set) = @_; |
|
my $scripts = &check_backoff_and_get("TUNING:moses-script-dir"); |
|
|
|
my ($output,$input) = &get_output_and_input($step_id); |
|
print "LM:$set:factors\n" if $VERBOSE; |
|
my $factor = &check_backoff_and_get_array("LM:$set:factors"); |
|
|
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
my $temp_dir = &check_and_get("INPUT-FACTOR:temp-dir") . ".$VERSION"; |
|
my $cmd = "mkdir -p $temp_dir\n" |
|
. &factorize_one_language("OUTPUT-FACTOR",$input,$output,$factor,$step_id); |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_interpolated_lm_factorize_tuning { |
|
my ($step_id) = @_; |
|
my $scripts = &check_backoff_and_get("TUNING:moses-script-dir"); |
|
|
|
my ($output,$input) = &get_output_and_input($step_id); |
|
my $factor = &check_backoff_and_get_array("TRAINING:output-factors"); |
|
|
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
my $temp_dir = &check_and_get("INPUT-FACTOR:temp-dir") . ".$VERSION"; |
|
my $cmd = "mkdir -p $temp_dir\n" |
|
. &factorize_one_language("OUTPUT-FACTOR",$input,$output,$factor,$step_id); |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_splitter_train { |
|
my ($step_id,$set) = @_; |
|
|
|
my ($output,$input) = &get_output_and_input($step_id); |
|
my $input_splitter = &get("GENERAL:input-splitter"); |
|
my $output_splitter = &get("GENERAL:output-splitter"); |
|
my $input_extension = &check_backoff_and_get("SPLITTER:input-extension"); |
|
my $output_extension = &check_backoff_and_get("SPLITTER:output-extension"); |
|
|
|
my $cmd = ""; |
|
if ($input_splitter) { |
|
$cmd .= "$input_splitter -train -model $output.$input_extension -corpus $input.$input_extension\n"; |
|
} |
|
if ($output_splitter) { |
|
$cmd .= "$output_splitter -train -model $output.$output_extension -corpus $input.$output_extension\n"; |
|
} |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_lm_train_randomized { |
|
my ($step_id,$set) = @_; |
|
my $training = &check_backoff_and_get("LM:$set:rlm-training"); |
|
my $order = &check_backoff_and_get("LM:$set:order"); |
|
my ($output,$input) = &get_output_and_input($step_id); |
|
|
|
$output =~ /^(.+)\/([^\/]+)$/; |
|
my ($output_dir,$output_prefix) = ($1,$2); |
|
my $cmd = "gzip $input\n"; |
|
$cmd .= "$training -struct BloomMap -order $order -output-prefix $output_prefix -output-dir $output_dir -input-type corpus -input-path $input\n"; |
|
$cmd .= "gunzip $input\n"; |
|
$cmd .= "mv $output.BloomMap $output\n"; |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_lm_train_bilingual_lm { |
|
my ($step_id,$set) = @_; |
|
my ($working_dir, $ngrams, $corpus) = &get_output_and_input($step_id); |
|
my $scripts = &check_backoff_and_get("LM:moses-script-dir"); |
|
my $cmd = "$scripts/training/bilingual-lm/train_nplm.py -w $working_dir -c $corpus -r $working_dir"; |
|
my $nplm_dir = &check_backoff_and_get("LM:$set:nplm-dir"); |
|
$cmd .= " -l $nplm_dir"; |
|
|
|
my ($n, $m, $total_order) = &get_bilingual_lm_order($set); |
|
$cmd .= " -n $total_order"; |
|
|
|
my $epochs = &get_bilingual_lm_epochs($set); |
|
$cmd .= " -e $epochs" if defined($epochs); |
|
|
|
my $nnjm_settings = backoff_and_get("LM:$set:nnjm-settings"); |
|
$cmd .= " "; |
|
$cmd .= $nnjm_settings; |
|
|
|
my $nplm_settings = backoff_and_get("LM:$set:nplm-settings"); |
|
$cmd .= " --extra-settings \"$nplm_settings\"" if defined($nplm_settings); |
|
|
|
|
|
$cmd .= "\n"; |
|
$cmd .= "$scripts/training/bilingual-lm/create_blm_ini.py -w $working_dir -n $n -m $m -x $set -e $epochs"; |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_lm_prepare_bilingual_lm { |
|
my ($step_id,$set) = @_; |
|
my ($working_dir, $corpus, $align) = &get_output_and_input($step_id); |
|
my $scripts = &check_backoff_and_get("LM:moses-script-dir"); |
|
my $cmd = "$scripts/training/bilingual-lm/extract_training.py -w $working_dir -c $corpus"; |
|
|
|
my $input_extension = &check_backoff_and_get("GENERAL:input-extension"); |
|
my $output_extension = &check_backoff_and_get("GENERAL:output-extension"); |
|
$cmd .= " -e $output_extension -f $input_extension"; |
|
|
|
my $align_method = &check_backoff_and_get("TRAINING:alignment-symmetrization-method"); |
|
$cmd .= " -a $align.$align_method"; |
|
|
|
my ($n, $m, $total_order) = &get_bilingual_lm_order($set); |
|
$cmd .= " -n $n -m $m"; |
|
|
|
my $bilingual_settings = backoff_and_get("LM:$set:bilingual-lm-settings"); |
|
$cmd .= " $bilingual_settings" if defined($bilingual_settings); |
|
|
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_lm_train_nplm { |
|
my ($step_id,$set) = @_; |
|
my ($working_dir, $corpus) = &get_output_and_input($step_id); |
|
my $scripts = &check_backoff_and_get("LM:moses-script-dir"); |
|
my $cmd = "$scripts/training/train-neurallm.py --mmap --working-dir $working_dir --corpus $corpus"; |
|
my $nplm_dir = &check_backoff_and_get("LM:$set:nplm-dir"); |
|
$cmd .= " --nplm-home $nplm_dir"; |
|
|
|
my $epochs = &backoff_and_get("LM:$set:epochs"); |
|
$epochs = 2 unless defined($epochs); |
|
$cmd .= " --epochs $epochs"; |
|
|
|
my $nplm_settings = backoff_and_get("LM:$set:nplm-settings"); |
|
$cmd .= " $nplm_settings" if defined($nplm_settings); |
|
|
|
my $order = &backoff_and_get("LM:$set:order"); |
|
$order = 5 unless defined($order); |
|
$cmd .= " --order $order"; |
|
|
|
|
|
$cmd .= "\n"; |
|
$cmd .= "$scripts/training/create_nplm_ini.py -w $working_dir -e $epochs -x $set -n $order"; |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub get_bilingual_lm_order { |
|
my ($set) = @_; |
|
my $order = &backoff_and_get("LM:$set:order"); |
|
$order = 5 unless defined ($order); |
|
my $source_window = &backoff_and_get("LM:$set:source-window"); |
|
$source_window = 4 unless defined ($order); |
|
return ($order, $source_window, $order + 2*$source_window+1); |
|
} |
|
|
|
sub get_bilingual_lm_epochs { |
|
my ($set) = @_; |
|
my $epochs = &backoff_and_get("LM:$set:epochs"); |
|
$epochs = 10 unless defined($epochs); |
|
return $epochs; |
|
} |
|
|
|
sub define_lm_randomize { |
|
my ($step_id,$set_dummy) = @_; |
|
|
|
my ($module,$set,$stepname) = &deconstruct_name($DO_STEP[$step_id]); |
|
my $randomizer = &check_backoff_and_get("$module:$set:lm-randomizer"); |
|
my $order = &check_backoff_and_get("$module:$set:order"); |
|
my ($output,$input) = &get_output_and_input($step_id); |
|
|
|
$output =~ /^(.+)\/([^\/]+)$/; |
|
my ($output_dir,$output_prefix) = ($1,$2); |
|
my $cmd = "$randomizer -struct BloomMap -order $order -output-prefix $output_prefix -output-dir $output_dir -input-type arpa -input-path $input\n"; |
|
$cmd .= "mv $output.BloomMap $output\n"; |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub factorize_one_language { |
|
my ($type,$infile,$outfile,$FACTOR,$step_id) = @_; |
|
my $scripts = &check_backoff_and_get("TUNING:moses-script-dir"); |
|
my $temp_dir = &check_and_get("INPUT-FACTOR:temp-dir") . ".$VERSION"; |
|
my $parallelizer = &get("GENERAL:generic-parallelizer"); |
|
my ($module,$set,$stepname) = &deconstruct_name($DO_STEP[$step_id]); |
|
|
|
my ($cmd,$list) = (""); |
|
foreach my $factor (@{$FACTOR}) { |
|
if ($factor eq "word") { |
|
$list .= " $infile"; |
|
} |
|
else { |
|
my $script = &check_and_get("$type:$factor:factor-script"); |
|
my $out = "$outfile.$factor"; |
|
if ($parallelizer && defined($PARALLELIZE{&defined_step($DO_STEP[$step_id])}) |
|
&& ( (&get("$module:jobs") && $CLUSTER) |
|
|| (&get("$module:cores") && $MULTICORE))) { |
|
my $subdir = $module; |
|
$subdir =~ tr/A-Z/a-z/; |
|
$subdir .= "/tmp.$set.$stepname.$type.$factor.$VERSION"; |
|
if ($CLUSTER) { |
|
my $qflags = ""; |
|
my $qsub_args = &get_qsub_args($DO_STEP[$step_id]); |
|
$qflags="--queue-flags \"$qsub_args\"" if ($CLUSTER && $qsub_args); |
|
$cmd .= "$parallelizer $qflags -in $infile -out $out -cmd '$script %s %s $temp_dir/$subdir' -jobs ".&get("$module:jobs")." -tmpdir $temp_dir/$subdir\n"; |
|
$QSUB_STEP{$step_id}++; |
|
} |
|
elsif ($MULTICORE) { |
|
$cmd .= "$parallelizer -in $infile -out $out -cmd '$script %s %s $temp_dir/$subdir' -cores ".&get("$module:cores")." -tmpdir $temp_dir/$subdir\n"; |
|
} |
|
} |
|
else { |
|
$cmd .= "$script $infile $out $temp_dir\n"; |
|
} |
|
$list .= " $out"; |
|
} |
|
} |
|
return $cmd . "$scripts/training/combine_factors.pl $list > $outfile\n"; |
|
} |
|
|
|
sub define_tuning_tune { |
|
my ($step_id) = @_; |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
my $hierarchical = &get("TRAINING:hierarchical-rule-set"); |
|
my $tuning_script = &check_and_get("TUNING:tuning-script"); |
|
my $use_mira = &backoff_and_get("TUNING:use-mira", 0); |
|
my $word_alignment = &backoff_and_get("TRAINING:include-word-alignment-in-rules"); |
|
my $tmp_dir = &get_tmp_file("TUNING","","tune"); |
|
|
|
|
|
my ($tuned_config,$config,$input,$reference,$config_devtest,$input_devtest,$reference_devtest, $filtered_config) = &get_output_and_input($step_id); |
|
$config = $filtered_config if $filtered_config; |
|
|
|
|
|
my $cmd = ""; |
|
if ($use_mira) { |
|
my $addTags = &backoff_and_get("TUNING:add-tags"); |
|
my $use_jackknife = &backoff_and_get("TUNING:use-jackknife"); |
|
if ($addTags && !$use_jackknife) { |
|
my $input_with_tags = $input.".".$VERSION.".tags"; |
|
`$addTags < $input > $input_with_tags`; |
|
$input = $input_with_tags; |
|
} |
|
|
|
my $addTagsDevtest = &backoff_and_get("TUNING:add-tags-devtest"); |
|
if ($addTagsDevtest) { |
|
my $input_devtest_with_tags = $input_devtest.".".$VERSION.".tags"; |
|
`$addTagsDevtest < $input_devtest > $input_devtest_with_tags`; |
|
$input_devtest = $input_devtest_with_tags; |
|
} |
|
|
|
system("mkdir -p $tmp_dir"); |
|
|
|
my $mira_config = "$tmp_dir/mira-config.$VERSION."; |
|
my $mira_config_log = $mira_config."log"; |
|
$mira_config .= "cfg"; |
|
|
|
write_mira_config($mira_config,$tmp_dir,$config,$input,$reference,$config_devtest,$input_devtest,$reference_devtest); |
|
|
|
|
|
$cmd = "$tuning_script -config $mira_config -exec "; |
|
|
|
|
|
|
|
my $script_filename = "$tmp_dir/selectBestWeights."; |
|
my $script_filename_log = $script_filename."log"; |
|
$script_filename .= "perl"; |
|
my $weight_output_file = "$tmp_dir/moses.ini"; |
|
write_selectBestMiraWeights($tmp_dir, $script_filename, $weight_output_file); |
|
$cmd .= "\n$script_filename >& $script_filename_log"; |
|
} |
|
else { |
|
my $scripts = &check_backoff_and_get("TUNING:moses-script-dir"); |
|
my $nbest_size = &check_and_get("TUNING:nbest"); |
|
my $lambda = &backoff_and_get("TUNING:lambda"); |
|
my $tune_continue = &backoff_and_get("TUNING:continue"); |
|
my $skip_decoder = &backoff_and_get("TUNING:skip-decoder"); |
|
my $tune_inputtype = &backoff_and_get("TUNING:inputtype"); |
|
my $jobs = &backoff_and_get("TUNING:jobs"); |
|
my $decoder = &check_backoff_and_get("TUNING:decoder"); |
|
my $cache_model = &backoff_and_get("GENERAL:cache-model"); |
|
|
|
if (defined($cache_model) && !($jobs && $jobs>1 && $CLUSTER)) { |
|
$cmd .= "MOSES_INI=`$scripts/ems/support/cache-model.perl $config $cache_model`\n"; |
|
$config = "\$MOSES_INI"; |
|
} |
|
|
|
my $decoder_settings = &backoff_and_get("TUNING:decoder-settings"); |
|
$decoder_settings = "" unless $decoder_settings; |
|
$decoder_settings .= " -v 0 " unless $CLUSTER && $jobs && $jobs>1; |
|
|
|
my $tuning_settings = &backoff_and_get("TUNING:tuning-settings"); |
|
$tuning_settings = "" unless $tuning_settings; |
|
|
|
$cmd .= "$tuning_script $input $reference $decoder $config --nbest $nbest_size --working-dir $tmp_dir --decoder-flags \"$decoder_settings\" --rootdir $scripts $tuning_settings --no-filter-phrase-table"; |
|
$cmd .= " --lambdas \"$lambda\"" if $lambda; |
|
$cmd .= " --continue" if $tune_continue; |
|
$cmd .= " --skip-decoder" if $skip_decoder; |
|
$cmd .= " --inputtype $tune_inputtype" if defined($tune_inputtype); |
|
|
|
my $qsub_args = &get_qsub_args($DO_STEP[$step_id]); |
|
$cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args); |
|
$cmd .= " --jobs $jobs" if $CLUSTER && $jobs && $jobs>1; |
|
$cmd .= " --cache-model $cache_model" if $cache_model && $CLUSTER && $jobs && $jobs>1; |
|
my $tuning_dir = $tuned_config; |
|
$tuning_dir =~ s/\/[^\/]+$//; |
|
$cmd .= "\nmkdir -p $tuning_dir"; |
|
} |
|
|
|
$cmd .= "\ncp $tmp_dir/moses.ini $tuned_config"; |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub write_mira_config { |
|
my ($config_filename,$expt_dir,$tune_filtered_ini,$input,$reference,$devtest_filtered_ini,$input_devtest,$reference_devtest) = @_; |
|
my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir"); |
|
my $mira_src_dir = &backoff_and_get("GENERAL:mira-src-dir"); |
|
my $tuning_decoder_settings = &check_and_get("TUNING:decoder-settings"); |
|
my $start_weights = &backoff_and_get("TUNING:start-weight-config"); |
|
my $tuning_settings = &check_and_get("TUNING:tuning-settings"); |
|
|
|
my $parallel_settings = &backoff_and_get("TUNING:parallel-settings"); |
|
my $use_jackknife = &backoff_and_get("TUNING:use-jackknife"); |
|
|
|
|
|
my $tune_meta_feature = &backoff_and_get("TUNING:tune-meta-feature"); |
|
|
|
my $tune_filtered_ini_start; |
|
if (!$use_jackknife) { |
|
$tune_filtered_ini =~ /.*\/([A-Za-z0-9\.\-\_]*)$/; |
|
$tune_filtered_ini_start = $1; |
|
$tune_filtered_ini_start = $expt_dir."/".$tune_filtered_ini_start.".start"; |
|
if ($start_weights) { |
|
|
|
print "DEBUG: $RealBin/support/substitute-weights.perl $start_weights $tune_filtered_ini $tune_filtered_ini_start \n"; |
|
system("$RealBin/support/substitute-weights.perl $start_weights $tune_filtered_ini $tune_filtered_ini_start"); |
|
} |
|
} |
|
|
|
|
|
my $continue_expt = &backoff_and_get("TUNING:continue-expt"); |
|
my $continue_epoch = &backoff_and_get("TUNING:continue-epoch"); |
|
my $continue_weights = &backoff_and_get("TUNING:continue-weights"); |
|
|
|
|
|
open(CFG, ">$config_filename"); |
|
print CFG "[general] \n"; |
|
print CFG "name=expt \n"; |
|
print CFG "fold=0 \n"; |
|
print CFG "mpienv=openmpi_fillup_mark2 \n"; |
|
if ($mira_src_dir) { |
|
print CFG "moses-home=".$mira_src_dir."\n"; |
|
} |
|
else { |
|
print CFG "moses-home=".$moses_src_dir."\n"; |
|
} |
|
print CFG "working-dir=".$expt_dir."\n"; |
|
if ($continue_expt && $continue_expt > 0) { |
|
print CFG "continue-expt=".$continue_expt."\n"; |
|
print CFG "continue-epoch=".$continue_epoch."\n"; |
|
print CFG "continue-weights=".$continue_weights."\n"; |
|
} |
|
print CFG "tune-meta-feature=1 \n" if ($tune_meta_feature); |
|
print CFG "jackknife=1 \n" if ($use_jackknife); |
|
print CFG "wait-for-bleu=1 \n\n"; |
|
|
|
print CFG "[train] \n"; |
|
print CFG "trainer=\${moses-home}/bin/mira \n"; |
|
if ($use_jackknife) { |
|
print CFG "input-files-folds="; |
|
for my $i (0..9) { |
|
my $addTags = &backoff_and_get("TUNING:add-tags"); |
|
if ($addTags) { |
|
my $input_with_tags = $input.".".$VERSION.".tags"; |
|
`$addTags.only$i < $input.only$i > $input_with_tags.only$i`; |
|
|
|
print CFG $input_with_tags.".only$i, " if $i<9; |
|
print CFG $input_with_tags.".only$i" if $i==9; |
|
} |
|
else { |
|
print CFG $input.".only$i, " if $i<9; |
|
print CFG $input.".only$i" if $i==9; |
|
} |
|
} |
|
print CFG "\n"; |
|
print CFG "reference-files-folds="; |
|
for my $i (0..9) { |
|
print CFG $reference.".only$i, " if $i<9; |
|
print CFG $reference.".only$i" if $i==9; |
|
} |
|
print CFG "\n"; |
|
print CFG "moses-ini-files-folds="; |
|
for my $i (0..9) { |
|
print CFG $start_weights.".wo$i, " if $i<9; |
|
print CFG $start_weights.".wo$i" if $i==9; |
|
} |
|
print CFG "\n"; |
|
} |
|
else { |
|
print CFG "input-file=".$input."\n"; |
|
print CFG "reference-files=".$reference."\n"; |
|
if ($start_weights) { |
|
print CFG "moses-ini-file=".$tune_filtered_ini_start."\n"; |
|
} |
|
else { |
|
print CFG "moses-ini-file=".$tune_filtered_ini."\n"; |
|
} |
|
} |
|
print CFG "decoder-settings=".$tuning_decoder_settings." -text-type \"dev\"\n"; |
|
print CFG "hours=48 \n"; |
|
if ($parallel_settings) { |
|
foreach my $setting (split(" ", $parallel_settings)) { |
|
print CFG $setting."\n"; |
|
} |
|
} |
|
print CFG "extra-args=".$tuning_settings."\n\n"; |
|
print CFG "[devtest] \n"; |
|
if (&get("TRAINING:hierarchical-rule-set")) { |
|
print CFG "moses=\${moses-home}/bin/moses_chart \n"; |
|
} |
|
else { |
|
print CFG "moses=\${moses-home}/bin/moses \n"; |
|
} |
|
|
|
print CFG "bleu=\${moses-home}/scripts/generic/multi-bleu.perl \n"; |
|
print CFG "input-file=".$input_devtest."\n"; |
|
print CFG "reference-file=".$reference_devtest."\n"; |
|
print CFG "moses-ini-file=".$devtest_filtered_ini."\n"; |
|
print CFG "decoder-settings=".$tuning_decoder_settings." -text-type \"devtest\"\n"; |
|
print CFG "hours=12 \nextra-args= \nskip-dev=1 \nskip-devtest=0 \nskip-submit=0 \n"; |
|
close(CFG); |
|
} |
|
|
|
sub write_selectBestMiraWeights { |
|
my ($expt_dir, $script_filename, $weight_out_file) = @_; |
|
open(SCR, ">$script_filename"); |
|
|
|
print SCR "#!/usr/bin/perl -w \nuse strict; \n\n"; |
|
print SCR "my \@devtest_bleu = glob(\"$expt_dir/*_devtest.bleu\"); \# expt_00_0_devtest.bleu \n"; |
|
print SCR "if (scalar(\@devtest_bleu) == 0) { \n"; |
|
print SCR "\tprint STDERR \"ERROR: no bleu files globbed, cannot find best weights.\\n\"; \n"; |
|
print SCR "\texit(1); \n"; |
|
print SCR "} \n\n"; |
|
print SCR "my (\$best_weights, \$best_id); \n"; |
|
print SCR "my \$best_bleu = -1; \n"; |
|
print SCR "my \$best_ratio = 0; \n"; |
|
print SCR "foreach my \$bleu_file (\@devtest_bleu) { \n"; |
|
print SCR "\t\$bleu_file =~ /_([\\d_]+)_devtest.bleu/; \n"; |
|
print SCR "\tmy \$id = \$1; \n"; |
|
print SCR "\topen(BLEU, \$bleu_file); \n"; |
|
print SCR "\tmy \$bleu = <BLEU>; \n"; |
|
print SCR "\t\$bleu =~ /BLEU = ([\\d\\.]+), .*ratio=([\\d\\.]+), /; \n"; |
|
print SCR "\tif (\$1 > \$best_bleu || (\$1 == \$best_bleu && (abs(1-\$2) < abs(1-\$best_ratio)))) { \n"; |
|
print SCR "\t\t\$best_bleu = \$1; \n"; |
|
print SCR "\t\t\$best_ratio = \$2; \n"; |
|
print SCR "\t\t# expt1-devtest.00_0.ini (incl. path to sparse weights) \n"; |
|
print SCR "\t\t(\$best_weights) = glob(\"$expt_dir/*devtest.\$id.ini\"); \n"; |
|
print SCR "\t} \n"; |
|
print SCR "} \n\n"; |
|
print SCR "print STDERR \"Best weights according to BLEU on devtest set: \$best_weights \\n\"; \n"; |
|
print SCR "system(\"cp \$best_weights $weight_out_file\"); \n\n"; |
|
|
|
close(SCR); |
|
system("chmod u+x $script_filename"); |
|
} |
|
|
|
sub define_training_prepare_data_fast_align { |
|
my ($step_id) = @_; |
|
|
|
my ($prepared, $corpus) = &get_output_and_input($step_id); |
|
my $scripts = &check_and_get("GENERAL:moses-script-dir"); |
|
my $input_extension = &check_backoff_and_get("TRAINING:input-extension"); |
|
my $output_extension = &check_backoff_and_get("TRAINING:output-extension"); |
|
|
|
my $alignment_factors = ""; |
|
if (&backoff_and_get("TRAINING:input-factors")) { |
|
my %IN = &get_factor_id("input"); |
|
my %OUT = &get_factor_id("output"); |
|
$alignment_factors = &encode_factor_definition("alignment-factors",\%IN,\%OUT); |
|
} |
|
my $cmd = "$scripts/ems/support/prepare-fast-align.perl $corpus.$input_extension $corpus.$output_extension $alignment_factors > $prepared"; |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_training_prepare_data { |
|
my ($step_id) = @_; |
|
|
|
my ($prepared, $corpus) = &get_output_and_input($step_id); |
|
my $cmd = &get_training_setting(1); |
|
$cmd .= "-corpus $corpus "; |
|
$cmd .= "-corpus-dir $prepared "; |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_training_run_giza { |
|
my ($step_id) = @_; |
|
|
|
my ($giza, $prepared) = &get_output_and_input($step_id); |
|
my $cmd = &get_training_setting(2); |
|
$cmd .= "-corpus-dir $prepared "; |
|
$cmd .= "-giza-e2f $giza "; |
|
$cmd .= "-direction 2 "; |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_training_run_giza_inverse { |
|
my ($step_id) = @_; |
|
|
|
my ($giza, $prepared) = &get_output_and_input($step_id); |
|
my $cmd = &get_training_setting(2); |
|
$cmd .= "-corpus-dir $prepared "; |
|
$cmd .= "-giza-f2e $giza "; |
|
$cmd .= "-direction 1 "; |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_training_symmetrize_giza { |
|
my ($step_id) = @_; |
|
|
|
my ($aligned, $giza,$giza_inv) = &get_output_and_input($step_id); |
|
my $method = &check_and_get("TRAINING:alignment-symmetrization-method"); |
|
my $cmd = &get_training_setting(3); |
|
my $alignment_stem = &versionize(&long_file_name("aligned","model","")); |
|
|
|
$cmd .= "-giza-e2f $giza -giza-f2e $giza_inv "; |
|
$cmd .= "-alignment-file $aligned "; |
|
$cmd .= "-alignment-stem $alignment_stem "; |
|
$cmd .= "-alignment $method "; |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_training_build_suffix_array { |
|
my ($step_id) = @_; |
|
|
|
my $scripts = &check_and_get("GENERAL:moses-script-dir"); |
|
|
|
my ($model, $aligned,$corpus) = &get_output_and_input($step_id); |
|
my $sa_exec_dir = &check_and_get("TRAINING:suffix-array"); |
|
my $input_extension = &check_backoff_and_get("TRAINING:input-extension"); |
|
my $output_extension = &check_backoff_and_get("TRAINING:output-extension"); |
|
my $method = &check_and_get("TRAINING:alignment-symmetrization-method"); |
|
|
|
my $glue_grammar_file = &versionize(&long_file_name("glue-grammar","model","")); |
|
|
|
my $cmd = "$scripts/training/wrappers/adam-suffix-array/suffix-array-create.sh $sa_exec_dir $corpus.$input_extension $corpus.$output_extension $aligned.$method $model $glue_grammar_file"; |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_training_build_biconcor { |
|
my ($step_id) = @_; |
|
|
|
my ($model, $aligned,$corpus) = &get_output_and_input($step_id); |
|
my $biconcor = &check_and_get("TRAINING:biconcor"); |
|
my $input_extension = &check_backoff_and_get("TRAINING:input-extension"); |
|
my $output_extension = &check_backoff_and_get("TRAINING:output-extension"); |
|
my $method = &check_and_get("TRAINING:alignment-symmetrization-method"); |
|
|
|
my $cmd = "$biconcor -c $corpus.$input_extension -t $corpus.$output_extension -a $aligned.$method -s $model"; |
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_training_build_lex_trans { |
|
my ($step_id) = @_; |
|
|
|
my ($lex, $aligned,$corpus) = &get_output_and_input($step_id); |
|
my $baseline_alignment = &get("TRAINING:baseline-alignment"); |
|
my $baseline_corpus = &get("TRAINING:baseline-corpus"); |
|
my $alignment_stem = &versionize(&long_file_name("aligned","model","")); |
|
$alignment_stem = $CONFIG{"TRAINING:word-alignment"}[0] if defined($CONFIG{"TRAINING:word-alignment"}); |
|
|
|
my $cmd = &get_training_setting(4); |
|
$cmd .= "-lexical-file $lex "; |
|
$cmd .= "-alignment-file $aligned "; |
|
$cmd .= "-alignment-stem $alignment_stem "; |
|
$cmd .= "-corpus $corpus "; |
|
$cmd .= "-baseline-corpus $baseline_corpus " if defined($baseline_corpus) && defined($baseline_alignment); |
|
$cmd .= "-baseline-alignment $baseline_alignment " if defined($baseline_corpus) && defined($baseline_alignment); |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_training_build_transliteration_model { |
|
my ($step_id) = @_; |
|
|
|
my ($model, $corpus, $alignment) = &get_output_and_input($step_id); |
|
|
|
my $moses_script_dir = &check_and_get("GENERAL:moses-script-dir"); |
|
my $input_extension = &check_backoff_and_get("TRAINING:input-extension"); |
|
my $output_extension = &check_backoff_and_get("TRAINING:output-extension"); |
|
my $sym_method = &check_and_get("TRAINING:alignment-symmetrization-method"); |
|
my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir"); |
|
my $external_bin_dir = &check_and_get("GENERAL:external-bin-dir"); |
|
my $srilm_dir = &check_backoff_and_get("TRAINING:srilm-dir"); |
|
my $decoder = &get("TRAINING:transliteration-decoder"); |
|
|
|
my $cmd = "$moses_script_dir/Transliteration/train-transliteration-module.pl"; |
|
$cmd .= " --corpus-f $corpus.$input_extension"; |
|
$cmd .= " --corpus-e $corpus.$output_extension"; |
|
$cmd .= " --alignment $alignment.$sym_method"; |
|
$cmd .= " --out-dir $model"; |
|
$cmd .= " --moses-src-dir $moses_src_dir"; |
|
$cmd .= " --decoder $decoder" if defined($decoder); |
|
$cmd .= " --external-bin-dir $external_bin_dir"; |
|
$cmd .= " --srilm-dir $srilm_dir"; |
|
$cmd .= " --input-extension $input_extension"; |
|
$cmd .= " --output-extension $output_extension"; |
|
$cmd .= " --factor 0-0"; |
|
$cmd .= " --source-syntax " if &get("GENERAL:input-parser"); |
|
$cmd .= " --target-syntax " if &get("GENERAL:output-parser"); |
|
|
|
&create_step($step_id, $cmd); |
|
} |
|
|
|
sub define_training_extract_phrases { |
|
my ($step_id) = @_; |
|
|
|
my ($extract, $aligned,$corpus) = &get_output_and_input($step_id); |
|
my $cmd = &get_training_setting(5); |
|
my $alignment_stem = &versionize(&long_file_name("aligned","model","")); |
|
$alignment_stem = $CONFIG{"TRAINING:word-alignment"}[0] if defined($CONFIG{"TRAINING:word-alignment"}); |
|
|
|
$cmd .= "-alignment-file $aligned "; |
|
$cmd .= "-alignment-stem $alignment_stem "; |
|
$cmd .= "-extract-file $extract "; |
|
$cmd .= "-corpus $corpus "; |
|
|
|
if (&get("TRAINING:hierarchical-rule-set")) { |
|
my $no_glue_grammar = &get("TRAINING:no-glue-grammar"); |
|
if (!defined($no_glue_grammar) || $no_glue_grammar eq "false") { |
|
my $glue_grammar_file = &get("TRAINING:glue-grammar"); |
|
$glue_grammar_file = &versionize(&long_file_name("glue-grammar","model","")) |
|
unless $glue_grammar_file; |
|
$cmd .= "-glue-grammar-file $glue_grammar_file "; |
|
} |
|
|
|
if (&get("GENERAL:output-parser") && (&get("TRAINING:use-unknown-word-labels") || &get("TRAINING:use-unknown-word-soft-matches"))) { |
|
my $unknown_word_label = &versionize(&long_file_name("unknown-word-label","model","")); |
|
$cmd .= "-unknown-word-label $unknown_word_label "; |
|
} |
|
|
|
if (&get("GENERAL:output-parser") && &get("TRAINING:use-unknown-word-soft-matches")) { |
|
my $unknown_word_soft_matches = &versionize(&long_file_name("unknown-word-soft-matches","model","")); |
|
$cmd .= "-unknown-word-soft-matches $unknown_word_soft_matches "; |
|
} |
|
|
|
if (&get("TRAINING:phrase-orientation")) { |
|
$cmd .= "-phrase-orientation "; |
|
my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model","")); |
|
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file "; |
|
} |
|
|
|
if (&get("TRAINING:use-ghkm")) { |
|
$cmd .= "-ghkm "; |
|
} |
|
|
|
if (&get("TRAINING:target-syntactic-preferences")) { |
|
$cmd .= "-target-syntactic-preferences "; |
|
my $target_syntactic_preferences_labels_file = &versionize(&long_file_name("target-syntactic-preferences-labels","model","")); |
|
$cmd .= "-target-syntactic-preferences-labels-file $target_syntactic_preferences_labels_file "; |
|
} |
|
|
|
if (&get("TRAINING:ghkm-tree-fragments")) { |
|
$cmd .= "-ghkm-tree-fragments "; |
|
} |
|
|
|
if (&get("TRAINING:ghkm-source-labels")) { |
|
$cmd .= "-ghkm-source-labels "; |
|
my $source_labels_file = &versionize(&long_file_name("source-labels","model","")); |
|
$cmd .= "-ghkm-source-labels-file $source_labels_file "; |
|
} |
|
|
|
if (&get("TRAINING:ghkm-parts-of-speech")) { |
|
$cmd .= "-ghkm-parts-of-speech "; |
|
my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model","")); |
|
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file "; |
|
} |
|
|
|
if (&get("TRAINING:ghkm-parts-of-speech-factor")) { |
|
$cmd .= "-ghkm-parts-of-speech-factor "; |
|
} |
|
|
|
if (&get("TRAINING:ghkm-strip-bitpar-nonterminal-labels")) { |
|
$cmd .= "-ghkm-strip-bitpar-nonterminal-labels "; |
|
} |
|
|
|
} else { |
|
|
|
if (&get("TRAINING:target-constituent-boundaries")) { |
|
$cmd .= "-target-constituent-boundaries "; |
|
} |
|
} |
|
|
|
my $extract_settings = &get("TRAINING:extract-settings"); |
|
$extract_settings .= " --IncludeSentenceId " if &get("TRAINING:domain-features"); |
|
$cmd .= "-extract-options '".$extract_settings."' " if defined($extract_settings); |
|
|
|
my $baseline_extract = &get("TRAINING:baseline-extract"); |
|
$cmd .= "-baseline-extract $baseline_extract" if defined($baseline_extract); |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_training_build_ttable { |
|
my ($step_id) = @_; |
|
|
|
my ($phrase_table, $extract,$lex,$domains) = &get_output_and_input($step_id); |
|
my $word_report = &backoff_and_get("EVALUATION:report-precision-by-coverage"); |
|
my $word_alignment = &backoff_and_get("TRAINING:include-word-alignment-in-rules"); |
|
|
|
my $cmd = &get_training_setting(6); |
|
$cmd .= "-extract-file $extract "; |
|
$cmd .= "-lexical-file $lex "; |
|
$cmd .= &get_table_name_settings("translation-factors","phrase-translation-table",$phrase_table); |
|
|
|
$cmd .= "-no-word-alignment " if defined($word_alignment) && $word_alignment eq "no"; |
|
|
|
$cmd .= &define_domain_feature_score_option($domains) if &get("TRAINING:domain-features"); |
|
|
|
if (&get("TRAINING:hierarchical-rule-set")) { |
|
|
|
if (&get("TRAINING:phrase-orientation")) { |
|
$cmd .= "-phrase-orientation "; |
|
my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model","")); |
|
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file "; |
|
} |
|
|
|
if (&get("TRAINING:target-syntactic-preferences")) { |
|
$cmd .= "-target-syntactic-preferences "; |
|
my $target_syntactic_preferences_labels_file = &versionize(&long_file_name("target-syntactic-preferences-labels","model","")); |
|
$cmd .= "-target-syntactic-preferences-labels-file $target_syntactic_preferences_labels_file "; |
|
} |
|
|
|
if (&get("TRAINING:ghkm-tree-fragments")) { |
|
$cmd .= "-ghkm-tree-fragments "; |
|
} |
|
|
|
if (&get("TRAINING:ghkm-source-labels")) { |
|
$cmd .= "-ghkm-source-labels "; |
|
my $source_labels_file = &versionize(&long_file_name("source-labels","model","")); |
|
$cmd .= "-ghkm-source-labels-file $source_labels_file "; |
|
} |
|
|
|
if (&get("TRAINING:ghkm-parts-of-speech")) { |
|
$cmd .= "-ghkm-parts-of-speech "; |
|
my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model","")); |
|
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file "; |
|
} |
|
|
|
} else { |
|
|
|
if (&get("TRAINING:target-constituent-boundaries")) { |
|
$cmd .= "-target-constituent-boundaries "; |
|
} |
|
} |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_domain_feature_score_option { |
|
my ($domains) = @_; |
|
my $spec = &backoff_and_get("TRAINING:domain-features"); |
|
my ($method,$restricted_to_table) = ("",""); |
|
$method = "Indicator" if $spec =~ /indicator/; |
|
$method = "Ratio" if $spec =~ /ratio/; |
|
$method = "Subset" if $spec =~ /subset/; |
|
$restricted_to_table = $1 if $spec =~ /( table \S+)/; |
|
die("ERROR: faulty TRAINING:domain-features spec (no method): $spec\n") unless defined($method); |
|
if ($spec =~ /sparse/) { |
|
return "-score-options '--SparseDomain$method $domains$restricted_to_table' "; |
|
} |
|
else { |
|
return "-score-options '--Domain$method $domains' "; |
|
} |
|
} |
|
|
|
sub define_training_build_reordering { |
|
my ($step_id) = @_; |
|
|
|
my ($reordering_table, $extract) = &get_output_and_input($step_id); |
|
my $cmd = &get_training_setting(7); |
|
$cmd .= "-extract-file $extract "; |
|
$cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table); |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_training_build_generation { |
|
my ($step_id) = @_; |
|
|
|
my ($generation_table, $corpus) = &get_output_and_input($step_id); |
|
my $cmd = &get_training_setting(8); |
|
$cmd .= "-corpus $corpus "; |
|
$cmd .= &get_table_name_settings("generation-factors","generation-table",$generation_table); |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_training_build_custom_generation { |
|
my ($step_id) = @_; |
|
|
|
my ($generation_table, $generation_corpus) = &get_output_and_input($step_id); |
|
my $cmd = &get_training_setting(8); |
|
$cmd .= "-corpus $generation_corpus "; |
|
$cmd .= &get_table_name_settings("generation-factors","generation-table",$generation_table); |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_training_sigtest_filter { |
|
my ($step_id) = @_; |
|
my ($filtered_table, $raw_table,$suffix_array) = &get_output_and_input($step_id); |
|
|
|
my $hierarchical_flag = &get("TRAINING:hierarchical-rule-set") ? "-h" : ""; |
|
my $sigtest_filter = &get("TRAINING:sigtest-filter"); |
|
my $input_extension = &check_backoff_and_get("TRAINING:input-extension"); |
|
my $output_extension = &check_backoff_and_get("TRAINING:output-extension"); |
|
my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir"); |
|
|
|
if ($DO_STEP[$step_id] =~ /reordering/) { |
|
$raw_table = &get_table_name_settings("reordering-factors","reordering-table", $raw_table); |
|
$filtered_table = &get_table_name_settings("reordering-factors","reordering-table", $filtered_table); |
|
chop($raw_table); |
|
chop($filtered_table); |
|
$raw_table .= ".wbe-".&get("TRAINING:lexicalized-reordering"); |
|
$filtered_table .= ".wbe-".&get("TRAINING:lexicalized-reordering"); |
|
} |
|
else { |
|
$raw_table = &get_table_name_settings("translation-factors","phrase-translation-table", $raw_table); |
|
$filtered_table = &get_table_name_settings("translation-factors","phrase-translation-table", $filtered_table); |
|
chop($raw_table); |
|
chop($filtered_table); |
|
} |
|
$raw_table =~ s/\s*\-\S+\s*//; |
|
$filtered_table =~ s/\s*\-\S+\s*//; |
|
|
|
my $cmd = "zcat $raw_table.gz | $moses_src_dir/contrib/sigtest-filter/filter-pt -e $suffix_array.$output_extension -f $suffix_array.$input_extension $sigtest_filter $hierarchical_flag | gzip - > $filtered_table.gz\n"; |
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub get_config_tables { |
|
my ($config,$reordering_table,$phrase_translation_table,$generation_table,$domains) = @_; |
|
|
|
my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir"); |
|
my $cmd = &get_training_setting(9); |
|
|
|
|
|
my $hierarchical = &get("TRAINING:hierarchical-rule-set"); |
|
$cmd .= "-hierarchical " if $hierarchical; |
|
|
|
my $sa_exec_dir = &get("TRAINING:suffix-array"); |
|
my ($ptImpl, $numFF) = (0); |
|
if ($hierarchical) { |
|
if ($sa_exec_dir) { |
|
$ptImpl = 10; |
|
$numFF = 7; |
|
} |
|
else { |
|
$ptImpl = 6; |
|
} |
|
} |
|
|
|
|
|
my $mmsapt = &get("TRAINING:mmsapt"); |
|
if (defined($mmsapt)) { |
|
$ptImpl = 11; |
|
$numFF = $1; |
|
$cmd .= "-mmsapt "; |
|
} |
|
|
|
|
|
$cmd .= &get_table_name_settings("translation-factors","phrase-translation-table", $phrase_translation_table); |
|
$cmd = trim($cmd); |
|
$cmd .= ":$ptImpl" if $ptImpl>0; |
|
$cmd .= ":$numFF" if defined($numFF); |
|
$cmd .= " "; |
|
|
|
$cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table) if $reordering_table && !defined($mmsapt); |
|
$cmd .= &get_table_name_settings("generation-factors","generation-table",$generation_table) if $generation_table; |
|
$cmd .= "-config $config "; |
|
|
|
my $decoding_graph_backoff = &get("TRAINING:decoding-graph-backoff"); |
|
if ($decoding_graph_backoff) { |
|
$cmd .= "-decoding-graph-backoff \"$decoding_graph_backoff\" "; |
|
} |
|
|
|
|
|
my $extract_version = $VERSION; |
|
if (&get("TRAINING:hierarchical-rule-set")) { |
|
$extract_version = $RE_USE[$STEP_LOOKUP{"TRAINING:extract-phrases"}] |
|
if defined($STEP_LOOKUP{"TRAINING:extract-phrases"}); |
|
my $no_glue_grammar = &get("TRAINING:no-glue-grammar"); |
|
if (!defined($no_glue_grammar) || $no_glue_grammar eq "false") { |
|
my $glue_grammar_file = &get("TRAINING:glue-grammar"); |
|
$glue_grammar_file = &versionize(&long_file_name("glue-grammar","model",""),$extract_version) |
|
unless $glue_grammar_file; |
|
$cmd .= "-glue-grammar-file $glue_grammar_file "; |
|
} |
|
if (&get("TRAINING:dont-tune-glue-grammar")) { |
|
$cmd .= "-dont-tune-glue-grammar "; |
|
} |
|
if (&get("TRAINING:use-syntax-input-weight-feature")) { |
|
$cmd .= "-use-syntax-input-weight-feature "; |
|
} |
|
} |
|
|
|
|
|
if (&get("GENERAL:output-parser") && (&get("TRAINING:use-unknown-word-labels") || &get("TRAINING:use-unknown-word-soft-matches"))) { |
|
my $unknown_word_label = &versionize(&long_file_name("unknown-word-label","model",""),$extract_version); |
|
$cmd .= "-unknown-word-label $unknown_word_label "; |
|
} |
|
if (&get("GENERAL:output-parser") && &get("TRAINING:use-unknown-word-soft-matches")) { |
|
my $unknown_word_soft_matches = &versionize(&long_file_name("unknown-word-soft-matches","model",""),$extract_version); |
|
$cmd .= "-unknown-word-soft-matches $unknown_word_soft_matches "; |
|
} |
|
|
|
$cmd .= &define_domain_feature_score_option($domains) if &get("TRAINING:domain-features"); |
|
|
|
my $additional_ini = &get("TRAINING:additional-ini"); |
|
$cmd .= "-additional-ini '$additional_ini' " if defined($additional_ini); |
|
|
|
return $cmd; |
|
} |
|
|
|
sub define_training_create_config { |
|
my ($step_id) = @_; |
|
|
|
my ($config,$reordering_table,$phrase_translation_table,$transliteration_pt,$generation_table,$sparse_lexical_features,$domains,$osm, @LM) |
|
= &get_output_and_input($step_id); |
|
|
|
my $cmd = &get_config_tables($config,$reordering_table,$phrase_translation_table,$generation_table,$domains); |
|
|
|
if($transliteration_pt){ |
|
$cmd .= "-transliteration-phrase-table $transliteration_pt "; |
|
} |
|
|
|
if ($osm) { |
|
my $osm_settings = &get("TRAINING:operation-sequence-model-settings"); |
|
if ($osm_settings =~ /-factor *(\S+)/) { |
|
$cmd .= "-osm-model $osm/ -osm-setting $1 "; |
|
} |
|
else { |
|
$cmd .= "-osm-model $osm/operationLM.bin "; |
|
} |
|
my $osm_load_method = &get("TRAINING:operation-sequence-model-load-method"); |
|
if (defined($osm_load_method)) { |
|
$cmd .= "-osm-load-method $osm_load_method "; |
|
} |
|
} |
|
|
|
if (&get("TRAINING:phrase-orientation")) { |
|
$cmd .= "-phrase-orientation "; |
|
} |
|
|
|
if (&get("TRAINING:target-syntactic-preferences")) { |
|
$cmd .= "-target-syntactic-preferences "; |
|
my $target_syntactic_preferences_labels_file = &versionize(&long_file_name("target-syntactic-preferences-labels","model","")); |
|
$cmd .= "-target-syntactic-preferences-labels-file $target_syntactic_preferences_labels_file "; |
|
} |
|
|
|
if (&get("TRAINING:ghkm-source-labels")) { |
|
$cmd .= "-ghkm-source-labels "; |
|
my $source_labels_file = &versionize(&long_file_name("source-labels","model","")); |
|
$cmd .= "-ghkm-source-labels-file $source_labels_file "; |
|
} |
|
|
|
if (&get("TRAINING:ghkm-parts-of-speech")) { |
|
$cmd .= "-ghkm-parts-of-speech "; |
|
my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model","")); |
|
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file "; |
|
} |
|
|
|
if (&get("TRAINING:target-constituent-boundaries")) { |
|
$cmd .= "-target-constituent-boundaries "; |
|
} |
|
|
|
|
|
my @additional_ini_files; |
|
push (@additional_ini_files, "$sparse_lexical_features.ini") if $sparse_lexical_features; |
|
|
|
my @LM_SETS = &get_sets("LM"); |
|
my %INTERPOLATED_AWAY; |
|
my %OUTPUT_FACTORS; |
|
%OUTPUT_FACTORS = &get_factor_id("output") if &backoff_and_get("TRAINING:output-factors"); |
|
|
|
if (&get("INTERPOLATED-LM:script")) { |
|
my $type = 0; |
|
|
|
$type = 1 if (&get("INTERPOLATED-LM:binlm") || |
|
&backoff_and_get("INTERPOLATED-LM:lm-binarizer")); |
|
|
|
$type = 5 if (&get("INTERPOLATED-LM:rlm") || |
|
&backoff_and_get("INTERPOLATED-LM:lm-randomizer")); |
|
|
|
|
|
$type = &get("INTERPOLATED-LM:type") if &get("INTERPOLATED-LM:type"); |
|
|
|
|
|
my ($icount,$ILM_SETS) = &get_interpolated_lm_sets(); |
|
my $FACTOR = &backoff_and_get_array("TRAINING:output-factors"); |
|
foreach my $factor (keys %{$ILM_SETS}) { |
|
foreach my $order (keys %{$$ILM_SETS{$factor}}) { |
|
next unless scalar(@{$$ILM_SETS{$factor}{$order}}) > 1; |
|
my $suffix = ""; |
|
$suffix = ".$$FACTOR[$factor]" if $icount > 1 && defined($FACTOR); |
|
$suffix .= ".order$order" if $icount > 1; |
|
$cmd .= "-lm $factor:$order:$LM[0]$suffix:$type "; |
|
foreach my $id_set (@{$$ILM_SETS{$factor}{$order}}) { |
|
my ($id,$set) = split(/ /,$id_set,2); |
|
$INTERPOLATED_AWAY{$set} = 1; |
|
} |
|
} |
|
} |
|
} |
|
shift @LM; |
|
|
|
my $feature_lines = ""; |
|
my $weight_lines = ""; |
|
|
|
|
|
die("ERROR: number of defined LM sets (".(scalar @LM_SETS).":".join(",",@LM_SETS).") and LM files (".(scalar @LM).":".join(",",@LM).") does not match") |
|
unless scalar @LM == scalar @LM_SETS; |
|
foreach my $lm (@LM) { |
|
my $set = shift @LM_SETS; |
|
next if defined($INTERPOLATED_AWAY{$set}); |
|
|
|
if (&get("LM:$set:config-feature-line") && &get("LM:$set:config-weight-line")) { |
|
$feature_lines .= &get("LM:$set:config-feature-line") . ";"; |
|
$weight_lines .= &get("LM:$set:config-weight-line") . ";"; |
|
} elsif (&get("LM:$set:nplm")) { |
|
push(@additional_ini_files, "$lm/nplm.ini"); |
|
} elsif (&get("LM:$set:bilingual-lm")) { |
|
push(@additional_ini_files, "$lm/blm.ini"); |
|
} else { |
|
my $order = &check_backoff_and_get("LM:$set:order"); |
|
|
|
my $lm_file = "$lm"; |
|
my $type = 0; |
|
|
|
|
|
$type = 1 if (&get("LM:$set:binlm") || |
|
&backoff_and_get("LM:$set:lm-binarizer")); |
|
|
|
|
|
$type = 5 if (&get("LM:$set:rlm") || |
|
&backoff_and_get("LM:$set:rlm-training") || |
|
&backoff_and_get("LM:$set:lm-randomizer")); |
|
|
|
|
|
$type = &backoff_and_get("LM:$set:type") if (&backoff_and_get("LM:$set:type")); |
|
|
|
|
|
my $factor = 0; |
|
if (&backoff_and_get("TRAINING:output-factors") && |
|
&backoff_and_get("LM:$set:factors")) { |
|
$factor = $OUTPUT_FACTORS{&backoff_and_get("LM:$set:factors")}; |
|
} |
|
|
|
$cmd .= "-lm $factor:$order:$lm_file:$type "; |
|
} |
|
} |
|
|
|
if ($feature_lines) { |
|
$cmd .= "-config-add-feature-lines \"$feature_lines\" "; |
|
} |
|
if ($weight_lines) { |
|
$cmd .= "-config-add-weight-lines \"$weight_lines\" "; |
|
} |
|
|
|
if (@additional_ini_files) { |
|
$cmd .= "-additional-ini-file " . join(":", @additional_ini_files); |
|
} |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_interpolated_lm_interpolate { |
|
my ($step_id) = @_; |
|
|
|
my ($interpolated_lm, |
|
$interpolation_script, $tuning, @LM) = &get_output_and_input($step_id); |
|
my $srilm_dir = &check_backoff_and_get("INTERPOLATED-LM:srilm-dir"); |
|
my $group = &get("INTERPOLATED-LM:group"); |
|
my $weights = &get("INTERPOLATED-LM:weights"); |
|
my $scripts = &check_backoff_and_get("TUNING:moses-script-dir"); |
|
|
|
my $cmd = ""; |
|
|
|
my %WEIGHT; |
|
if (defined($weights)) { |
|
foreach (split(/ *, */,$weights)) { |
|
/^ *(\S+) *= *(\S+)/ || die("ERROR: wrong interpolation weight specification $_ ($weights)"); |
|
$WEIGHT{$1} = $2; |
|
} |
|
} |
|
|
|
|
|
my ($icount,$ILM_SETS) = &get_interpolated_lm_sets(); |
|
foreach my $factor (keys %{$ILM_SETS}) { |
|
foreach my $order (keys %{$$ILM_SETS{$factor}}) { |
|
next unless scalar(@{$$ILM_SETS{$factor}{$order}}) > 1; |
|
|
|
|
|
my $lm_list = ""; |
|
my $weight_list = ""; |
|
foreach my $id_set (@{$$ILM_SETS{$factor}{$order}}) { |
|
my ($id,$set) = split(/ /,$id_set,2); |
|
$lm_list .= $LM[$id]."," if $LM[$id]; |
|
if (defined($weights)) { |
|
die("ERROR: no interpolation weight set for $factor:$order:$set (factor:order:set)") |
|
unless defined($WEIGHT{"$factor:$order:$set"}); |
|
$weight_list .= $WEIGHT{"$factor:$order:$set"}.","; |
|
} |
|
} |
|
chop($lm_list); |
|
chop($weight_list); |
|
|
|
|
|
my $numbered_string = ""; |
|
if (defined($group)) { |
|
my %POSITION; |
|
foreach my $id_set (@{$$ILM_SETS{$factor}{$order}}) { |
|
my ($id,$set) = split(/ /,$id_set,2); |
|
$POSITION{$set} = scalar keys %POSITION; |
|
} |
|
my $group_string = $group; |
|
$group_string =~ s/\s+/ /g; |
|
$group_string =~ s/ *, */,/g; |
|
$group_string =~ s/^ //; |
|
$group_string =~ s/ $//; |
|
$group_string .= " "; |
|
while($group_string =~ /^([^ ,]+)([ ,]+)(.*)$/) { |
|
|
|
|
|
|
|
if (defined($POSITION{$1})) { |
|
$numbered_string .= $POSITION{$1}.$2; |
|
} |
|
$group_string = $3; |
|
} |
|
chop($numbered_string); |
|
} |
|
|
|
my $FACTOR = &backoff_and_get_array("TRAINING:output-factors"); |
|
my $name = $interpolated_lm; |
|
if ($icount > 1) { |
|
$name .= ".$$FACTOR[$factor]" if defined($FACTOR); |
|
$name .= ".order$order"; |
|
} |
|
my $factored_tuning = $tuning; |
|
if (&backoff_and_get("TRAINING:output-factors")) { |
|
$factored_tuning = "$tuning.factor$factor"; |
|
$cmd .= "$scripts/training/reduce-factors.perl --corpus $tuning --reduced $factored_tuning --factor $factor\n"; |
|
} |
|
$cmd .= "$interpolation_script --tuning $factored_tuning --name $name --srilm $srilm_dir --lm $lm_list"; |
|
$cmd .= " --group \"$numbered_string\"" if defined($group); |
|
$cmd .= " --weights \"$weight_list\"" if defined($weights); |
|
$cmd .= "\n"; |
|
} |
|
} |
|
|
|
die("ERROR: Nothing to interpolate, remove interpolation step!") if $cmd eq ""; |
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_interpolated_lm_process { |
|
my ($step_id) = @_; |
|
|
|
my ($processed_lm, $interpolated_lm) = &get_output_and_input($step_id); |
|
my ($module,$set,$stepname) = &deconstruct_name($DO_STEP[$step_id]); |
|
my $tool = &check_backoff_and_get("INTERPOLATED-LM:lm-${stepname}r"); |
|
my $FACTOR = &backoff_and_get_array("TRAINING:output-factors"); |
|
|
|
|
|
my ($icount,$ILM_SETS) = &get_interpolated_lm_sets(); |
|
my $cmd = ""; |
|
foreach my $factor (keys %{$ILM_SETS}) { |
|
foreach my $order (keys %{$$ILM_SETS{$factor}}) { |
|
my ($name,$name_processed); |
|
if (scalar(@{$$ILM_SETS{$factor}{$order}}) == 1) { |
|
|
|
my($id,$set) = split(/ /,$$ILM_SETS{$factor}{$order}[0]); |
|
$name = &get_default_file("LM",$set,"train"); |
|
$name_processed = $STEP_OUTNAME{"LM:$stepname"}; |
|
$name_processed =~ s/^(.+\/)([^\/]+)$/$1$set.$2/; |
|
$name_processed = &versionize(&long_file_name($name_processed,"lm","")); |
|
} |
|
else { |
|
my $suffix = ""; |
|
$suffix = ".$$FACTOR[$factor]" if $icount > 1 && defined($FACTOR); |
|
$suffix .= ".order$order" if $icount > 1; |
|
$name = "$interpolated_lm$suffix"; |
|
$name_processed = "$processed_lm$suffix"; |
|
} |
|
$cmd .= "$tool $name $name_processed\n"; |
|
} |
|
} |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub get_interpolated_lm_processed_names { |
|
my ($processed_lm) = @_; |
|
my @ILM_NAME; |
|
my ($icount,$ILM_SETS) = &get_interpolated_lm_sets(); |
|
my $FACTOR = &backoff_and_get_array("TRAINING:output-factors"); |
|
foreach my $factor (keys %{$ILM_SETS}) { |
|
foreach my $order (keys %{$$ILM_SETS{$factor}}) { |
|
if (scalar(@{$$ILM_SETS{$factor}{$order}}) > 1) { |
|
my $suffix = ""; |
|
$suffix = ".$$FACTOR[$factor]" if $icount > 1 && defined($FACTOR); |
|
$suffix .= ".order$order" if $icount > 1; |
|
push @ILM_NAME,"$processed_lm$suffix"; |
|
} |
|
else { |
|
push @ILM_NAME,"$processed_lm.".($FACTOR?"":".$$FACTOR[$factor]").".order$order"; |
|
} |
|
} |
|
} |
|
return @ILM_NAME; |
|
} |
|
|
|
sub get_interpolated_lm_sets { |
|
my %ILM_SETS; |
|
|
|
my @LM_SETS = &get_sets("LM"); |
|
my %OUTPUT_FACTORS; |
|
%OUTPUT_FACTORS = &get_factor_id("output") if &backoff_and_get("TRAINING:output-factors"); |
|
|
|
my $count=0; |
|
my $icount=0; |
|
foreach my $set (@LM_SETS) { |
|
next if (&get("LM:$set:exclude-from-interpolation")) or (&get("LM:$set:bilingual-lm")) |
|
or (&get("LM:$set:nplm")); |
|
my $order = &check_backoff_and_get("LM:$set:order"); |
|
|
|
my $factor = 0; |
|
if (&backoff_and_get("TRAINING:output-factors") && |
|
&backoff_and_get("LM:$set:factors")) { |
|
$factor = $OUTPUT_FACTORS{&backoff_and_get("LM:$set:factors")}; |
|
} |
|
|
|
push @{$ILM_SETS{$factor}{$order}}, ($count++)." ".$set; |
|
$icount++ if scalar(@{$ILM_SETS{$factor}{$order}}) == 2; |
|
} |
|
return ($icount,\%ILM_SETS); |
|
} |
|
|
|
sub get_training_setting { |
|
my ($step) = @_; |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
my $training_script = &check_and_get("TRAINING:script"); |
|
my $external_bin_dir = &check_backoff_and_get("TRAINING:external-bin-dir"); |
|
my $scripts = &check_backoff_and_get("TUNING:moses-script-dir"); |
|
my $reordering = &get("TRAINING:lexicalized-reordering"); |
|
my $input_extension = &check_backoff_and_get("TRAINING:input-extension"); |
|
my $output_extension = &check_backoff_and_get("TRAINING:output-extension"); |
|
my $alignment = &check_and_get("TRAINING:alignment-symmetrization-method"); |
|
my $parts = &get("TRAINING:run-giza-in-parts"); |
|
my $options = &get("TRAINING:training-options"); |
|
my $phrase_length = &get("TRAINING:max-phrase-length"); |
|
my $hierarchical = &get("TRAINING:hierarchical-rule-set"); |
|
my $source_syntax = &get("GENERAL:input-parser"); |
|
my $target_syntax = &get("GENERAL:output-parser"); |
|
my $score_settings = &get("TRAINING:score-settings"); |
|
my $parallel = &get("TRAINING:parallel"); |
|
my $pcfg = &get("TRAINING:use-pcfg-feature"); |
|
my $baseline_alignment = &get("TRAINING:baseline-alignment-model"); |
|
my $no_glue_grammar = &get("TRAINING:no-glue-grammar"); |
|
my $mmsapt = &get("TRAINING:mmsapt"); |
|
|
|
my $xml = $source_syntax || $target_syntax; |
|
|
|
my $cmd = "$training_script "; |
|
$cmd .= "$options " if defined($options); |
|
$cmd .= "-dont-zip "; |
|
$cmd .= "-first-step $step " if $step>1; |
|
$cmd .= "-last-step $step " if $step<9; |
|
$cmd .= "-external-bin-dir $external_bin_dir " if defined($external_bin_dir); |
|
$cmd .= "-f $input_extension -e $output_extension "; |
|
$cmd .= "-alignment $alignment "; |
|
$cmd .= "-max-phrase-length $phrase_length " if $phrase_length; |
|
$cmd .= "-parts $parts " if $parts; |
|
$cmd .= "-reordering $reordering " if $reordering; |
|
$cmd .= "-temp-dir /disk/scratch2 " if `hostname` =~ /townhill/; |
|
$cmd .= "-hierarchical " if $hierarchical; |
|
$cmd .= "-xml " if $xml; |
|
$cmd .= "-target-syntax " if $target_syntax; |
|
$cmd .= "-source-syntax " if $source_syntax; |
|
$cmd .= "-glue-grammar " if $hierarchical && (!defined($no_glue_grammar) || $no_glue_grammar eq "false"); |
|
$cmd .= "-score-options '".$score_settings."' " if $score_settings; |
|
$cmd .= "-parallel " if $parallel; |
|
$cmd .= "-pcfg " if $pcfg; |
|
$cmd .= "-baseline-alignment-model $baseline_alignment " if defined($baseline_alignment) && ($step == 1 || $step == 2); |
|
$cmd .= "-mmsapt " if defined($mmsapt); |
|
|
|
|
|
if (&backoff_and_get("TRAINING:input-factors")) { |
|
my %IN = &get_factor_id("input"); |
|
my %OUT = &get_factor_id("output"); |
|
$cmd .= "-input-factor-max ".((scalar keys %IN)-1)." "; |
|
$cmd .= "-alignment-factors ". |
|
&encode_factor_definition("alignment-factors",\%IN,\%OUT)." "; |
|
$cmd .= "-translation-factors ". |
|
&encode_factor_definition("translation-factors",\%IN,\%OUT)." "; |
|
$cmd .= "-reordering-factors ". |
|
&encode_factor_definition("reordering-factors",\%IN,\%OUT)." " |
|
if &get("TRAINING:reordering-factors"); |
|
$cmd .= "-generation-factors ". |
|
&encode_factor_definition("generation-factors",\%OUT,\%OUT)." " |
|
if &get("TRAINING:generation-factors"); |
|
die("ERROR: define either both TRAINING:reordering-factors and TRAINING:reordering or neither") |
|
if (( &get("TRAINING:reordering-factors") && ! $reordering) || |
|
(! &get("TRAINING:reordering-factors") && $reordering)); |
|
my $decoding_steps = &check_and_get("TRAINING:decoding-steps"); |
|
$decoding_steps =~ s/\s*//g; |
|
$cmd .= "-decoding-steps $decoding_steps "; |
|
my $generation_type = &get("TRAINING:generation-type"); |
|
$cmd .= "-generation-type $generation_type " if $generation_type; |
|
} |
|
|
|
return $cmd; |
|
} |
|
|
|
sub get_table_name_settings { |
|
my ($factor,$table,$default) = @_; |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
|
|
my @NAME; |
|
if (!&backoff_and_get("TRAINING:input-factors")) { |
|
return "-$table $default "; |
|
} |
|
|
|
|
|
my %IN = &get_factor_id("input"); |
|
my %OUT = &get_factor_id("output"); |
|
%IN = %OUT if $factor eq "generation-factors"; |
|
my $factors = &encode_factor_definition($factor,\%IN,\%OUT); |
|
foreach my $f (split(/\+/,$factors)) { |
|
push @NAME,"$default.$f"; |
|
|
|
} |
|
|
|
|
|
if (&get("TRAINING:$table")) { |
|
my @SPECIFIED_NAME = @{$CONFIG{"TRAINING:$table"}}; |
|
die("ERROR: specified more ${table}s than $factor") |
|
if (scalar @SPECIFIED_NAME) > (scalar @NAME); |
|
for(my $i=0;$i<scalar(@SPECIFIED_NAME);$i++) { |
|
$NAME[$i] = $SPECIFIED_NAME[$i]; |
|
} |
|
} |
|
|
|
|
|
my $cmd; |
|
foreach my $name (@NAME) { |
|
$cmd .= "-$table $name "; |
|
} |
|
return $cmd; |
|
} |
|
|
|
sub get_factor_id { |
|
my ($type) = @_; |
|
my $FACTOR = &check_backoff_and_get_array("TRAINING:$type-factors"); |
|
my %ID = (); |
|
foreach my $factor (@{$FACTOR}) { |
|
$ID{$factor} = scalar keys %ID; |
|
} |
|
return %ID; |
|
} |
|
|
|
sub encode_factor_definition { |
|
my ($parameter,$IN,$OUT) = @_; |
|
my $definition = &check_and_get("TRAINING:$parameter"); |
|
my $encoded; |
|
foreach my $mapping (split(/,\s*/,$definition)) { |
|
my ($in,$out) = split(/\s*->\s*/,$mapping); |
|
$encoded .= |
|
&encode_factor_list($IN,$in)."-". |
|
&encode_factor_list($OUT,$out)."+"; |
|
} |
|
chop($encoded); |
|
return $encoded; |
|
} |
|
|
|
sub encode_factor_list { |
|
my ($ID,$list) = @_; |
|
my $id; |
|
foreach my $factor (split(/\s*\+\s*/,$list)) { |
|
die("ERROR: unknown factor type '$factor'\n") unless defined($$ID{$factor}); |
|
$id .= $$ID{$factor}.","; |
|
} |
|
chop($id); |
|
return $id; |
|
} |
|
|
|
sub define_tuningevaluation_filter { |
|
my ($set,$step_id) = @_; |
|
my $scripts = &check_and_get("GENERAL:moses-script-dir"); |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
my $word_alignment = &backoff_and_get("TRAINING:include-word-alignment-in-rules"); |
|
my $tuning_flag = !defined($set); |
|
my $hierarchical = &get("TRAINING:hierarchical-rule-set"); |
|
|
|
my ($filter_dir,$input,$phrase_translation_table,$reordering_table,$domains,$transliteration_table) = &get_output_and_input($step_id); |
|
|
|
my $binarizer; |
|
$binarizer = &backoff_and_get("EVALUATION:$set:ttable-binarizer") unless $tuning_flag; |
|
$binarizer = &backoff_and_get("TUNING:ttable-binarizer") if $tuning_flag; |
|
my $report_precision_by_coverage = !$tuning_flag && &backoff_and_get("EVALUATION:$set:report-precision-by-coverage"); |
|
|
|
|
|
|
|
|
|
my $input_filter; |
|
$input_filter = &get("EVALUATION:$set:input-filter") unless $tuning_flag; |
|
$input_filter = &get("TUNING:input-filter") if $tuning_flag; |
|
|
|
$input_filter = $input unless $input_filter; |
|
|
|
my $settings = &backoff_and_get("EVALUATION:$set:filter-settings") unless $tuning_flag; |
|
$settings = &backoff_and_get("TUNING:filter-settings") if $tuning_flag; |
|
$settings = "" unless $settings; |
|
|
|
$binarizer .= " -no-alignment-info" if defined ($binarizer) && !$hierarchical && defined $word_alignment && $word_alignment eq "no"; |
|
|
|
$settings .= " -Binarizer \"$binarizer\"" if $binarizer; |
|
$settings .= " --Hierarchical" if $hierarchical; |
|
|
|
|
|
my $sa_exec_dir = &get("TRAINING:suffix-array"); |
|
my $sa_extractors = &get("GENERAL:sa_extractors"); |
|
$sa_extractors = 1 unless $sa_extractors; |
|
|
|
my ($ptImpl, $numFF); |
|
if ($hierarchical) { |
|
if ($sa_exec_dir) { |
|
$ptImpl = 10; |
|
$numFF = 7; |
|
} |
|
else { |
|
$ptImpl = 6; |
|
} |
|
} |
|
else { |
|
$ptImpl = 0; |
|
} |
|
|
|
|
|
my ($config,$cmd,$delete_config); |
|
if (&get("TUNING:config-with-reused-weights")) { |
|
$config = &get("TUNING:config-with-reused-weights"); |
|
} |
|
elsif (&get("TRAINING:config")) { |
|
$config = &get("TRAINING:config"); |
|
} |
|
|
|
else { |
|
$config = $tuning_flag ? "$dir/tuning/moses.table.ini.$VERSION" : "$dir/evaluation/$set.moses.table.ini.$VERSION"; |
|
$cmd = "touch $config\n"; |
|
$delete_config = 1; |
|
|
|
$cmd .= &get_config_tables($config,$reordering_table,$phrase_translation_table,undef,$domains); |
|
|
|
if (&get("TRAINING:in-decoding-transliteration")) { |
|
|
|
$cmd .= "-transliteration-phrase-table $dir/model/transliteration-phrase-table.$VERSION "; |
|
} |
|
|
|
|
|
$cmd .= "-lm 0:3:$config:8\n"; |
|
|
|
} |
|
|
|
|
|
if ($sa_exec_dir) { |
|
|
|
$cmd .= "$scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh $sa_exec_dir $phrase_translation_table $input_filter $filter_dir $sa_extractors \n"; |
|
|
|
my $escaped_filter_dir = $filter_dir; |
|
$escaped_filter_dir =~ s/\//\\\\\//g; |
|
$cmd .= "cat $config | sed s/10\\ 0\\ 0\\ 7.*/10\\ 0\\ 0\\ 7\\ $escaped_filter_dir/g > $filter_dir/moses.ini \n"; |
|
|
|
|
|
$cmd .= "sed -i 's%path=$phrase_translation_table%path=$filter_dir%' $filter_dir/moses.ini\n"; |
|
} |
|
else { |
|
|
|
$cmd .= "$scripts/training/filter-model-given-input.pl"; |
|
$cmd .= " $filter_dir $config $input_filter $settings\n"; |
|
} |
|
|
|
|
|
$cmd .= "rm $config" if $delete_config; |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_evaluation_decode { |
|
my ($set,$step_id) = @_; |
|
my $scripts = &check_and_get("GENERAL:moses-script-dir"); |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
|
|
my ($system_output, |
|
$config,$input,$filtered_config) = &get_output_and_input($step_id); |
|
$config = $filtered_config if $filtered_config; |
|
|
|
my $jobs = &backoff_and_get("EVALUATION:$set:jobs"); |
|
my $decoder = &check_backoff_and_get("EVALUATION:$set:decoder"); |
|
my $settings = &backoff_and_get("EVALUATION:$set:decoder-settings"); |
|
$settings = "" unless $settings; |
|
my $nbest = &backoff_and_get("EVALUATION:$set:nbest"); |
|
my $moses_parallel = &backoff_and_get("EVALUATION:$set:moses-parallel"); |
|
my $report_segmentation = &backoff_and_get("EVALUATION:$set:report-segmentation"); |
|
my $analyze_search_graph = &backoff_and_get("EVALUATION:$set:analyze-search-graph"); |
|
my $report_precision_by_coverage = &backoff_and_get("EVALUATION:$set:report-precision-by-coverage"); |
|
my $use_wade = &backoff_and_get("EVALUATION:$set:wade"); |
|
my $hierarchical = &get("TRAINING:hierarchical-rule-set"); |
|
my $word_alignment = &backoff_and_get("TRAINING:include-word-alignment-in-rules"); |
|
my $post_decoding_transliteration = &get("TRAINING:post-decoding-transliteration"); |
|
|
|
|
|
if (defined($report_precision_by_coverage) && $report_precision_by_coverage eq "yes") { |
|
$settings .= " -alignment-output-file $system_output.wa"; |
|
$report_segmentation = "yes"; |
|
} |
|
if (defined($analyze_search_graph) && $analyze_search_graph eq "yes") { |
|
$settings .= " -unpruned-search-graph -include-lhs-in-search-graph -osg $system_output.graph"; |
|
} |
|
if (defined($report_segmentation) && $report_segmentation eq "yes") { |
|
if ($hierarchical) { |
|
$settings .= " -T $system_output.trace"; |
|
} |
|
else { |
|
$settings .= " -t"; |
|
} |
|
} |
|
if ($use_wade) { |
|
$settings .= " -T $system_output.details"; |
|
} |
|
$settings .= " -text-type \"test\""; |
|
|
|
my $addTags = &backoff_and_get("EVALUATION:$set:add-tags"); |
|
if ($addTags) { |
|
my $input_with_tags = $input.".".$VERSION.".tags"; |
|
`$addTags < $input > $input_with_tags`; |
|
$input = $input_with_tags; |
|
} |
|
|
|
|
|
my $cmd; |
|
my $cache_model = &backoff_and_get("GENERAL:cache-model"); |
|
if (defined($cache_model) && !($jobs && $jobs>1 && $CLUSTER)) { |
|
my $scripts = &check_and_get("GENERAL:moses-script-dir"); |
|
$cmd = "MOSES_INI=`$scripts/ems/support/cache-model.perl $config $cache_model`\n"; |
|
$config = "\$MOSES_INI"; |
|
} |
|
|
|
|
|
my $nbest_size; |
|
$nbest_size = $nbest if $nbest; |
|
$nbest_size =~ s/[^\d]//g if $nbest; |
|
if ($jobs && $jobs>1 && $CLUSTER) { |
|
$cmd .= "mkdir -p $dir/evaluation/tmp.$set.$VERSION\n"; |
|
$cmd .= "cd $dir/evaluation/tmp.$set.$VERSION\n"; |
|
if (defined $moses_parallel) { |
|
$cmd .= $moses_parallel; |
|
} else { |
|
$cmd .= "$scripts/generic/moses-parallel.pl"; |
|
} |
|
my $qsub_args = &get_qsub_args($DO_STEP[$step_id]); |
|
$cmd .= " -queue-parameters \"$qsub_args\"" if ($CLUSTER && $qsub_args); |
|
$cmd .= " -decoder $decoder"; |
|
$cmd .= " -config $config"; |
|
$cmd .= " -cache-model $cache_model" if defined($cache_model); |
|
$cmd .= " -input-file $input"; |
|
$cmd .= " --jobs $jobs"; |
|
$cmd .= " -decoder-parameters \"$settings\" > $system_output"; |
|
$cmd .= " -n-best-file $system_output.best$nbest_size -n-best-size $nbest" if $nbest; |
|
} |
|
else { |
|
$cmd = "$decoder $settings -v 0 -f $config < $input > $system_output"; |
|
$cmd .= " -n-best-list $system_output.best$nbest_size $nbest" if $nbest; |
|
} |
|
|
|
|
|
$cmd .= " -output-unknowns $system_output.oov" |
|
if defined($post_decoding_transliteration) && $post_decoding_transliteration eq "yes"; |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_evaluation_analysis { |
|
my ($set,$step_id) = @_; |
|
|
|
my ($analysis, |
|
$output,$reference,$input) = &get_output_and_input($step_id); |
|
my $script = &backoff_and_get("EVALUATION:$set:analysis"); |
|
my $report_segmentation = &backoff_and_get("EVALUATION:$set:report-segmentation"); |
|
my $analyze_search_graph = &backoff_and_get("EVALUATION:$set:analyze-search-graph"); |
|
|
|
my $cmd = "$script -system $output -reference $reference -input $input -dir $analysis"; |
|
if (defined($report_segmentation) && $report_segmentation eq "yes") { |
|
my $segmentation_file = &get_default_file("EVALUATION",$set,"decode"); |
|
$cmd .= " -segmentation $segmentation_file"; |
|
} |
|
if (defined($analyze_search_graph) && $analyze_search_graph eq "yes") { |
|
my $search_graph_file = &get_default_file("EVALUATION",$set,"decode"); |
|
$cmd .= " -search-graph $search_graph_file.graph"; |
|
} |
|
if (&get("TRAINING:hierarchical-rule-set")) { |
|
$cmd .= " -hierarchical"; |
|
} |
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_evaluation_analysis_precision { |
|
my ($set,$step_id) = @_; |
|
|
|
my ($analysis, |
|
$output,$reference,$input,$corpus,$ttable,$coverage) = &get_output_and_input($step_id); |
|
my $script = &backoff_and_get("EVALUATION:$set:analysis"); |
|
my $input_extension = &check_backoff_and_get("TRAINING:input-extension"); |
|
my $coverage_base = &backoff_and_get("EVALUATION:$set:precision-by-coverage-base"); |
|
my $cmd = "$script -system $output -reference $reference -input $input -dir $analysis -precision-by-coverage"; |
|
|
|
my $segmentation_file = &get_default_file("EVALUATION",$set,"decode"); |
|
$cmd .= " -segmentation $segmentation_file"; |
|
$cmd .= " -system-alignment $segmentation_file.wa"; |
|
$coverage = $coverage_base if defined($coverage_base); |
|
$cmd .= " -coverage $coverage"; |
|
|
|
|
|
if (&backoff_and_get("TRAINING:input-factors")) { |
|
my %IN = &get_factor_id("input"); |
|
my %OUT = &get_factor_id("output"); |
|
my $factors = &encode_factor_definition("translation-factors",\%IN,\%OUT); |
|
my @FACTOR = split(/\+/,$factors); |
|
my @SPECIFIED_NAME; |
|
if (&backoff_and_get("TRAINING:sigtest-filter-phrase-translation-table")) { |
|
@SPECIFIED_NAME = @{$CONFIG{"TRAINING:sigtest-filter-phrase-translation-table"}}; |
|
} |
|
elsif (&backoff_and_get("TRAINING:phrase-translation-table")) { |
|
@SPECIFIED_NAME = @{$CONFIG{"TRAINING:phrase-translation-table"}}; |
|
} |
|
for(my $i=0;$i<scalar(split(/\+/,$factors));$i++) { |
|
if ($FACTOR[$i] =~ /^0-/) { |
|
if (scalar(@SPECIFIED_NAME) > $i) { |
|
$ttable = $SPECIFIED_NAME[$i]; |
|
} |
|
else { |
|
$ttable .= ".".$FACTOR[$i]; |
|
} |
|
last; |
|
} |
|
} |
|
my $subreport = &backoff_and_get("EVALUATION:precision-by-coverage-factor"); |
|
if (defined($subreport)) { |
|
die("unknown factor $subreport specified in EVALUATION:precision-by-coverage-factor") unless defined($IN{$subreport}); |
|
$cmd .= " -precision-by-coverage-factor ".$IN{$subreport}; |
|
} |
|
} |
|
$cmd .= " -ttable $ttable -input-corpus $corpus.$input_extension"; |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_evaluation_analysis_coverage { |
|
my ($set,$step_id) = @_; |
|
|
|
my ($analysis, |
|
$input,$corpus,$ttable) = &get_output_and_input($step_id); |
|
my $script = &backoff_and_get("EVALUATION:$set:analysis"); |
|
my $input_extension = &check_backoff_and_get("TRAINING:input-extension"); |
|
my $score_settings = &get("TRAINING:score-settings"); |
|
|
|
my $ttable_config; |
|
|
|
|
|
if (!&backoff_and_get("TRAINING:input-factors")) { |
|
$ttable_config = "-ttable $ttable"; |
|
} |
|
|
|
else { |
|
my %IN = &get_factor_id("input"); |
|
$ttable_config = "-input-factors ".(scalar(keys %IN)); |
|
my %OUT = &get_factor_id("output"); |
|
$ttable_config .= " -input-factor-names '".join(",",keys %IN)."'"; |
|
$ttable_config .= " -output-factor-names '".join(",",keys %OUT)."'"; |
|
my $factors = &encode_factor_definition("translation-factors",\%IN,\%OUT); |
|
my @FACTOR = split(/\+/,$factors); |
|
my @SPECIFIED_NAME; |
|
if (&backoff_and_get("TRAINING:sigtest-filter-phrase-translation-table")) { |
|
@SPECIFIED_NAME = @{$CONFIG{"TRAINING:sigtest-filter-phrase-translation-table"}}; |
|
} |
|
elsif (&backoff_and_get("TRAINING:phrase-translation-table")) { |
|
@SPECIFIED_NAME = @{$CONFIG{"TRAINING:phrase-translation-table"}}; |
|
} |
|
my $surface_ttable; |
|
for(my $i=0;$i<scalar(@FACTOR);$i++) { |
|
$FACTOR[$i] =~ /^([\d\,]+)/; |
|
my $input_factors = $1; |
|
|
|
my $ttable_name = $ttable.".".$FACTOR[$i]; |
|
if (scalar(@SPECIFIED_NAME) > $i) { |
|
$ttable_name = $SPECIFIED_NAME[$i]; |
|
} |
|
|
|
$ttable_config .= " -factored-ttable $input_factors:".$ttable_name; |
|
if ($input_factors eq "0" && !defined($surface_ttable)) { |
|
$surface_ttable = $ttable_name; |
|
$ttable_config .= " -ttable $surface_ttable"; |
|
} |
|
} |
|
} |
|
|
|
my $cmd = "$script -input $input -input-corpus $corpus.$input_extension $ttable_config -dir $analysis"; |
|
$cmd .= " -score-options '$score_settings'" if $score_settings; |
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
sub define_reporting_report { |
|
my ($step_id) = @_; |
|
|
|
my $score_file = &get_default_file("REPORTING","","report"); |
|
|
|
my $scripts = &check_and_get("GENERAL:moses-script-dir"); |
|
my $cmd = "$scripts/ems/support/report-experiment-scores.perl"; |
|
|
|
|
|
foreach my $parent (@{$DEPENDENCY[$step_id]}) { |
|
my ($parent_module,$parent_set,$parent_step) |
|
= &deconstruct_name($DO_STEP[$parent]); |
|
|
|
my $file = &get_default_file($parent_module,$parent_set,$parent_step); |
|
$cmd .= " set=$parent_set,type=$parent_step,file=$file"; |
|
} |
|
|
|
|
|
my $email = &get("REPORTING:email"); |
|
if ($email) { |
|
$cmd .= " email='$email'"; |
|
} |
|
|
|
$cmd .= " >$score_file"; |
|
|
|
&create_step($step_id,$cmd); |
|
} |
|
|
|
|
|
|
|
sub get_output_and_input { |
|
my ($step_id) = @_; |
|
|
|
my $step = $DO_STEP[$step_id]; |
|
my $output = &get_default_file(&deconstruct_name($step)); |
|
|
|
my @INPUT; |
|
if (defined($USES_INPUT{$step_id})) { |
|
for(my $i=0; $i<scalar @{$USES_INPUT{$step_id}}; $i++) { |
|
|
|
my $in_file = $USES_INPUT{$step_id}[$i]; |
|
|
|
|
|
|
|
|
|
my $prev_step = ""; |
|
|
|
foreach my $parent (@{$DEPENDENCY[$step_id]}) { |
|
my ($parent_module,$parent_set,$parent_step) |
|
= &deconstruct_name($DO_STEP[$parent]); |
|
my $parent_file |
|
= &construct_name($parent_module,$parent_set, |
|
$STEP_OUT{&defined_step($DO_STEP[$parent])}); |
|
if ($in_file eq $parent_file) { |
|
$prev_step = $DO_STEP[$parent]; |
|
} |
|
} |
|
|
|
if ($prev_step eq "" && !defined($CONFIG{$in_file})) { |
|
|
|
|
|
push @INPUT,""; |
|
next; |
|
} |
|
|
|
|
|
push @INPUT,&get_specified_or_default_file(&deconstruct_name($in_file), |
|
&deconstruct_name($prev_step)); |
|
} |
|
} |
|
return ($output,@INPUT); |
|
} |
|
|
|
sub define_template { |
|
my ($step_id) = @_; |
|
|
|
my $step = $DO_STEP[$step_id]; |
|
print "building sh file for $step\n" if $VERBOSE; |
|
my $defined_step = &defined_step($step); |
|
return 0 unless (defined($TEMPLATE {$defined_step}) || |
|
defined($TEMPLATE_IF{$defined_step})); |
|
|
|
my $parallelizer = &get("GENERAL:generic-parallelizer"); |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
|
|
my ($module,$set,$stepname) = &deconstruct_name($step); |
|
|
|
my $multiref = undef; |
|
if ($MULTIREF{$defined_step} && |
|
&backoff_and_get(&extend_local_name($module,$set,"multiref"))) { |
|
$multiref = $MULTIREF{$defined_step}; |
|
} |
|
|
|
my ($output,@INPUT) = &get_output_and_input($step_id); |
|
|
|
my $cmd; |
|
if (defined($TEMPLATE{$defined_step})) { |
|
$cmd = $TEMPLATE{$defined_step}; |
|
} |
|
else { |
|
foreach my $template_if (@{$TEMPLATE_IF{$defined_step}}) { |
|
my ($command,$in,$out,@EXTRA) = @{$template_if}; |
|
my $extra = join(" ",@EXTRA); |
|
|
|
if (&backoff_and_get(&extend_local_name($module,$set,$command))) { |
|
$cmd .= "\$$command < $in > $out $extra\n"; |
|
} |
|
else { |
|
$cmd .= "ln -s $in $out\n"; |
|
} |
|
} |
|
} |
|
|
|
if ($parallelizer && defined($PARALLELIZE{$defined_step}) && |
|
((&get("$module:jobs") && $CLUSTER) || |
|
(&get("$module:cores") && $MULTICORE))) { |
|
my $new_cmd; |
|
my $i=0; |
|
foreach my $single_cmd (split(/\n/,$cmd)) { |
|
if ($single_cmd =~ /^ln /) { |
|
$new_cmd .= $single_cmd."\n"; |
|
} |
|
elsif ($single_cmd =~ /^.+$/) { |
|
|
|
$single_cmd =~ /(EMS_IN_EMS\S*)/ |
|
|| die("ERROR: could not find EMS_IN_EMS in $single_cmd"); |
|
my $in = $1; |
|
$single_cmd =~ /(EMS_OUT_EMS\S*)/ |
|
|| die("ERROR: could not find OUT in $single_cmd"); |
|
my $out = $1; |
|
|
|
$single_cmd =~ s/EMS_IN_EMS\S*/\%s/; |
|
$single_cmd =~ s/EMS_OUT_EMS\S*/\%s/; |
|
$single_cmd =~ s/EMS_SLASH_OUT_EMS\S*/\%s/; |
|
|
|
my $tmp_dir = $module; |
|
$tmp_dir =~ tr/A-Z/a-z/; |
|
$tmp_dir .= "/tmp.$set.$stepname.$VERSION-".($i++); |
|
if ($CLUSTER) { |
|
my $qflags = ""; |
|
my $qsub_args = &get_qsub_args($DO_STEP[$step_id]); |
|
$qflags="--queue-flags \"$qsub_args\"" if ($CLUSTER && $qsub_args); |
|
$new_cmd .= "$parallelizer $qflags -in $in -out $out -cmd '$single_cmd' -jobs ".&get("$module:jobs")." -tmpdir $dir/$tmp_dir\n"; |
|
} |
|
if ($MULTICORE) { |
|
$new_cmd .= "$parallelizer -in $in -out $out -cmd '$single_cmd' -cores ".&get("$module:cores")." -tmpdir $dir/$tmp_dir\n"; |
|
} |
|
} |
|
} |
|
|
|
$cmd = $new_cmd; |
|
$QSUB_STEP{$step_id}++; |
|
} |
|
|
|
|
|
if (defined($multiref)) { |
|
$cmd =~ s/^(.*)EMS_IN_EMS (.+)EMS_OUT_EMS(.*)$/$multiref '$1 mref-input-file $2 mref-output-file $3' EMS_IN_EMS EMS_OUT_EMS/; |
|
$cmd =~ s/^(.+)EMS_OUT_EMS(.+)EMS_IN_EMS (.*)$/$multiref '$1 mref-output-file $2 mref-input-file $3' EMS_IN_EMS EMS_OUT_EMS/; |
|
} |
|
|
|
|
|
if ($cmd !~ /EMS_IN1_EMS/ && (scalar @INPUT) > 1 ) { |
|
my $in = join(" ",@INPUT); |
|
$cmd =~ s/EMS_IN_EMS/$in/; |
|
} |
|
|
|
else { |
|
if ($cmd =~ /EMS_IN\d*_EMS/ && scalar(@INPUT) == 0) { |
|
die("ERROR: Step $step requires input from prior steps, but none defined."); |
|
} |
|
$cmd =~ s/EMS_IN(\d)_EMS/$INPUT[$1]/g; |
|
$cmd =~ s/EMS_IN_EMS/$INPUT[0]/g; |
|
} |
|
$cmd =~ s/EMS_OUT_EMS/$output/g; |
|
if (defined($STEP_TMPNAME{"$module:$stepname"})) { |
|
my $tmp = $dir."/".$STEP_TMPNAME{"$module:$stepname"}.".$VERSION"; |
|
$cmd =~ s/EMS_TMP_EMS/$tmp/g; |
|
} |
|
$cmd =~ s/VERSION/$VERSION/g; |
|
print "\tcmd is $cmd\n" if $VERBOSE; |
|
|
|
|
|
while ($cmd =~ /^([\S\s]*)\$(\??)\{([a-z][^\s\/\"\']*)\}([\S\s]*)$/i || |
|
$cmd =~ /^([\S\s]*)\$(\??)([a-z][^\s\/\"\']*)([\S\s]*)$/i) { |
|
my ($pre,$optional,$variable,$post) = ($1,$2,$3,$4); |
|
my $value; |
|
if ($optional eq '?') { |
|
$value = &backoff_and_get(&extend_local_name($module,$set,$variable)); |
|
$value = "" unless $value; |
|
} |
|
else { |
|
$value = &check_backoff_and_get(&extend_local_name($module,$set,$variable)); |
|
} |
|
$cmd = $pre.$value.$post; |
|
} |
|
|
|
|
|
$cmd =~ s/\|(.*[^\\])(\<\s*\S+) /$2 \| $1 /g; |
|
$cmd =~ s/\\\</\</g; |
|
|
|
|
|
my $c = ""; |
|
foreach my $cmd (split(/[\n\r]+/,$cmd)) { |
|
if ($cmd =~ /\<\s*(\S+) / && ! -e $1 && -e "$1.gz") { |
|
$cmd =~ s/([^\n\r]+)\s*\<\s*(\S+) /zcat $2.gz \| $1 /; |
|
} |
|
else { |
|
$cmd =~ s/([^\n\r]+)\s*\<\s*(\S+\.gz)/zcat $2 \| $1/; |
|
} |
|
$c .= $cmd."\n"; |
|
} |
|
$cmd = $c; |
|
|
|
|
|
if ($output =~ /\//) { |
|
my $out_dir = $output; |
|
$out_dir =~ s/^(.+)\/[^\/]+$/$1/; |
|
$cmd = "mkdir -p $out_dir\n$cmd"; |
|
} |
|
|
|
&create_step($step_id,$cmd); |
|
return 1; |
|
} |
|
|
|
|
|
|
|
sub create_step { |
|
my ($step_id,$cmd) = @_; |
|
my ($module,$set,$step) = &deconstruct_name($DO_STEP[$step_id]); |
|
my $file = &versionize(&step_file2($module,$set,$step)); |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
my $subdir = $module; |
|
$subdir =~ tr/A-Z/a-z/; |
|
$subdir = "evaluation" if $subdir eq "reporting"; |
|
$subdir = "lm" if $subdir eq "interpolated-lm"; |
|
open(STEP,">$file") or die "Cannot open: $!"; |
|
print STEP "#!/bin/bash\n\n"; |
|
print STEP "PATH=\"".$ENV{"PATH"}."\"\n"; |
|
print STEP "cd $dir\n"; |
|
print STEP "echo 'starting at '`date`' on '`hostname`\n"; |
|
print STEP "mkdir -p $dir/$subdir\n\n"; |
|
print STEP "$cmd\n\n"; |
|
print STEP "echo 'finished at '`date`\n"; |
|
print STEP "touch $file.DONE\n"; |
|
close(STEP); |
|
} |
|
|
|
sub get { |
|
return &check_and_get($_[0],"allow_undef"); |
|
} |
|
|
|
sub check_and_get { |
|
my ($parameter,$allow_undef) = @_; |
|
if (!defined($CONFIG{$parameter})) { |
|
return if $allow_undef; |
|
print STDERR "ERROR: you need to define $parameter\n"; |
|
exit; |
|
} |
|
return $CONFIG{$parameter}[0]; |
|
} |
|
|
|
sub backoff_and_get { |
|
return &check_backoff_and_get($_[0],"allow_undef"); |
|
} |
|
|
|
sub check_backoff_and_get { |
|
my $VALUE = &check_backoff_and_get_array(@_); |
|
return ${$VALUE}[0] if $VALUE; |
|
} |
|
|
|
sub backoff_and_get_array { |
|
return &check_backoff_and_get_array($_[0],"allow_undef"); |
|
} |
|
|
|
sub check_backoff_and_get_array { |
|
my ($parameter,$allow_undef) = @_; |
|
return $CONFIG{$parameter} if defined($CONFIG{$parameter}); |
|
|
|
|
|
$parameter =~ s/:[^:]+:/:/; |
|
return $CONFIG{$parameter} if defined($CONFIG{$parameter}); |
|
|
|
|
|
if ($parameter =~ /:[^:]+:/) { |
|
$parameter =~ s/:[^:]+:/:/; |
|
return $CONFIG{$parameter} if defined($CONFIG{$parameter}); |
|
} |
|
|
|
|
|
$parameter =~ s/^[^:]+:/GENERAL:/; |
|
return $CONFIG{$parameter} if defined($CONFIG{$parameter}); |
|
|
|
return if $allow_undef; |
|
print STDERR "ERROR: you need to define $parameter\n"; |
|
exit; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
sub get_specified_or_default_file { |
|
my ($specified_module,$specified_set,$specified_parameter, |
|
$default_module, $default_set, $default_step) = @_; |
|
my $specified = |
|
&construct_name($specified_module,$specified_set,$specified_parameter); |
|
if (defined($CONFIG{$specified})) { |
|
print "\t\texpanding $CONFIG{$specified}[0]\n" if $VERBOSE; |
|
return &long_file_name($CONFIG{$specified}[0],$default_module,$default_set); |
|
} |
|
return &get_default_file($default_module, $default_set, $default_step); |
|
} |
|
|
|
sub get_tmp_file { |
|
my ($module,$set,$step,$version) = @_; |
|
$version = $VERSION unless $version; |
|
my $tmp_file = $STEP_TMPNAME{"$module:$step"}; |
|
if ($set) { |
|
$tmp_file =~ s/^(.+\/)([^\/]+)$/$1$set.$2/g; |
|
} |
|
$tmp_file = &versionize(&long_file_name($tmp_file,$module,$set), $version); |
|
return $tmp_file; |
|
} |
|
|
|
sub get_default_file { |
|
my ($default_module, $default_set, $default_step) = @_; |
|
|
|
|
|
|
|
my $step = &construct_name($default_module,$default_set,$default_step); |
|
|
|
|
|
|
|
my $i = $STEP_LOOKUP{$step}; |
|
|
|
while (defined($PASS{$i})) { |
|
if (scalar @{$DEPENDENCY[$i]} == 0) { |
|
|
|
my $out = $STEP_IN{&defined_step($step)}[0]; |
|
my ($module,$set) = &deconstruct_name($step); |
|
foreach my $out_option (split(/=OR=/,$out)) { |
|
|
|
my $name = &construct_name($module,$set,$out); |
|
return &get($name) if &get($name); |
|
} |
|
foreach my $out_option (split(/=OR=/,$out)) { |
|
my $name = &construct_name($module,$set,$out_option); |
|
return &backoff_and_get($name) if &backoff_and_get($name); |
|
} |
|
die("something is wrong with $out\n"); |
|
} |
|
|
|
$i = $DEPENDENCY[$i][0]; |
|
$step = $DO_STEP[$i]; |
|
|
|
($default_module,$default_set,$default_step) = &deconstruct_name($step); |
|
} |
|
|
|
|
|
my $default = $STEP_OUTNAME{&defined_step($step)}; |
|
|
|
die("no specified default name for $step") unless $default; |
|
|
|
if ($default_set) { |
|
$default =~ s/^(.+\/)([^\/]+)$/$1$default_set.$2/g; |
|
} |
|
|
|
|
|
my $version = 0; |
|
$version = $RE_USE[$STEP_LOOKUP{$step}] if defined($STEP_LOOKUP{$step}) && $#RE_USE >= $STEP_LOOKUP{$step}; |
|
$version = "*" if $version > 1e6; |
|
$version = $VERSION unless $version; |
|
|
|
return &versionize(&long_file_name($default,$default_module,$default_set), |
|
$version); |
|
} |
|
|
|
sub long_file_name { |
|
my ($file,$module,$set) = @_; |
|
return $file if $file =~ /^\// || $file =~ / \//; |
|
|
|
if ($file !~ /\//) { |
|
my $dir = $module; |
|
$dir =~ tr/A-Z/a-z/; |
|
$file = "$dir/$file"; |
|
} |
|
|
|
my $module_working_dir_parameter = |
|
$module . ($set ne "" ? ":$set" : "") . ":working-dir"; |
|
|
|
if (defined($CONFIG{$module_working_dir_parameter})) { |
|
return $CONFIG{$module_working_dir_parameter}[0]."/".$file; |
|
} |
|
return &check_and_get("GENERAL:working-dir")."/".$file; |
|
} |
|
|
|
sub compute_version_number { |
|
my $dir = &check_and_get("GENERAL:working-dir"); |
|
$VERSION = 1; |
|
return unless -e $dir; |
|
open(LS,"find $dir/steps -maxdepth 1 -follow |"); |
|
while(<LS>) { |
|
s/.+\/([^\/]+)$/$1/; |
|
if ( /^(\d+)$/ ) { |
|
if ($1 >= $VERSION) { |
|
$VERSION = $1 + 1; |
|
} |
|
} |
|
} |
|
close(LS); |
|
} |
|
|
|
sub steps_file { |
|
my ($file,$run) = @_; |
|
return "steps/$run/$file"; |
|
} |
|
|