#!/usr/bin/env perl # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. # $Id$ # Usage: # mert-moses.pl

# For other options see below or run 'mert-moses.pl --help' # Notes: # and should be raw text files, one sentence per line # can be a prefix, in which case the files are 0, 1, etc. are used # Excerpts from revision history # 15 Aug 2017 option added: --transform-decoded-file (Joerg Tiedemann) # Sept 2011 multi-threaded mert (Barry Haddow) # 3 Aug 2011 Added random directions, historic best, pairwise ranked (PK) # Jul 2011 simplifications (Ondrej Bojar) # -- rely on moses' -show-weights instead of parsing moses.ini # ... so moses is also run once *before* mert starts, checking # the model to some extent # -- got rid of the 'triples' mess; # use --range to supply bounds for random starting values: # --range tm:-3..3 --range lm:-3..3 # 5 Aug 2009 Handling with different reference length policies (shortest, average, closest) for BLEU # and case-sensistive/insensitive evaluation (Nicola Bertoldi) # 5 Jun 2008 Forked previous version to support new mert implementation. # 13 Feb 2007 Better handling of default values for lambda, now works with multiple # models and lexicalized reordering # 11 Oct 2006 Handle different input types through parameter --inputype=[0|1] # (0 for text, 1 for confusion network, default is 0) (Nicola Bertoldi) # 10 Oct 2006 Allow skip of filtering of phrase tables (--no-filter-phrase-table) # useful if binary phrase tables are used (Nicola Bertoldi) # 28 Aug 2006 Use either closest or average or shortest (default) reference # length as effective reference length # Use either normalization or not (default) of texts (Nicola Bertoldi) # 31 Jul 2006 move gzip run*.out to avoid failure wit restartings # adding default paths # 29 Jul 2006 run-filter, score-nbest and mert run on the queue (Nicola; Ondrej had to type it in again) # 28 Jul 2006 attempt at foolproof usage, strong checking of input validity, merged the parallel and nonparallel version (Ondrej Bojar) # 27 Jul 2006 adding the safesystem() function to handle with process failure # 22 Jul 2006 fixed a bug about handling relative path of configuration file (Nicola Bertoldi) # 21 Jul 2006 adapted for Moses-in-parallel (Nicola Bertoldi) # 18 Jul 2006 adapted for Moses and cleaned up (PK) # 21 Jan 2005 unified various versions, thorough cleanup (DWC) # now indexing accumulated n-best list solely by feature vectors # 14 Dec 2004 reimplemented find_threshold_points in C (NMD) # 25 Oct 2004 Use either average or shortest (default) reference # length as effective reference length (DWC) # 13 Oct 2004 Use alternative decoders (DWC) # Original version by Philipp Koehn use warnings; use strict; use FindBin qw($RealBin); use File::Basename; use File::Path; use File::Spec; use File::Copy qw(move); use Cwd; my $SCRIPTS_ROOTDIR = $RealBin; $SCRIPTS_ROOTDIR =~ s/\/training$//; $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"}); my $minimum_required_change_in_weights = 0.00001; # stop if no lambda changes more than this my $verbose = 0; my $usage = 0; # request for --help # We assume that if you don't specify working directory, # we set the default is set to `pwd`/mert-work my $___WORKING_DIR = File::Spec->catfile(Cwd::getcwd(), "mert-work"); my $___DEV_F = undef; # required, input text to decode my $___DEV_E = undef; # required, basename of files with references my $___DECODER = undef; # required, pathname to the decoder executable my $___CONFIG = undef; # required, pathname to startup ini file my $___N_BEST_LIST_SIZE = 100; my $___LATTICE_SAMPLES = 0; my $queue_flags = "-hard"; # extra parameters for parallelizer # the -l ws0ssmt was relevant only to JHU 2006 workshop my $___JOBS = undef; # if parallel, number of jobs to use (undef or <= 0 -> serial) my $___CACHE_MODEL = undef; # if models need to be copied to local disk from NFS my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder my $continue = 0; # should we try to continue from the last saved step? my $skip_decoder = 0; # and should we skip the first decoder run (assuming we got interrupted during mert) my $___FILTER_PHRASE_TABLE = 1; # filter phrase table my $___TRANSFORM_DECODED_FILE = 0; # transform decoded file before scoring (script or command) my $___PREDICTABLE_SEEDS = 0; my $___START_WITH_HISTORIC_BESTS = 0; # use best settings from all previous iterations as starting points [Foster&Kuhn,2009] my $___RANDOM_DIRECTIONS = 0; # search in random directions only my $___NUM_RANDOM_DIRECTIONS = 0; # number of random directions, also works with default optimizer [Cer&al.,2008] my $___RANDOM_RESTARTS = 20; my $___RETURN_BEST_DEV = 0; # return the best weights according to dev, not the last # Flags related to PRO (Hopkins & May, 2011) my $___PAIRWISE_RANKED_OPTIMIZER = 0; # flag to enable PRO. my $___PRO_STARTING_POINT = 0; # get a starting point from pairwise ranked optimizer my $___HISTORIC_INTERPOLATION = 0; # interpolate optimize weights with previous iteration's weights [Hopkins&May,2011,5.4.3] # MegaM's options for PRO optimization. # TODO: Should we also add these values to options of this script? my $megam_default_options = "-fvals -maxi 30 -nobias binary"; # Flags related to Batch MIRA (Cherry & Foster, 2012) my $___BATCH_MIRA = 0; # flg to enable batch MIRA # Hypergraph mira my $___HG_MIRA = 0; # Train phrase model mixture weights with PRO (Haddow, NAACL 2012) my $__PROMIX_TRAINING = undef; # Location of main script (contrib/promix/main.py) # The phrase tables. These should be gzip text format. my @__PROMIX_TABLES; # used to filter output my $__REMOVE_SEGMENTATION = "$SCRIPTS_ROOTDIR/ems/support/remove-segmentation-markup.perl"; my $__THREADS = 0; # Parameter for effective reference length when computing BLEU score # Default is to use shortest reference # Use "--shortest" to use shortest reference length # Use "--average" to use average reference length # Use "--closest" to use closest reference length # Only one between --shortest, --average and --closest can be set # If more than one choice the defualt (--shortest) is used my $___SHORTEST = 0; my $___AVERAGE = 0; my $___CLOSEST = 0; # Use "--nocase" to compute case-insensitive scores my $___NOCASE = 0; # Use "--nonorm" to non normalize translation before computing scores my $___NONORM = 0; # set 0 if input type is text, set 1 if input type is confusion network, set 3 if input type is parse tree my $___INPUTTYPE; my $mertdir = undef; # path to new mert directory my $mertargs = undef; # args to pass through to mert & extractor my $mertmertargs = undef; # args to pass through to mert only my $extractorargs = undef; # args to pass through to extractor only my $proargs = undef; # args to pass through to pro only # Args to pass through to batch mira only. This flags is useful to # change MIRA's hyperparameters such as regularization parameter C, # BLEU decay factor, and the number of iterations of MIRA. my $batch_mira_args = undef; my $filtercmd = undef; # path to filter-model-given-input.pl my $filterfile = undef; my $qsubwrapper = undef; my $moses_parallel_cmd = undef; my $old_sge = 0; # assume sge<6.0 my $___CONFIG_ORIG = undef; # pathname to startup ini file before filtering my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on # if undef work on all features # (others are fixed to the starting values) my $___RANGES = undef; my $___USE_CONFIG_WEIGHTS_FIRST = 0; # use weights in configuration file for first iteration my $prev_aggregate_nbl_size = -1; # number of previous step to consider when loading data (default =-1) # -1 means all previous, i.e. from iteration 1 # 0 means no previous data, i.e. from actual iteration # 1 means 1 previous data , i.e. from the actual iteration and from the previous one # and so on my $maximum_iterations = 25; # Multiple instance parallelization my $___MULTI_MOSES = "$SCRIPTS_ROOTDIR/generic/multi_moses.py"; my $___USE_MULTI_MOSES = undef; # Simulated post-editing my $___MOSES_SIM_PE = "$SCRIPTS_ROOTDIR/generic/moses_sim_pe.py"; my $___DEV_SYMAL = undef; my $dev_symal_abs = undef; my $working_dir_abs = undef; use Getopt::Long; GetOptions( "working-dir=s" => \$___WORKING_DIR, "input=s" => \$___DEV_F, "inputtype=i" => \$___INPUTTYPE, "refs=s" => \$___DEV_E, "decoder=s" => \$___DECODER, "config=s" => \$___CONFIG, "nbest=i" => \$___N_BEST_LIST_SIZE, "lattice-samples=i" => \$___LATTICE_SAMPLES, "queue-flags=s" => \$queue_flags, "jobs=i" => \$___JOBS, "cache-model=s" => \$___CACHE_MODEL, "decoder-flags=s" => \$___DECODER_FLAGS, "continue" => \$continue, "skip-decoder" => \$skip_decoder, "shortest" => \$___SHORTEST, "average" => \$___AVERAGE, "closest" => \$___CLOSEST, "nocase" => \$___NOCASE, "nonorm" => \$___NONORM, "help" => \$usage, "verbose" => \$verbose, "mertdir=s" => \$mertdir, "mertargs=s" => \$mertargs, "extractorargs=s" => \$extractorargs, "proargs=s" => \$proargs, "mertmertargs=s" => \$mertmertargs, "rootdir=s" => \$SCRIPTS_ROOTDIR, "filtercmd=s" => \$filtercmd, # allow to override the default location "filterfile=s" => \$filterfile, # input to filtering script (useful for lattices/confnets) "qsubwrapper=s" => \$qsubwrapper, # allow to override the default location "mosesparallelcmd=s" => \$moses_parallel_cmd, # allow to override the default location "old-sge" => \$old_sge, #passed to moses-parallel "filter-phrase-table!" => \$___FILTER_PHRASE_TABLE, # (dis)allow of phrase tables "transform-decoded-file=s" => \$___TRANSFORM_DECODED_FILE, # transform decoded file to convert to standard word level format (script or command line) "predictable-seeds" => \$___PREDICTABLE_SEEDS, # make random restarts deterministic "historic-bests" => \$___START_WITH_HISTORIC_BESTS, # use best settings from all previous iterations as starting points "random-directions" => \$___RANDOM_DIRECTIONS, # search only in random directions "number-of-random-directions=i" => \$___NUM_RANDOM_DIRECTIONS, # number of random directions "random-restarts=i" => \$___RANDOM_RESTARTS, # number of random restarts "return-best-dev" => \$___RETURN_BEST_DEV, # return the best weights according to dev, not the last "activate-features=s" => \$___ACTIVATE_FEATURES, #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values) "range=s@" => \$___RANGES, "use-config-weights-for-first-run" => \$___USE_CONFIG_WEIGHTS_FIRST, # use the weights in the configuration file when running the decoder for the first time "prev-aggregate-nbestlist=i" => \$prev_aggregate_nbl_size, #number of previous step to consider when loading data (default =-1, i.e. all previous) "maximum-iterations=i" => \$maximum_iterations, "pairwise-ranked" => \$___PAIRWISE_RANKED_OPTIMIZER, "pro-starting-point" => \$___PRO_STARTING_POINT, "historic-interpolation=f" => \$___HISTORIC_INTERPOLATION, "batch-mira" => \$___BATCH_MIRA, "hg-mira" => \$___HG_MIRA, "batch-mira-args=s" => \$batch_mira_args, "promix-training=s" => \$__PROMIX_TRAINING, "promix-table=s" => \@__PROMIX_TABLES, "threads=i" => \$__THREADS, "spe-symal=s" => \$___DEV_SYMAL, "multi-moses" => \$___USE_MULTI_MOSES ) or exit(1); # the 4 required parameters can be supplied on the command line directly # or using the --options if (scalar @ARGV == 4) { # required parameters: input_file references_basename decoder_executable $___DEV_F = shift; $___DEV_E = shift; $___DECODER = shift; $___CONFIG = shift; } if ($usage || !defined $___DEV_F || !defined $___DEV_E || !defined $___DECODER || !defined $___CONFIG) { print STDERR "usage: $0 input-text references decoder-executable decoder.ini Options: --working-dir=mert-dir ... where all the files are created --nbest=100 ... how big nbestlist to generate --lattice-samples ... how many lattice samples (Chatterjee & Cancedda, emnlp 2010) --jobs=N ... set this to anything to run moses in parallel --cache-model=STRING ... local directory into which copy model before running decoder --mosesparallelcmd=STR ... use a different script instead of moses-parallel --queue-flags=STRING ... anything you with to pass to qsub, eg. '-l ws06osssmt=true'. The default is: '-hard' To reset the parameters, please use --queue-flags=' ' (i.e. a space between the quotes). --decoder-flags=STRING ... extra parameters for the decoder --continue ... continue from the last successful iteration --skip-decoder ... skip the decoder run for the first time, assuming that we got interrupted during optimization --shortest --average --closest ... Use shortest/average/closest reference length as effective reference length (mutually exclusive) --nocase ... Do not preserve case information; i.e. case-insensitive evaluation (default is false). --nonorm ... Do not use text normalization (flag is not active, i.e. text is NOT normalized) --filtercmd=STRING ... path to filter-model-given-input.pl --filterfile=STRING ... path to alternative to input-text for filtering model. useful for lattice decoding --rootdir=STRING ... where do helpers reside (if not given explicitly) --mertdir=STRING ... path to new mert implementation --mertargs=STRING ... extra args for both extractor and mert --extractorargs=STRING ... extra args for extractor only --mertmertargs=STRING ... extra args for mert only --scorenbestcmd=STRING ... path to score-nbest.py --old-sge ... passed to parallelizers, assume Grid Engine < 6.0 --inputtype=[0|1|2] ... Handle different input types: (0 for text, 1 for confusion network, 2 for lattices, default is 0) --no-filter-phrase-table ... disallow filtering of phrase tables (useful if binary phrase tables are available) --random-restarts=INT ... number of random restarts (default: 20) --predictable-seeds ... provide predictable seeds to mert so that random restarts are the same on every run --range=tm:0..1,-1..1 ... specify min and max value for some features --range can be repeated as needed. The order of the various --range specifications is important only within a feature name. E.g.: --range=tm:0..1,-1..1 --range=tm:0..2 is identical to: --range=tm:0..1,-1..1,0..2 but not to: --range=tm:0..2 --range=tm:0..1,-1..1 --activate-features=STRING ... comma-separated list of features to optimize, others are fixed to the starting values default: optimize all features example: tm_0,tm_4,d_0 --prev-aggregate-nbestlist=INT ... number of previous step to consider when loading data (default = $prev_aggregate_nbl_size) -1 means all previous, i.e. from iteration 1 0 means no previous data, i.e. only the current iteration N means this and N previous iterations --maximum-iterations=ITERS ... Maximum number of iterations. Default: $maximum_iterations --return-best-dev ... Return the weights according to dev bleu, instead of returning the last iteration --random-directions ... search only in random directions --number-of-random-directions=int ... number of random directions (also works with regular optimizer, default: 0) --pairwise-ranked ... Use PRO for optimisation (Hopkins and May, emnlp 2011) --pro-starting-point ... Use PRO to get a starting point for MERT --batch-mira ... Use Batch MIRA for optimisation (Cherry and Foster, NAACL 2012) --hg-mira ... Use hypergraph MIRA, ie batch mira with hypergraphs instead of kbests. --batch-mira-args=STRING ... args to pass through to batch/hg MIRA. This flag is useful to change MIRA's hyperparameters such as regularization parameter C, BLEU decay factor, and the number of iterations of MIRA. --promix-training=STRING ... PRO-based mixture model training (Haddow, NAACL 2013) --promix-tables=STRING ... Phrase tables for PRO-based mixture model training. --threads=NUMBER ... Use multi-threaded mert (must be compiled in). --historic-interpolation ... Interpolate optimized weights with prior iterations' weight (parameter sets factor [0;1] given to current weights) --spe-symal=SYMAL ... Use simulated post-editing when decoding. (SYMAL aligns input to refs) --multi-moses ... Use multiple instances of moses instead of threads for decoding (Use with --decoder-flags='-threads N' to get N instances, each of which uses a single thread (overrides threads in moses.ini)) --transform-decoded-file=STRING ... transform n-best list before scoring, STRING='bpe', script or command for transforming "; exit 1; } # Check validity of input parameters and set defaults if needed print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n"; # path of script for filtering phrase tables and running the decoder $filtercmd = File::Spec->catfile($SCRIPTS_ROOTDIR, "training", "filter-model-given-input.pl") if !defined $filtercmd; # WHY ... ! ___FILTER_PHRASE_TABLE ??? This doesn't make sense! [UG] # if ( ! -x $filtercmd && ! $___FILTER_PHRASE_TABLE) { if ( ! -x $filtercmd && $___FILTER_PHRASE_TABLE) { warn "Filtering command not found: $filtercmd."; warn "Use --filtercmd=PATH to specify a valid one or --no-filter-phrase-table"; exit 1; } $qsubwrapper = File::Spec->catfile($SCRIPTS_ROOTDIR, "generic", "qsub-wrapper.pl") if !defined $qsubwrapper; $moses_parallel_cmd = File::Spec->catfile($SCRIPTS_ROOTDIR, "generic", "moses-parallel.pl") if !defined $moses_parallel_cmd; if (!defined $mertdir) { $mertdir = File::Spec->catfile(File::Basename::dirname($SCRIPTS_ROOTDIR), "bin"); die "mertdir does not exist: $mertdir" if ! -x $mertdir; print STDERR "Assuming --mertdir=$mertdir\n"; } my $mert_extract_cmd = File::Spec->catfile($mertdir, "extractor"); my $mert_mert_cmd = File::Spec->catfile($mertdir, "mert"); my $mert_pro_cmd = File::Spec->catfile($mertdir, "pro"); my $mert_mira_cmd = File::Spec->catfile($mertdir, "kbmira"); my $mert_eval_cmd = File::Spec->catfile($mertdir, "evaluator"); die "Not executable: $mert_extract_cmd" if ! -x $mert_extract_cmd; die "Not executable: $mert_mert_cmd" if ! -x $mert_mert_cmd; die "Not executable: $mert_pro_cmd" if ! -x $mert_pro_cmd; die "Not executable: $mert_mira_cmd" if ! -x $mert_mira_cmd; die "Not executable: $mert_eval_cmd" if ! -x $mert_eval_cmd; my $pro_optimizer = File::Spec->catfile($mertdir, "megam_i686.opt"); # or set to your installation if (($___PAIRWISE_RANKED_OPTIMIZER || $___PRO_STARTING_POINT) && ! -x $pro_optimizer) { print "Could not find $pro_optimizer, installing it in $mertdir\n"; my $megam_url = "http://hal3.name/megam"; if (&is_mac_osx()) { die "Error: Sorry for Mac OS X users! Please get the source code of megam and compile by hand. Please see $megam_url for details."; } `cd $mertdir; wget $megam_url/megam_i686.opt.gz;`; `gunzip $pro_optimizer.gz`; `chmod +x $pro_optimizer`; die("ERROR: Installation of megam_i686.opt failed! Install by hand from $megam_url") unless -x $pro_optimizer; } if ($__PROMIX_TRAINING) { die "Not executable $__PROMIX_TRAINING" unless -x $__PROMIX_TRAINING; die "For promix training, specify the tables using --promix-table arguments" unless @__PROMIX_TABLES; die "For mixture model, need at least 2 tables" unless scalar(@__PROMIX_TABLES) > 1; for my $TABLE (@__PROMIX_TABLES) { die "Phrase table $TABLE not found" unless -r $TABLE; } die "To use promix training, need to specify a filter and binarisation command" unless $filtercmd =~ /Binarizer/; } if (!defined $mertargs) { if (defined $batch_mira_args) { $mertargs = $batch_mira_args; } else { $mertargs = ""; } } my $scconfig = undef; if ($mertargs =~ /\-\-scconfig(?:\s+|=)(.+?)(\s|$)/) { $scconfig = $1; $scconfig =~ s/\,/ /g; $mertargs =~ s/\-\-scconfig(?:\s+|=)(.+?)(\s|$)//; } my $sctype = "--sctype BLEU"; if ($mertargs =~ /(\-\-sctype(?:\s+|=).+?)(\s|$)/) { $sctype = $1; $mertargs =~ s/(\-\-sctype(?:\s+|=)+.+?)(\s|$)//; } # handling reference lengh strategy $scconfig .= &setup_reference_length_type(); # handling case-insensitive flag $scconfig .= &setup_case_config(); $scconfig =~ s/^\s+//; $scconfig =~ s/\s+$//; $scconfig =~ s/\s+/,/g; $scconfig = "--scconfig $scconfig" if ($scconfig); my $mert_extract_args = "$sctype $scconfig"; $extractorargs = "" unless $extractorargs; $mert_extract_args .= " $extractorargs"; $mertmertargs = "" if !defined $mertmertargs; $proargs = "" unless $proargs; my $mert_mert_args = "$mertargs $mertmertargs"; $mert_mert_args =~ s/\-+(binary|b)\b//; $mert_mert_args .= "$sctype $scconfig"; if ($___ACTIVATE_FEATURES) { $mert_mert_args .= " -o \"$___ACTIVATE_FEATURES\""; } my ($just_cmd_filtercmd, $x) = split(/ /, $filtercmd); die "Not executable: $just_cmd_filtercmd" if $___FILTER_PHRASE_TABLE && ! -x $just_cmd_filtercmd; die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_parallel_cmd; die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper; die "Not executable: $___DECODER" if ! -x $___DECODER; my $input_abs = ensure_full_path($___DEV_F); die "File not found: $___DEV_F (interpreted as $input_abs)." if ! -e $input_abs; $___DEV_F = $input_abs; # Option to pass to qsubwrapper and moses-parallel my $pass_old_sge = $old_sge ? "-old-sge" : ""; my $decoder_abs = ensure_full_path($___DECODER); die "File not executable: $___DECODER (interpreted as $decoder_abs)." if ! -x $decoder_abs; $___DECODER = $decoder_abs; my $ref_abs = ensure_full_path($___DEV_E); # check if English dev set (reference translations) exist and store a list of all references my @references; if (-e $ref_abs) { push @references, $ref_abs; } else { # if multiple file, get a full list of the files my $part = 0; if (! -e $ref_abs . "0" && -e $ref_abs . ".ref0") { $ref_abs .= ".ref"; } while (-e $ref_abs . $part) { push @references, $ref_abs . $part; $part++; } die("Reference translations not found: $___DEV_E (interpreted as $ref_abs)") unless $part; } my $config_abs = ensure_full_path($___CONFIG); die "File not found: $___CONFIG (interpreted as $config_abs)." if ! -e $config_abs; $___CONFIG = $config_abs; # moses should use our config if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) / || $___DECODER_FLAGS =~ /(^|\s)-(ttable-file|t) / || $___DECODER_FLAGS =~ /(^|\s)-(distortion-file) / || $___DECODER_FLAGS =~ /(^|\s)-(generation-file) / || $___DECODER_FLAGS =~ /(^|\s)-(lmodel-file) / || $___DECODER_FLAGS =~ /(^|\s)-(global-lexical-file) / ) { die "It is forbidden to supply any of -config, -ttable-file, -distortion-file, -generation-file or -lmodel-file in the --decoder-flags.\nPlease use only the --config option to give the config file that lists all the supplementary files."; } # Paths needed for simulated post-editing $working_dir_abs = ensure_full_path($___WORKING_DIR); if (defined $___DEV_SYMAL) { $dev_symal_abs = ensure_full_path($___DEV_SYMAL); } # as weights are normalized in the next steps (by cmert) # normalize initial LAMBDAs, too my $need_to_normalize = 1; #store current directory and create the working directory (if needed) my $cwd = Cwd::getcwd(); mkpath($___WORKING_DIR); # open local scope { #chdir to the working directory chdir($___WORKING_DIR) or die "Can't chdir to $___WORKING_DIR"; # fixed file names my $mert_outfile = "mert.out"; my $mert_logfile = "mert.log"; my $weights_in_file = "init.opt"; my $weights_out_file = "weights.txt"; my $finished_step_file = "finished_step.txt"; # set start run my $start_run = 1; my $bestpoint = undef; my $devbleu = undef; my $sparse_weights_file = undef; my $prev_feature_file = undef; my $prev_score_file = undef; my $prev_init_file = undef; my @allnbests; # If we're doing promix training, need to make sure the appropriate # tables are in place my @_PROMIX_TABLES_BIN; if ($__PROMIX_TRAINING) { print STDERR "Training mixture model using promix\n"; for (my $i = 0; $i < scalar(@__PROMIX_TABLES); ++$i) { # Create filtered, binarised tables my $filtered_config = "moses_$i.ini"; substitute_ttable($___CONFIG, $filtered_config, $__PROMIX_TABLES[$i]); #TODO: Remove reordering table from config, as we don't need to filter # and binarise it. my $filtered_path = "filtered_$i"; my $___FILTER_F = $___DEV_F; $___FILTER_F = $filterfile if (defined $filterfile); my $cmd = "$filtercmd ./$filtered_path $filtered_config $___FILTER_F"; &submit_or_exec($cmd, "filterphrases_$i.out", "filterphrases_$i.err", 1); push (@_PROMIX_TABLES_BIN,"$filtered_path/phrase-table.0-0.1.1"); } } if ($___FILTER_PHRASE_TABLE) { my $outdir = "filtered"; if (-e "$outdir/moses.ini") { print STDERR "Assuming the tables are already filtered, reusing $outdir/moses.ini\n"; } else { # filter the phrase tables with respect to input, use --decoder-flags print STDERR "filtering the phrase tables... ".`date`; my $___FILTER_F = $___DEV_F; $___FILTER_F = $filterfile if (defined $filterfile); my $cmd = "$filtercmd ./$outdir $___CONFIG $___FILTER_F"; &submit_or_exec($cmd, "filterphrases.out", "filterphrases.err", 1); } # make a backup copy of startup ini filepath $___CONFIG_ORIG = $___CONFIG; # the decoder should now use the filtered model $___CONFIG = "$outdir/moses.ini"; } else{ # do not filter phrase tables (useful if binary phrase tables are available) # use the original configuration file $___CONFIG_ORIG = $___CONFIG; } # we run moses to check validity of moses.ini and to obtain all the feature # names my $featlist = get_featlist_from_moses($___CONFIG); $featlist = insert_ranges_to_featlist($featlist, $___RANGES); # Mark which features are disabled if (defined $___ACTIVATE_FEATURES) { $featlist->{"enabled"} = undef; my %enabled = map { ($_, 1) } split /[, ]+/, $___ACTIVATE_FEATURES; my %cnt; for (my $i = 0; $i < scalar(@{$featlist->{"names"}}); $i++) { my $name = $featlist->{"names"}->[$i]; $cnt{$name} = 0 if !defined $cnt{$name}; $featlist->{"enabled"}->[$i] = $enabled{$name . "_" . $cnt{$name}}; $cnt{$name}++; } } else { # all enabled for(my $i = 0; $i < scalar(@{$featlist->{"names"}}); $i++) { $featlist->{"enabled"}->[$i] = 1; } } print STDERR "MERT starting values and ranges for random generation:\n"; for (my $i = 0; $i < scalar(@{$featlist->{"names"}}); $i++) { my $name = $featlist->{"names"}->[$i]; my $val = $featlist->{"values"}->[$i]; my $min = $featlist->{"mins"}->[$i]; my $max = $featlist->{"maxs"}->[$i]; my $enabled = $featlist->{"enabled"}->[$i]; printf STDERR " %5s = %7.3f", $name, $val; if ($enabled) { printf STDERR " (%5.2f .. %5.2f)\n", $min, $max; } else { print STDERR " --- inactive, not optimized ---\n"; } } if ($continue) { # getting the last finished step print STDERR "Trying to continue an interrupted optimization.\n"; open my $fh, '<', $finished_step_file or die "$finished_step_file: $!"; my $step = <$fh>; chomp $step; close $fh; print STDERR "Last finished step is $step\n"; # getting the first needed step my $firststep; if ($prev_aggregate_nbl_size == -1) { $firststep = 1; } else { $firststep = $step - $prev_aggregate_nbl_size + 1; $firststep = ($firststep > 0) ? $firststep : 1; } #checking if all needed data are available if ($firststep <= $step) { print STDERR "First previous needed data index is $firststep\n"; print STDERR "Checking whether all needed data (from step $firststep to step $step) are available\n"; for (my $prevstep = $firststep; $prevstep <= $step; $prevstep++) { print STDERR "Checking whether data of step $prevstep are available\n"; if (! -e "run$prevstep.features.dat") { die "Can't start from step $step, because run$prevstep.features.dat was not found!"; } else { if (defined $prev_feature_file) { $prev_feature_file = "${prev_feature_file},run$prevstep.features.dat"; } else { $prev_feature_file = "run$prevstep.features.dat"; } } if (! -e "run$prevstep.scores.dat") { die "Can't start from step $step, because run$prevstep.scores.dat was not found!"; } else { if (defined $prev_score_file) { $prev_score_file = "${prev_score_file},run$prevstep.scores.dat"; } else { $prev_score_file = "run$prevstep.scores.dat"; } } if (! -e "run$prevstep.${weights_in_file}") { die "Can't start from step $step, because run$prevstep.${weights_in_file} was not found!"; } else{ if (defined $prev_init_file) { $prev_init_file = "${prev_init_file},run$prevstep.${weights_in_file}"; } else{ $prev_init_file = "run$prevstep.${weights_in_file}"; } } } if (! -e "run$step.weights.txt") { die "Can't start from step $step, because run$step.weights.txt was not found!"; } if (! -e "run$step.$mert_logfile") { die "Can't start from step $step, because run$step.$mert_logfile was not found!"; } if (! -e "run$step.best$___N_BEST_LIST_SIZE.out.gz") { die "Can't start from step $step, because run$step.best$___N_BEST_LIST_SIZE.out.gz was not found!"; } print STDERR "All needed data are available\n"; print STDERR "Loading information from last step ($step)\n"; my %dummy; # sparse features ($bestpoint, $devbleu) = &get_weights_from_mert("run$step.$mert_outfile","run$step.$mert_logfile", scalar @{$featlist->{"names"}}, \%dummy); die "Failed to parse mert.log, missed Best point there." if !defined $bestpoint || !defined $devbleu; print "($step) BEST at $step $bestpoint => $devbleu at ".`date`; my @newweights = split /\s+/, $bestpoint; # Sanity check: order of lambdas must match if (!$___HG_MIRA) { sanity_check_order_of_lambdas($featlist, "gunzip -c < run$step.best$___N_BEST_LIST_SIZE.out.gz |"); } else { print STDERR "WARN: No sanity check of order of features in hypergraph mira\n"; } # update my cache of lambda values $featlist->{"values"} = \@newweights; } else { print STDERR "No previous data are needed\n"; } $start_run = $step + 1; } ###### MERT MAIN LOOP my $run = $start_run - 1; my $oldallsorted = undef; my $allsorted = undef; my $nbest_file = undef; my $lsamp_file = undef; # Lattice samples my $hypergraph_dir = undef; my $orig_nbest_file = undef; # replaced if lattice sampling # For mixture modelling my @promix_weights; my $num_mixed_phrase_features; my $interpolated_config; my $uninterpolated_config; # backup of config without interpolated ttable while (1) { $run++; if ($maximum_iterations && $run > $maximum_iterations) { print "Maximum number of iterations exceeded - stopping\n"; last; } print "run $run start at ".`date`; if ($__PROMIX_TRAINING) { # Need to create an ini file for the interpolated phrase table if (!@promix_weights) { # Create initial weights, distributing evenly between tables # total number of weights is 1 less than number of phrase features, multiplied # by the number of tables $num_mixed_phrase_features = (grep { $_ eq 'tm' } @{$featlist->{"names"}}) - 1; @promix_weights = (1.0/scalar(@__PROMIX_TABLES)) x ($num_mixed_phrase_features * scalar(@__PROMIX_TABLES)); } # backup orig config, so we always add the table into it $uninterpolated_config= $___CONFIG unless $uninterpolated_config; # Interpolation my $interpolated_phrase_table = "interpolate"; for my $itable (@_PROMIX_TABLES_BIN) { $interpolated_phrase_table .= " 1:$itable"; } # Create an ini file for the interpolated phrase table $interpolated_config ="moses.interpolated.ini"; substitute_ttable($uninterpolated_config, $interpolated_config, $interpolated_phrase_table, "99"); # Append the multimodel weights open(ITABLE,">>$interpolated_config") || die "Failed to append weights to $interpolated_config"; print ITABLE "\n"; print ITABLE "[weight-t-multimodel]\n"; #for my $feature (0..($num_mixed_phrase_features-1)) { # for my $table (0..(scalar(@__PROMIX_TABLES)-1)) { # print ITABLE $promix_weights[$table * $num_mixed_phrase_features + $feature]; # print ITABLE "\n"; # } #} for my $iweight (@promix_weights) { print ITABLE $iweight . "\n"; } close ITABLE; # the decoder should now use the interpolated model $___CONFIG = "$interpolated_config"; } # run beamdecoder with option to output nbestlists # the end result should be (1) @NBEST_LIST, a list of lists; (2) @SCORE, a list of lists of lists # In case something dies later, we might wish to have a copy create_config($___CONFIG, "./run$run.moses.ini", $featlist, $run, (defined $devbleu ? $devbleu : "--not-estimated--"), $sparse_weights_file); # Save dense weights to simplify best dev recovery { my $densefile = "run$run.dense"; my @vals = @{$featlist->{"values"}}; my @names = @{$featlist->{"names"}}; open my $denseout, '>', $densefile or die "Can't write $densefile (WD now $___WORKING_DIR)"; for (my $i = 0; $i < scalar(@{$featlist->{"names"}}); $i++) { print $denseout "$names[$i]= $vals[$i]\n"; } close $denseout; } # skip running the decoder if the user wanted if (! $skip_decoder) { print "($run) run decoder to produce n-best lists\n"; ($nbest_file, $lsamp_file, $hypergraph_dir) = run_decoder($featlist, $run, $need_to_normalize); $need_to_normalize = 0; if ($___LATTICE_SAMPLES) { my $combined_file = "$nbest_file.comb"; safesystem("sort -k1,1n $nbest_file $lsamp_file > $combined_file") or die("failed to merge nbest and lattice samples"); safesystem("gzip -f $nbest_file; gzip -f $lsamp_file") or die "Failed to gzip nbests and lattice samples"; $orig_nbest_file = "$nbest_file.gz"; $orig_nbest_file = "$nbest_file.gz"; $lsamp_file = "$lsamp_file.gz"; $lsamp_file = "$lsamp_file.gz"; $nbest_file = "$combined_file"; } ## tranforming decoded file (n-best list) if ($___TRANSFORM_DECODED_FILE) { my $trans_file=$nbest_file.".transformed"; ## original code by Anoop Kunchukuttan ##Command to run: cat $nbest_file | sed 's, ||| ,""""",g' | awk -F '"""""' 'BEGIN{OFS=" ||| "}{gsub (" ", "", $2);gsub ("\\^"," ",$2);print}' > $trans_file ## my $trans_cmd="cat $nbest_file | sed 's, ||| ,!!!!!,g' | awk -F '!!!!!' 'BEGIN{OFS=\" ||| \"}{gsub (\" \", \"\", \$2);gsub (\"\\\\^\",\" \",\$2);print}' > $trans_file"; my $trans_cmd; ## check if the argument is a file or includes space ## if there are spaces then assume that it specifies the command to be run if ((-e $___TRANSFORM_DECODED_FILE) || ($___TRANSFORM_DECODED_FILE=~/ /)){ $trans_cmd="cat $nbest_file | $___TRANSFORM_DECODED_FILE > $trans_file"; } ## otherwise assume BPE markup with @@ else{ $trans_cmd="cat $nbest_file | sed 's/\@\@ //g' > $trans_file"; } print STDERR "Executing data transformation command: $trans_cmd \n"; safesystem($trans_cmd) or die "The data transformation failed \n"; move $trans_file, $nbest_file ; } safesystem("gzip -f $nbest_file") or die "Failed to gzip run*out" unless $___HG_MIRA; $nbest_file = $nbest_file.".gz"; } else { $nbest_file = "run$run.best$___N_BEST_LIST_SIZE.out.gz"; print "skipped decoder run $run\n"; $skip_decoder = 0; $need_to_normalize = 0; } # extract score statistics and features from the nbest lists print STDERR "Scoring the nbestlist.\n"; my $base_feature_file = "features.dat"; my $base_score_file = "scores.dat"; my $feature_file = "run$run.${base_feature_file}"; my $score_file = "run$run.${base_score_file}"; my $cmd = "$mert_extract_cmd $mert_extract_args --scfile $score_file --ffile $feature_file -r " . join(",", @references) . " -n $nbest_file"; if (! $___HG_MIRA) { $cmd .= " -d" if $__PROMIX_TRAINING; # Allow duplicates # remove segmentation $cmd .= " -l $__REMOVE_SEGMENTATION" if $__PROMIX_TRAINING; $cmd = &create_extractor_script($cmd, $___WORKING_DIR); &submit_or_exec($cmd, "extract.out","extract.err", 1); } # Create the initial weights file for mert: init.opt my @MIN = @{$featlist->{"mins"}}; my @MAX = @{$featlist->{"maxs"}}; my @CURR = @{$featlist->{"values"}}; my @NAME = @{$featlist->{"names"}}; open my $out, '>', $weights_in_file or die "Can't write $weights_in_file (WD now $___WORKING_DIR)"; print $out join(" ", @CURR) . "\n"; print $out join(" ", @MIN) . "\n"; # this is where we could pass MINS print $out join(" ", @MAX) . "\n"; # this is where we could pass MAXS close $out; # print join(" ", @NAME)."\n"; # make a backup copy labelled with this run number safesystem("\\cp -f $weights_in_file run$run.$weights_in_file") or die; my $DIM = scalar(@CURR); # number of lambdas # run mert $cmd = "$mert_mert_cmd -d $DIM $mert_mert_args"; my $mert_settings = " -n $___RANDOM_RESTARTS"; my $seed_settings = ""; if ($___PREDICTABLE_SEEDS) { my $seed = $run * 1000; $seed_settings .= " -r $seed"; } $mert_settings .= $seed_settings; if ($___RANDOM_DIRECTIONS) { if ($___NUM_RANDOM_DIRECTIONS == 0) { $mert_settings .= " -m 50"; } $mert_settings .= " -t random-direction"; } if ($___NUM_RANDOM_DIRECTIONS) { $mert_settings .= " -m $___NUM_RANDOM_DIRECTIONS"; } if ($__THREADS) { $mert_settings .= " --threads $__THREADS"; } my $ffiles = ""; my $scfiles = ""; if (defined $prev_feature_file) { $ffiles = "$prev_feature_file,$feature_file"; } else{ $ffiles = "$feature_file"; } if (defined $prev_score_file) { $scfiles = "$prev_score_file,$score_file"; } else{ $scfiles = "$score_file"; } my $mira_settings = ""; if (($___BATCH_MIRA || $___HG_MIRA) && $batch_mira_args) { $mira_settings .= "$batch_mira_args "; } #$mira_settings .= " --dense-init run$run.$weights_in_file"; $mira_settings .= " --dense-init run$run.dense"; if (-e "run$run.sparse-weights") { $mira_settings .= " --sparse-init run$run.sparse-weights"; } my $file_settings = " --ffile $ffiles --scfile $scfiles"; my $pro_file_settings = "--ffile " . join(" --ffile ", split(/,/, $ffiles)) . " --scfile " . join(" --scfile ", split(/,/, $scfiles)); push @allnbests, $nbest_file; my $promix_file_settings = "--scfile " . join(" --scfile ", split(/,/, $scfiles)) . " --nbest " . join(" --nbest ", @allnbests); if ($___START_WITH_HISTORIC_BESTS && defined $prev_init_file) { $file_settings .= " --ifile $prev_init_file,run$run.$weights_in_file"; } else { $file_settings .= " --ifile run$run.$weights_in_file"; } $cmd .= $file_settings; my %sparse_weights; # sparse features my $pro_optimizer_cmd = "$pro_optimizer $megam_default_options run$run.pro.data"; if ($___PAIRWISE_RANKED_OPTIMIZER) { # pro optimization $cmd = "$mert_pro_cmd $proargs $seed_settings $pro_file_settings -o run$run.pro.data ; echo 'not used' > $weights_out_file; $pro_optimizer_cmd"; &submit_or_exec($cmd, $mert_outfile, $mert_logfile, 1); } elsif ($___PRO_STARTING_POINT) { # First, run pro, then mert # run pro... my $pro_cmd = "$mert_pro_cmd $proargs $seed_settings $pro_file_settings -o run$run.pro.data ; $pro_optimizer_cmd"; &submit_or_exec($pro_cmd, "run$run.pro.out", "run$run.pro.err", 1); # ... get results ... ($bestpoint,$devbleu) = &get_weights_from_mert("run$run.pro.out","run$run.pro.err",scalar @{$featlist->{"names"}},\%sparse_weights, \@promix_weights); # Get the pro outputs ready for mert. Add the weight ranges, # and a weight and range for the single sparse feature $cmd =~ s/--ifile (\S+)/--ifile run$run.init.pro/; open(MERT_START,$1); open(PRO_START,">run$run.init.pro"); print PRO_START $bestpoint." 1\n"; my $mert_line = ; $mert_line = ; chomp $mert_line; print PRO_START $mert_line." 0\n"; $mert_line = ; chomp $mert_line; print PRO_START $mert_line." 1\n"; close(PRO_START); # Write the sparse weights to file so mert can use them open(SPARSE_WEIGHTS,">run$run.merge-weights"); foreach my $fname (keys %sparse_weights) { print SPARSE_WEIGHTS "$fname $sparse_weights{$fname}\n"; } close(SPARSE_WEIGHTS); $cmd = $cmd." --sparse-weights run$run.merge-weights"; # ... and run mert $cmd =~ s/(--ifile \S+)/$1,run$run.init.pro/; &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile, ($__THREADS ? $__THREADS : 1) ); } elsif ($___BATCH_MIRA) { # batch MIRA optimization safesystem("echo 'not used' > $weights_out_file") or die; $cmd = "$mert_mira_cmd $mira_settings $seed_settings $pro_file_settings -o $mert_outfile"; &submit_or_exec($cmd, "run$run.mira.out", $mert_logfile, 1); } elsif ($___HG_MIRA) { safesystem("echo 'not used' > $weights_out_file") or die; $mira_settings .= " --type hypergraph "; $mira_settings .= join(" ", map {"--reference $_"} @references); $mira_settings .= " --hgdir $hypergraph_dir "; #$mira_settings .= "--verbose "; $cmd = "$mert_mira_cmd $mira_settings $seed_settings -o $mert_outfile"; &submit_or_exec($cmd, "run$run.mira.out", $mert_logfile, 1); } elsif ($__PROMIX_TRAINING) { # PRO trained mixture model safesystem("echo 'not used' > $weights_out_file") or die; $cmd = "$__PROMIX_TRAINING $promix_file_settings"; $cmd .= " -t mix "; $cmd .= join(" ", map {"-p $_"} @_PROMIX_TABLES_BIN); $cmd .= " -i $___DEV_F"; print "Starting promix optimisation at " . `date`; &submit_or_exec($cmd, "$mert_outfile", $mert_logfile, 1); print "Finished promix optimisation at " . `date`; } else { # just mert &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile, ($__THREADS ? $__THREADS : 1) ); } die "Optimization failed, file $weights_out_file does not exist or is empty" if ! -s $weights_out_file; # backup copies if (! $___HG_MIRA) { safesystem("\\cp -f extract.err run$run.extract.err") or die; safesystem("\\cp -f extract.out run$run.extract.out") or die; } safesystem("\\cp -f $mert_outfile run$run.$mert_outfile") or die; safesystem("\\cp -f $mert_logfile run$run.$mert_logfile") or die; safesystem("touch $mert_logfile run$run.$mert_logfile") or die; safesystem("\\cp -f $weights_out_file run$run.$weights_out_file") or die; # this one is needed for restarts, too if ($__PROMIX_TRAINING) { safesystem("\\cp -f $interpolated_config run$run.$interpolated_config") or die; } print "run $run end at ".`date`; ($bestpoint,$devbleu) = &get_weights_from_mert("run$run.$mert_outfile","run$run.$mert_logfile",scalar @{$featlist->{"names"}},\%sparse_weights,\@promix_weights); my $merge_weight = 0; if ($__PROMIX_TRAINING) { print "New mixture weights: " . join(" ", @promix_weights) . "\n"; } die "Failed to parse mert.log, missed Best point there." if !defined $bestpoint || !defined $devbleu; print "($run) BEST at $run: $bestpoint => $devbleu at ".`date`; # update my cache of lambda values my @newweights = split /\s+/, $bestpoint; if ($___PRO_STARTING_POINT) { $merge_weight = pop @newweights; } # interpolate with prior's interation weight, if historic-interpolation is specified if ($___HISTORIC_INTERPOLATION>0 && $run>3) { my %historic_sparse_weights; if (-e "run$run.sparse-weights") { open my $sparse_fh, '<', "run$run.sparse-weights" or die "run$run.sparse-weights: $!"; while (<$sparse_fh>) { chop; my ($feature, $weight) = split; $historic_sparse_weights{$feature} = $weight; } close $sparse_fh; } my $prev = $run - 1; my @historic_weights = split /\s+/, `cat run$prev.$weights_out_file`; for(my $i = 0; $i < scalar(@newweights); $i++) { $newweights[$i] = $___HISTORIC_INTERPOLATION * $newweights[$i] + (1 - $___HISTORIC_INTERPOLATION) * $historic_weights[$i]; } print "interpolate with " . join(",", @historic_weights) . " to " . join(",", @newweights); foreach (keys %sparse_weights) { $sparse_weights{$_} *= $___HISTORIC_INTERPOLATION; #print STDERR "sparse_weights{$_} *= $___HISTORIC_INTERPOLATION -> $sparse_weights{$_}\n"; } foreach (keys %historic_sparse_weights) { $sparse_weights{$_} += (1 - $___HISTORIC_INTERPOLATION) * $historic_sparse_weights{$_}; #print STDERR "sparse_weights{$_} += (1-$___HISTORIC_INTERPOLATION) * $historic_sparse_weights{$_} -> $sparse_weights{$_}\n"; } } if ($___HISTORIC_INTERPOLATION > 0) { open my $weights_fh, '>', "run$run.$weights_out_file" or die "run$run.$weights_out_file: $!"; print $weights_fh join(" ", @newweights); close $weights_fh; } $featlist->{"values"} = \@newweights; if (scalar keys %sparse_weights) { $sparse_weights_file = "run" . ($run + 1) . ".sparse-weights"; open my $sparse_fh, '>', $sparse_weights_file or die "$sparse_weights_file: $!"; foreach my $feature (keys %sparse_weights) { my $sparse_weight = $sparse_weights{$feature}; if ($___PRO_STARTING_POINT) { $sparse_weight *= $merge_weight; } print $sparse_fh "$feature $sparse_weight\n"; } close $sparse_fh; } ## additional stopping criterion: weights have not changed my $shouldstop = 1; for (my $i = 0; $i < @CURR; $i++) { die "Lost weight! mert reported fewer weights (@newweights) than we gave it (@CURR)" if !defined $newweights[$i]; if (abs($CURR[$i] - $newweights[$i]) >= $minimum_required_change_in_weights) { $shouldstop = 0; last; } } &save_finished_step($finished_step_file, $run); if ($shouldstop) { print STDERR "None of the weights changed more than $minimum_required_change_in_weights. Stopping.\n"; last; } my $firstrun; if ($prev_aggregate_nbl_size == -1) { $firstrun = 1; } else { $firstrun = $run - $prev_aggregate_nbl_size + 1; $firstrun = ($firstrun > 0) ? $firstrun : 1; } print "loading data from $firstrun to $run (prev_aggregate_nbl_size=$prev_aggregate_nbl_size)\n"; $prev_feature_file = undef; $prev_score_file = undef; $prev_init_file = undef; for (my $i = $firstrun; $i <= $run; $i++) { if (defined $prev_feature_file) { $prev_feature_file = "${prev_feature_file},run${i}.${base_feature_file}"; } else { $prev_feature_file = "run${i}.${base_feature_file}"; } if (defined $prev_score_file) { $prev_score_file = "${prev_score_file},run${i}.${base_score_file}"; } else { $prev_score_file = "run${i}.${base_score_file}"; } if (defined $prev_init_file) { $prev_init_file = "${prev_init_file},run${i}.${weights_in_file}"; } else { $prev_init_file = "run${i}.${weights_in_file}"; } } print "loading data from $prev_feature_file\n" if defined($prev_feature_file); print "loading data from $prev_score_file\n" if defined($prev_score_file); print "loading data from $prev_init_file\n" if defined($prev_init_file); } if (defined $allsorted) { safesystem ("\\rm -f $allsorted") or die; } safesystem("\\cp -f $weights_in_file run$run.$weights_in_file") or die; safesystem("\\cp -f $mert_logfile run$run.$mert_logfile") or die; if($___RETURN_BEST_DEV) { my $bestit=1; my $bestbleu=0; my $evalout = "eval.out"; for (my $i = 1; $i < $run; $i++) { my $candidate; if ($___HG_MIRA) { die "File not found: run$i.out" unless -r "run$i.out"; $candidate = "--candidate run$i.out"; } else { die "File not found: run$i.best$___N_BEST_LIST_SIZE.out.gz" unless -r "run$i.best$___N_BEST_LIST_SIZE.out.gz"; $candidate = "--nbest run$i.best$___N_BEST_LIST_SIZE.out.gz"; } my $cmd = "$mert_eval_cmd --reference " . join(",", @references) . " $mert_extract_args $candidate"; $cmd .= " -l $__REMOVE_SEGMENTATION" if defined( $__PROMIX_TRAINING); &submit_or_exec($cmd, $evalout, "/dev/null", 1); open my $fh, '<', $evalout or die "Can't read $evalout : $!"; my $bleu = <$fh>; chomp $bleu; if($bleu > $bestbleu) { $bestbleu = $bleu; $bestit = $i; } close $fh; } print "copying weights from best iteration ($bestit, bleu=$bestbleu) to moses.ini\n"; my $best_sparse_file = undef; if(defined $sparse_weights_file) { $best_sparse_file = "run$bestit.sparse-weights"; } my $best_featlist = get_featlist_from_file("run$bestit.dense"); $best_featlist->{"untuneables"} = $featlist->{"untuneables"}; $best_featlist->{"allcomponentsuntuneable"} = $featlist->{"allcomponentsuntuneable"}; $best_featlist->{"skippeduntuneablecomponents"} = $featlist->{"skippeduntuneablecomponents"}; create_config($___CONFIG_ORIG, "./moses.ini", $best_featlist, $bestit, $bestbleu, $best_sparse_file); } else { create_config($___CONFIG_ORIG, "./moses.ini", $featlist, $run, $devbleu, $sparse_weights_file); } # just to be sure that we have the really last finished step marked &save_finished_step($finished_step_file, $run); #chdir back to the original directory # useless, just to remind we were not there chdir($cwd); print "Training finished at " . `date`; } # end of local scope sub get_weights_from_mert { my ($outfile, $logfile, $weight_count, $sparse_weights, $mix_weights) = @_; my ($bestpoint, $devbleu); if ($___PAIRWISE_RANKED_OPTIMIZER || ($___PRO_STARTING_POINT && $logfile =~ /pro/) || $___BATCH_MIRA || $__PROMIX_TRAINING || $___HG_MIRA) { open my $fh, '<', $outfile or die "Can't open $outfile: $!"; my @WEIGHT; @$mix_weights = (); for (my $i = 0; $i < $weight_count; $i++) { push @WEIGHT, 0; } my $sum = 0.0; while (<$fh>) { if (/^F(\d+) ([\-\.\de]+)/) { # regular features $WEIGHT[$1] = $2; $sum += abs($2); } elsif (/^M(\d+_\d+) ([\-\.\de]+)/) { # mix weights push @$mix_weights,$2; } elsif (/^(.+_.+) ([\-\.\de]+)/) { # sparse features $$sparse_weights{$1} = $2; } } close $fh; die "It seems feature values are invalid or unable to read $outfile." if $sum < 1e-09; $devbleu = "unknown"; foreach (@WEIGHT) { $_ /= $sum; } foreach (keys %{$sparse_weights}) { $$sparse_weights{$_} /= $sum; } $bestpoint = join(" ", @WEIGHT); if($___BATCH_MIRA || $___HG_MIRA) { open my $fh2, '<', $logfile or die "Can't open $logfile: $!"; while(<$fh2>) { if(/Best BLEU = ([\-\d\.]+)/) { $devbleu = $1; } } close $fh2; } } else { open my $fh, '<', $logfile or die "Can't open $logfile: $!"; while (<$fh>) { if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) { $bestpoint = $1; $devbleu = $2; last; } } close $fh; } return ($bestpoint, $devbleu); } sub run_decoder { my ($featlist, $run, $need_to_normalize) = @_; my $filename_template = "run%d.best$___N_BEST_LIST_SIZE.out"; my $filename = sprintf($filename_template, $run); my $hypergraph_dir = "hypergraph"; my $lsamp_filename = undef; if ($___LATTICE_SAMPLES) { my $lsamp_filename_template = "run%d.lsamp$___LATTICE_SAMPLES.out"; $lsamp_filename = sprintf($lsamp_filename_template, $run); } # user-supplied parameters print "params = $___DECODER_FLAGS\n"; # parameters to set all model weights (to override moses.ini) my @vals = @{$featlist->{"values"}}; if ($need_to_normalize) { print STDERR "Normalizing lambdas: @vals\n"; my $totlambda = 0; grep($totlambda += abs($_), @vals); grep($_ /= $totlambda, @vals); } # moses now does not seem accept "-tm X -tm Y" but needs "-tm X Y" my %model_weights; my $valcnt = 0; my $offset = 0; for(my $i=0; $i{"names"}}); $i++) { my $name = $featlist->{"names"}->[$i]; if (!defined $model_weights{$name}) { $model_weights{$name} = "$name="; $valcnt = 0; while (defined $featlist->{"skippeduntuneablecomponents"}->{$name}{$valcnt+$offset}) { #$model_weights{$name} .= sprintf " %.6f", $oldvalues{$name}{$valcnt+$offset}; $model_weights{$name} .= sprintf " x"; $offset++; } } $model_weights{$name} .= sprintf " %.6f", $vals[$i]; $valcnt++; while (defined $featlist->{"skippeduntuneablecomponents"}->{$name}{$valcnt+$offset}) { #$model_weights{$name} .= sprintf " %.6f", $oldvalues{$name}{$valcnt+$offset}; $model_weights{$name} .= sprintf " x"; $offset++; } } my $decoder_config = ""; $decoder_config = "-weight-overwrite '" . join(" ", values %model_weights) ."'" unless $___USE_CONFIG_WEIGHTS_FIRST && $run==1; $decoder_config .= " -weight-file run$run.sparse-weights" if -e "run$run.sparse-weights"; $decoder_config .= " -report-segmentation" if $__PROMIX_TRAINING; print STDERR "DECODER_CFG = $decoder_config\n"; print "decoder_config = $decoder_config\n"; # run the decoder my $decoder_cmd; my $lsamp_cmd = ""; if ($___LATTICE_SAMPLES) { $lsamp_cmd = " -lattice-samples $lsamp_filename $___LATTICE_SAMPLES "; } if (defined $___JOBS && $___JOBS > 1) { die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA; $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG"; $decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE); $decoder_cmd .= " -cache-model $___CACHE_MODEL" if defined($___CACHE_MODEL); $decoder_cmd .= " -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE distinct\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out"; } else { my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE distinct"; if ($___HG_MIRA) { safesystem("rm -rf $hypergraph_dir"); $nbest_list_cmd = "-output-search-graph-hypergraph true gz"; } $decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG"; $decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE); $decoder_cmd .= " $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F"; if (defined $___USE_MULTI_MOSES) { # If requested, prefix full decoder command with multi-moses wrapper $decoder_cmd = "$___MULTI_MOSES $decoder_cmd"; } if (defined $___DEV_SYMAL) { # If simulating post-editing, route command through moses_sim_pe.py # Always use single (first) reference. Simulated post-editing undefined for multiple references. $decoder_cmd = "$___MOSES_SIM_PE $decoder_cmd -ref $references[0] -symal $dev_symal_abs -tmp $working_dir_abs > run$run.out"; } $decoder_cmd .= " > run$run.out"; } print STDERR "Executing: $decoder_cmd \n"; safesystem($decoder_cmd) or die "The decoder died. CONFIG WAS $decoder_config \n"; if (!$___HG_MIRA) { sanity_check_order_of_lambdas($featlist,$filename); } else { print STDERR "WARN: No sanity check of order of features in hypergraph mira\n"; } return ($filename, $lsamp_filename, $hypergraph_dir); } sub insert_ranges_to_featlist { my $featlist = shift; my $ranges = shift; $ranges = [] if !defined $ranges; # first collect the ranges from options my $niceranges; foreach my $range (@$ranges) { my $name = undef; foreach my $namedpair (split /,/, $range) { if ($namedpair =~ /^(.*?):/) { $name = $1; $namedpair =~ s/^.*?://; } my ($min, $max) = split /\.\./, $namedpair; die "Bad min '$min' in --range=$range" if $min !~ /^-?[0-9.]+$/; die "Bad max '$max' in --range=$range" if $min !~ /^-?[0-9.]+$/; die "No name given in --range=$range" if !defined $name; push @{$niceranges->{$name}}, [$min, $max]; } } # now populate featlist my $seen = undef; for(my $i = 0; $i < scalar(@{$featlist->{"names"}}); $i++) { my $name = $featlist->{"names"}->[$i]; $seen->{$name} ++; my $min = 0.0; my $max = 1.0; if (defined $niceranges->{$name}) { my $minmax = shift @{$niceranges->{$name}}; ($min, $max) = @$minmax if defined $minmax; } $featlist->{"mins"}->[$i] = $min; $featlist->{"maxs"}->[$i] = $max; } return $featlist; } sub sanity_check_order_of_lambdas { my $featlist = shift; my $filename_or_stream = shift; my @expected_lambdas = @{$featlist->{"names"}}; my @got = get_order_of_scores_from_nbestlist($filename_or_stream); die "Mismatched lambdas. Decoder returned @got, we expected @expected_lambdas" if "@got" ne "@expected_lambdas"; } sub get_featlist_from_moses { # run moses with the given config file and return the list of features and # their initial values my $configfn = shift; my $featlistfn = "./features.list"; if (-e $featlistfn && ! -z $featlistfn) { # exists & not empty print STDERR "Using cached features list: $featlistfn\n"; } else { print STDERR "Asking moses for feature names and values from $___CONFIG\n"; my $cmd; if ($___CACHE_MODEL) { $cmd = "MOSES_INI=`$SCRIPTS_ROOTDIR/ems/support/cache-model.perl $configfn $___CACHE_MODEL` && "; $configfn = "\$MOSES_INI"; } $cmd .= "$___DECODER $___DECODER_FLAGS -config $configfn"; $cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE); $cmd .= " -show-weights"; if (defined $___USE_MULTI_MOSES) { # Pass moses command through multi-moses script to handle threads properly $cmd = "$___MULTI_MOSES $cmd"; } print STDERR "Executing: $cmd\n"; &submit_or_exec($cmd, $featlistfn, "/dev/null", 1); } return get_featlist_from_file($featlistfn); } sub get_featlist_from_file { my $featlistfn = shift; # read feature list my @names = (); my @startvalues = (); my @untuneables = (); my @allcomponentsuntuneable = (); my %skippeduntuneablecomponents = (); open my $fh, '<', $featlistfn or die "Can't read $featlistfn : $!"; my $nr = 0; my $i = 0; my @errs = (); while (<$fh>) { $nr++; chomp; if (/^(\S+)= (.+)$/) { # only for feature functions with dense features my ($longname, $valuesStr) = ($1, $2); next if (!defined($valuesStr)); my @values = split(/ /, $valuesStr); my $valcnt = 0; my $hastuneablecomponent = 0; foreach my $value (@values) { if ($value =~ /^UNTUNEABLECOMPONENT$/) { $skippeduntuneablecomponents{$longname}{$valcnt} = 1; $i++; $valcnt++; } elsif ($value =~ /^[+-]?[0-9.\-e]+$/) { push @names, $longname; push @startvalues, $value; $i++; $valcnt++; $hastuneablecomponent = 1; } else { push @errs, "$featlistfn:$nr:Bad initial value of $longname: $value\n" } } if (!$hastuneablecomponent) { push @allcomponentsuntuneable, $longname; } } elsif (/^(\S+) UNTUNEABLE$/) { my ($longname) = ($1); push @untuneables, $longname; } } close $fh; if (scalar @errs) { warn join("", @errs); exit 1; } return {"names"=>\@names, "values"=>\@startvalues, "untuneables"=>\@untuneables, "allcomponentsuntuneable"=>\@allcomponentsuntuneable, "skippeduntuneablecomponents"=>\%skippeduntuneablecomponents}; } sub get_order_of_scores_from_nbestlist { # read the first line and interpret the ||| label: num num num label2: num ||| column in nbestlist # return the score labels in order my $fname_or_source = shift; # print STDERR "Peeking at the beginning of nbestlist to get order of scores: $fname_or_source\n"; open my $fh, $fname_or_source or die "Failed to get order of scores from nbestlist '$fname_or_source': $!"; my $line = <$fh>; close $fh; die "Line empty in nbestlist '$fname_or_source'" if !defined $line; my ($sent, $hypo, $scores, $total) = split /\|\|\|/, $line; $scores =~ s/^\s*|\s*$//g; die "No scores in line: $line" if $scores eq ""; my @order = (); my $label = undef; my $sparse = 0; # we ignore sparse features here foreach my $tok (split /\s+/, $scores) { if ($tok =~ /.+_.+=/) { $sparse = 1; } elsif ($tok =~ /^([a-z][0-9a-z]*)=/i) { $label = $1; } elsif ($tok =~ /^-?[-0-9.\-e]+$/) { if (!$sparse) { # a score found, remember it die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!" if !defined $label; push @order, $label; } $sparse = 0; } else { die "Not a label, not a score '$tok'. Failed to parse the scores string: '$scores' of nbestlist '$fname_or_source'"; } } print STDERR "The decoder returns the scores in this order: @order\n"; return @order; } sub create_config { # TODO: too many arguments. you might want to consider using hashes my $infn = shift; # source config my $outfn = shift; # where to save the config my $featlist = shift; # the lambdas we should write my $iteration = shift; # just for verbosity my $bleu_achieved = shift; # just for verbosity my $sparse_weights_file = shift; # only defined when optimizing sparse features my @keep_weights = (); for (my $i = 0; $i < scalar(@{$featlist->{"names"}}); $i++) { my $name = $featlist->{"names"}->[$i]; my $val = $featlist->{"values"}->[$i]; # ensure long name print STDERR "featlist: $name=$val \n"; } my %P; # the hash of all parameters we wish to override # first convert the command line parameters to the hash # ensure local scope of vars { my $parameter = undef; print "Parsing --decoder-flags: |$___DECODER_FLAGS|\n"; $___DECODER_FLAGS =~ s/^\s*|\s*$//; $___DECODER_FLAGS =~ s/\s+/ /; foreach (split(/ /, $___DECODER_FLAGS)) { if (/^\--?([^\d].*)$/) { $parameter = $1; } else { my $value = $_; die "Found value with no -paramname before it: $value" if !defined $parameter; push @{$P{$parameter}}, $value; } } } if (defined($sparse_weights_file)) { push @{$P{"weight-file"}}, File::Spec->catfile($___WORKING_DIR, $sparse_weights_file); } # create new moses.ini decoder config file by cloning and overriding the original one open my $ini_fh, '<', $infn or die "Can't read $infn: $!"; delete($P{"config"}); # never output print "Saving new config to: $outfn\n"; open my $out, '>', $outfn or die "Can't write $outfn: $!"; print $out "# MERT optimized configuration\n"; print $out "# decoder $___DECODER\n"; print $out "# BLEU $bleu_achieved on dev $___DEV_F\n"; print $out "# We were before running iteration $iteration\n"; print $out "# finished ".`date`; my %oldvalues = (); my $line = <$ini_fh>; while(1) { last unless $line; # skip until hit [parameter] if ($line !~ /^\[(.+)\]\s*$/) { $line = <$ini_fh>; print $out $line if $line =~ /^\#/ || $line =~ /^\s+$/; next; } # parameter name my $parameter = $1; if ($parameter eq "weight") { # leave weights 'til last. We're changing it while ($line = <$ini_fh>) { last if $line =~ /^\[/; if ($line =~ /^(\S+)= (.+)$/) { for( @{$featlist->{"untuneables"}} ){ if ($1 eq $_ ) {# if weight is untuneable, copy it into new config push @keep_weights, $line; } } for( @{$featlist->{"allcomponentsuntuneable"}} ){ if ($1 eq $_ ) {# if all dense weights are untuneable, copy it into new config push @keep_weights, $line; } } my ($longname, $valuesStr) = ($1, $2); next if (!defined($valuesStr)); print $valuesStr; my @values = split(/ /, $valuesStr); my $valcnt = 0; foreach my $value (@values) { if ($value =~ /^[+-]?[0-9.\-e]+$/) { $oldvalues{$longname}{$valcnt} = $value; } $valcnt++; } } } } elsif (defined($P{$parameter})) { # found a param (thread, verbose etc) that we're overriding. Leave to the end while ($line = <$ini_fh>) { last if $line =~ /^\[/; } } else { # unchanged parameter, write old print $out "[$parameter]\n"; while ($line = <$ini_fh>) { last if $line =~ /^\[/; print $out $line; } } } # write all additional parameters foreach my $parameter (sort keys %P) { print $out "\n[$parameter]\n"; foreach (@{$P{$parameter}}) { print $out $_."\n"; } } # write all weights print $out "[weight]\n"; my $prevName = ""; my $outStr = ""; my $valcnt = 0; my $offset = 0; for (my $i = 0; $i < scalar(@{$featlist->{"names"}}); $i++) { my $name = $featlist->{"names"}->[$i]; my $val = $featlist->{"values"}->[$i]; if ($prevName ne $name) { print $out "$outStr\n"; $valcnt = 0; $outStr = "$name="; $prevName = $name; while (defined $featlist->{"skippeduntuneablecomponents"}->{$name}{$valcnt+$offset}) { $outStr .= " $oldvalues{$name}{$valcnt+$offset}"; $offset++; } } $outStr .= " $val"; $valcnt++; while (defined $featlist->{"skippeduntuneablecomponents"}->{$name}{$valcnt+$offset}) { $outStr .= " $oldvalues{$name}{$valcnt+$offset}"; $offset++; } } print $out "$outStr\n"; for (@keep_weights) { print $out $_; } close $ini_fh; close $out; print STDERR "Saved: $outfn\n"; } # Create a new ini file, with the first ttable replaced by the given one # and its type set to text sub substitute_ttable { my ($old_ini, $new_ini, $new_ttable, $ttable_type) = @_; $ttable_type = "0" unless defined($ttable_type); open(NEW_INI,">$new_ini") || die "Failed to create $new_ini"; open(INI,$old_ini) || die "Failed to open $old_ini"; while() { if (/\[ttable-file\]/) { print NEW_INI "[ttable-file]\n"; my $ttable_config = ; chomp $ttable_config; my @ttable_fields = split /\s+/, $ttable_config; $ttable_fields[0] = $ttable_type; $ttable_fields[4] = $new_ttable; print NEW_INI join(" ", @ttable_fields) . "\n"; } else { print NEW_INI; } } close NEW_INI; close INI; } sub safesystem { print STDERR "Executing: @_\n"; system(@_); if ($? == -1) { warn "Failed to execute: @_\n $!"; exit(1); } elsif ($? & 127) { printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", ($? & 127), ($? & 128) ? 'with' : 'without'; exit(1); } else { my $exitcode = $? >> 8; warn "Exit code: $exitcode\n" if $exitcode; return ! $exitcode; } } sub ensure_full_path { my $PATH = shift; $PATH =~ s/\/nfsmnt//; return $PATH if $PATH =~ /^\//; my $dir = Cwd::getcwd(); $PATH = File::Spec->catfile($dir, $PATH); $PATH =~ s/[\r\n]//g; $PATH =~ s/\/\.\//\//g; $PATH =~ s/\/+/\//g; my $sanity = 0; while($PATH =~ /\/\.\.\// && $sanity++ < 10) { $PATH =~ s/\/+/\//g; $PATH =~ s/\/[^\/]+\/\.\.\//\//g; } $PATH =~ s/\/[^\/]+\/\.\.$//; $PATH =~ s/\/+$//; $PATH =~ s/\/nfsmnt//; return $PATH; } sub submit_or_exec { my ($cmd, $stdout, $stderr, $threads) = @_; print STDERR "exec: $cmd\n"; if (defined $___JOBS && $___JOBS > 1) { # request fewer CPU slots, if not needed my $queue_flags_for_this_command = $queue_flags; $threads = 1 unless defined($threads); $queue_flags_for_this_command =~ s/(\-pe smp) \d+/$1 $threads/; safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags_for_this_command\" -stdout=$stdout -stderr=$stderr" ) or die "ERROR: Failed to submit '$cmd' (via $qsubwrapper)"; } else { safesystem("$cmd > $stdout 2> $stderr") or die "ERROR: Failed to run '$cmd'."; } } sub create_extractor_script() { my ($cmd, $outdir) = @_; my $script_path = File::Spec->catfile($outdir, "extractor.sh"); open my $out, '>', $script_path or die "Couldn't open $script_path for writing: $!\n"; print $out "#!/usr/bin/env bash\n"; print $out "cd $outdir\n"; print $out "$cmd\n"; close $out; `chmod +x $script_path`; return $script_path; } sub save_finished_step { my ($filename, $step) = @_; open my $fh, '>', $filename or die "$filename: $!"; print $fh $step . "\n"; close $fh; } # It returns a config for mert/extractor. sub setup_reference_length_type { if (($___CLOSEST + $___AVERAGE + $___SHORTEST) > 1) { die "You can specify just ONE reference length strategy (closest or shortest or average) not both\n"; } if ($___SHORTEST) { return " reflen:shortest"; } elsif ($___AVERAGE) { return " reflen:average"; } elsif ($___CLOSEST) { return " reflen:closest"; } else { return ""; } } sub setup_case_config { if ($___NOCASE) { return " case:false"; } else { return " case:true"; } } sub is_mac_osx { return ($^O eq "darwin") ? 1 : 0; }