|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <algorithm> |
|
#include <cstdlib> |
|
#include <ctime> |
|
#include <string> |
|
#include <vector> |
|
#include <map> |
|
|
|
#include <boost/program_options.hpp> |
|
#include <boost/algorithm/string.hpp> |
|
|
|
#ifdef MPI_ENABLE |
|
#include <boost/mpi.hpp> |
|
namespace mpi = boost::mpi; |
|
#endif |
|
|
|
#include "Main.h" |
|
#include "Optimiser.h" |
|
#include "Hildreth.h" |
|
#include "HypothesisQueue.h" |
|
#include "moses/StaticData.h" |
|
#include "moses/ScoreComponentCollection.h" |
|
#include "moses/ThreadPool.h" |
|
#include "mert/BleuScorer.h" |
|
#include "moses/FeatureVector.h" |
|
|
|
#include "moses/FF/WordTranslationFeature.h" |
|
#include "moses/FF/PhrasePairFeature.h" |
|
#include "moses/FF/WordPenaltyProducer.h" |
|
#include "moses/LM/Base.h" |
|
#include "util/random.hh" |
|
|
|
using namespace Mira; |
|
using namespace std; |
|
using namespace Moses; |
|
namespace po = boost::program_options; |
|
|
|
int main(int argc, char** argv) |
|
{ |
|
util::rand_init(); |
|
size_t rank = 0; |
|
size_t size = 1; |
|
#ifdef MPI_ENABLE |
|
mpi::environment env(argc,argv); |
|
mpi::communicator world; |
|
rank = world.rank(); |
|
size = world.size(); |
|
#endif |
|
|
|
bool help; |
|
int verbosity; |
|
string mosesConfigFile; |
|
string inputFile; |
|
vector<string> referenceFiles; |
|
vector<string> mosesConfigFilesFolds, inputFilesFolds, referenceFilesFolds; |
|
|
|
size_t epochs; |
|
string learner; |
|
bool shuffle; |
|
size_t mixingFrequency; |
|
size_t weightDumpFrequency; |
|
string weightDumpStem; |
|
bool scale_margin; |
|
bool scale_update; |
|
size_t n; |
|
size_t batchSize; |
|
bool distinctNbest; |
|
bool accumulateWeights; |
|
float historySmoothing; |
|
bool scaleByInputLength, scaleByAvgInputLength; |
|
bool scaleByInverseLength, scaleByAvgInverseLength; |
|
float scaleByX; |
|
float slack; |
|
bool averageWeights; |
|
bool weightConvergence; |
|
float learning_rate; |
|
float mira_learning_rate; |
|
float perceptron_learning_rate; |
|
string decoder_settings; |
|
float min_weight_change; |
|
bool normaliseWeights, normaliseMargin; |
|
bool print_feature_values; |
|
bool historyBleu ; |
|
bool sentenceBleu; |
|
bool perceptron_update; |
|
bool hope_fear; |
|
bool model_hope_fear; |
|
size_t hope_n, fear_n; |
|
size_t bleu_smoothing_scheme; |
|
float min_oracle_bleu; |
|
float minBleuRatio, maxBleuRatio; |
|
bool boost; |
|
bool decode_hope, decode_fear, decode_model; |
|
string decode_filename; |
|
bool batchEqualsShard; |
|
bool sparseAverage, dumpMixedWeights, sparseNoAverage; |
|
int featureCutoff; |
|
bool pruneZeroWeights; |
|
bool printFeatureCounts, printNbestWithFeatures; |
|
bool avgRefLength; |
|
bool print_weights, print_core_weights, debug_model, scale_lm, scale_wp; |
|
float scale_lm_factor, scale_wp_factor; |
|
bool kbest; |
|
string moses_src; |
|
float sigmoidParam; |
|
float bleuWeight, bleuWeight_hope, bleuWeight_fear; |
|
bool bleu_weight_lm; |
|
float bleu_weight_lm_factor; |
|
bool l1_regularize, l2_regularize, l1_reg_sparse, l2_reg_sparse; |
|
float l1_lambda, l2_lambda; |
|
bool most_violated, most_violated_reg, all_violated, max_bleu_diff; |
|
bool feature_confidence, signed_counts; |
|
float decay_core, decay_sparse, core_r0, sparse_r0; |
|
float bleu_weight_fear_factor; |
|
bool hildreth; |
|
float add2lm; |
|
|
|
|
|
bool realBleu, disableBleuFeature; |
|
bool rescaleSlack; |
|
bool makePairs; |
|
bool debug; |
|
bool reg_on_every_mix; |
|
size_t continue_epoch; |
|
bool modelPlusBleu, simpleHistoryBleu; |
|
po::options_description desc("Allowed options"); |
|
desc.add_options() |
|
("continue-epoch", po::value<size_t>(&continue_epoch)->default_value(0), "Continue an interrupted experiment from this epoch on") |
|
("freq-reg", po::value<bool>(®_on_every_mix)->default_value(false), "Regularize after every weight mixing") |
|
("l1sparse", po::value<bool>(&l1_reg_sparse)->default_value(true), "L1-regularization for sparse weights only") |
|
("l2sparse", po::value<bool>(&l2_reg_sparse)->default_value(true), "L2-regularization for sparse weights only") |
|
("mv-reg", po::value<bool>(&most_violated_reg)->default_value(false), "Regularize most violated constraint") |
|
("most-violated", po::value<bool>(&most_violated)->default_value(false), "Add most violated constraint") |
|
("all-violated", po::value<bool>(&all_violated)->default_value(false), "Add all violated constraints") |
|
("feature-confidence", po::value<bool>(&feature_confidence)->default_value(false), "Confidence-weighted learning") |
|
("signed-counts", po::value<bool>(&signed_counts)->default_value(false), "Use signed feature counts for CWL") |
|
("dbg", po::value<bool>(&debug)->default_value(true), "More debug output") |
|
("make-pairs", po::value<bool>(&makePairs)->default_value(true), "Make pairs of hypotheses for 1slack") |
|
("debug", po::value<bool>(&debug)->default_value(true), "More debug output") |
|
("rescale-slack", po::value<bool>(&rescaleSlack)->default_value(false), "Rescale slack in 1-slack formulation") |
|
("add2lm", po::value<float>(&add2lm)->default_value(0.0), "Add the specified amount to all LM weights") |
|
("hildreth", po::value<bool>(&hildreth)->default_value(false), "Prefer Hildreth over analytical update") |
|
("model-plus-bleu", po::value<bool>(&modelPlusBleu)->default_value(false), "Use the sum of model score and +/- bleu to select hope and fear translations") |
|
("simple-history-bleu", po::value<bool>(&simpleHistoryBleu)->default_value(false), "Simple history Bleu") |
|
|
|
("bleu-weight", po::value<float>(&bleuWeight)->default_value(1.0), "Bleu weight used in decoder objective") |
|
("bw-hope", po::value<float>(&bleuWeight_hope)->default_value(-1.0), "Bleu weight used in decoder objective for hope") |
|
("bw-fear", po::value<float>(&bleuWeight_fear)->default_value(-1.0), "Bleu weight used in decoder objective for fear") |
|
|
|
("core-r0", po::value<float>(&core_r0)->default_value(1.0), "Start learning rate for core features") |
|
("sparse-r0", po::value<float>(&sparse_r0)->default_value(1.0), "Start learning rate for sparse features") |
|
("decay-core", po::value<float>(&decay_core)->default_value(0.01), "Decay for core feature learning rate") |
|
("decay-sparse", po::value<float>(&decay_sparse)->default_value(0.01), "Decay for sparse feature learning rate") |
|
|
|
("tie-bw-to-lm", po::value<bool>(&bleu_weight_lm)->default_value(true), "Make bleu weight depend on lm weight") |
|
("bw-lm-factor", po::value<float>(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor") |
|
("bw-factor-fear", po::value<float>(&bleu_weight_fear_factor)->default_value(1.0), "Multiply fear weight by this factor") |
|
("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs") |
|
("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update") |
|
("avg-ref-length", po::value<bool>(&avgRefLength)->default_value(false), "Use average reference length instead of shortest for BLEU score feature") |
|
("batch-equals-shard", po::value<bool>(&batchEqualsShard)->default_value(false), "Batch size is equal to shard size (purely batch)") |
|
("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments") |
|
("bleu-smoothing-scheme", po::value<size_t>(&bleu_smoothing_scheme)->default_value(1), "Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1)") |
|
("boost", po::value<bool>(&boost)->default_value(false), "Apply boosting factor to updates on misranked candidates") |
|
("config,f", po::value<string>(&mosesConfigFile), "Moses ini-file") |
|
("configs-folds", po::value<vector<string> >(&mosesConfigFilesFolds), "Moses ini-files, one for each fold") |
|
("debug-model", po::value<bool>(&debug_model)->default_value(false), "Get best model translation for debugging purposes") |
|
("decode-hope", po::value<bool>(&decode_hope)->default_value(false), "Decode dev input set according to hope objective") |
|
("decode-fear", po::value<bool>(&decode_fear)->default_value(false), "Decode dev input set according to fear objective") |
|
("decode-model", po::value<bool>(&decode_model)->default_value(false), "Decode dev input set according to normal objective") |
|
("decode-filename", po::value<string>(&decode_filename), "Filename for Bleu objective translations") |
|
("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs") |
|
("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use n-best list with distinct translations in inference step") |
|
("dump-mixed-weights", po::value<bool>(&dumpMixedWeights)->default_value(false), "Dump mixed weights instead of averaged weights") |
|
("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs") |
|
("feature-cutoff", po::value<int>(&featureCutoff)->default_value(-1), "Feature cutoff as additional regularization for sparse features") |
|
("fear-n", po::value<size_t>(&fear_n)->default_value(1), "Number of fear translations used") |
|
("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit") |
|
("history-bleu", po::value<bool>(&historyBleu)->default_value(false), "Use 1best translations to update the history") |
|
("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing") |
|
("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimisation (not model)") |
|
("hope-n", po::value<size_t>(&hope_n)->default_value(2), "Number of hope translations used") |
|
("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source") |
|
("input-files-folds", po::value<vector<string> >(&inputFilesFolds), "Input files containing tokenised source, one for each fold") |
|
("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm") |
|
("l1-lambda", po::value<float>(&l1_lambda)->default_value(0.0001), "Lambda for l1-regularization (w_i +/- lambda)") |
|
("l2-lambda", po::value<float>(&l2_lambda)->default_value(0.01), "Lambda for l2-regularization (w_i * (1 - lambda))") |
|
("l1-reg", po::value<bool>(&l1_regularize)->default_value(false), "L1-regularization") |
|
("l2-reg", po::value<bool>(&l2_regularize)->default_value(false), "L2-regularization") |
|
("min-bleu-ratio", po::value<float>(&minBleuRatio)->default_value(-1), "Set a minimum BLEU ratio between hope and fear") |
|
("max-bleu-ratio", po::value<float>(&maxBleuRatio)->default_value(-1), "Set a maximum BLEU ratio between hope and fear") |
|
("max-bleu-diff", po::value<bool>(&max_bleu_diff)->default_value(true), "Select hope/fear with maximum Bleu difference") |
|
("min-oracle-bleu", po::value<float>(&min_oracle_bleu)->default_value(0), "Set a minimum oracle BLEU score") |
|
("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.0001), "Set minimum weight change for stopping criterion") |
|
("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)") |
|
("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(10), "How often per epoch to mix weights, when using mpi") |
|
("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimisation") |
|
("moses-src", po::value<string>(&moses_src)->default_value(""), "Moses source directory") |
|
("nbest,n", po::value<size_t>(&n)->default_value(30), "Number of translations in n-best list") |
|
("normalise-weights", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder") |
|
("normalise-margin", po::value<bool>(&normaliseMargin)->default_value(false), "Normalise the margin: squash between 0 and 1") |
|
("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate") |
|
("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values") |
|
("print-feature-counts", po::value<bool>(&printFeatureCounts)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch") |
|
("print-nbest-with-features", po::value<bool>(&printNbestWithFeatures)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch") |
|
("print-weights", po::value<bool>(&print_weights)->default_value(false), "Print out current weights") |
|
("print-core-weights", po::value<bool>(&print_core_weights)->default_value(true), "Print out current core weights") |
|
("prune-zero-weights", po::value<bool>(&pruneZeroWeights)->default_value(false), "Prune zero-valued sparse feature weights") |
|
("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training") |
|
("reference-files-folds", po::value<vector<string> >(&referenceFilesFolds), "Reference translation files for training, one for each fold") |
|
("kbest", po::value<bool>(&kbest)->default_value(true), "Select hope/fear pairs from a list of nbest translations") |
|
|
|
("scale-by-inverse-length", po::value<bool>(&scaleByInverseLength)->default_value(false), "Scale BLEU by (history of) inverse input length") |
|
("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale BLEU by (history of) input length") |
|
("scale-by-avg-input-length", po::value<bool>(&scaleByAvgInputLength)->default_value(false), "Scale BLEU by average input length") |
|
("scale-by-avg-inverse-length", po::value<bool>(&scaleByAvgInverseLength)->default_value(false), "Scale BLEU by average inverse input length") |
|
("scale-by-x", po::value<float>(&scaleByX)->default_value(0.1), "Scale the BLEU score by value x") |
|
("scale-lm", po::value<bool>(&scale_lm)->default_value(true), "Scale the language model feature") |
|
("scale-factor-lm", po::value<float>(&scale_lm_factor)->default_value(0.5), "Scale the language model feature by this factor") |
|
("scale-wp", po::value<bool>(&scale_wp)->default_value(false), "Scale the word penalty feature") |
|
("scale-factor-wp", po::value<float>(&scale_wp_factor)->default_value(2), "Scale the word penalty feature by this factor") |
|
("scale-margin", po::value<bool>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation") |
|
("sentence-level-bleu", po::value<bool>(&sentenceBleu)->default_value(true), "Use a sentences level Bleu scoring function") |
|
("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing") |
|
("sigmoid-param", po::value<float>(&sigmoidParam)->default_value(1), "y=sigmoidParam is the axis that this sigmoid approaches") |
|
("slack", po::value<float>(&slack)->default_value(0.05), "Use slack in optimiser") |
|
("sparse-average", po::value<bool>(&sparseAverage)->default_value(false), "Average weights by the number of processes") |
|
("sparse-no-average", po::value<bool>(&sparseNoAverage)->default_value(false), "Don't average sparse weights, just sum") |
|
("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge") |
|
("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level") |
|
("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(2), "How often per epoch to dump weights (mpi)") |
|
("weight-dump-stem", po::value<string>(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights"); |
|
|
|
po::options_description cmdline_options; |
|
cmdline_options.add(desc); |
|
po::variables_map vm; |
|
po::store(po::command_line_parser(argc, argv). options(cmdline_options).run(), vm); |
|
po::notify(vm); |
|
|
|
if (help) { |
|
std::cout << "Usage: " + string(argv[0]) |
|
+ " -f mosesini-file -i input-file -r reference-file(s) [options]" << std::endl; |
|
std::cout << desc << std::endl; |
|
return 0; |
|
} |
|
|
|
const StaticData &staticData = StaticData::Instance(); |
|
|
|
bool trainWithMultipleFolds = false; |
|
if (mosesConfigFilesFolds.size() > 0 || inputFilesFolds.size() > 0 || referenceFilesFolds.size() > 0) { |
|
if (rank == 0) |
|
cerr << "Training with " << mosesConfigFilesFolds.size() << " folds" << endl; |
|
trainWithMultipleFolds = true; |
|
} |
|
|
|
if (dumpMixedWeights && (mixingFrequency != weightDumpFrequency)) { |
|
cerr << "Set mixing frequency = weight dump frequency for dumping mixed weights!" << endl; |
|
exit(1); |
|
} |
|
|
|
if ((sparseAverage || sparseNoAverage) && averageWeights) { |
|
cerr << "Parameters --sparse-average 1/--sparse-no-average 1 and --average-weights 1 are incompatible (not implemented)" << endl; |
|
exit(1); |
|
} |
|
|
|
if (trainWithMultipleFolds) { |
|
if (!mosesConfigFilesFolds.size()) { |
|
cerr << "Error: No moses ini files specified for training with folds" << endl; |
|
exit(1); |
|
} |
|
|
|
if (!inputFilesFolds.size()) { |
|
cerr << "Error: No input files specified for training with folds" << endl; |
|
exit(1); |
|
} |
|
|
|
if (!referenceFilesFolds.size()) { |
|
cerr << "Error: No reference files specified for training with folds" << endl; |
|
exit(1); |
|
} |
|
} else { |
|
if (mosesConfigFile.empty()) { |
|
cerr << "Error: No moses ini file specified" << endl; |
|
return 1; |
|
} |
|
|
|
if (inputFile.empty()) { |
|
cerr << "Error: No input file specified" << endl; |
|
return 1; |
|
} |
|
|
|
if (!referenceFiles.size()) { |
|
cerr << "Error: No reference files specified" << endl; |
|
return 1; |
|
} |
|
} |
|
|
|
|
|
vector<string> inputSentences; |
|
size_t inputSize = trainWithMultipleFolds? inputFilesFolds.size(): 0; |
|
size_t refSize = trainWithMultipleFolds? referenceFilesFolds.size(): referenceFiles.size(); |
|
vector<vector<string> > inputSentencesFolds(inputSize); |
|
vector<vector<string> > referenceSentences(refSize); |
|
|
|
|
|
size_t coresPerFold = 0, myFold = 0; |
|
if (trainWithMultipleFolds) { |
|
if (mosesConfigFilesFolds.size() > size) { |
|
cerr << "Number of cores has to be a multiple of the number of folds" << endl; |
|
exit(1); |
|
} |
|
coresPerFold = size/mosesConfigFilesFolds.size(); |
|
if (size % coresPerFold > 0) { |
|
cerr << "Number of cores has to be a multiple of the number of folds" << endl; |
|
exit(1); |
|
} |
|
|
|
if (rank == 0) |
|
cerr << "Number of cores per fold: " << coresPerFold << endl; |
|
myFold = rank/coresPerFold; |
|
cerr << "Rank " << rank << ", my fold: " << myFold << endl; |
|
} |
|
|
|
|
|
if (trainWithMultipleFolds) { |
|
if (!loadSentences(inputFilesFolds[myFold], inputSentencesFolds[myFold])) { |
|
cerr << "Error: Failed to load input sentences from " << inputFilesFolds[myFold] << endl; |
|
exit(1); |
|
} |
|
VERBOSE(1, "Rank " << rank << " reading inputs from " << inputFilesFolds[myFold] << endl); |
|
|
|
if (!loadSentences(referenceFilesFolds[myFold], referenceSentences[myFold])) { |
|
cerr << "Error: Failed to load reference sentences from " << referenceFilesFolds[myFold] << endl; |
|
exit(1); |
|
} |
|
if (referenceSentences[myFold].size() != inputSentencesFolds[myFold].size()) { |
|
cerr << "Error: Input file length (" << inputSentencesFolds[myFold].size() << ") != (" |
|
<< referenceSentences[myFold].size() << ") reference file length (rank " << rank << ")" << endl; |
|
exit(1); |
|
} |
|
VERBOSE(1, "Rank " << rank << " reading references from " << referenceFilesFolds[myFold] << endl); |
|
} else { |
|
if (!loadSentences(inputFile, inputSentences)) { |
|
cerr << "Error: Failed to load input sentences from " << inputFile << endl; |
|
return 1; |
|
} |
|
|
|
for (size_t i = 0; i < referenceFiles.size(); ++i) { |
|
if (!loadSentences(referenceFiles[i], referenceSentences[i])) { |
|
cerr << "Error: Failed to load reference sentences from " |
|
<< referenceFiles[i] << endl; |
|
return 1; |
|
} |
|
if (referenceSentences[i].size() != inputSentences.size()) { |
|
cerr << "Error: Input file length (" << inputSentences.size() << ") != (" |
|
<< referenceSentences[i].size() << ") length of reference file " << i |
|
<< endl; |
|
return 1; |
|
} |
|
} |
|
} |
|
|
|
if (scaleByAvgInputLength || scaleByInverseLength || scaleByAvgInverseLength) |
|
scaleByInputLength = false; |
|
|
|
if (historyBleu || simpleHistoryBleu) { |
|
sentenceBleu = false; |
|
cerr << "Using history Bleu. " << endl; |
|
} |
|
|
|
if (kbest) { |
|
realBleu = true; |
|
disableBleuFeature = true; |
|
cerr << "Use kbest lists and real Bleu scores, disable Bleu feature.." << endl; |
|
} |
|
|
|
|
|
|
|
boost::trim(decoder_settings); |
|
decoder_settings += " -mira -n-best-list - " + boost::lexical_cast<string>(n) + " distinct"; |
|
|
|
vector<string> decoder_params; |
|
boost::split(decoder_params, decoder_settings, boost::is_any_of("\t ")); |
|
|
|
|
|
decoder_params.push_back("-feature-add"); |
|
|
|
decoder_settings = "BleuScoreFeature tuneable=false references="; |
|
if (trainWithMultipleFolds) { |
|
decoder_settings += referenceFilesFolds[myFold]; |
|
} else { |
|
decoder_settings += referenceFiles[0]; |
|
for (size_t i=1; i < referenceFiles.size(); ++i) { |
|
decoder_settings += ","; |
|
decoder_settings += referenceFiles[i]; |
|
} |
|
} |
|
decoder_params.push_back(decoder_settings); |
|
|
|
string configFile = trainWithMultipleFolds? mosesConfigFilesFolds[myFold] : mosesConfigFile; |
|
VERBOSE(1, "Rank " << rank << " reading config file from " << configFile << endl); |
|
MosesDecoder* decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params); |
|
decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength, |
|
scaleByInverseLength, scaleByAvgInverseLength, |
|
scaleByX, historySmoothing, bleu_smoothing_scheme, simpleHistoryBleu); |
|
bool chartDecoding = staticData.IsChart(); |
|
|
|
|
|
vector<size_t> order; |
|
if (trainWithMultipleFolds) { |
|
for (size_t i = 0; i < inputSentencesFolds[myFold].size(); ++i) { |
|
order.push_back(i); |
|
} |
|
} else { |
|
if (rank == 0) { |
|
for (size_t i = 0; i < inputSentences.size(); ++i) { |
|
order.push_back(i); |
|
} |
|
} |
|
} |
|
|
|
|
|
Optimiser* optimiser = NULL; |
|
if (learner == "mira") { |
|
if (rank == 0) { |
|
cerr << "Optimising using Mira" << endl; |
|
cerr << "slack: " << slack << ", learning rate: " << mira_learning_rate << endl; |
|
if (normaliseMargin) |
|
cerr << "sigmoid parameter: " << sigmoidParam << endl; |
|
} |
|
optimiser = new MiraOptimiser(slack, scale_margin, scale_update, boost, normaliseMargin, sigmoidParam); |
|
learning_rate = mira_learning_rate; |
|
perceptron_update = false; |
|
} else if (learner == "perceptron") { |
|
if (rank == 0) { |
|
cerr << "Optimising using Perceptron" << endl; |
|
} |
|
optimiser = new Perceptron(); |
|
learning_rate = perceptron_learning_rate; |
|
perceptron_update = true; |
|
model_hope_fear = false; |
|
hope_fear = false; |
|
n = 1; |
|
hope_n = 1; |
|
fear_n = 1; |
|
} else { |
|
cerr << "Error: Unknown optimiser: " << learner << endl; |
|
return 1; |
|
} |
|
|
|
|
|
if (batchSize > 1 && perceptron_update) { |
|
batchSize = 1; |
|
cerr << "Info: Setting batch size to 1 for perceptron update" << endl; |
|
} |
|
|
|
if (hope_n == 0) |
|
hope_n = n; |
|
if (fear_n == 0) |
|
fear_n = n; |
|
|
|
if (model_hope_fear || kbest) |
|
hope_fear = false; |
|
if (learner == "mira" && !(hope_fear || model_hope_fear || kbest)) { |
|
cerr << "Error: Need to select one of parameters --hope-fear/--model-hope-fear/--kbest for mira update." << endl; |
|
return 1; |
|
} |
|
|
|
#ifdef MPI_ENABLE |
|
if (!trainWithMultipleFolds) |
|
mpi::broadcast(world, order, 0); |
|
#endif |
|
|
|
|
|
vector<size_t> shard; |
|
if (trainWithMultipleFolds) { |
|
size_t shardSize = order.size()/coresPerFold; |
|
size_t shardStart = (size_t) (shardSize * (rank % coresPerFold)); |
|
size_t shardEnd = shardStart + shardSize; |
|
if (rank % coresPerFold == coresPerFold - 1) { |
|
shardEnd = order.size(); |
|
shardSize = shardEnd - shardStart; |
|
} |
|
VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl); |
|
VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl); |
|
shard.resize(shardSize); |
|
copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin()); |
|
batchSize = 1; |
|
} else { |
|
size_t shardSize = order.size() / size; |
|
size_t shardStart = (size_t) (shardSize * rank); |
|
size_t shardEnd = (size_t) (shardSize * (rank + 1)); |
|
if (rank == size - 1) { |
|
shardEnd = order.size(); |
|
shardSize = shardEnd - shardStart; |
|
} |
|
VERBOSE(1, "Rank: " << rank << " Shard size: " << shardSize << endl); |
|
VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl); |
|
shard.resize(shardSize); |
|
copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin()); |
|
if (batchEqualsShard) |
|
batchSize = shardSize; |
|
} |
|
|
|
|
|
|
|
ScoreComponentCollection initialWeights = decoder->getWeights(); |
|
|
|
if (add2lm != 0) { |
|
const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions(); |
|
for (size_t i = 0; i < statefulFFs.size(); ++i) { |
|
const StatefulFeatureFunction *ff = statefulFFs[i]; |
|
const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff); |
|
|
|
if (lm) { |
|
float lmWeight = initialWeights.GetScoreForProducer(lm) + add2lm; |
|
initialWeights.Assign(lm, lmWeight); |
|
cerr << "Rank " << rank << ", add " << add2lm << " to lm weight." << endl; |
|
} |
|
} |
|
} |
|
|
|
if (normaliseWeights) { |
|
initialWeights.L1Normalise(); |
|
cerr << "Rank " << rank << ", normalised initial weights: " << initialWeights << endl; |
|
} |
|
|
|
decoder->setWeights(initialWeights); |
|
|
|
|
|
if (bleu_weight_lm) { |
|
float lmSum = 0; |
|
const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions(); |
|
for (size_t i = 0; i < statefulFFs.size(); ++i) { |
|
const StatefulFeatureFunction *ff = statefulFFs[i]; |
|
const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff); |
|
|
|
if (lm) { |
|
lmSum += abs(initialWeights.GetScoreForProducer(lm)); |
|
} |
|
} |
|
|
|
bleuWeight = lmSum * bleu_weight_lm_factor; |
|
if (!kbest) cerr << "Set bleu weight to lm weight * " << bleu_weight_lm_factor << endl; |
|
} |
|
|
|
|
|
if (bleuWeight_hope == -1) { |
|
bleuWeight_hope = bleuWeight; |
|
} |
|
if (bleuWeight_fear == -1) { |
|
bleuWeight_fear = bleuWeight; |
|
} |
|
bleuWeight_fear *= bleu_weight_fear_factor; |
|
if (!kbest) { |
|
cerr << "Bleu weight: " << bleuWeight << endl; |
|
cerr << "Bleu weight fear: " << bleuWeight_fear << endl; |
|
} |
|
|
|
if (decode_hope || decode_fear || decode_model) { |
|
size_t decode = 1; |
|
if (decode_fear) decode = 2; |
|
if (decode_model) decode = 3; |
|
decodeHopeOrFear(rank, size, decode, decode_filename, inputSentences, decoder, n, bleuWeight); |
|
} |
|
|
|
|
|
ScoreComponentCollection cumulativeWeights; |
|
ScoreComponentCollection cumulativeWeightsBinary; |
|
size_t numberOfUpdates = 0; |
|
size_t numberOfUpdatesThisEpoch = 0; |
|
|
|
time_t now; |
|
time(&now); |
|
cerr << "Rank " << rank << ", " << ctime(&now); |
|
|
|
float avgInputLength = 0; |
|
float sumOfInputs = 0; |
|
size_t numberOfInputs = 0; |
|
|
|
ScoreComponentCollection mixedWeights; |
|
ScoreComponentCollection mixedWeightsPrevious; |
|
ScoreComponentCollection mixedWeightsBeforePrevious; |
|
ScoreComponentCollection mixedAverageWeights; |
|
ScoreComponentCollection mixedAverageWeightsPrevious; |
|
ScoreComponentCollection mixedAverageWeightsBeforePrevious; |
|
|
|
bool stop = false; |
|
|
|
float epsilon = 0.0001; |
|
|
|
|
|
ScoreComponentCollection confidenceCounts, mixedConfidenceCounts, featureLearningRates; |
|
featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0); |
|
cerr << "Initial learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl; |
|
|
|
for (size_t epoch = continue_epoch; epoch < epochs && !stop; ++epoch) { |
|
if (shuffle) { |
|
if (trainWithMultipleFolds || rank == 0) { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", shuffling input sentences.." << endl; |
|
RandomIndex rindex; |
|
random_shuffle(order.begin(), order.end(), rindex); |
|
} |
|
|
|
#ifdef MPI_ENABLE |
|
if (!trainWithMultipleFolds) |
|
mpi::broadcast(world, order, 0); |
|
#endif |
|
|
|
|
|
if (trainWithMultipleFolds) { |
|
size_t shardSize = order.size()/coresPerFold; |
|
size_t shardStart = (size_t) (shardSize * (rank % coresPerFold)); |
|
size_t shardEnd = shardStart + shardSize; |
|
if (rank % coresPerFold == coresPerFold - 1) { |
|
shardEnd = order.size(); |
|
shardSize = shardEnd - shardStart; |
|
} |
|
VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl); |
|
VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl); |
|
shard.resize(shardSize); |
|
copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin()); |
|
batchSize = 1; |
|
} else { |
|
size_t shardSize = order.size()/size; |
|
size_t shardStart = (size_t) (shardSize * rank); |
|
size_t shardEnd = (size_t) (shardSize * (rank + 1)); |
|
if (rank == size - 1) { |
|
shardEnd = order.size(); |
|
shardSize = shardEnd - shardStart; |
|
} |
|
VERBOSE(1, "Shard size: " << shardSize << endl); |
|
VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl); |
|
shard.resize(shardSize); |
|
copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin()); |
|
if (batchEqualsShard) |
|
batchSize = shardSize; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
numberOfUpdatesThisEpoch = 0; |
|
|
|
if (!accumulateWeights) { |
|
cumulativeWeights.ZeroAll(); |
|
cumulativeWeightsBinary.ZeroAll(); |
|
} |
|
|
|
|
|
size_t weightMixingThisEpoch = 0; |
|
size_t weightEpochDump = 0; |
|
|
|
size_t shardPosition = 0; |
|
vector<size_t>::const_iterator sid = shard.begin(); |
|
while (sid != shard.end()) { |
|
|
|
vector<vector<ScoreComponentCollection> > featureValues; |
|
vector<vector<float> > bleuScores; |
|
vector<vector<float> > modelScores; |
|
|
|
|
|
vector<vector<ScoreComponentCollection> > featureValuesHope; |
|
vector<vector<ScoreComponentCollection> > featureValuesFear; |
|
vector<vector<float> > bleuScoresHope; |
|
vector<vector<float> > bleuScoresFear; |
|
vector<vector<float> > modelScoresHope; |
|
vector<vector<float> > modelScoresFear; |
|
|
|
|
|
ScoreComponentCollection mosesWeights = decoder->getWeights(); |
|
VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", weights: " << mosesWeights << endl); |
|
|
|
if (historyBleu || simpleHistoryBleu) { |
|
decoder->printBleuFeatureHistory(cerr); |
|
} |
|
|
|
|
|
vector<float> oracleBleuScores; |
|
vector<float> oracleModelScores; |
|
vector<vector<const Word*> > oneBests; |
|
vector<ScoreComponentCollection> oracleFeatureValues; |
|
vector<size_t> inputLengths; |
|
vector<size_t> ref_ids; |
|
size_t actualBatchSize = 0; |
|
|
|
size_t examples_in_batch = 0; |
|
bool skip_example = false; |
|
for (size_t batchPosition = 0; batchPosition < batchSize && sid |
|
!= shard.end(); ++batchPosition) { |
|
string input; |
|
if (trainWithMultipleFolds) |
|
input = inputSentencesFolds[myFold][*sid]; |
|
else |
|
input = inputSentences[*sid]; |
|
|
|
Moses::Sentence *sentence = new Sentence(); |
|
stringstream in(input + "\n"); |
|
const vector<FactorType> inputFactorOrder = staticData.GetInputFactorOrder(); |
|
sentence->Read(in,inputFactorOrder); |
|
cerr << "\nRank " << rank << ", epoch " << epoch << ", input sentence " << *sid << ": \""; |
|
sentence->Print(cerr); |
|
cerr << "\"" << " (batch pos " << batchPosition << ")" << endl; |
|
size_t current_input_length = (*sentence).GetSize(); |
|
|
|
if (epoch == 0 && (scaleByAvgInputLength || scaleByAvgInverseLength)) { |
|
sumOfInputs += current_input_length; |
|
++numberOfInputs; |
|
avgInputLength = sumOfInputs/numberOfInputs; |
|
decoder->setAvgInputLength(avgInputLength); |
|
cerr << "Rank " << rank << ", epoch 0, average input length: " << avgInputLength << endl; |
|
} |
|
|
|
vector<ScoreComponentCollection> newFeatureValues; |
|
vector<float> newScores; |
|
if (model_hope_fear) { |
|
featureValues.push_back(newFeatureValues); |
|
bleuScores.push_back(newScores); |
|
modelScores.push_back(newScores); |
|
} |
|
if (hope_fear || perceptron_update) { |
|
featureValuesHope.push_back(newFeatureValues); |
|
featureValuesFear.push_back(newFeatureValues); |
|
bleuScoresHope.push_back(newScores); |
|
bleuScoresFear.push_back(newScores); |
|
modelScoresHope.push_back(newScores); |
|
modelScoresFear.push_back(newScores); |
|
if (historyBleu || simpleHistoryBleu || debug_model) { |
|
featureValues.push_back(newFeatureValues); |
|
bleuScores.push_back(newScores); |
|
modelScores.push_back(newScores); |
|
} |
|
} |
|
if (kbest) { |
|
|
|
featureValues.push_back(newFeatureValues); |
|
bleuScores.push_back(newScores); |
|
modelScores.push_back(newScores); |
|
|
|
|
|
featureValuesHope.push_back(newFeatureValues); |
|
featureValuesFear.push_back(newFeatureValues); |
|
bleuScoresHope.push_back(newScores); |
|
bleuScoresFear.push_back(newScores); |
|
modelScoresHope.push_back(newScores); |
|
modelScoresFear.push_back(newScores); |
|
} |
|
|
|
size_t ref_length; |
|
float avg_ref_length; |
|
|
|
if (print_weights) |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: " << mosesWeights << endl; |
|
if (print_core_weights) { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: "; |
|
mosesWeights.PrintCoreFeatures(); |
|
cerr << endl; |
|
} |
|
|
|
|
|
const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions(); |
|
for (size_t i = 0; i < statefulFFs.size(); ++i) { |
|
const StatefulFeatureFunction *ff = statefulFFs[i]; |
|
const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff); |
|
|
|
if (lm) { |
|
float lmWeight = mosesWeights.GetScoreForProducer(lm); |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight: " << lmWeight << endl; |
|
if (lmWeight <= 0) { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: language model weight should never be <= 0." << endl; |
|
mosesWeights.Assign(lm, 0.1); |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", assign lm weights of 0.1" << endl; |
|
} |
|
} |
|
} |
|
|
|
|
|
cerr << "Rank " << rank << ", epoch " << epoch << ", real Bleu? " << realBleu << endl; |
|
if (hope_fear || perceptron_update) { |
|
|
|
cerr << "Rank " << rank << ", epoch " << epoch << ", " << hope_n << |
|
"best hope translations" << endl; |
|
vector< vector<const Word*> > outputHope = decoder->getNBest(input, *sid, hope_n, 1.0, bleuWeight_hope, |
|
featureValuesHope[batchPosition], bleuScoresHope[batchPosition], modelScoresHope[batchPosition], |
|
1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); |
|
vector<const Word*> oracle = outputHope[0]; |
|
decoder->cleanup(chartDecoding); |
|
ref_length = decoder->getClosestReferenceLength(*sid, oracle.size()); |
|
avg_ref_length = ref_length; |
|
float hope_length_ratio = (float)oracle.size()/ref_length; |
|
cerr << endl; |
|
|
|
|
|
featureValuesHope[batchPosition][0].IncrementSparseHopeFeatures(); |
|
|
|
vector<const Word*> bestModel; |
|
if (debug_model || historyBleu || simpleHistoryBleu) { |
|
|
|
cerr << "Rank " << rank << ", epoch " << epoch << ", 1best wrt model score (debug or history)" << endl; |
|
vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0, bleuWeight, |
|
featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], |
|
1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); |
|
bestModel = outputModel[0]; |
|
decoder->cleanup(chartDecoding); |
|
cerr << endl; |
|
ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size()); |
|
} |
|
|
|
|
|
|
|
float bleuRatioHopeFear = 0; |
|
|
|
cerr << "Rank " << rank << ", epoch " << epoch << ", " << fear_n << "best fear translations" << endl; |
|
vector< vector<const Word*> > outputFear = decoder->getNBest(input, *sid, fear_n, -1.0, bleuWeight_fear, |
|
featureValuesFear[batchPosition], bleuScoresFear[batchPosition], modelScoresFear[batchPosition], |
|
1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); |
|
vector<const Word*> fear = outputFear[0]; |
|
decoder->cleanup(chartDecoding); |
|
ref_length = decoder->getClosestReferenceLength(*sid, fear.size()); |
|
avg_ref_length += ref_length; |
|
avg_ref_length /= 2; |
|
|
|
|
|
cerr << endl; |
|
for (size_t i = 0; i < fear.size(); ++i) |
|
delete fear[i]; |
|
|
|
|
|
featureValuesFear[batchPosition][0].IncrementSparseFearFeatures(); |
|
|
|
|
|
bool skip = false; |
|
bleuRatioHopeFear = bleuScoresHope[batchPosition][0] / bleuScoresFear[batchPosition][0]; |
|
if (minBleuRatio != -1 && bleuRatioHopeFear < minBleuRatio) |
|
skip = true; |
|
if(maxBleuRatio != -1 && bleuRatioHopeFear > maxBleuRatio) |
|
skip = true; |
|
|
|
|
|
if (historyBleu || simpleHistoryBleu) { |
|
if (bleuScores[batchPosition][0] > bleuScoresHope[batchPosition][0] && |
|
modelScores[batchPosition][0] > modelScoresHope[batchPosition][0]) { |
|
if (abs(bleuScores[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon && |
|
abs(modelScores[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: MODEL translation better than HOPE translation." << endl; |
|
skip = true; |
|
} |
|
} |
|
if (bleuScoresFear[batchPosition][0] > bleuScores[batchPosition][0] && |
|
modelScoresFear[batchPosition][0] > modelScores[batchPosition][0]) { |
|
if (abs(bleuScoresFear[batchPosition][0] - bleuScores[batchPosition][0]) > epsilon && |
|
abs(modelScoresFear[batchPosition][0] - modelScores[batchPosition][0]) > epsilon) { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than MODEL translation." << endl; |
|
skip = true; |
|
} |
|
} |
|
} |
|
if (bleuScoresFear[batchPosition][0] > bleuScoresHope[batchPosition][0]) { |
|
if (abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon) { |
|
|
|
skip = true; |
|
if (modelScoresFear[batchPosition][0] > modelScoresHope[batchPosition][0] && abs(modelScoresFear[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl; |
|
} else { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", WARNING: FEAR translation has better Bleu than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl; |
|
} |
|
} |
|
} |
|
|
|
if (skip) { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", skip example (" << hope_length_ratio << ", " << bleuRatioHopeFear << ").. " << endl; |
|
featureValuesHope[batchPosition].clear(); |
|
featureValuesFear[batchPosition].clear(); |
|
bleuScoresHope[batchPosition].clear(); |
|
bleuScoresFear[batchPosition].clear(); |
|
if (historyBleu || simpleHistoryBleu || debug_model) { |
|
featureValues[batchPosition].clear(); |
|
bleuScores[batchPosition].clear(); |
|
} |
|
} else { |
|
examples_in_batch++; |
|
|
|
|
|
if (historyBleu || simpleHistoryBleu) { |
|
inputLengths.push_back(current_input_length); |
|
ref_ids.push_back(*sid); |
|
oneBests.push_back(bestModel); |
|
} |
|
} |
|
} |
|
if (model_hope_fear) { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best hope translations" << endl; |
|
size_t oraclePos = featureValues[batchPosition].size(); |
|
decoder->getNBest(input, *sid, n, 1.0, bleuWeight_hope, |
|
featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], |
|
0, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); |
|
|
|
|
|
inputLengths.push_back(current_input_length); |
|
ref_ids.push_back(*sid); |
|
decoder->cleanup(chartDecoding); |
|
|
|
|
|
cerr << endl; |
|
|
|
oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]); |
|
oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]); |
|
oracleModelScores.push_back(modelScores[batchPosition][oraclePos]); |
|
|
|
|
|
cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl; |
|
if (historyBleu || simpleHistoryBleu) { |
|
vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0, |
|
bleuWeight, featureValues[batchPosition], bleuScores[batchPosition], |
|
modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); |
|
vector<const Word*> bestModel = outputModel[0]; |
|
oneBests.push_back(bestModel); |
|
inputLengths.push_back(current_input_length); |
|
ref_ids.push_back(*sid); |
|
} else { |
|
decoder->getNBest(input, *sid, n, 0.0, bleuWeight, |
|
featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], |
|
0, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); |
|
} |
|
decoder->cleanup(chartDecoding); |
|
|
|
|
|
cerr << endl; |
|
|
|
|
|
cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best fear translations" << endl; |
|
decoder->getNBest(input, *sid, n, -1.0, bleuWeight_fear, |
|
featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], |
|
0, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); |
|
decoder->cleanup(chartDecoding); |
|
|
|
|
|
|
|
examples_in_batch++; |
|
} |
|
if (kbest) { |
|
|
|
cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl; |
|
if (historyBleu || simpleHistoryBleu) { |
|
vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0, |
|
bleuWeight, featureValues[batchPosition], bleuScores[batchPosition], |
|
modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); |
|
vector<const Word*> bestModel = outputModel[0]; |
|
oneBests.push_back(bestModel); |
|
inputLengths.push_back(current_input_length); |
|
ref_ids.push_back(*sid); |
|
} else { |
|
decoder->getNBest(input, *sid, n, 0.0, bleuWeight, |
|
featureValues[batchPosition], bleuScores[batchPosition], |
|
modelScores[batchPosition], 0, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); |
|
} |
|
decoder->cleanup(chartDecoding); |
|
|
|
|
|
cerr << endl; |
|
|
|
examples_in_batch++; |
|
|
|
HypothesisQueue queueHope(hope_n); |
|
HypothesisQueue queueFear(fear_n); |
|
cerr << endl; |
|
if (most_violated || all_violated) { |
|
float bleuHope = -1000; |
|
float bleuFear = 1000; |
|
int indexHope = -1; |
|
int indexFear = -1; |
|
|
|
vector<float> bleuHopeList; |
|
vector<float> bleuFearList; |
|
vector<float> indexHopeList; |
|
vector<float> indexFearList; |
|
|
|
if (most_violated) |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with most violated constraint" << endl; |
|
else if (all_violated) |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with violated constraints"; |
|
else |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with hope"; |
|
|
|
|
|
for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) { |
|
if (abs(bleuScores[batchPosition][i] - bleuHope) < epsilon) { |
|
if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) { |
|
if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) { |
|
|
|
bleuHope = bleuScores[batchPosition][i]; |
|
indexHope = i; |
|
} |
|
} |
|
} else if (bleuScores[batchPosition][i] > bleuHope) { |
|
bleuHope = bleuScores[batchPosition][i]; |
|
indexHope = i; |
|
} |
|
} |
|
|
|
float currentViolation = 0; |
|
for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) { |
|
float bleuDiff = bleuHope - bleuScores[batchPosition][i]; |
|
float modelDiff = modelScores[batchPosition][indexHope] - modelScores[batchPosition][i]; |
|
if ((bleuDiff > epsilon) && (modelDiff < bleuDiff)) { |
|
float diff = bleuDiff - modelDiff; |
|
if (diff > epsilon) { |
|
if (all_violated) { |
|
cerr << ".. adding pair"; |
|
bleuHopeList.push_back(bleuHope); |
|
bleuFearList.push_back(bleuScores[batchPosition][i]); |
|
indexHopeList.push_back(indexHope); |
|
indexFearList.push_back(i); |
|
} else if (most_violated && diff > currentViolation) { |
|
currentViolation = diff; |
|
bleuFear = bleuScores[batchPosition][i]; |
|
indexFear = i; |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << currentViolation << " (" << modelDiff << " >= " << bleuDiff << ")" << endl; |
|
} |
|
} |
|
} |
|
} |
|
|
|
if (most_violated) { |
|
if (currentViolation > 0) { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", adding pair with violation " << currentViolation << endl; |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope << "), fear: " << bleuFear << " (" << indexFear << ")" << endl; |
|
bleuScoresHope[batchPosition].push_back(bleuHope); |
|
bleuScoresFear[batchPosition].push_back(bleuFear); |
|
featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]); |
|
featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]); |
|
float modelScoreHope = modelScores[batchPosition][indexHope]; |
|
float modelScoreFear = modelScores[batchPosition][indexFear]; |
|
if (most_violated_reg) { |
|
|
|
float reg = currentViolation/4; |
|
modelScoreHope += abs(reg); |
|
modelScoreFear -= abs(reg); |
|
float newViolation = (bleuHope - bleuFear) - (modelScoreHope - modelScoreFear); |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", regularized violation: " << newViolation << endl; |
|
} |
|
modelScoresHope[batchPosition].push_back(modelScoreHope); |
|
modelScoresFear[batchPosition].push_back(modelScoreFear); |
|
|
|
featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures(); |
|
featureValues[batchPosition][indexFear].IncrementSparseFearFeatures(); |
|
} else { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", no violated constraint found." << endl; |
|
skip_example = 1; |
|
} |
|
} else cerr << endl; |
|
} |
|
if (max_bleu_diff) { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with max Bleu diff from list: " << bleuScores[batchPosition].size() << endl; |
|
for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) { |
|
float hopeScore = bleuScores[batchPosition][i]; |
|
if (modelPlusBleu) hopeScore += modelScores[batchPosition][i]; |
|
BleuIndexPair hope(hopeScore, i); |
|
queueHope.Push(hope); |
|
|
|
float fearScore = -1*(bleuScores[batchPosition][i]); |
|
if (modelPlusBleu) fearScore += modelScores[batchPosition][i]; |
|
BleuIndexPair fear(fearScore, i); |
|
queueFear.Push(fear); |
|
} |
|
skip_example = 0; |
|
} |
|
cerr << endl; |
|
|
|
vector<BleuIndexPair> hopeList, fearList; |
|
for (size_t i=0; i<hope_n && !queueHope.Empty(); ++i) hopeList.push_back(queueHope.Pop()); |
|
for (size_t i=0; i<fear_n && !queueFear.Empty(); ++i) fearList.push_back(queueFear.Pop()); |
|
for (size_t i=0; i<hopeList.size(); ++i) { |
|
|
|
size_t indexHope = hopeList[i].second; |
|
float bleuHope = bleuScores[batchPosition][indexHope]; |
|
for (size_t j=0; j<fearList.size(); ++j) { |
|
|
|
size_t indexFear = fearList[j].second; |
|
float bleuFear = bleuScores[batchPosition][indexFear]; |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope << "), fear: " << bleuFear << " (" << indexFear << ")" << endl; |
|
bleuScoresHope[batchPosition].push_back(bleuHope); |
|
bleuScoresFear[batchPosition].push_back(bleuFear); |
|
featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]); |
|
featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]); |
|
float modelScoreHope = modelScores[batchPosition][indexHope]; |
|
float modelScoreFear = modelScores[batchPosition][indexFear]; |
|
|
|
modelScoresHope[batchPosition].push_back(modelScoreHope); |
|
modelScoresFear[batchPosition].push_back(modelScoreFear); |
|
|
|
featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures(); |
|
featureValues[batchPosition][indexFear].IncrementSparseFearFeatures(); |
|
} |
|
} |
|
if (!makePairs) |
|
cerr << "Rank " << rank << ", epoch " << epoch << "summing up hope and fear vectors, no pairs" << endl; |
|
} |
|
|
|
|
|
++sid; |
|
++actualBatchSize; |
|
++shardPosition; |
|
} |
|
|
|
if (examples_in_batch == 0 || (kbest && skip_example)) { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", batch is empty." << endl; |
|
} else { |
|
vector<vector<float> > losses(actualBatchSize); |
|
if (model_hope_fear) { |
|
|
|
for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) { |
|
for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) { |
|
losses[batchPosition].push_back(oracleBleuScores[batchPosition] - bleuScores[batchPosition][j]); |
|
} |
|
} |
|
} |
|
|
|
|
|
vector<FeatureFunction*>::const_iterator iter; |
|
const vector<FeatureFunction*> &featureFunctions2 = FeatureFunction::GetFeatureFunctions(); |
|
for (iter = featureFunctions2.begin(); iter != featureFunctions2.end(); ++iter) { |
|
if ((*iter)->GetScoreProducerDescription() == "BleuScoreFeature") { |
|
mosesWeights.Assign(*iter, 0); |
|
break; |
|
} |
|
} |
|
|
|
|
|
if (scale_lm) { |
|
cerr << "scale lm" << endl; |
|
const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions(); |
|
for (size_t i = 0; i < statefulFFs.size(); ++i) { |
|
const StatefulFeatureFunction *ff = statefulFFs[i]; |
|
const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff); |
|
|
|
if (lm) { |
|
|
|
if (model_hope_fear) { |
|
scaleFeatureScore(lm, scale_lm_factor, featureValues, rank, epoch); |
|
} else { |
|
scaleFeatureScore(lm, scale_lm_factor, featureValuesHope, rank, epoch); |
|
scaleFeatureScore(lm, scale_lm_factor, featureValuesFear, rank, epoch); |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
if (scale_wp) { |
|
|
|
WordPenaltyProducer &wp = WordPenaltyProducer::InstanceNonConst(); |
|
|
|
|
|
if (model_hope_fear) { |
|
scaleFeatureScore(&wp, scale_wp_factor, featureValues, rank, epoch); |
|
} else { |
|
scaleFeatureScore(&wp, scale_wp_factor, featureValuesHope, rank, epoch); |
|
scaleFeatureScore(&wp, scale_wp_factor, featureValuesFear, rank, epoch); |
|
} |
|
} |
|
|
|
|
|
if (print_feature_values) { |
|
cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl; |
|
if (model_hope_fear) printFeatureValues(featureValues); |
|
else { |
|
cerr << "hope: " << endl; |
|
printFeatureValues(featureValuesHope); |
|
cerr << "fear: " << endl; |
|
printFeatureValues(featureValuesFear); |
|
} |
|
} |
|
|
|
|
|
if (feature_confidence) { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decays " << decay_core << "/" << decay_sparse << ": " << featureLearningRates << endl; |
|
if (model_hope_fear) { |
|
applyPerFeatureLearningRates(featureValues, featureLearningRates, sparse_r0); |
|
} else { |
|
applyPerFeatureLearningRates(featureValuesHope, featureLearningRates, sparse_r0); |
|
applyPerFeatureLearningRates(featureValuesFear, featureLearningRates, sparse_r0); |
|
} |
|
} else { |
|
|
|
cerr << "Rank " << rank << ", epoch " << epoch << ", apply fixed learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl; |
|
if (core_r0 != 1.0 || sparse_r0 != 1.0) { |
|
if (model_hope_fear) { |
|
applyLearningRates(featureValues, core_r0, sparse_r0); |
|
} else { |
|
applyLearningRates(featureValuesHope, core_r0, sparse_r0); |
|
applyLearningRates(featureValuesFear, core_r0, sparse_r0); |
|
} |
|
} |
|
} |
|
|
|
|
|
VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl); |
|
size_t update_status = 1; |
|
ScoreComponentCollection weightUpdate; |
|
if (perceptron_update) { |
|
vector<vector<float> > dummy1; |
|
update_status = optimiser->updateWeightsHopeFear( weightUpdate, featureValuesHope, |
|
featureValuesFear, dummy1, dummy1, dummy1, dummy1, learning_rate, rank, epoch); |
|
} else if (hope_fear) { |
|
if (bleuScoresHope[0][0] >= min_oracle_bleu) { |
|
if (hope_n == 1 && fear_n ==1 && batchSize == 1 && !hildreth) { |
|
update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(weightUpdate, |
|
featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0], |
|
bleuScoresFear[0][0], modelScoresHope[0][0], modelScoresFear[0][0], learning_rate, rank, epoch); |
|
} else |
|
update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope, |
|
featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope, |
|
modelScoresFear, learning_rate, rank, epoch); |
|
} else |
|
update_status = 1; |
|
} else if (kbest) { |
|
if (batchSize == 1 && featureValuesHope[0].size() == 1 && !hildreth) { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl; |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl; |
|
update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically( |
|
weightUpdate, featureValuesHope[0][0], featureValuesFear[0][0], |
|
bleuScoresHope[0][0], bleuScoresFear[0][0], modelScoresHope[0][0], |
|
modelScoresFear[0][0], learning_rate, rank, epoch); |
|
} else { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl; |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl; |
|
update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope, |
|
featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope, |
|
modelScoresFear, learning_rate, rank, epoch); |
|
} |
|
} else { |
|
|
|
update_status = ((MiraOptimiser*) optimiser)->updateWeights(weightUpdate, |
|
featureValues, losses, bleuScores, modelScores, oracleFeatureValues, |
|
oracleBleuScores, oracleModelScores, learning_rate, rank, epoch); |
|
} |
|
|
|
|
|
|
|
if (update_status == 0) { |
|
|
|
if (debug) |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << weightUpdate << endl; |
|
|
|
if (feature_confidence) { |
|
|
|
confidenceCounts.UpdateConfidenceCounts(weightUpdate, signed_counts); |
|
|
|
|
|
featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0); |
|
} |
|
|
|
|
|
mosesWeights.PlusEquals(weightUpdate); |
|
|
|
if (normaliseWeights) |
|
mosesWeights.L1Normalise(); |
|
|
|
cumulativeWeights.PlusEquals(mosesWeights); |
|
if (sparseAverage) { |
|
ScoreComponentCollection binary; |
|
binary.SetToBinaryOf(mosesWeights); |
|
cumulativeWeightsBinary.PlusEquals(binary); |
|
} |
|
|
|
++numberOfUpdates; |
|
++numberOfUpdatesThisEpoch; |
|
if (averageWeights) { |
|
ScoreComponentCollection averageWeights(cumulativeWeights); |
|
if (accumulateWeights) { |
|
averageWeights.DivideEquals(numberOfUpdates); |
|
} else { |
|
averageWeights.DivideEquals(numberOfUpdatesThisEpoch); |
|
} |
|
|
|
mosesWeights = averageWeights; |
|
} |
|
|
|
|
|
decoder->setWeights(mosesWeights); |
|
|
|
} |
|
|
|
|
|
if (historyBleu || simpleHistoryBleu) { |
|
for (size_t i = 0; i < oneBests.size(); ++i) |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", update history with 1best length: " << oneBests[i].size() << " "; |
|
decoder->updateHistory(oneBests, inputLengths, ref_ids, rank, epoch); |
|
deleteTranslations(oneBests); |
|
} |
|
} |
|
|
|
|
|
size_t generalShardSize; |
|
if (trainWithMultipleFolds) |
|
generalShardSize = order.size()/coresPerFold; |
|
else |
|
generalShardSize = order.size()/size; |
|
|
|
size_t mixing_base = mixingFrequency == 0 ? 0 : generalShardSize / mixingFrequency; |
|
size_t dumping_base = weightDumpFrequency == 0 ? 0 : generalShardSize / weightDumpFrequency; |
|
bool mix = evaluateModulo(shardPosition, mixing_base, actualBatchSize); |
|
|
|
|
|
if (mix) { |
|
#ifdef MPI_ENABLE |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", mixing weights.. " << endl; |
|
|
|
mpi::reduce(world, mosesWeights, mixedWeights, SCCPlus(), 0); |
|
|
|
|
|
|
|
ScoreComponentCollection totalBinary; |
|
if (sparseAverage) { |
|
ScoreComponentCollection binary; |
|
binary.SetToBinaryOf(mosesWeights); |
|
mpi::reduce(world, binary, totalBinary, SCCPlus(), 0); |
|
} |
|
if (rank == 0) { |
|
|
|
if (sparseNoAverage) |
|
mixedWeights.CoreDivideEquals(size); |
|
else if (sparseAverage) |
|
mixedWeights.DivideEquals(totalBinary); |
|
else |
|
mixedWeights.DivideEquals(size); |
|
|
|
|
|
|
|
|
|
|
|
if (normaliseWeights) { |
|
mixedWeights.L1Normalise(); |
|
} |
|
|
|
++weightMixingThisEpoch; |
|
|
|
if (pruneZeroWeights) { |
|
size_t pruned = mixedWeights.PruneZeroWeightFeatures(); |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", " |
|
<< pruned << " zero-weighted features pruned from mixedWeights." << endl; |
|
|
|
pruned = cumulativeWeights.PruneZeroWeightFeatures(); |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", " |
|
<< pruned << " zero-weighted features pruned from cumulativeWeights." << endl; |
|
} |
|
|
|
if (featureCutoff != -1 && weightMixingThisEpoch == mixingFrequency) { |
|
size_t pruned = mixedWeights.PruneSparseFeatures(featureCutoff); |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", " |
|
<< pruned << " features pruned from mixedWeights." << endl; |
|
|
|
pruned = cumulativeWeights.PruneSparseFeatures(featureCutoff); |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", " |
|
<< pruned << " features pruned from cumulativeWeights." << endl; |
|
} |
|
|
|
if (weightMixingThisEpoch == mixingFrequency || reg_on_every_mix) { |
|
if (l1_regularize) { |
|
size_t pruned; |
|
if (l1_reg_sparse) |
|
pruned = mixedWeights.SparseL1Regularize(l1_lambda); |
|
else |
|
pruned = mixedWeights.L1Regularize(l1_lambda); |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", " |
|
<< "l1-reg. on mixedWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl; |
|
} |
|
if (l2_regularize) { |
|
if (l2_reg_sparse) |
|
mixedWeights.SparseL2Regularize(l2_lambda); |
|
else |
|
mixedWeights.L2Regularize(l2_lambda); |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", " |
|
<< "l2-reg. on mixedWeights with lambda=" << l2_lambda << endl; |
|
} |
|
} |
|
} |
|
|
|
|
|
mpi::broadcast(world, mixedWeights, 0); |
|
decoder->setWeights(mixedWeights); |
|
mosesWeights = mixedWeights; |
|
|
|
|
|
|
|
|
|
#endif |
|
#ifndef MPI_ENABLE |
|
|
|
mixedWeights = mosesWeights; |
|
#endif |
|
} |
|
|
|
|
|
if (trainWithMultipleFolds || weightEpochDump == weightDumpFrequency) { |
|
|
|
|
|
ostringstream filename; |
|
if (epoch < 10) |
|
filename << weightDumpStem << "_mixed_0" << epoch; |
|
else |
|
filename << weightDumpStem << "_mixed_" << epoch; |
|
|
|
if (weightDumpFrequency > 1) |
|
filename << "_" << weightEpochDump; |
|
|
|
mixedWeights.Save(filename.str()); |
|
cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl; |
|
} |
|
if (dumpMixedWeights) { |
|
if (mix && rank == 0 && !weightDumpStem.empty()) { |
|
|
|
ostringstream filename; |
|
if (epoch < 10) |
|
filename << weightDumpStem << "_0" << epoch; |
|
else |
|
filename << weightDumpStem << "_" << epoch; |
|
|
|
if (weightDumpFrequency > 1) |
|
filename << "_" << weightEpochDump; |
|
|
|
cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl; |
|
mixedWeights.Save(filename.str()); |
|
++weightEpochDump; |
|
} |
|
} else { |
|
if (evaluateModulo(shardPosition, dumping_base, actualBatchSize)) { |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", dump weights.. (pos: " << shardPosition << ", base: " << dumping_base << ")" << endl; |
|
ScoreComponentCollection tmpAverageWeights(cumulativeWeights); |
|
bool proceed = false; |
|
if (accumulateWeights) { |
|
if (numberOfUpdates > 0) { |
|
tmpAverageWeights.DivideEquals(numberOfUpdates); |
|
proceed = true; |
|
} |
|
} else { |
|
if (numberOfUpdatesThisEpoch > 0) { |
|
if (sparseNoAverage) |
|
tmpAverageWeights.CoreDivideEquals(numberOfUpdatesThisEpoch); |
|
else if (sparseAverage) |
|
tmpAverageWeights.DivideEquals(cumulativeWeightsBinary); |
|
else |
|
tmpAverageWeights.DivideEquals(numberOfUpdatesThisEpoch); |
|
proceed = true; |
|
} |
|
} |
|
|
|
if (proceed) { |
|
#ifdef MPI_ENABLE |
|
|
|
mpi::reduce(world, tmpAverageWeights, mixedAverageWeights, SCCPlus(), 0); |
|
ScoreComponentCollection totalBinary; |
|
if (sparseAverage) { |
|
ScoreComponentCollection binary; |
|
binary.SetToBinaryOf(mosesWeights); |
|
mpi::reduce(world, binary, totalBinary, SCCPlus(), 0); |
|
} |
|
#endif |
|
#ifndef MPI_ENABLE |
|
mixedAverageWeights = tmpAverageWeights; |
|
|
|
ScoreComponentCollection totalBinary; |
|
#endif |
|
if (rank == 0 && !weightDumpStem.empty()) { |
|
|
|
if (sparseNoAverage) |
|
mixedAverageWeights.CoreDivideEquals(size); |
|
else if (sparseAverage) |
|
mixedAverageWeights.DivideEquals(totalBinary); |
|
else |
|
mixedAverageWeights.DivideEquals(size); |
|
|
|
|
|
if (normaliseWeights) { |
|
mixedAverageWeights.L1Normalise(); |
|
} |
|
|
|
|
|
ostringstream filename; |
|
if (epoch < 10) { |
|
filename << weightDumpStem << "_0" << epoch; |
|
} else { |
|
filename << weightDumpStem << "_" << epoch; |
|
} |
|
|
|
if (weightDumpFrequency > 1) { |
|
filename << "_" << weightEpochDump; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl; |
|
mixedAverageWeights.Save(filename.str()); |
|
++weightEpochDump; |
|
|
|
if (weightEpochDump == weightDumpFrequency) { |
|
if (l1_regularize) { |
|
size_t pruned = mixedAverageWeights.SparseL1Regularize(l1_lambda); |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", " |
|
<< "l1-reg. on mixedAverageWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl; |
|
|
|
} |
|
if (l2_regularize) { |
|
mixedAverageWeights.SparseL2Regularize(l2_lambda); |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", " |
|
<< "l2-reg. on mixedAverageWeights with lambda=" << l2_lambda << endl; |
|
} |
|
|
|
if (l1_regularize || l2_regularize) { |
|
filename << "_reg"; |
|
cerr << "Dumping regularized mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl; |
|
mixedAverageWeights.Save(filename.str()); |
|
} |
|
} |
|
|
|
if (weightEpochDump == weightDumpFrequency && printFeatureCounts) { |
|
|
|
stringstream s1, s2; |
|
s1 << "sparse_feature_hope_counts" << "_" << epoch; |
|
s2 << "sparse_feature_fear_counts" << "_" << epoch; |
|
ofstream sparseFeatureCountsHope(s1.str().c_str()); |
|
ofstream sparseFeatureCountsFear(s2.str().c_str()); |
|
|
|
mixedAverageWeights.PrintSparseHopeFeatureCounts(sparseFeatureCountsHope); |
|
mixedAverageWeights.PrintSparseFearFeatureCounts(sparseFeatureCountsFear); |
|
sparseFeatureCountsHope.close(); |
|
sparseFeatureCountsFear.close(); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
cerr << "Rank " << rank << ", epoch " << epoch << ", end of epoch.." << endl; |
|
|
|
if (historyBleu || simpleHistoryBleu) { |
|
cerr << "Bleu feature history after epoch " << epoch << endl; |
|
decoder->printBleuFeatureHistory(cerr); |
|
} |
|
|
|
|
|
|
|
size_t sumUpdates; |
|
size_t *sendbuf_uint, *recvbuf_uint; |
|
sendbuf_uint = (size_t *) malloc(sizeof(size_t)); |
|
recvbuf_uint = (size_t *) malloc(sizeof(size_t)); |
|
#ifdef MPI_ENABLE |
|
sendbuf_uint[0] = numberOfUpdatesThisEpoch; |
|
recvbuf_uint[0] = 0; |
|
MPI_Reduce(sendbuf_uint, recvbuf_uint, 1, MPI_UNSIGNED, MPI_SUM, 0, world); |
|
sumUpdates = recvbuf_uint[0]; |
|
#endif |
|
#ifndef MPI_ENABLE |
|
sumUpdates = numberOfUpdatesThisEpoch; |
|
#endif |
|
if (rank == 0 && sumUpdates == 0) { |
|
cerr << "\nNo weight updates during this epoch.. stopping." << endl; |
|
stop = true; |
|
#ifdef MPI_ENABLE |
|
mpi::broadcast(world, stop, 0); |
|
#endif |
|
} |
|
|
|
if (!stop) { |
|
|
|
if (weightConvergence) { |
|
bool reached = true; |
|
if (rank == 0 && (epoch >= 2)) { |
|
ScoreComponentCollection firstDiff, secondDiff; |
|
if (dumpMixedWeights) { |
|
firstDiff = mixedWeights; |
|
firstDiff.MinusEquals(mixedWeightsPrevious); |
|
secondDiff = mixedWeights; |
|
secondDiff.MinusEquals(mixedWeightsBeforePrevious); |
|
} else { |
|
firstDiff = mixedAverageWeights; |
|
firstDiff.MinusEquals(mixedAverageWeightsPrevious); |
|
secondDiff = mixedAverageWeights; |
|
secondDiff.MinusEquals(mixedAverageWeightsBeforePrevious); |
|
} |
|
VERBOSE(1, "Average weight changes since previous epoch: " << firstDiff << " (max: " << firstDiff.GetLInfNorm() << ")" << endl); |
|
VERBOSE(1, "Average weight changes since before previous epoch: " << secondDiff << " (max: " << secondDiff.GetLInfNorm() << ")" << endl << endl); |
|
|
|
|
|
|
|
if (firstDiff.GetLInfNorm() >= min_weight_change) |
|
reached = false; |
|
if (secondDiff.GetLInfNorm() >= min_weight_change) |
|
reached = false; |
|
if (reached) { |
|
|
|
stop = true; |
|
cerr << "\nWeights have converged after epoch " << epoch << ".. stopping MIRA." << endl; |
|
ScoreComponentCollection dummy; |
|
ostringstream endfilename; |
|
endfilename << "stopping"; |
|
dummy.Save(endfilename.str()); |
|
} |
|
} |
|
|
|
mixedWeightsBeforePrevious = mixedWeightsPrevious; |
|
mixedWeightsPrevious = mixedWeights; |
|
mixedAverageWeightsBeforePrevious = mixedAverageWeightsPrevious; |
|
mixedAverageWeightsPrevious = mixedAverageWeights; |
|
#ifdef MPI_ENABLE |
|
mpi::broadcast(world, stop, 0); |
|
#endif |
|
} |
|
} |
|
} |
|
|
|
#ifdef MPI_ENABLE |
|
MPI_Finalize(); |
|
#endif |
|
|
|
time(&now); |
|
cerr << "Rank " << rank << ", " << ctime(&now); |
|
|
|
if (rank == 0) { |
|
ScoreComponentCollection dummy; |
|
ostringstream endfilename; |
|
endfilename << "finished"; |
|
dummy.Save(endfilename.str()); |
|
} |
|
|
|
delete decoder; |
|
exit(0); |
|
} |
|
|
|
bool loadSentences(const string& filename, vector<string>& sentences) |
|
{ |
|
ifstream in(filename.c_str()); |
|
if (!in) |
|
return false; |
|
string line; |
|
while (getline(in, line)) |
|
sentences.push_back(line); |
|
return true; |
|
} |
|
|
|
bool evaluateModulo(size_t shard_position, size_t mix_or_dump_base, size_t actual_batch_size) |
|
{ |
|
if (mix_or_dump_base == 0) return 0; |
|
if (actual_batch_size > 1) { |
|
bool mix_or_dump = false; |
|
size_t numberSubtracts = actual_batch_size; |
|
do { |
|
if (shard_position % mix_or_dump_base == 0) { |
|
mix_or_dump = true; |
|
break; |
|
} |
|
--shard_position; |
|
--numberSubtracts; |
|
} while (numberSubtracts > 0); |
|
return mix_or_dump; |
|
} else { |
|
return ((shard_position % mix_or_dump_base) == 0); |
|
} |
|
} |
|
|
|
void printFeatureValues(vector<vector<ScoreComponentCollection> > &featureValues) |
|
{ |
|
for (size_t i = 0; i < featureValues.size(); ++i) { |
|
for (size_t j = 0; j < featureValues[i].size(); ++j) { |
|
cerr << featureValues[i][j] << endl; |
|
} |
|
} |
|
cerr << endl; |
|
} |
|
|
|
void deleteTranslations(vector<vector<const Word*> > &translations) |
|
{ |
|
for (size_t i = 0; i < translations.size(); ++i) { |
|
for (size_t j = 0; j < translations[i].size(); ++j) { |
|
delete translations[i][j]; |
|
} |
|
} |
|
} |
|
|
|
void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename, vector<string> &inputSentences, MosesDecoder* decoder, size_t n, float bleuWeight) |
|
{ |
|
if (decode == 1) |
|
cerr << "Rank " << rank << ", decoding dev input set according to hope objective.. " << endl; |
|
else if (decode == 2) |
|
cerr << "Rank " << rank << ", decoding dev input set according to fear objective.. " << endl; |
|
else |
|
cerr << "Rank " << rank << ", decoding dev input set according to normal objective.. " << endl; |
|
|
|
|
|
vector<size_t> order; |
|
for (size_t i = 0; i < inputSentences.size(); ++i) |
|
order.push_back(i); |
|
|
|
vector<size_t> shard; |
|
float shardSize = (float) (order.size()) / size; |
|
size_t shardStart = (size_t) (shardSize * rank); |
|
size_t shardEnd = (size_t) (shardSize * (rank + 1)); |
|
if (rank == size - 1) { |
|
shardEnd = inputSentences.size(); |
|
shardSize = shardEnd - shardStart; |
|
} |
|
VERBOSE(1, "Rank " << rank << ", shard start: " << shardStart << " Shard end: " << shardEnd << endl); |
|
VERBOSE(1, "Rank " << rank << ", shard size: " << shardSize << endl); |
|
shard.resize(shardSize); |
|
copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin()); |
|
|
|
|
|
stringstream fname; |
|
fname << filename << ".rank" << rank; |
|
filename = fname.str(); |
|
ostringstream filename_nbest; |
|
filename_nbest << filename << "." << n << "best"; |
|
ofstream out(filename.c_str()); |
|
ofstream nbest_out((filename_nbest.str()).c_str()); |
|
if (!out) { |
|
ostringstream msg; |
|
msg << "Unable to open " << fname.str(); |
|
throw runtime_error(msg.str()); |
|
} |
|
if (!nbest_out) { |
|
ostringstream msg; |
|
msg << "Unable to open " << filename_nbest; |
|
throw runtime_error(msg.str()); |
|
} |
|
|
|
for (size_t i = 0; i < shard.size(); ++i) { |
|
size_t sid = shard[i]; |
|
string& input = inputSentences[sid]; |
|
|
|
vector<vector<ScoreComponentCollection> > dummyFeatureValues; |
|
vector<vector<float> > dummyBleuScores; |
|
vector<vector<float> > dummyModelScores; |
|
|
|
vector<ScoreComponentCollection> newFeatureValues; |
|
vector<float> newScores; |
|
dummyFeatureValues.push_back(newFeatureValues); |
|
dummyBleuScores.push_back(newScores); |
|
dummyModelScores.push_back(newScores); |
|
|
|
float factor = 0.0; |
|
if (decode == 1) factor = 1.0; |
|
if (decode == 2) factor = -1.0; |
|
cerr << "Rank " << rank << ", translating sentence " << sid << endl; |
|
bool realBleu = false; |
|
vector< vector<const Word*> > nbestOutput = decoder->getNBest(input, sid, n, factor, bleuWeight, dummyFeatureValues[0], |
|
dummyBleuScores[0], dummyModelScores[0], n, realBleu, true, false, rank, 0, ""); |
|
cerr << endl; |
|
decoder->cleanup(StaticData::Instance().IsChart()); |
|
|
|
for (size_t i = 0; i < nbestOutput.size(); ++i) { |
|
vector<const Word*> output = nbestOutput[i]; |
|
stringstream translation; |
|
for (size_t k = 0; k < output.size(); ++k) { |
|
Word* w = const_cast<Word*>(output[k]); |
|
translation << w->GetString(0); |
|
translation << " "; |
|
} |
|
|
|
if (i == 0) |
|
out << translation.str() << endl; |
|
nbest_out << sid << " ||| " << translation.str() << " ||| " << dummyFeatureValues[0][i] << |
|
" ||| " << dummyModelScores[0][i] << " ||| sBleu=" << dummyBleuScores[0][i] << endl; |
|
} |
|
} |
|
|
|
out.close(); |
|
nbest_out.close(); |
|
cerr << "Closing files " << filename << " and " << filename_nbest.str() << endl; |
|
|
|
#ifdef MPI_ENABLE |
|
MPI_Finalize(); |
|
#endif |
|
|
|
time_t now; |
|
time(&now); |
|
cerr << "Rank " << rank << ", " << ctime(&now); |
|
|
|
delete decoder; |
|
exit(0); |
|
} |
|
|
|
void applyLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, float core_r0, float sparse_r0) |
|
{ |
|
for (size_t i=0; i<featureValues.size(); ++i) |
|
for (size_t j=0; j<featureValues[i].size(); ++j) |
|
featureValues[i][j].MultiplyEquals(core_r0, sparse_r0); |
|
} |
|
|
|
void applyPerFeatureLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, ScoreComponentCollection featureLearningRates, float sparse_r0) |
|
{ |
|
for (size_t i=0; i<featureValues.size(); ++i) |
|
for (size_t j=0; j<featureValues[i].size(); ++j) |
|
featureValues[i][j].MultiplyEqualsBackoff(featureLearningRates, sparse_r0); |
|
} |
|
|
|
void scaleFeatureScore(const FeatureFunction *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch) |
|
{ |
|
string name = sp->GetScoreProducerDescription(); |
|
|
|
|
|
float featureScore; |
|
for (size_t i=0; i<featureValues.size(); ++i) { |
|
for (size_t j=0; j<featureValues[i].size(); ++j) { |
|
featureScore = featureValues[i][j].GetScoreForProducer(sp); |
|
featureValues[i][j].Assign(sp, featureScore*scaling_factor); |
|
|
|
} |
|
} |
|
} |
|
|
|
void scaleFeatureScores(const FeatureFunction *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch) |
|
{ |
|
string name = sp->GetScoreProducerDescription(); |
|
|
|
|
|
for (size_t i=0; i<featureValues.size(); ++i) { |
|
for (size_t j=0; j<featureValues[i].size(); ++j) { |
|
vector<float> featureScores = featureValues[i][j].GetScoresForProducer(sp); |
|
for (size_t k=0; k<featureScores.size(); ++k) |
|
featureScores[k] *= scaling_factor; |
|
featureValues[i][j].Assign(sp, featureScores); |
|
|
|
} |
|
} |
|
} |
|
|