|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <algorithm> |
|
#include <cmath> |
|
#include <iterator> |
|
|
|
#define BOOST_FILESYSTEM_VERSION 3 |
|
#include <boost/filesystem.hpp> |
|
#include <boost/lexical_cast.hpp> |
|
|
|
#include "util/exception.hh" |
|
#include "util/file_piece.hh" |
|
|
|
#include "Scorer.h" |
|
#include "HopeFearDecoder.h" |
|
|
|
using namespace std; |
|
namespace fs = boost::filesystem; |
|
|
|
namespace MosesTuning |
|
{ |
|
|
|
static const ValType BLEU_RATIO = 5; |
|
|
|
std::pair<MiraWeightVector*,size_t> |
|
InitialiseWeights(const string& denseInitFile, const string& sparseInitFile, |
|
const string& type, bool verbose) |
|
{ |
|
|
|
vector<parameter_t> initParams; |
|
if(!denseInitFile.empty()) { |
|
ifstream opt(denseInitFile.c_str()); |
|
string buffer; |
|
if (opt.fail()) { |
|
cerr << "could not open dense initfile: " << denseInitFile << endl; |
|
exit(3); |
|
} |
|
if (verbose) cerr << "Reading dense features:" << endl; |
|
parameter_t val; |
|
getline(opt,buffer); |
|
if (buffer.find_first_of("=") == buffer.npos) { |
|
UTIL_THROW_IF(type == "hypergraph", util::Exception, "For hypergraph version, require dense features in 'name= value' format"); |
|
cerr << "WARN: dense features in deprecated Moses mert format. Prefer 'name= value' format." << endl; |
|
istringstream strstrm(buffer); |
|
while(strstrm >> val) { |
|
initParams.push_back(val); |
|
if(verbose) cerr << val << endl; |
|
} |
|
} else { |
|
vector<string> names; |
|
string last_name = ""; |
|
size_t feature_ctr = 1; |
|
do { |
|
size_t equals = buffer.find_last_of("="); |
|
UTIL_THROW_IF(equals == buffer.npos, util::Exception, "Incorrect format in dense feature file: '" |
|
<< buffer << "'"); |
|
string name = buffer.substr(0,equals); |
|
names.push_back(name); |
|
initParams.push_back(boost::lexical_cast<ValType>(buffer.substr(equals+2))); |
|
|
|
|
|
if (name != last_name) feature_ctr = 1; |
|
last_name = name; |
|
if (feature_ctr>1) { |
|
stringstream namestr; |
|
namestr << names.back() << "_" << feature_ctr; |
|
names[names.size()-1] = namestr.str(); |
|
if (feature_ctr == 2) { |
|
stringstream namestr; |
|
namestr << names[names.size()-2] << "_" << (feature_ctr-1); |
|
names[names.size()-2] = namestr.str(); |
|
} |
|
} |
|
++feature_ctr; |
|
|
|
} while(getline(opt,buffer)); |
|
|
|
|
|
|
|
for (size_t i = 0; i < names.size(); ++i) { |
|
size_t id = SparseVector::encode(names[i]); |
|
assert(id == i); |
|
if (verbose) cerr << names[i] << " " << initParams[i] << endl; |
|
} |
|
|
|
} |
|
|
|
opt.close(); |
|
} |
|
size_t initDenseSize = initParams.size(); |
|
|
|
if(!sparseInitFile.empty()) { |
|
if(initDenseSize==0) { |
|
cerr << "sparse initialization requires dense initialization" << endl; |
|
exit(3); |
|
} |
|
ifstream opt(sparseInitFile.c_str()); |
|
if(opt.fail()) { |
|
cerr << "could not open sparse initfile: " << sparseInitFile << endl; |
|
exit(3); |
|
} |
|
int sparseCount=0; |
|
parameter_t val; |
|
std::string name; |
|
while(opt >> name >> val) { |
|
size_t id = SparseVector::encode(name) + initDenseSize; |
|
while(initParams.size()<=id) initParams.push_back(0.0); |
|
initParams[id] = val; |
|
sparseCount++; |
|
} |
|
cerr << "Found " << sparseCount << " initial sparse features" << endl; |
|
opt.close(); |
|
} |
|
|
|
return pair<MiraWeightVector*,size_t>(new MiraWeightVector(initParams), initDenseSize); |
|
} |
|
|
|
ValType HopeFearDecoder::Evaluate(const AvgWeightVector& wv) |
|
{ |
|
vector<ValType> stats(scorer_->NumberOfScores(),0); |
|
for(reset(); !finished(); next()) { |
|
vector<ValType> sent; |
|
MaxModel(wv,&sent); |
|
for(size_t i=0; i<sent.size(); i++) { |
|
stats[i]+=sent[i]; |
|
} |
|
} |
|
return scorer_->calculateScore(stats); |
|
} |
|
|
|
NbestHopeFearDecoder::NbestHopeFearDecoder( |
|
const vector<string>& featureFiles, |
|
const vector<string>& scoreFiles, |
|
bool streaming, |
|
bool no_shuffle, |
|
bool safe_hope, |
|
Scorer* scorer |
|
) : safe_hope_(safe_hope) |
|
{ |
|
scorer_ = scorer; |
|
if (streaming) { |
|
train_.reset(new StreamingHypPackEnumerator(featureFiles, scoreFiles)); |
|
} else { |
|
train_.reset(new RandomAccessHypPackEnumerator(featureFiles, scoreFiles, no_shuffle)); |
|
} |
|
} |
|
|
|
|
|
void NbestHopeFearDecoder::next() |
|
{ |
|
train_->next(); |
|
} |
|
|
|
bool NbestHopeFearDecoder::finished() |
|
{ |
|
return train_->finished(); |
|
} |
|
|
|
void NbestHopeFearDecoder::reset() |
|
{ |
|
train_->reset(); |
|
} |
|
|
|
void NbestHopeFearDecoder::HopeFear( |
|
const std::vector<ValType>& backgroundBleu, |
|
const MiraWeightVector& wv, |
|
HopeFearData* hopeFear |
|
) |
|
{ |
|
|
|
|
|
|
|
ValType hope_scale = 1.0; |
|
size_t hope_index=0, fear_index=0, model_index=0; |
|
ValType hope_score=0, fear_score=0, model_score=0; |
|
for(size_t safe_loop=0; safe_loop<2; safe_loop++) { |
|
ValType hope_bleu=0, hope_model=0; |
|
for(size_t i=0; i< train_->cur_size(); i++) { |
|
const MiraFeatureVector& vec=train_->featuresAt(i); |
|
ValType score = wv.score(vec); |
|
ValType bleu = scorer_->calculateSentenceLevelBackgroundScore(train_->scoresAt(i),backgroundBleu); |
|
|
|
if(i==0 || (hope_scale*score + bleu) > hope_score) { |
|
hope_score = hope_scale*score + bleu; |
|
hope_index = i; |
|
hope_bleu = bleu; |
|
hope_model = score; |
|
} |
|
|
|
if(i==0 || (score - bleu) > fear_score) { |
|
fear_score = score - bleu; |
|
fear_index = i; |
|
} |
|
|
|
if(i==0 || score > model_score) { |
|
model_score = score; |
|
model_index = i; |
|
} |
|
} |
|
|
|
|
|
hope_bleu *= BLEU_RATIO; |
|
if(safe_hope_ && safe_loop==0 && abs(hope_model)>1e-8 && abs(hope_bleu)/abs(hope_model)<hope_scale) |
|
hope_scale = abs(hope_bleu) / abs(hope_model); |
|
else break; |
|
} |
|
hopeFear->modelFeatures = train_->featuresAt(model_index); |
|
hopeFear->hopeFeatures = train_->featuresAt(hope_index); |
|
hopeFear->fearFeatures = train_->featuresAt(fear_index); |
|
|
|
hopeFear->hopeStats = train_->scoresAt(hope_index); |
|
hopeFear->hopeBleu = scorer_->calculateSentenceLevelBackgroundScore(hopeFear->hopeStats, backgroundBleu); |
|
const vector<float>& fear_stats = train_->scoresAt(fear_index); |
|
hopeFear->fearBleu = scorer_->calculateSentenceLevelBackgroundScore(fear_stats, backgroundBleu); |
|
|
|
hopeFear->modelStats = train_->scoresAt(model_index); |
|
hopeFear->hopeFearEqual = (hope_index == fear_index); |
|
} |
|
|
|
void NbestHopeFearDecoder::MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats) |
|
{ |
|
|
|
size_t max_index=0; |
|
ValType max_score=0; |
|
for(size_t i=0; i<train_->cur_size(); i++) { |
|
MiraFeatureVector vec(train_->featuresAt(i)); |
|
ValType score = wv.score(vec); |
|
if(i==0 || score > max_score) { |
|
max_index = i; |
|
max_score = score; |
|
} |
|
} |
|
*stats = train_->scoresAt(max_index); |
|
} |
|
|
|
|
|
|
|
HypergraphHopeFearDecoder::HypergraphHopeFearDecoder |
|
( |
|
const string& hypergraphDir, |
|
const vector<string>& referenceFiles, |
|
size_t num_dense, |
|
bool streaming, |
|
bool no_shuffle, |
|
bool safe_hope, |
|
size_t hg_pruning, |
|
const MiraWeightVector& wv, |
|
Scorer* scorer |
|
) : |
|
num_dense_(num_dense) |
|
{ |
|
|
|
UTIL_THROW_IF(streaming, util::Exception, "Streaming not currently supported for hypergraphs"); |
|
UTIL_THROW_IF(!fs::exists(hypergraphDir), HypergraphException, "Directory '" << hypergraphDir << "' does not exist"); |
|
UTIL_THROW_IF(!referenceFiles.size(), util::Exception, "No reference files supplied"); |
|
references_.Load(referenceFiles, vocab_); |
|
|
|
SparseVector weights; |
|
wv.ToSparse(&weights,num_dense_); |
|
scorer_ = scorer; |
|
|
|
static const string kWeights = "weights"; |
|
fs::directory_iterator dend; |
|
size_t fileCount = 0; |
|
|
|
cerr << "Reading hypergraphs" << endl; |
|
for (fs::directory_iterator di(hypergraphDir); di != dend; ++di) { |
|
const fs::path& hgpath = di->path(); |
|
if (hgpath.filename() == kWeights) continue; |
|
|
|
Graph graph(vocab_); |
|
size_t id = boost::lexical_cast<size_t>(hgpath.stem().string()); |
|
util::scoped_fd fd(util::OpenReadOrThrow(hgpath.string().c_str())); |
|
|
|
util::FilePiece file(fd.release()); |
|
ReadGraph(file,graph); |
|
|
|
|
|
size_t edgeCount = hg_pruning * references_.Length(id); |
|
boost::shared_ptr<Graph> prunedGraph; |
|
prunedGraph.reset(new Graph(vocab_)); |
|
graph.Prune(prunedGraph.get(), weights, edgeCount); |
|
graphs_[id] = prunedGraph; |
|
|
|
++fileCount; |
|
if (fileCount % 10 == 0) cerr << "."; |
|
if (fileCount % 400 == 0) cerr << " [count=" << fileCount << "]\n"; |
|
} |
|
cerr << endl << "Done" << endl; |
|
|
|
sentenceIds_.resize(graphs_.size()); |
|
for (size_t i = 0; i < graphs_.size(); ++i) sentenceIds_[i] = i; |
|
if (!no_shuffle) { |
|
random_shuffle(sentenceIds_.begin(), sentenceIds_.end()); |
|
} |
|
|
|
} |
|
|
|
void HypergraphHopeFearDecoder::reset() |
|
{ |
|
sentenceIdIter_ = sentenceIds_.begin(); |
|
} |
|
|
|
void HypergraphHopeFearDecoder::next() |
|
{ |
|
sentenceIdIter_++; |
|
} |
|
|
|
bool HypergraphHopeFearDecoder::finished() |
|
{ |
|
return sentenceIdIter_ == sentenceIds_.end(); |
|
} |
|
|
|
void HypergraphHopeFearDecoder::HopeFear( |
|
const vector<ValType>& backgroundBleu, |
|
const MiraWeightVector& wv, |
|
HopeFearData* hopeFear |
|
) |
|
{ |
|
size_t sentenceId = *sentenceIdIter_; |
|
SparseVector weights; |
|
wv.ToSparse(&weights, num_dense_); |
|
const Graph& graph = *(graphs_[sentenceId]); |
|
|
|
|
|
HgHypothesis hopeHypo, fearHypo, modelHypo; |
|
for(size_t safe_loop=0; safe_loop<2; safe_loop++) { |
|
|
|
|
|
Viterbi(graph, weights, 1, references_, sentenceId, backgroundBleu, &hopeHypo); |
|
|
|
|
|
Viterbi(graph, weights, -1, references_, sentenceId, backgroundBleu, &fearHypo); |
|
|
|
|
|
Viterbi(graph, weights, 0, references_, sentenceId, backgroundBleu, &modelHypo); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
break; |
|
} |
|
|
|
hopeFear->modelFeatures = MiraFeatureVector(modelHypo.featureVector, num_dense_); |
|
hopeFear->hopeFeatures = MiraFeatureVector(hopeHypo.featureVector, num_dense_); |
|
hopeFear->fearFeatures = MiraFeatureVector(fearHypo.featureVector, num_dense_); |
|
|
|
|
|
|
|
|
|
|
|
vector<ValType> fearStats(scorer_->NumberOfScores()); |
|
hopeFear->hopeStats.reserve(scorer_->NumberOfScores()); |
|
hopeFear->modelStats.reserve(scorer_->NumberOfScores()); |
|
for (size_t i = 0; i < fearStats.size(); ++i) { |
|
hopeFear->modelStats.push_back(modelHypo.bleuStats[i]); |
|
hopeFear->hopeStats.push_back(hopeHypo.bleuStats[i]); |
|
|
|
fearStats[i] = fearHypo.bleuStats[i]; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hopeFear->hopeBleu = sentenceLevelBackgroundBleu(hopeFear->hopeStats, backgroundBleu); |
|
hopeFear->fearBleu = sentenceLevelBackgroundBleu(fearStats, backgroundBleu); |
|
|
|
|
|
hopeFear->hopeFearEqual = true; |
|
if (hopeFear->hopeFearEqual) { |
|
for (size_t i = 0; i < fearStats.size(); ++i) { |
|
if (fearStats[i] != hopeFear->hopeStats[i]) { |
|
hopeFear->hopeFearEqual = false; |
|
break; |
|
} |
|
} |
|
} |
|
hopeFear->hopeFearEqual = hopeFear->hopeFearEqual && (hopeFear->fearFeatures == hopeFear->hopeFeatures); |
|
} |
|
|
|
void HypergraphHopeFearDecoder::MaxModel(const AvgWeightVector& wv, vector<ValType>* stats) |
|
{ |
|
assert(!finished()); |
|
HgHypothesis bestHypo; |
|
size_t sentenceId = *sentenceIdIter_; |
|
SparseVector weights; |
|
wv.ToSparse(&weights, num_dense_); |
|
vector<ValType> bg(scorer_->NumberOfScores()); |
|
|
|
Viterbi(*(graphs_[sentenceId]), weights, 0, references_, sentenceId, bg, &bestHypo); |
|
stats->resize(bestHypo.bleuStats.size()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < bestHypo.bleuStats.size(); ++i) { |
|
(*stats)[i] = bestHypo.bleuStats[i]; |
|
} |
|
} |
|
|
|
|
|
|
|
}; |
|
|