sakharamg's picture
Uploading all files
158b61b
#include <string>
#include <map>
#include <limits>
#include <vector>
#include <boost/unordered_map.hpp>
#include <boost/functional/hash.hpp>
#include "moses/FF/StatefulFeatureFunction.h"
#include "moses/PP/CountsPhraseProperty.h"
#include "moses/TranslationOptionList.h"
#include "moses/TranslationOption.h"
#include "moses/Util.h"
#include "moses/TypeDef.h"
#include "moses/StaticData.h"
#include "moses/Phrase.h"
#include "moses/AlignmentInfo.h"
#include "moses/AlignmentInfoCollection.h"
#include "moses/Word.h"
#include "moses/FactorCollection.h"
#include "Normalizer.h"
#include "Classifier.h"
#include "VWFeatureBase.h"
#include "TabbedSentence.h"
#include "ThreadLocalByFeatureStorage.h"
#include "TrainingLoss.h"
#include "VWTargetSentence.h"
#include "VWState.h"
#include "VW.h"
namespace Moses
{
VW::VW(const std::string &line)
: StatefulFeatureFunction(1, line)
, TLSTargetSentence(this)
, m_train(false)
, m_sentenceStartWord(Word())
{
ReadParameters();
Discriminative::ClassifierFactory *classifierFactory = m_train
? new Discriminative::ClassifierFactory(m_modelPath)
: new Discriminative::ClassifierFactory(m_modelPath, m_vwOptions);
m_tlsClassifier = new TLSClassifier(this, *classifierFactory);
m_tlsFutureScores = new TLSFloatHashMap(this);
m_tlsComputedStateExtensions = new TLSStateExtensions(this);
m_tlsTranslationOptionFeatures = new TLSFeatureVectorMap(this);
m_tlsTargetContextFeatures = new TLSFeatureVectorMap(this);
if (! m_normalizer) {
VERBOSE(1, "VW :: No loss function specified, assuming logistic loss.\n");
m_normalizer = (Discriminative::Normalizer *) new Discriminative::LogisticLossNormalizer();
}
if (! m_trainingLoss) {
VERBOSE(1, "VW :: Using basic 1/0 loss calculation in training.\n");
m_trainingLoss = (TrainingLoss *) new TrainingLossBasic();
}
// create a virtual beginning-of-sentence word with all factors replaced by <S>
const Factor *bosFactor = FactorCollection::Instance().AddFactor(BOS_);
for (size_t i = 0; i < MAX_NUM_FACTORS; i++)
m_sentenceStartWord.SetFactor(i, bosFactor);
}
VW::~VW()
{
delete m_tlsClassifier;
delete m_normalizer;
// TODO delete more stuff
}
FFState* VW::EvaluateWhenApplied(
const Hypothesis& curHypo,
const FFState* prevState,
ScoreComponentCollection* accumulator) const
{
VERBOSE(3, "VW :: Evaluating translation options\n");
const VWState& prevVWState = *static_cast<const VWState *>(prevState);
const std::vector<VWFeatureBase*>& contextFeatures =
VWFeatureBase::GetTargetContextFeatures(GetScoreProducerDescription());
if (contextFeatures.empty()) {
// no target context features => we already evaluated everything in
// EvaluateTranslationOptionListWithSourceContext(). Nothing to do now,
// no state information to track.
return new VWState();
}
size_t spanStart = curHypo.GetTranslationOption().GetStartPos();
size_t spanEnd = curHypo.GetTranslationOption().GetEndPos();
// compute our current key
size_t cacheKey = MakeCacheKey(prevState, spanStart, spanEnd);
boost::unordered_map<size_t, FloatHashMap> &computedStateExtensions
= *m_tlsComputedStateExtensions->GetStored();
if (computedStateExtensions.find(cacheKey) == computedStateExtensions.end()) {
// we have not computed this set of translation options yet
const TranslationOptionList *topts =
curHypo.GetManager().getSntTranslationOptions()->GetTranslationOptionList(spanStart, spanEnd);
const InputType& input = curHypo.GetManager().GetSource();
Discriminative::Classifier &classifier = *m_tlsClassifier->GetStored();
// extract target context features
size_t contextHash = prevVWState.hash();
FeatureVectorMap &contextFeaturesCache = *m_tlsTargetContextFeatures->GetStored();
FeatureVectorMap::const_iterator contextIt = contextFeaturesCache.find(contextHash);
if (contextIt == contextFeaturesCache.end()) {
// we have not extracted features for this context yet
const Phrase &targetContext = prevVWState.GetPhrase();
Discriminative::FeatureVector contextVector;
const AlignmentInfo *alignInfo = TransformAlignmentInfo(curHypo, targetContext.GetSize());
for(size_t i = 0; i < contextFeatures.size(); ++i)
(*contextFeatures[i])(input, targetContext, *alignInfo, classifier, contextVector);
contextFeaturesCache[contextHash] = contextVector;
VERBOSE(3, "VW :: context cache miss\n");
} else {
// context already in cache, simply put feature IDs in the classifier object
classifier.AddLabelIndependentFeatureVector(contextIt->second);
VERBOSE(3, "VW :: context cache hit\n");
}
std::vector<float> losses(topts->size());
for (size_t toptIdx = 0; toptIdx < topts->size(); toptIdx++) {
const TranslationOption *topt = topts->Get(toptIdx);
const TargetPhrase &targetPhrase = topt->GetTargetPhrase();
size_t toptHash = hash_value(*topt);
// start with pre-computed source-context-only VW scores
losses[toptIdx] = m_tlsFutureScores->GetStored()->find(toptHash)->second;
// add all features associated with this translation option
// (pre-computed when evaluated with source context)
const Discriminative::FeatureVector &targetFeatureVector =
m_tlsTranslationOptionFeatures->GetStored()->find(toptHash)->second;
classifier.AddLabelDependentFeatureVector(targetFeatureVector);
// add classifier score with context+target features only to the total loss
losses[toptIdx] += classifier.Predict(MakeTargetLabel(targetPhrase));
}
// normalize classifier scores to get a probability distribution
(*m_normalizer)(losses);
// fill our cache with the results
FloatHashMap &toptScores = computedStateExtensions[cacheKey];
for (size_t toptIdx = 0; toptIdx < topts->size(); toptIdx++) {
const TranslationOption *topt = topts->Get(toptIdx);
size_t toptHash = hash_value(*topt);
toptScores[toptHash] = FloorScore(TransformScore(losses[toptIdx]));
}
VERBOSE(3, "VW :: cache miss\n");
} else {
VERBOSE(3, "VW :: cache hit\n");
}
// now our cache is guaranteed to contain the required score, simply look it up
std::vector<float> newScores(m_numScoreComponents);
size_t toptHash = hash_value(curHypo.GetTranslationOption());
newScores[0] = computedStateExtensions[cacheKey][toptHash];
VERBOSE(3, "VW :: adding score: " << newScores[0] << "\n");
accumulator->PlusEquals(this, newScores);
return new VWState(prevVWState, curHypo);
}
const FFState* VW::EmptyHypothesisState(const InputType &input) const
{
size_t maxContextSize = VWFeatureBase::GetMaximumContextSize(GetScoreProducerDescription());
Phrase initialPhrase;
for (size_t i = 0; i < maxContextSize; i++)
initialPhrase.AddWord(m_sentenceStartWord);
return new VWState(initialPhrase);
}
void VW::EvaluateTranslationOptionListWithSourceContext(const InputType &input
, const TranslationOptionList &translationOptionList) const
{
Discriminative::Classifier &classifier = *m_tlsClassifier->GetStored();
if (translationOptionList.size() == 0)
return; // nothing to do
VERBOSE(3, "VW :: Evaluating translation options\n");
// which feature functions do we use (on the source and target side)
const std::vector<VWFeatureBase*>& sourceFeatures =
VWFeatureBase::GetSourceFeatures(GetScoreProducerDescription());
const std::vector<VWFeatureBase*>& contextFeatures =
VWFeatureBase::GetTargetContextFeatures(GetScoreProducerDescription());
const std::vector<VWFeatureBase*>& targetFeatures =
VWFeatureBase::GetTargetFeatures(GetScoreProducerDescription());
size_t maxContextSize = VWFeatureBase::GetMaximumContextSize(GetScoreProducerDescription());
// only use stateful score computation when needed
bool haveTargetContextFeatures = ! contextFeatures.empty();
const Range &sourceRange = translationOptionList.Get(0)->GetSourceWordsRange();
if (m_train) {
//
// extract features for training the classifier (only call this when using vwtrainer, not in Moses!)
//
// find which topts are correct
std::vector<bool> correct(translationOptionList.size());
std::vector<int> startsAt(translationOptionList.size());
std::set<int> uncoveredStartingPositions;
for (size_t i = 0; i < translationOptionList.size(); i++) {
std::pair<bool, int> isCorrect = IsCorrectTranslationOption(* translationOptionList.Get(i));
correct[i] = isCorrect.first;
startsAt[i] = isCorrect.second;
if (isCorrect.first) {
uncoveredStartingPositions.insert(isCorrect.second);
}
}
// optionally update translation options using leave-one-out
std::vector<bool> keep = (m_leaveOneOut.size() > 0)
? LeaveOneOut(translationOptionList, correct)
: std::vector<bool>(translationOptionList.size(), true);
while (! uncoveredStartingPositions.empty()) {
int currentStart = *uncoveredStartingPositions.begin();
uncoveredStartingPositions.erase(uncoveredStartingPositions.begin());
// check whether we (still) have some correct translation
int firstCorrect = -1;
for (size_t i = 0; i < translationOptionList.size(); i++) {
if (keep[i] && correct[i] && startsAt[i] == currentStart) {
firstCorrect = i;
break;
}
}
// do not train if there are no positive examples
if (firstCorrect == -1) {
VERBOSE(3, "VW :: skipping topt collection, no correct translation for span at current tgt start position\n");
continue;
}
// the first correct topt can be used by some loss functions
const TargetPhrase &correctPhrase = translationOptionList.Get(firstCorrect)->GetTargetPhrase();
// feature extraction *at prediction time* outputs feature hashes which can be cached;
// this is training time, simply store everything in this dummyVector
Discriminative::FeatureVector dummyVector;
// extract source side features
for(size_t i = 0; i < sourceFeatures.size(); ++i)
(*sourceFeatures[i])(input, sourceRange, classifier, dummyVector);
// build target-side context
Phrase targetContext;
for (size_t i = 0; i < maxContextSize; i++)
targetContext.AddWord(m_sentenceStartWord);
const Phrase *targetSent = GetStored()->m_sentence;
// word alignment info shifted by context size
AlignmentInfo contextAlignment = TransformAlignmentInfo(*GetStored()->m_alignment, maxContextSize, currentStart);
if (currentStart > 0)
targetContext.Append(targetSent->GetSubString(Range(0, currentStart - 1)));
// extract target-context features
for(size_t i = 0; i < contextFeatures.size(); ++i)
(*contextFeatures[i])(input, targetContext, contextAlignment, classifier, dummyVector);
// go over topts, extract target side features and train the classifier
for (size_t toptIdx = 0; toptIdx < translationOptionList.size(); toptIdx++) {
// this topt was discarded by leaving one out
if (! keep[toptIdx])
continue;
// extract target-side features for each topt
const TargetPhrase &targetPhrase = translationOptionList.Get(toptIdx)->GetTargetPhrase();
for(size_t i = 0; i < targetFeatures.size(); ++i)
(*targetFeatures[i])(input, targetPhrase, classifier, dummyVector);
bool isCorrect = correct[toptIdx] && startsAt[toptIdx] == currentStart;
float loss = (*m_trainingLoss)(targetPhrase, correctPhrase, isCorrect);
// train classifier on current example
classifier.Train(MakeTargetLabel(targetPhrase), loss);
}
}
} else {
//
// predict using a trained classifier, use this in decoding (=at test time)
//
std::vector<float> losses(translationOptionList.size());
Discriminative::FeatureVector outFeaturesSourceNamespace;
// extract source side features
for(size_t i = 0; i < sourceFeatures.size(); ++i)
(*sourceFeatures[i])(input, sourceRange, classifier, outFeaturesSourceNamespace);
for (size_t toptIdx = 0; toptIdx < translationOptionList.size(); toptIdx++) {
const TranslationOption *topt = translationOptionList.Get(toptIdx);
const TargetPhrase &targetPhrase = topt->GetTargetPhrase();
Discriminative::FeatureVector outFeaturesTargetNamespace;
// extract target-side features for each topt
for(size_t i = 0; i < targetFeatures.size(); ++i)
(*targetFeatures[i])(input, targetPhrase, classifier, outFeaturesTargetNamespace);
// cache the extracted target features (i.e. features associated with given topt)
// for future use at decoding time
size_t toptHash = hash_value(*topt);
m_tlsTranslationOptionFeatures->GetStored()->insert(
std::make_pair(toptHash, outFeaturesTargetNamespace));
// get classifier score
losses[toptIdx] = classifier.Predict(MakeTargetLabel(targetPhrase));
}
// normalize classifier scores to get a probability distribution
std::vector<float> rawLosses = losses;
(*m_normalizer)(losses);
// update scores of topts
for (size_t toptIdx = 0; toptIdx < translationOptionList.size(); toptIdx++) {
TranslationOption *topt = *(translationOptionList.begin() + toptIdx);
if (! haveTargetContextFeatures) {
// no target context features; evaluate the FF now
std::vector<float> newScores(m_numScoreComponents);
newScores[0] = FloorScore(TransformScore(losses[toptIdx]));
ScoreComponentCollection &scoreBreakDown = topt->GetScoreBreakdown();
scoreBreakDown.PlusEquals(this, newScores);
topt->UpdateScore();
} else {
// We have target context features => this is just a partial score,
// do not add it to the score component collection.
size_t toptHash = hash_value(*topt);
// Subtract the score contribution of target-only features, otherwise it would
// be included twice.
Discriminative::FeatureVector emptySource;
const Discriminative::FeatureVector &targetFeatureVector =
m_tlsTranslationOptionFeatures->GetStored()->find(toptHash)->second;
classifier.AddLabelIndependentFeatureVector(emptySource);
classifier.AddLabelDependentFeatureVector(targetFeatureVector);
float targetOnlyLoss = classifier.Predict(VW_DUMMY_LABEL);
float futureScore = rawLosses[toptIdx] - targetOnlyLoss;
m_tlsFutureScores->GetStored()->insert(std::make_pair(toptHash, futureScore));
}
}
}
}
void VW::SetParameter(const std::string& key, const std::string& value)
{
if (key == "train") {
m_train = Scan<bool>(value);
} else if (key == "path") {
m_modelPath = value;
} else if (key == "vw-options") {
m_vwOptions = value;
} else if (key == "leave-one-out-from") {
m_leaveOneOut = value;
} else if (key == "training-loss") {
// which type of loss to use for training
if (value == "basic") {
m_trainingLoss = (TrainingLoss *) new TrainingLossBasic();
} else if (value == "bleu") {
m_trainingLoss = (TrainingLoss *) new TrainingLossBLEU();
} else {
UTIL_THROW2("Unknown training loss type:" << value);
}
} else if (key == "loss") {
// which normalizer to use (theoretically depends on the loss function used for training the
// classifier (squared/logistic/hinge/...), hence the name "loss"
if (value == "logistic") {
m_normalizer = (Discriminative::Normalizer *) new Discriminative::LogisticLossNormalizer();
} else if (value == "squared") {
m_normalizer = (Discriminative::Normalizer *) new Discriminative::SquaredLossNormalizer();
} else {
UTIL_THROW2("Unknown loss type:" << value);
}
} else {
StatefulFeatureFunction::SetParameter(key, value);
}
}
void VW::InitializeForInput(ttasksptr const& ttask)
{
// do not keep future cost estimates across sentences!
m_tlsFutureScores->GetStored()->clear();
// invalidate our caches after each sentence
m_tlsComputedStateExtensions->GetStored()->clear();
// it's not certain that we should clear these caches; we do it
// because they shouldn't be allowed to grow indefinitely large but
// target contexts and translation options will have identical features
// the next time we extract them...
m_tlsTargetContextFeatures->GetStored()->clear();
m_tlsTranslationOptionFeatures->GetStored()->clear();
InputType const& source = *(ttask->GetSource().get());
// tabbed sentence is assumed only in training
if (! m_train)
return;
UTIL_THROW_IF2(source.GetType() != TabbedSentenceInput,
"This feature function requires the TabbedSentence input type");
const TabbedSentence& tabbedSentence = static_cast<const TabbedSentence&>(source);
UTIL_THROW_IF2(tabbedSentence.GetColumns().size() < 2,
"TabbedSentence must contain target<tab>alignment");
// target sentence represented as a phrase
Phrase *target = new Phrase();
target->CreateFromString(
Output
, StaticData::Instance().options()->output.factor_order
, tabbedSentence.GetColumns()[0]
, NULL);
// word alignment between source and target sentence
// we don't store alignment info in AlignmentInfoCollection because we keep alignments of whole
// sentences, not phrases
AlignmentInfo *alignment = new AlignmentInfo(tabbedSentence.GetColumns()[1]);
VWTargetSentence &targetSent = *GetStored();
targetSent.Clear();
targetSent.m_sentence = target;
targetSent.m_alignment = alignment;
// pre-compute max- and min- aligned points for faster translation option checking
targetSent.SetConstraints(source.GetSize());
}
/*************************************************************************************
* private methods
************************************************************************************/
const AlignmentInfo *VW::TransformAlignmentInfo(const Hypothesis &curHypo, size_t contextSize) const
{
std::set<std::pair<size_t, size_t> > alignmentPoints;
const Hypothesis *contextHypo = curHypo.GetPrevHypo();
int idxInContext = contextSize - 1;
int processedWordsInHypo = 0;
while (idxInContext >= 0 && contextHypo) {
int idxInHypo = contextHypo->GetCurrTargetLength() - 1 - processedWordsInHypo;
if (idxInHypo >= 0) {
const AlignmentInfo &hypoAlign = contextHypo->GetCurrTargetPhrase().GetAlignTerm();
std::set<size_t> alignedToTgt = hypoAlign.GetAlignmentsForTarget(idxInHypo);
size_t srcOffset = contextHypo->GetCurrSourceWordsRange().GetStartPos();
BOOST_FOREACH(size_t srcIdx, alignedToTgt) {
alignmentPoints.insert(std::make_pair(srcOffset + srcIdx, idxInContext));
}
processedWordsInHypo++;
idxInContext--;
} else {
processedWordsInHypo = 0;
contextHypo = contextHypo->GetPrevHypo();
}
}
return AlignmentInfoCollection::Instance().Add(alignmentPoints);
}
AlignmentInfo VW::TransformAlignmentInfo(const AlignmentInfo &alignInfo, size_t contextSize, int currentStart) const
{
std::set<std::pair<size_t, size_t> > alignmentPoints;
for (int i = std::max(0, currentStart - (int)contextSize); i < currentStart; i++) {
std::set<size_t> alignedToTgt = alignInfo.GetAlignmentsForTarget(i);
BOOST_FOREACH(size_t srcIdx, alignedToTgt) {
alignmentPoints.insert(std::make_pair(srcIdx, i + contextSize));
}
}
return AlignmentInfo(alignmentPoints);
}
std::pair<bool, int> VW::IsCorrectTranslationOption(const TranslationOption &topt) const
{
//std::cerr << topt.GetSourceWordsRange() << std::endl;
int sourceStart = topt.GetSourceWordsRange().GetStartPos();
int sourceEnd = topt.GetSourceWordsRange().GetEndPos();
const VWTargetSentence &targetSentence = *GetStored();
// [targetStart, targetEnd] spans aligned target words
int targetStart = targetSentence.m_sentence->GetSize();
int targetEnd = -1;
// get the left-most and right-most alignment point within source span
for(int i = sourceStart; i <= sourceEnd; ++i) {
if(targetSentence.m_sourceConstraints[i].IsSet()) {
if(targetStart > targetSentence.m_sourceConstraints[i].GetMin())
targetStart = targetSentence.m_sourceConstraints[i].GetMin();
if(targetEnd < targetSentence.m_sourceConstraints[i].GetMax())
targetEnd = targetSentence.m_sourceConstraints[i].GetMax();
}
}
// there was no alignment
if(targetEnd == -1)
return std::make_pair(false, -1);
//std::cerr << "Shorter: " << targetStart << " " << targetEnd << std::endl;
// [targetStart2, targetEnd2] spans unaligned words left and right of [targetStart, targetEnd]
int targetStart2 = targetStart;
for(int i = targetStart2; i >= 0 && !targetSentence.m_targetConstraints[i].IsSet(); --i)
targetStart2 = i;
int targetEnd2 = targetEnd;
for(int i = targetEnd2;
i < targetSentence.m_sentence->GetSize() && !targetSentence.m_targetConstraints[i].IsSet();
++i)
targetEnd2 = i;
//std::cerr << "Longer: " << targetStart2 << " " << targetEnd2 << std::endl;
const TargetPhrase &tphrase = topt.GetTargetPhrase();
//std::cerr << tphrase << std::endl;
// if target phrase is shorter than inner span return false
if(tphrase.GetSize() < targetEnd - targetStart + 1)
return std::make_pair(false, -1);
// if target phrase is longer than outer span return false
if(tphrase.GetSize() > targetEnd2 - targetStart2 + 1)
return std::make_pair(false, -1);
// for each possible starting point
for(int tempStart = targetStart2; tempStart <= targetStart; tempStart++) {
bool found = true;
// check if the target phrase is within longer span
for(int i = tempStart; i <= targetEnd2 && i < tphrase.GetSize() + tempStart; ++i) {
if(tphrase.GetWord(i - tempStart) != targetSentence.m_sentence->GetWord(i)) {
found = false;
break;
}
}
// return true if there was a match
if(found) {
//std::cerr << "Found" << std::endl;
return std::make_pair(true, tempStart);
}
}
return std::make_pair(false, -1);
}
std::vector<bool> VW::LeaveOneOut(const TranslationOptionList &topts, const std::vector<bool> &correct) const
{
UTIL_THROW_IF2(m_leaveOneOut.size() == 0 || ! m_train, "LeaveOneOut called in wrong setting!");
float sourceRawCount = 0.0;
const float ONE = 1.0001; // I don't understand floating point numbers
std::vector<bool> keepOpt;
for (size_t i = 0; i < topts.size(); i++) {
TranslationOption *topt = *(topts.begin() + i);
const TargetPhrase &targetPhrase = topt->GetTargetPhrase();
// extract raw counts from phrase-table property
const CountsPhraseProperty *property =
static_cast<const CountsPhraseProperty *>(targetPhrase.GetProperty("Counts"));
if (! property) {
VERBOSE(2, "VW :: Counts not found for topt! Is this an OOV?\n");
// keep all translation opts without updating, this is either OOV or bad usage...
keepOpt.assign(topts.size(), true);
return keepOpt;
}
if (sourceRawCount == 0.0) {
sourceRawCount = property->GetSourceMarginal() - ONE; // discount one occurrence of the source phrase
if (sourceRawCount <= 0) {
// no translation options survived, source phrase was a singleton
keepOpt.assign(topts.size(), false);
return keepOpt;
}
}
float discount = correct[i] ? ONE : 0.0;
float target = property->GetTargetMarginal() - discount;
float joint = property->GetJointCount() - discount;
if (discount != 0.0) VERBOSE(3, "VW :: leaving one out!\n");
if (joint > 0) {
// topt survived leaving one out, update its scores
const FeatureFunction *feature = &FindFeatureFunction(m_leaveOneOut);
std::vector<float> scores = targetPhrase.GetScoreBreakdown().GetScoresForProducer(feature);
UTIL_THROW_IF2(scores.size() != 4, "Unexpected number of scores in feature " << m_leaveOneOut);
scores[0] = TransformScore(joint / target); // P(f|e)
scores[2] = TransformScore(joint / sourceRawCount); // P(e|f)
ScoreComponentCollection &scoreBreakDown = topt->GetScoreBreakdown();
scoreBreakDown.Assign(feature, scores);
topt->UpdateScore();
keepOpt.push_back(true);
} else {
// they only occurred together once, discard topt
VERBOSE(2, "VW :: discarded topt when leaving one out\n");
keepOpt.push_back(false);
}
}
return keepOpt;
}
} // namespace Moses