/* * LanguageModel.cpp * * Created on: 29 Oct 2015 * Author: hieu */ #include #include "LanguageModel.h" #include "../Phrase.h" #include "../System.h" #include "../PhraseBased/Manager.h" #include "../PhraseBased/Hypothesis.h" #include "../PhraseBased/TargetPhraseImpl.h" #include "../FF/PointerState.h" #include "../legacy/Util2.h" #include "../legacy/InputFileStream.h" #include "../legacy/Bitmap.h" #include "../legacy/Util2.h" using namespace std; namespace Moses2 { struct LMState: public PointerState { LMState() : PointerState() { // uninitialised } void Set(MemPool &pool, void *lms, const std::vector &context) { lmstate = lms; numWords = context.size(); lastWords = (const Factor**) pool.Allocate( sizeof(const Factor*) * numWords); for (size_t i = 0; i < numWords; ++i) { lastWords[i] = context[i]; } } void Init(MemPool &pool, const Factor *factor) { lmstate = NULL; numWords = 1; lastWords = (const Factor**) pool.Allocate(sizeof(const Factor*)); lastWords[0] = factor; } size_t numWords; const Factor** lastWords; }; //////////////////////////////////////////////////////////////////////////////////////// LanguageModel::LanguageModel(size_t startInd, const std::string &line) : StatefulFeatureFunction(startInd, line), m_oov(-100) { ReadParameters(); } LanguageModel::~LanguageModel() { // TODO Auto-generated destructor stub } void LanguageModel::Load(System &system) { FactorCollection &fc = system.GetVocab(); m_bos = fc.AddFactor(BOS_, system, false); m_eos = fc.AddFactor(EOS_, system, false); InputFileStream infile(m_path); size_t lineNum = 0; string line; while (getline(infile, line)) { if (++lineNum % 100000 == 0) { cerr << lineNum << " "; } vector substrings = Tokenize(line, "\t"); if (substrings.size() < 2) continue; assert(substrings.size() == 2 || substrings.size() == 3); SCORE prob = TransformLMScore(Scan(substrings[0])); if (substrings[1] == "") { m_oov = prob; continue; } SCORE backoff = 0.f; if (substrings.size() == 3) { backoff = TransformLMScore(Scan(substrings[2])); } // ngram vector key = Tokenize(substrings[1], " "); vector factorKey(key.size()); for (size_t i = 0; i < key.size(); ++i) { factorKey[factorKey.size() - i - 1] = fc.AddFactor(key[i], system, false); } m_root.insert(factorKey, LMScores(prob, backoff)); } } void LanguageModel::SetParameter(const std::string& key, const std::string& value) { if (key == "path") { m_path = value; } else if (key == "factor") { m_factorType = Scan(value); } else if (key == "order") { m_order = Scan(value); } else { StatefulFeatureFunction::SetParameter(key, value); } } FFState* LanguageModel::BlankState(MemPool &pool, const System &sys) const { return new (pool.Allocate()) LMState(); } void LanguageModel::EmptyHypothesisState(FFState &state, const ManagerBase &mgr, const InputType &input, const Hypothesis &hypo) const { LMState &stateCast = static_cast(state); MemPool &pool = mgr.GetPool(); stateCast.Init(pool, m_bos); } void LanguageModel::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, const TargetPhraseImpl &targetPhrase, Scores &scores, SCORE &estimatedScore) const { if (targetPhrase.GetSize() == 0) { return; } SCORE score = 0; SCORE nonFullScore = 0; vector context; // context.push_back(m_bos); context.reserve(m_order); for (size_t i = 0; i < targetPhrase.GetSize(); ++i) { const Factor *factor = targetPhrase[i][m_factorType]; ShiftOrPush(context, factor); if (context.size() == m_order) { std::pair fromScoring = Score(context); score += fromScoring.first; } else { std::pair fromScoring = Score(context); nonFullScore += fromScoring.first; } } scores.PlusEquals(system, *this, score); SCORE weightedScore = Scores::CalcWeightedScore(system, *this, nonFullScore); estimatedScore += weightedScore; } void LanguageModel::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase &source, const TargetPhrase &targetPhrase, Scores &scores, SCORE &estimatedScore) const { } void LanguageModel::EvaluateWhenApplied(const ManagerBase &mgr, const Hypothesis &hypo, const FFState &prevState, Scores &scores, FFState &state) const { const LMState &prevLMState = static_cast(prevState); size_t numWords = prevLMState.numWords; // context is held backwards vector context(numWords); for (size_t i = 0; i < numWords; ++i) { context[i] = prevLMState.lastWords[i]; } //DebugContext(context); SCORE score = 0; std::pair fromScoring; const TargetPhrase &tp = hypo.GetTargetPhrase(); for (size_t i = 0; i < tp.GetSize(); ++i) { const Word &word = tp[i]; const Factor *factor = word[m_factorType]; ShiftOrPush(context, factor); fromScoring = Score(context); score += fromScoring.first; } const Bitmap &bm = hypo.GetBitmap(); if (bm.IsComplete()) { // everything translated ShiftOrPush(context, m_eos); fromScoring = Score(context); score += fromScoring.first; fromScoring.second = NULL; context.clear(); } else { assert(context.size()); if (context.size() == m_order) { context.resize(context.size() - 1); } } scores.PlusEquals(mgr.system, *this, score); // return state //DebugContext(context); LMState &stateCast = static_cast(state); MemPool &pool = mgr.GetPool(); stateCast.Set(pool, fromScoring.second, context); } void LanguageModel::ShiftOrPush(std::vector &context, const Factor *factor) const { if (context.size() < m_order) { context.resize(context.size() + 1); } assert(context.size()); for (size_t i = context.size() - 1; i > 0; --i) { context[i] = context[i - 1]; } context[0] = factor; } std::pair LanguageModel::Score( const std::vector &context) const { //cerr << "context="; //DebugContext(context); std::pair ret; typedef Node LMNode; const LMNode *node = m_root.getNode(context); if (node) { ret.first = node->getValue().prob; ret.second = (void*) node; } else { SCORE backoff = 0; std::vector backOffContext(context.begin() + 1, context.end()); node = m_root.getNode(backOffContext); if (node) { backoff = node->getValue().backoff; } std::vector newContext(context.begin(), context.end() - 1); std::pair newRet = Score(newContext); ret.first = backoff + newRet.first; ret.second = newRet.second; } //cerr << "score=" << ret.first << endl; return ret; } SCORE LanguageModel::BackoffScore( const std::vector &context) const { //cerr << "backoff="; //DebugContext(context); SCORE ret; size_t stoppedAtInd; const Node &node = m_root.getNode(context, stoppedAtInd); if (stoppedAtInd == context.size()) { // found entire ngram ret = node.getValue().backoff; } else { if (stoppedAtInd == 0) { ret = m_oov; stoppedAtInd = 1; } else { ret = node.getValue().backoff; } // recursive std::vector backoff(context.begin() + stoppedAtInd, context.end()); ret += BackoffScore(backoff); } return ret; } void LanguageModel::DebugContext( const std::vector &context) const { for (size_t i = 0; i < context.size(); ++i) { cerr << context[i]->GetString() << " "; } cerr << endl; } void LanguageModel::EvaluateWhenApplied(const SCFG::Manager &mgr, const SCFG::Hypothesis &hypo, int featureID, Scores &scores, FFState &state) const { UTIL_THROW2("Not implemented"); } }