|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "ParallelBackoff.h" |
|
|
|
#include <vector> |
|
#include <string> |
|
#include <sstream> |
|
#include <fstream> |
|
|
|
#include "MultiFactor.h" |
|
#include "moses/Word.h" |
|
#include "moses/Factor.h" |
|
#include "moses/FactorTypeSet.h" |
|
#include "moses/FactorCollection.h" |
|
#include "moses/Phrase.h" |
|
#include "moses/TypeDef.h" |
|
#include "moses/Util.h" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef __APPLE__ |
|
#define HAVE_ZOPEN |
|
#endif |
|
|
|
#include "FNgramSpecs.h" |
|
#include "FNgramStats.h" |
|
#include "FactoredVocab.h" |
|
#include "FNgram.h" |
|
#include "wmatrix.h" |
|
#include "Vocab.h" |
|
#include "File.h" |
|
|
|
using namespace std; |
|
|
|
namespace Moses |
|
{ |
|
|
|
namespace |
|
{ |
|
class LanguageModelParallelBackoff : public LanguageModelMultiFactor |
|
{ |
|
private: |
|
std::vector<FactorType> m_factorTypesOrdered; |
|
|
|
FactoredVocab *m_srilmVocab; |
|
FNgram *m_srilmModel; |
|
VocabIndex m_unknownId; |
|
VocabIndex m_wtid; |
|
VocabIndex m_wtbid; |
|
VocabIndex m_wteid; |
|
FNgramSpecs<FNgramCount>* fnSpecs; |
|
|
|
std::map<size_t, VocabIndex>* lmIdMap; |
|
std::fstream* debugStream; |
|
|
|
WidMatrix *widMatrix; |
|
|
|
public: |
|
LanguageModelParallelBackoff(const std::string &line) |
|
:LanguageModelMultiFactor(line) { |
|
} |
|
|
|
~LanguageModelParallelBackoff(); |
|
|
|
bool Load(const std::string &filePath, const std::vector<FactorType> &factorTypes, size_t nGramOrder); |
|
|
|
VocabIndex GetLmID( const std::string &str ) const; |
|
|
|
VocabIndex GetLmID( const Factor *factor, FactorType ft ) const; |
|
|
|
void CreateFactors(); |
|
|
|
LMResult GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const; |
|
const FFState *GetNullContextState() const; |
|
const FFState *GetBeginSentenceState() const; |
|
FFState *NewState(const FFState *from) const; |
|
}; |
|
|
|
LanguageModelParallelBackoff::~LanguageModelParallelBackoff() |
|
{ |
|
|
|
} |
|
|
|
|
|
bool LanguageModelParallelBackoff::Load(const std::string &filePath, const std::vector<FactorType> &factorTypes, size_t nGramOrder) |
|
{ |
|
|
|
cerr << "Loading Language Model Parallel Backoff!!!\n"; |
|
widMatrix = new ::WidMatrix(); |
|
m_factorTypes = FactorMask(factorTypes); |
|
m_srilmVocab = new ::FactoredVocab(); |
|
|
|
|
|
fnSpecs = 0; |
|
File f(filePath.c_str(),"r"); |
|
fnSpecs = new ::FNgramSpecs<FNgramCount>(f,*m_srilmVocab, 0); |
|
|
|
cerr << "Loaded fnSpecs!\n"; |
|
|
|
m_srilmVocab->unkIsWord() = true; |
|
m_srilmVocab->nullIsWord() = true; |
|
m_srilmVocab->toLower() = false; |
|
|
|
FNgramStats *factoredStats = new FNgramStats(*m_srilmVocab, *fnSpecs); |
|
|
|
factoredStats->debugme(2); |
|
|
|
cerr << "Factored stats\n"; |
|
|
|
FNgram* fngramLM = new FNgram(*m_srilmVocab,*fnSpecs); |
|
|
|
cerr << "FNgram object created\n"; |
|
|
|
fngramLM->skipOOVs = false; |
|
|
|
if (!factoredStats->read()) { |
|
cerr << "error reading in counts in factor file\n"; |
|
exit(1); |
|
} |
|
|
|
cerr << "Factored stats read!\n"; |
|
|
|
factoredStats->estimateDiscounts(); |
|
factoredStats->computeCardinalityFunctions(); |
|
factoredStats->sumCounts(); |
|
|
|
cerr << "Another three operations made!\n"; |
|
|
|
if (!fngramLM->read()) { |
|
cerr << "format error in lm file\n"; |
|
exit(1); |
|
} |
|
|
|
cerr << "fngramLM reads!\n"; |
|
|
|
m_filePath = filePath; |
|
m_nGramOrder= nGramOrder; |
|
|
|
m_factorTypesOrdered= factorTypes; |
|
|
|
m_unknownId = m_srilmVocab->unkIndex(); |
|
|
|
cerr << "m_unknowdId = " << m_unknownId << endl; |
|
|
|
m_srilmModel = fngramLM; |
|
|
|
cerr << "Create factors...\n"; |
|
|
|
CreateFactors(); |
|
|
|
cerr << "Factors created! \n"; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return true; |
|
} |
|
|
|
VocabIndex LanguageModelParallelBackoff::GetLmID( const std::string &str ) const |
|
{ |
|
return m_srilmVocab->getIndex( str.c_str(), m_unknownId ); |
|
} |
|
|
|
VocabIndex LanguageModelParallelBackoff::GetLmID( const Factor *factor, size_t ft ) const |
|
{ |
|
|
|
size_t factorId = factor->GetId(); |
|
if ( lmIdMap->find( factorId * 10 + ft ) != lmIdMap->end() ) { |
|
return lmIdMap->find( factorId * 10 + ft )->second; |
|
} else { |
|
return m_unknownId; |
|
} |
|
|
|
} |
|
|
|
void LanguageModelParallelBackoff::CreateFactors() |
|
{ |
|
|
|
|
|
FactorCollection &factorCollection = FactorCollection::Instance(); |
|
|
|
lmIdMap = new std::map<size_t, VocabIndex>(); |
|
|
|
|
|
VocabString str; |
|
VocabIter iter(*m_srilmVocab); |
|
|
|
iter.init(); |
|
|
|
size_t pomFactorTypeNum = 0; |
|
|
|
|
|
while ( (str = iter.next()) != NULL) { |
|
|
|
if ((str[0] < 'a' || str[0] > 'k') && str[0] != 'W') { |
|
continue; |
|
} |
|
VocabIndex lmId = GetLmID(str); |
|
pomFactorTypeNum = str[0] - 'a'; |
|
|
|
size_t factorId = factorCollection.AddFactor(Output, m_factorTypesOrdered[pomFactorTypeNum], &(str[2]) )->GetId(); |
|
(*lmIdMap)[factorId * 10 + pomFactorTypeNum] = lmId; |
|
} |
|
|
|
size_t factorIdStart; |
|
size_t factorIdEnd; |
|
|
|
|
|
for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index) { |
|
FactorType factorType = m_factorTypesOrdered[index]; |
|
m_sentenceStartWord[index] = factorCollection.AddFactor(Output, factorType, BOS_); |
|
|
|
|
|
m_sentenceEndWord[index] = factorCollection.AddFactor(Output, factorType, EOS_); |
|
|
|
factorIdStart = m_sentenceStartWord[index]->GetId(); |
|
factorIdEnd = m_sentenceEndWord[index]->GetId(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(*lmIdMap)[factorIdStart * 10 + index] = GetLmID(BOS_); |
|
(*lmIdMap)[factorIdEnd * 10 + index] = GetLmID(EOS_); |
|
|
|
cerr << "BOS_:" << GetLmID(BOS_) << ", EOS_:" << GetLmID(EOS_) << endl; |
|
|
|
} |
|
|
|
m_wtid = GetLmID("W-<unk>"); |
|
m_wtbid = GetLmID("W-<s>"); |
|
m_wteid = GetLmID("W-</s>"); |
|
|
|
cerr << "W-<unk> index: " << m_wtid << endl; |
|
cerr << "W-<s> index: " << m_wtbid << endl; |
|
cerr << "W-</s> index: " << m_wteid << endl; |
|
|
|
|
|
} |
|
|
|
LMResult LanguageModelParallelBackoff::GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState & ) const |
|
{ |
|
|
|
static WidMatrix widMatrix; |
|
|
|
for (int i=0; i<contextFactor.size(); i++) |
|
::memset(widMatrix[i],0,(m_factorTypesOrdered.size() + 1)*sizeof(VocabIndex)); |
|
|
|
|
|
for (size_t i = 0; i < contextFactor.size(); i++) { |
|
const Word &word = *contextFactor[i]; |
|
|
|
for (size_t j = 0; j < m_factorTypesOrdered.size(); j++) { |
|
const Factor *factor = word[ m_factorTypesOrdered[j] ]; |
|
|
|
if (factor == NULL) |
|
widMatrix[i][j + 1] = 0; |
|
else |
|
widMatrix[i][j + 1] = GetLmID(factor, j); |
|
} |
|
|
|
if (widMatrix[i][1] == GetLmID(m_sentenceStartWord[0], 0) ) { |
|
widMatrix[i][0] = m_wtbid; |
|
} else if (widMatrix[i][1] == GetLmID(m_sentenceEndWord[0], 0 )) { |
|
widMatrix[i][0] = m_wteid; |
|
} else { |
|
widMatrix[i][0] = m_wtid; |
|
} |
|
} |
|
|
|
|
|
LMResult ret; |
|
ret.score = m_srilmModel->wordProb( widMatrix, contextFactor.size() - 1, contextFactor.size() ); |
|
ret.score = FloorScore(TransformLMScore(ret.score)); |
|
ret.unknown = !contextFactor.empty() && (widMatrix[contextFactor.size() - 1][0] == m_unknownId); |
|
return ret; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
FFState *LanguageModelParallelBackoff::NewState(const FFState * ) const |
|
{ |
|
return NULL; |
|
} |
|
|
|
const FFState *LanguageModelParallelBackoff::GetNullContextState() const |
|
{ |
|
return NULL; |
|
} |
|
|
|
const FFState *LanguageModelParallelBackoff::GetBeginSentenceState() const |
|
{ |
|
return NULL; |
|
} |
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|