|
#include <fstream> |
|
#include "GlobalLexicalModel.h" |
|
#include "moses/StaticData.h" |
|
#include "moses/InputFileStream.h" |
|
#include "moses/TranslationOption.h" |
|
#include "moses/TranslationTask.h" |
|
#include "moses/FactorCollection.h" |
|
#include "util/exception.hh" |
|
|
|
using namespace std; |
|
|
|
namespace Moses |
|
{ |
|
GlobalLexicalModel::GlobalLexicalModel(const std::string &line) |
|
: StatelessFeatureFunction(1, line) |
|
{ |
|
std::cerr << "Creating global lexical model...\n"; |
|
ReadParameters(); |
|
|
|
|
|
FactorCollection &factorCollection = FactorCollection::Instance(); |
|
m_bias = new Word(); |
|
const Factor* factor = factorCollection.AddFactor( Input, m_inputFactorsVec[0], "**BIAS**" ); |
|
m_bias->SetFactor( m_inputFactorsVec[0], factor ); |
|
|
|
} |
|
|
|
void GlobalLexicalModel::SetParameter(const std::string& key, const std::string& value) |
|
{ |
|
if (key == "path") { |
|
m_filePath = value; |
|
} else if (key == "input-factor") { |
|
m_inputFactorsVec = Tokenize<FactorType>(value,","); |
|
} else if (key == "output-factor") { |
|
m_outputFactorsVec = Tokenize<FactorType>(value,","); |
|
} else { |
|
StatelessFeatureFunction::SetParameter(key, value); |
|
} |
|
} |
|
|
|
GlobalLexicalModel::~GlobalLexicalModel() |
|
{ |
|
|
|
DoubleHash::const_iterator iter; |
|
for(iter = m_hash.begin(); iter != m_hash.end(); iter++ ) { |
|
boost::unordered_map< const Word*, float, UnorderedComparer<Word>, UnorderedComparer<Word> >::const_iterator iter2; |
|
for(iter2 = iter->second.begin(); iter2 != iter->second.end(); iter2++ ) { |
|
delete iter2->first; |
|
} |
|
delete iter->first; |
|
} |
|
} |
|
|
|
void GlobalLexicalModel::Load(AllOptions::ptr const& opts) |
|
{ |
|
m_options = opts; |
|
FactorCollection &factorCollection = FactorCollection::Instance(); |
|
const std::string& oFactorDelimiter = opts->output.factor_delimiter; |
|
const std::string& iFactorDelimiter = opts->input.factor_delimiter; |
|
|
|
|
|
VERBOSE(2, "Loading global lexical model from file " << m_filePath << endl); |
|
|
|
m_inputFactors = FactorMask(m_inputFactorsVec); |
|
m_outputFactors = FactorMask(m_outputFactorsVec); |
|
InputFileStream inFile(m_filePath); |
|
|
|
|
|
size_t lineNum = 0; |
|
string line; |
|
while(getline(inFile, line)) { |
|
++lineNum; |
|
vector<string> token = Tokenize<string>(line, " "); |
|
|
|
if (token.size() != 3) { |
|
UTIL_THROW2("Syntax error at " << m_filePath << ":" << lineNum << ":" << line); |
|
} |
|
|
|
|
|
Word *outWord = new Word(); |
|
vector<string> factorString = Tokenize( token[0], oFactorDelimiter ); |
|
for (size_t i=0 ; i < m_outputFactorsVec.size() ; i++) { |
|
const FactorDirection& direction = Output; |
|
const FactorType& factorType = m_outputFactorsVec[i]; |
|
const Factor* factor |
|
= factorCollection.AddFactor( direction, factorType, factorString[i] ); |
|
outWord->SetFactor( factorType, factor ); |
|
} |
|
|
|
|
|
Word *inWord = new Word(); |
|
factorString = Tokenize( token[1], iFactorDelimiter ); |
|
for (size_t i=0 ; i < m_inputFactorsVec.size() ; i++) { |
|
const FactorDirection& direction = Input; |
|
const FactorType& factorType = m_inputFactorsVec[i]; |
|
const Factor* factor |
|
= factorCollection.AddFactor( direction, factorType, factorString[i] ); |
|
inWord->SetFactor( factorType, factor ); |
|
} |
|
|
|
|
|
float score = Scan<float>(token[2]); |
|
|
|
|
|
|
|
|
|
DoubleHash::iterator keyOutWord = m_hash.find( outWord ); |
|
if( keyOutWord == m_hash.end() ) { |
|
m_hash[outWord][inWord] = score; |
|
} else { |
|
(keyOutWord->second)[inWord] = score; |
|
delete outWord; |
|
} |
|
} |
|
} |
|
|
|
void GlobalLexicalModel::InitializeForInput(ttasksptr const& ttask) |
|
{ |
|
UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput, |
|
"GlobalLexicalModel works only with sentence input."); |
|
Sentence const* s = reinterpret_cast<Sentence const*>(ttask->GetSource().get()); |
|
m_local.reset(new ThreadLocalStorage); |
|
m_local->input = s; |
|
} |
|
|
|
float GlobalLexicalModel::ScorePhrase( const TargetPhrase& targetPhrase ) const |
|
{ |
|
const Sentence& input = *(m_local->input); |
|
float score = 0; |
|
for(size_t targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) { |
|
float sum = 0; |
|
const Word& targetWord = targetPhrase.GetWord( targetIndex ); |
|
VERBOSE(2,"glm " << targetWord << ": "); |
|
const DoubleHash::const_iterator targetWordHash = m_hash.find( &targetWord ); |
|
if( targetWordHash != m_hash.end() ) { |
|
SingleHash::const_iterator inputWordHash = targetWordHash->second.find( m_bias ); |
|
if( inputWordHash != targetWordHash->second.end() ) { |
|
VERBOSE(2,"*BIAS* " << inputWordHash->second); |
|
sum += inputWordHash->second; |
|
} |
|
|
|
boost::unordered_set< const Word*, UnorderedComparer<Word>, UnorderedComparer<Word> > alreadyScored; |
|
for(size_t inputIndex = 0; inputIndex < input.GetSize(); inputIndex++ ) { |
|
const Word& inputWord = input.GetWord( inputIndex ); |
|
if ( alreadyScored.find( &inputWord ) == alreadyScored.end() ) { |
|
SingleHash::const_iterator inputWordHash = targetWordHash->second.find( &inputWord ); |
|
if( inputWordHash != targetWordHash->second.end() ) { |
|
VERBOSE(2," " << inputWord << " " << inputWordHash->second); |
|
sum += inputWordHash->second; |
|
} |
|
alreadyScored.insert( &inputWord ); |
|
} |
|
} |
|
} |
|
|
|
VERBOSE(2," p=" << FloorScore( log(1/(1+exp(-sum))) ) << endl); |
|
score += FloorScore( log(1/(1+exp(-sum))) ); |
|
} |
|
return score; |
|
} |
|
|
|
float GlobalLexicalModel::GetFromCacheOrScorePhrase( const TargetPhrase& targetPhrase ) const |
|
{ |
|
LexiconCache& m_cache = m_local->cache; |
|
const LexiconCache::const_iterator query = m_cache.find( &targetPhrase ); |
|
if ( query != m_cache.end() ) { |
|
return query->second; |
|
} |
|
|
|
float score = ScorePhrase( targetPhrase ); |
|
m_cache.insert( pair<const TargetPhrase*, float>(&targetPhrase, score) ); |
|
|
|
return score; |
|
} |
|
|
|
void GlobalLexicalModel::EvaluateWithSourceContext(const InputType &input |
|
, const InputPath &inputPath |
|
, const TargetPhrase &targetPhrase |
|
, const StackVec *stackVec |
|
, ScoreComponentCollection &scoreBreakdown |
|
, ScoreComponentCollection *estimatedScores) const |
|
{ |
|
scoreBreakdown.PlusEquals( this, GetFromCacheOrScorePhrase(targetPhrase) ); |
|
} |
|
|
|
bool GlobalLexicalModel::IsUseable(const FactorMask &mask) const |
|
{ |
|
for (size_t i = 0; i < m_outputFactors.size(); ++i) { |
|
if (m_outputFactors[i]) { |
|
if (!mask[i]) { |
|
return false; |
|
} |
|
} |
|
} |
|
|
|
return true; |
|
} |
|
|
|
} |
|
|