File size: 6,969 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
#include <fstream>
#include "GlobalLexicalModel.h"
#include "moses/StaticData.h"
#include "moses/InputFileStream.h"
#include "moses/TranslationOption.h"
#include "moses/TranslationTask.h"
#include "moses/FactorCollection.h"
#include "util/exception.hh"
using namespace std;
namespace Moses
{
GlobalLexicalModel::GlobalLexicalModel(const std::string &line)
: StatelessFeatureFunction(1, line)
{
std::cerr << "Creating global lexical model...\n";
ReadParameters();
// define bias word
FactorCollection &factorCollection = FactorCollection::Instance();
m_bias = new Word();
const Factor* factor = factorCollection.AddFactor( Input, m_inputFactorsVec[0], "**BIAS**" );
m_bias->SetFactor( m_inputFactorsVec[0], factor );
}
void GlobalLexicalModel::SetParameter(const std::string& key, const std::string& value)
{
if (key == "path") {
m_filePath = value;
} else if (key == "input-factor") {
m_inputFactorsVec = Tokenize<FactorType>(value,",");
} else if (key == "output-factor") {
m_outputFactorsVec = Tokenize<FactorType>(value,",");
} else {
StatelessFeatureFunction::SetParameter(key, value);
}
}
GlobalLexicalModel::~GlobalLexicalModel()
{
// delete words in the hash data structure
DoubleHash::const_iterator iter;
for(iter = m_hash.begin(); iter != m_hash.end(); iter++ ) {
boost::unordered_map< const Word*, float, UnorderedComparer<Word>, UnorderedComparer<Word> >::const_iterator iter2;
for(iter2 = iter->second.begin(); iter2 != iter->second.end(); iter2++ ) {
delete iter2->first; // delete input word
}
delete iter->first; // delete output word
}
}
void GlobalLexicalModel::Load(AllOptions::ptr const& opts)
{
m_options = opts;
FactorCollection &factorCollection = FactorCollection::Instance();
const std::string& oFactorDelimiter = opts->output.factor_delimiter;
const std::string& iFactorDelimiter = opts->input.factor_delimiter;
VERBOSE(2, "Loading global lexical model from file " << m_filePath << endl);
m_inputFactors = FactorMask(m_inputFactorsVec);
m_outputFactors = FactorMask(m_outputFactorsVec);
InputFileStream inFile(m_filePath);
// reading in data one line at a time
size_t lineNum = 0;
string line;
while(getline(inFile, line)) {
++lineNum;
vector<string> token = Tokenize<string>(line, " ");
if (token.size() != 3) { // format checking
UTIL_THROW2("Syntax error at " << m_filePath << ":" << lineNum << ":" << line);
}
// create the output word
Word *outWord = new Word();
vector<string> factorString = Tokenize( token[0], oFactorDelimiter );
for (size_t i=0 ; i < m_outputFactorsVec.size() ; i++) {
const FactorDirection& direction = Output;
const FactorType& factorType = m_outputFactorsVec[i];
const Factor* factor
= factorCollection.AddFactor( direction, factorType, factorString[i] );
outWord->SetFactor( factorType, factor );
}
// create the input word
Word *inWord = new Word();
factorString = Tokenize( token[1], iFactorDelimiter );
for (size_t i=0 ; i < m_inputFactorsVec.size() ; i++) {
const FactorDirection& direction = Input;
const FactorType& factorType = m_inputFactorsVec[i];
const Factor* factor
= factorCollection.AddFactor( direction, factorType, factorString[i] );
inWord->SetFactor( factorType, factor );
}
// maximum entropy feature score
float score = Scan<float>(token[2]);
// std::cerr << "storing word " << *outWord << " " << *inWord << " " << score << endl;
// store feature in hash
DoubleHash::iterator keyOutWord = m_hash.find( outWord );
if( keyOutWord == m_hash.end() ) {
m_hash[outWord][inWord] = score;
} else { // already have hash for outword, delete the word to avoid leaks
(keyOutWord->second)[inWord] = score;
delete outWord;
}
}
}
void GlobalLexicalModel::InitializeForInput(ttasksptr const& ttask)
{
UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput,
"GlobalLexicalModel works only with sentence input.");
Sentence const* s = reinterpret_cast<Sentence const*>(ttask->GetSource().get());
m_local.reset(new ThreadLocalStorage);
m_local->input = s;
}
float GlobalLexicalModel::ScorePhrase( const TargetPhrase& targetPhrase ) const
{
const Sentence& input = *(m_local->input);
float score = 0;
for(size_t targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) {
float sum = 0;
const Word& targetWord = targetPhrase.GetWord( targetIndex );
VERBOSE(2,"glm " << targetWord << ": ");
const DoubleHash::const_iterator targetWordHash = m_hash.find( &targetWord );
if( targetWordHash != m_hash.end() ) {
SingleHash::const_iterator inputWordHash = targetWordHash->second.find( m_bias );
if( inputWordHash != targetWordHash->second.end() ) {
VERBOSE(2,"*BIAS* " << inputWordHash->second);
sum += inputWordHash->second;
}
boost::unordered_set< const Word*, UnorderedComparer<Word>, UnorderedComparer<Word> > alreadyScored; // do not score a word twice
for(size_t inputIndex = 0; inputIndex < input.GetSize(); inputIndex++ ) {
const Word& inputWord = input.GetWord( inputIndex );
if ( alreadyScored.find( &inputWord ) == alreadyScored.end() ) {
SingleHash::const_iterator inputWordHash = targetWordHash->second.find( &inputWord );
if( inputWordHash != targetWordHash->second.end() ) {
VERBOSE(2," " << inputWord << " " << inputWordHash->second);
sum += inputWordHash->second;
}
alreadyScored.insert( &inputWord );
}
}
}
// Hal Daume says: 1/( 1 + exp [ - sum_i w_i * f_i ] )
VERBOSE(2," p=" << FloorScore( log(1/(1+exp(-sum))) ) << endl);
score += FloorScore( log(1/(1+exp(-sum))) );
}
return score;
}
float GlobalLexicalModel::GetFromCacheOrScorePhrase( const TargetPhrase& targetPhrase ) const
{
LexiconCache& m_cache = m_local->cache;
const LexiconCache::const_iterator query = m_cache.find( &targetPhrase );
if ( query != m_cache.end() ) {
return query->second;
}
float score = ScorePhrase( targetPhrase );
m_cache.insert( pair<const TargetPhrase*, float>(&targetPhrase, score) );
//VERBOSE(2, "add to cache " << targetPhrase << ": " << score << endl);
return score;
}
void GlobalLexicalModel::EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedScores) const
{
scoreBreakdown.PlusEquals( this, GetFromCacheOrScorePhrase(targetPhrase) );
}
bool GlobalLexicalModel::IsUseable(const FactorMask &mask) const
{
for (size_t i = 0; i < m_outputFactors.size(); ++i) {
if (m_outputFactors[i]) {
if (!mask[i]) {
return false;
}
}
}
return true;
}
}
|