|
|
|
|
|
#include "LexicalReorderingTable.h" |
|
#include "moses/InputFileStream.h" |
|
#include "moses/StaticData.h" |
|
#include "moses/TranslationModel/PhraseDictionary.h" |
|
#include "moses/GenerationDictionary.h" |
|
#include "moses/TargetPhrase.h" |
|
#include "moses/TargetPhraseCollection.h" |
|
#include "moses/TranslationTask.h" |
|
|
|
#if !defined WIN32 || defined __MINGW32__ || defined HAVE_CMPH |
|
#include "moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h" |
|
#endif |
|
|
|
namespace Moses |
|
{ |
|
|
|
|
|
std::string auxClearString(const std::string& str) |
|
{ |
|
int i = 0, j = str.size()-1; |
|
while(i <= j) { |
|
if(' ' != str[i]) { |
|
break; |
|
} else { |
|
++i; |
|
} |
|
} |
|
while(j >= i) { |
|
if(' ' != str[j]) { |
|
break; |
|
} else { |
|
--j; |
|
} |
|
} |
|
return str.substr(i,j-i+1); |
|
} |
|
|
|
void auxAppend(IPhrase& head, const IPhrase& tail) |
|
{ |
|
head.reserve(head.size()+tail.size()); |
|
for(size_t i = 0; i < tail.size(); ++i) { |
|
head.push_back(tail[i]); |
|
} |
|
} |
|
|
|
LexicalReorderingTable* |
|
LexicalReorderingTable:: |
|
LoadAvailable(const std::string& filePath, |
|
const FactorList& f_factors, |
|
const FactorList& e_factors, |
|
const FactorList& c_factors) |
|
{ |
|
|
|
#ifdef HAVE_CMPH |
|
LexicalReorderingTable *compactLexr = NULL; |
|
compactLexr = LexicalReorderingTableCompact::CheckAndLoad(filePath + ".minlexr", f_factors, e_factors, c_factors); |
|
if(compactLexr) |
|
return compactLexr; |
|
#endif |
|
LexicalReorderingTable* ret; |
|
if (FileExists(filePath+".binlexr.idx") ) |
|
ret = new LexicalReorderingTableTree(filePath, f_factors, |
|
e_factors, c_factors); |
|
else |
|
ret = new LexicalReorderingTableMemory(filePath, f_factors, |
|
e_factors, c_factors); |
|
return ret; |
|
} |
|
|
|
LexicalReorderingTableMemory:: |
|
LexicalReorderingTableMemory(const std::string& filePath, |
|
const std::vector<FactorType>& f_factors, |
|
const std::vector<FactorType>& e_factors, |
|
const std::vector<FactorType>& c_factors) |
|
: LexicalReorderingTable(f_factors, e_factors, c_factors) |
|
{ |
|
LoadFromFile(filePath); |
|
} |
|
|
|
LexicalReorderingTableMemory:: |
|
~LexicalReorderingTableMemory() { } |
|
|
|
std::vector<float> |
|
LexicalReorderingTableMemory::GetScore(const Phrase& f, |
|
const Phrase& e, |
|
const Phrase& c) |
|
{ |
|
|
|
|
|
TableType::const_iterator r; |
|
std::string key; |
|
if(0 == c.GetSize()) { |
|
key = MakeKey(f,e,c); |
|
r = m_Table.find(key); |
|
if(m_Table.end() != r) { |
|
return r->second; |
|
} |
|
} else { |
|
|
|
for(size_t i = 0; i <= c.GetSize(); ++i) { |
|
Phrase sub_c(c.GetSubString(Range(i,c.GetSize()-1))); |
|
key = MakeKey(f,e,sub_c); |
|
r = m_Table.find(key); |
|
if(m_Table.end() != r) { |
|
return r->second; |
|
} |
|
} |
|
} |
|
return Scores(); |
|
} |
|
|
|
void |
|
LexicalReorderingTableMemory:: |
|
DbgDump(std::ostream* out) const |
|
{ |
|
TableType::const_iterator i; |
|
for(i = m_Table.begin(); i != m_Table.end(); ++i) { |
|
*out << " key: '" << i->first << "' score: "; |
|
*out << "(num scores: " << (i->second).size() << ")"; |
|
for(size_t j = 0; j < (i->second).size(); ++j) |
|
*out << (i->second)[j] << " "; |
|
|
|
*out << "\n"; |
|
} |
|
}; |
|
|
|
std::string |
|
LexicalReorderingTableMemory::MakeKey(const Phrase& f, |
|
const Phrase& e, |
|
const Phrase& c) const |
|
{ |
|
return MakeKey(auxClearString(f.GetStringRep(m_FactorsF)), |
|
auxClearString(e.GetStringRep(m_FactorsE)), |
|
auxClearString(c.GetStringRep(m_FactorsC))); |
|
} |
|
|
|
std::string |
|
LexicalReorderingTableMemory::MakeKey(const std::string& f, |
|
const std::string& e, |
|
const std::string& c) const |
|
{ |
|
std::string key; |
|
if(!f.empty()) key += f; |
|
if(!m_FactorsE.empty()) { |
|
if(!key.empty()) { |
|
key += "|||"; |
|
} |
|
key += e; |
|
} |
|
if(!m_FactorsC.empty()) { |
|
if(!key.empty()) { |
|
key += "|||"; |
|
} |
|
key += c; |
|
} |
|
return key; |
|
} |
|
|
|
void |
|
LexicalReorderingTableMemory:: |
|
LoadFromFile(const std::string& filePath) |
|
{ |
|
std::string fileName = filePath; |
|
if(!FileExists(fileName) && FileExists(fileName+".gz")) |
|
fileName += ".gz"; |
|
|
|
InputFileStream file(fileName); |
|
std::string line(""), key(""); |
|
int numScores = -1; |
|
std::cerr << "Loading table into memory..."; |
|
while(!getline(file, line).eof()) { |
|
std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||"); |
|
int t = 0 ; |
|
std::string f(""),e(""),c(""); |
|
|
|
if(!m_FactorsF.empty()) { |
|
|
|
f = auxClearString(tokens.at(t)); |
|
++t; |
|
} |
|
if(!m_FactorsE.empty()) { |
|
|
|
e = auxClearString(tokens.at(t)); |
|
++t; |
|
} |
|
if(!m_FactorsC.empty()) { |
|
|
|
c = auxClearString(tokens.at(t)); |
|
++t; |
|
} |
|
|
|
std::vector<float> p = Scan<float>(Tokenize(tokens.at(t))); |
|
|
|
if(-1 == numScores) { |
|
numScores = (int)p.size(); |
|
} |
|
if((int)p.size() != numScores) { |
|
TRACE_ERR( "found inconsistent number of probabilities... found " |
|
<< p.size() << " expected " << numScores << std::endl); |
|
exit(0); |
|
} |
|
std::transform(p.begin(),p.end(),p.begin(),TransformScore); |
|
std::transform(p.begin(),p.end(),p.begin(),FloorScore); |
|
|
|
m_Table[MakeKey(f,e,c)] = p; |
|
} |
|
std::cerr << "done.\n"; |
|
} |
|
|
|
LexicalReorderingTableTree:: |
|
LexicalReorderingTableTree(const std::string& filePath, |
|
const std::vector<FactorType>& f_factors, |
|
const std::vector<FactorType>& e_factors, |
|
const std::vector<FactorType>& c_factors) |
|
: LexicalReorderingTable(f_factors, e_factors, c_factors) |
|
, m_UseCache(false) |
|
, m_FilePath(filePath) |
|
{ |
|
m_Table.reset(new PrefixTreeMap()); |
|
m_Table->Read(m_FilePath+".binlexr"); |
|
} |
|
|
|
LexicalReorderingTableTree:: |
|
~LexicalReorderingTableTree() |
|
{ } |
|
|
|
Scores |
|
LexicalReorderingTableTree:: |
|
GetScore(const Phrase& f, const Phrase& e, const Phrase& c) |
|
{ |
|
if((!m_FactorsF.empty() && 0 == f.GetSize()) |
|
|| (!m_FactorsE.empty() && 0 == e.GetSize())) { |
|
|
|
|
|
|
|
|
|
return Scores(); |
|
} |
|
|
|
CacheType::iterator i; |
|
|
|
if(m_UseCache) { |
|
std::pair<CacheType::iterator, bool> r; |
|
r = m_Cache.insert(std::make_pair(MakeCacheKey(f,e),Candidates())); |
|
if(!r.second) return auxFindScoreForContext((r.first)->second, c); |
|
i = r.first; |
|
} else if((i = m_Cache.find(MakeCacheKey(f,e))) != m_Cache.end()) |
|
|
|
return auxFindScoreForContext(i->second, c); |
|
|
|
|
|
Candidates cands; |
|
m_Table->GetCandidates(MakeTableKey(f,e), &cands); |
|
if(cands.empty()) return Scores(); |
|
if(m_UseCache) i->second = cands; |
|
|
|
if(m_FactorsC.empty()) { |
|
UTIL_THROW_IF2(1 != cands.size(), "Error"); |
|
return cands[0].GetScore(0); |
|
} else return auxFindScoreForContext(cands, c); |
|
}; |
|
|
|
Scores |
|
LexicalReorderingTableTree:: |
|
auxFindScoreForContext(const Candidates& cands, const Phrase& context) |
|
{ |
|
if(m_FactorsC.empty()) { |
|
UTIL_THROW_IF2(cands.size() > 1, "Error"); |
|
return (cands.size() == 1) ? cands[0].GetScore(0) : Scores(); |
|
} else { |
|
std::vector<std::string> cvec; |
|
for(size_t i = 0; i < context.GetSize(); ++i) |
|
cvec.push_back(context.GetWord(i).GetString(m_FactorsC, false)); |
|
|
|
IPhrase c = m_Table->ConvertPhrase(cvec,TargetVocId); |
|
IPhrase sub_c; |
|
IPhrase::iterator start = c.begin(); |
|
for(size_t j = 0; j <= context.GetSize(); ++j, ++start) { |
|
sub_c.assign(start, c.end()); |
|
for(size_t cand = 0; cand < cands.size(); ++cand) { |
|
IPhrase p = cands[cand].GetPhrase(0); |
|
if(cands[cand].GetPhrase(0) == sub_c) |
|
return cands[cand].GetScore(0); |
|
} |
|
} |
|
return Scores(); |
|
} |
|
} |
|
|
|
void |
|
LexicalReorderingTableTree:: |
|
InitializeForInput(ttasksptr const& ttask) |
|
{ |
|
const InputType& input = *ttask->GetSource(); |
|
ClearCache(); |
|
if(ConfusionNet const* cn = dynamic_cast<ConfusionNet const*>(&input)) { |
|
Cache(*cn); |
|
} else if (dynamic_cast<Sentence const*>(&input)) { |
|
|
|
DisableCache(); |
|
} |
|
if (!m_Table.get()) { |
|
|
|
m_Table.reset(new PrefixTreeMap()); |
|
m_Table->Read(m_FilePath+".binlexr"); |
|
} |
|
}; |
|
|
|
bool |
|
LexicalReorderingTableTree:: |
|
Create(std::istream& inFile, const std::string& outFileName) |
|
{ |
|
typedef PrefixTreeSA<LabelId,OFF_T> PSA; |
|
|
|
std::string |
|
line, |
|
ofn(outFileName+".binlexr.srctree"), |
|
oft(outFileName+".binlexr.tgtdata"), |
|
ofi(outFileName+".binlexr.idx"), |
|
ofsv(outFileName+".binlexr.voc0"), |
|
oftv(outFileName+".binlexr.voc1"); |
|
|
|
FILE *os = fOpen(ofn.c_str(),"wb"); |
|
FILE *ot = fOpen(oft.c_str(),"wb"); |
|
|
|
PSA *psa = new PSA; |
|
PSA::setDefault(InvalidOffT); |
|
WordVoc* voc[3]; |
|
|
|
LabelId currFirstWord = InvalidLabelId; |
|
IPhrase currKey; |
|
|
|
Candidates cands; |
|
std::vector<OFF_T> vo; |
|
size_t lnc = 0; |
|
size_t numTokens = 0; |
|
size_t numKeyTokens = 0; |
|
while(getline(inFile, line)) { |
|
++lnc; |
|
if(0 == lnc % 10000) TRACE_ERR("."); |
|
IPhrase key; |
|
Scores score; |
|
|
|
std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||"); |
|
std::string w; |
|
if(1 == lnc) { |
|
|
|
numTokens = tokens.size(); |
|
if(tokens.size() == 2) { |
|
|
|
numKeyTokens = 1; |
|
voc[0] = new WordVoc(); |
|
voc[1] = 0; |
|
} else if(3 == tokens.size() || 4 == tokens.size()) { |
|
|
|
numKeyTokens = 2; |
|
voc[0] = new WordVoc(); |
|
voc[1] = new WordVoc(); |
|
voc[2] = voc[1]; |
|
} |
|
} else { |
|
|
|
UTIL_THROW_IF2(numTokens != tokens.size(), |
|
"Lines do not have the same number of tokens"); |
|
} |
|
size_t phrase = 0; |
|
for(; phrase < numKeyTokens; ++phrase) { |
|
|
|
if(phrase >=1) key.push_back(PrefixTreeMap::MagicWord); |
|
std::istringstream is(tokens[phrase]); |
|
while(is >> w) key.push_back(voc[phrase]->add(w)); |
|
} |
|
|
|
|
|
std::vector<IPhrase> tgt_phrases; |
|
tgt_phrases.resize(numTokens - numKeyTokens - 1); |
|
for(size_t j = 0; j < tgt_phrases.size(); ++j, ++phrase) { |
|
std::istringstream is(tokens[numKeyTokens + j]); |
|
while(is >> w) tgt_phrases[j].push_back(voc[phrase]->add(w)); |
|
} |
|
|
|
|
|
std::istringstream is(tokens[numTokens-1]); |
|
while(is >> w) score.push_back(atof(w.c_str())); |
|
|
|
|
|
std::transform(score.begin(),score.end(),score.begin(),TransformScore); |
|
std::transform(score.begin(),score.end(),score.begin(),FloorScore); |
|
std::vector<Scores> scores; |
|
scores.push_back(score); |
|
|
|
if(key.empty()) { |
|
TRACE_ERR("WARNING: empty source phrase in line '"<<line<<"'\n"); |
|
continue; |
|
} |
|
|
|
|
|
if(currFirstWord == InvalidLabelId) currFirstWord = key[0]; |
|
if(currKey.empty()) { |
|
currKey = key; |
|
|
|
UTIL_THROW_IF2(psa == NULL, "Object not yet created"); |
|
PSA::Data& d = psa->insert(key); |
|
if(d == InvalidOffT) d = fTell(ot); |
|
else { |
|
TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" |
|
<< lnc << "): '" << line << "\n"); |
|
return false; |
|
} |
|
} |
|
|
|
if(currKey != key) { |
|
|
|
currKey = key; |
|
|
|
cands.writeBin(ot); |
|
cands.clear(); |
|
|
|
if(key[0] != currFirstWord) { |
|
|
|
PTF pf; |
|
if(currFirstWord >= vo.size()) |
|
vo.resize(currFirstWord+1,InvalidOffT); |
|
vo[currFirstWord] = fTell(os); |
|
pf.create(*psa, os); |
|
delete psa; |
|
psa = new PSA; |
|
currFirstWord = key[0]; |
|
} |
|
|
|
|
|
UTIL_THROW_IF2(psa == NULL, "Object not yet created"); |
|
PSA::Data& d = psa->insert(key); |
|
if(d == InvalidOffT) d = fTell(ot); |
|
else { |
|
TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" |
|
<< lnc << "): '" << line << "\n"); |
|
return false; |
|
} |
|
} |
|
cands.push_back(GenericCandidate(tgt_phrases, scores)); |
|
} |
|
if (lnc == 0) { |
|
TRACE_ERR("ERROR: empty lexicalised reordering file\n" << std::endl); |
|
return false; |
|
} |
|
cands.writeBin(ot); |
|
cands.clear(); |
|
|
|
PTF pf; |
|
if(currFirstWord >= vo.size()) |
|
vo.resize(currFirstWord+1,InvalidOffT); |
|
vo[currFirstWord] = fTell(os); |
|
pf.create(*psa,os); |
|
delete psa; |
|
psa=0; |
|
|
|
fClose(os); |
|
fClose(ot); |
|
FILE *oi = fOpen(ofi.c_str(),"wb"); |
|
fWriteVector(oi,vo); |
|
fClose(oi); |
|
|
|
if(voc[0]) { |
|
voc[0]->Write(ofsv); |
|
delete voc[0]; |
|
} |
|
if(voc[1]) { |
|
voc[1]->Write(oftv); |
|
delete voc[1]; |
|
} |
|
return true; |
|
} |
|
|
|
std::string |
|
LexicalReorderingTableTree:: |
|
MakeCacheKey(const Phrase& f, const Phrase& e) const |
|
{ |
|
std::string key; |
|
if(!m_FactorsF.empty()) |
|
key += auxClearString(f.GetStringRep(m_FactorsF)); |
|
|
|
if(!m_FactorsE.empty()) { |
|
if(!key.empty()) { |
|
key += "|||"; |
|
} |
|
key += auxClearString(e.GetStringRep(m_FactorsE)); |
|
} |
|
return key; |
|
}; |
|
|
|
IPhrase |
|
LexicalReorderingTableTree:: |
|
MakeTableKey(const Phrase& f, const Phrase& e) const |
|
{ |
|
IPhrase key; |
|
std::vector<std::string> keyPart; |
|
if(!m_FactorsF.empty()) { |
|
for(size_t i = 0; i < f.GetSize(); ++i) |
|
keyPart.push_back(f.GetWord(i).GetString(m_FactorsF, false)); |
|
auxAppend(key, m_Table->ConvertPhrase(keyPart, SourceVocId)); |
|
keyPart.clear(); |
|
} |
|
if(!m_FactorsE.empty()) { |
|
if(!key.empty()) key.push_back(PrefixTreeMap::MagicWord); |
|
for(size_t i = 0; i < e.GetSize(); ++i) |
|
keyPart.push_back(e.GetWord(i).GetString(m_FactorsE, false)); |
|
auxAppend(key, m_Table->ConvertPhrase(keyPart,TargetVocId)); |
|
} |
|
return key; |
|
}; |
|
|
|
|
|
struct State { |
|
State(PPimp* t, const std::string& p) |
|
: pos(t), path(p) { } |
|
|
|
PPimp* pos; |
|
std::string path; |
|
}; |
|
|
|
void |
|
LexicalReorderingTableTree:: |
|
auxCacheForSrcPhrase(const Phrase& f) |
|
{ |
|
if(m_FactorsE.empty()) { |
|
|
|
Candidates cands; |
|
m_Table->GetCandidates(MakeTableKey(f,Phrase(ARRAY_SIZE_INCR)),&cands); |
|
m_Cache[MakeCacheKey(f,Phrase(ARRAY_SIZE_INCR))] = cands; |
|
} else { |
|
ObjectPool<PPimp> pool; |
|
PPimp* pPos = m_Table->GetRoot(); |
|
|
|
|
|
for(size_t i = 0; i < f.GetSize() && 0 != pPos && pPos->isValid(); ++i) |
|
pPos = m_Table->Extend(pPos, f.GetWord(i).GetString(m_FactorsF, false), SourceVocId); |
|
|
|
if(pPos && pPos->isValid()) |
|
pPos = m_Table->Extend(pPos, PrefixTreeMap::MagicWord); |
|
|
|
if(!pPos || !pPos->isValid()) |
|
return; |
|
|
|
|
|
std::string cache_key = auxClearString(f.GetStringRep(m_FactorsF)) + "|||"; |
|
|
|
std::vector<State> stack; |
|
stack.push_back(State(pool.get(PPimp(pPos->ptr()->getPtr(pPos->idx),0,0)),"")); |
|
Candidates cands; |
|
while(!stack.empty()) { |
|
if(stack.back().pos->isValid()) { |
|
LabelId w = stack.back().pos->ptr()->getKey(stack.back().pos->idx); |
|
std::string next_path = stack.back().path + " " + m_Table->ConvertWord(w,TargetVocId); |
|
|
|
m_Table->GetCandidates(*stack.back().pos,&cands); |
|
if(!cands.empty()) m_Cache[cache_key + auxClearString(next_path)] = cands; |
|
cands.clear(); |
|
PPimp* next_pos = pool.get(PPimp(stack.back().pos->ptr()->getPtr(stack.back().pos->idx),0,0)); |
|
++stack.back().pos->idx; |
|
stack.push_back(State(next_pos,next_path)); |
|
} else stack.pop_back(); |
|
} |
|
} |
|
} |
|
|
|
void |
|
LexicalReorderingTableTree:: |
|
Cache(const ConfusionNet& ) |
|
{ |
|
return; |
|
} |
|
|
|
void |
|
LexicalReorderingTableTree:: |
|
Cache(const Sentence& input) |
|
{ |
|
|
|
size_t prev_cache_size = m_Cache.size(); |
|
size_t max_phrase_length = input.GetSize(); |
|
for(size_t len = 0; len <= max_phrase_length; ++len) { |
|
for(size_t start = 0; start+len <= input.GetSize(); ++start) { |
|
Phrase f = input.GetSubString(Range(start, start+len)); |
|
auxCacheForSrcPhrase(f); |
|
} |
|
} |
|
std::cerr << "Cached " << m_Cache.size() - prev_cache_size |
|
<< " new primary reordering table keys\n"; |
|
} |
|
} |
|
|
|
|