|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "moses/TranslationModel/PhraseDictionaryGroup.h" |
|
|
|
#include <boost/foreach.hpp> |
|
#include <boost/unordered_map.hpp> |
|
|
|
#include "util/exception.hh" |
|
|
|
using namespace std; |
|
using namespace boost; |
|
|
|
namespace Moses |
|
{ |
|
|
|
PhraseDictionaryGroup::PhraseDictionaryGroup(const string &line) |
|
: PhraseDictionary(line, true), |
|
m_numModels(0), |
|
m_totalModelScores(0), |
|
m_phraseCounts(false), |
|
m_wordCounts(false), |
|
m_modelBitmapCounts(false), |
|
m_restrict(false), |
|
m_haveDefaultScores(false), |
|
m_defaultAverageOthers(false), |
|
m_scoresPerModel(0), |
|
m_haveMmsaptLrFunc(false) |
|
{ |
|
ReadParameters(); |
|
} |
|
|
|
void PhraseDictionaryGroup::SetParameter(const string& key, const string& value) |
|
{ |
|
if (key == "members") { |
|
m_memberPDStrs = Tokenize(value, ","); |
|
m_numModels = m_memberPDStrs.size(); |
|
m_seenByAll = dynamic_bitset<>(m_numModels); |
|
m_seenByAll.set(); |
|
} else if (key == "restrict") { |
|
m_restrict = Scan<bool>(value); |
|
} else if (key == "phrase-counts") { |
|
m_phraseCounts = Scan<bool>(value); |
|
} else if (key == "word-counts") { |
|
m_wordCounts = Scan<bool>(value); |
|
} else if (key == "model-bitmap-counts") { |
|
m_modelBitmapCounts = Scan<bool>(value); |
|
} else if (key =="default-scores") { |
|
m_haveDefaultScores = true; |
|
m_defaultScores = Scan<float>(Tokenize(value, ",")); |
|
} else if (key =="default-average-others") { |
|
m_defaultAverageOthers = Scan<bool>(value); |
|
} else if (key =="mmsapt-lr-func") { |
|
m_haveMmsaptLrFunc = true; |
|
} else { |
|
PhraseDictionary::SetParameter(key, value); |
|
} |
|
} |
|
|
|
void PhraseDictionaryGroup::Load(AllOptions::ptr const& opts) |
|
{ |
|
m_options = opts; |
|
SetFeaturesToApply(); |
|
m_pdFeature.push_back(const_cast<PhraseDictionaryGroup*>(this)); |
|
size_t numScoreComponents = 0; |
|
|
|
|
|
BOOST_FOREACH(const string& pdName, m_memberPDStrs) { |
|
bool pdFound = false; |
|
BOOST_FOREACH(PhraseDictionary* pd, PhraseDictionary::GetColl()) { |
|
if (pd->GetScoreProducerDescription() == pdName) { |
|
pdFound = true; |
|
m_memberPDs.push_back(pd); |
|
size_t nScores = pd->GetNumScoreComponents(); |
|
numScoreComponents += nScores; |
|
if (m_scoresPerModel == 0) { |
|
m_scoresPerModel = nScores; |
|
} else if (m_defaultAverageOthers) { |
|
UTIL_THROW_IF2(nScores != m_scoresPerModel, |
|
m_description << ": member models must have the same number of scores when using default-average-others"); |
|
} |
|
} |
|
} |
|
UTIL_THROW_IF2(!pdFound, |
|
m_description << ": could not find member phrase table " << pdName); |
|
} |
|
m_totalModelScores = numScoreComponents; |
|
|
|
|
|
if (m_phraseCounts) { |
|
numScoreComponents += m_numModels; |
|
} |
|
if (m_wordCounts) { |
|
numScoreComponents += m_numModels; |
|
} |
|
if (m_modelBitmapCounts) { |
|
numScoreComponents += (pow(2, m_numModels) - 1); |
|
} |
|
UTIL_THROW_IF2(numScoreComponents != m_numScoreComponents, |
|
m_description << ": feature count mismatch: specify \"num-features=" << numScoreComponents << "\" and supply " << numScoreComponents << " weights"); |
|
|
|
#ifdef PT_UG |
|
|
|
if (m_haveMmsaptLrFunc) { |
|
BOOST_FOREACH(PhraseDictionary* pd, m_memberPDs) { |
|
|
|
|
|
m_mmsaptLrFuncs.push_back(&(static_cast<Mmsapt*>(pd)->m_lr_func)); |
|
} |
|
} |
|
#endif |
|
|
|
|
|
if (m_haveDefaultScores) { |
|
UTIL_THROW_IF2(m_defaultScores.size() != m_numScoreComponents, |
|
m_description << ": number of specified default scores is unequal to number of member model scores"); |
|
} else { |
|
|
|
|
|
m_defaultScores = vector<float>(m_numScoreComponents, 0); |
|
} |
|
} |
|
|
|
void PhraseDictionaryGroup::InitializeForInput(const ttasksptr& ttask) |
|
{ |
|
|
|
} |
|
|
|
void PhraseDictionaryGroup::GetTargetPhraseCollectionBatch( |
|
const ttasksptr& ttask, const InputPathList& inputPathQueue) const |
|
{ |
|
|
|
BOOST_FOREACH(const InputPath* inputPath, inputPathQueue) { |
|
const Phrase& phrase = inputPath->GetPhrase(); |
|
BOOST_FOREACH(const PhraseDictionary* pd, m_memberPDs) { |
|
pd->PrefixExists(ttask, phrase); |
|
} |
|
} |
|
|
|
BOOST_FOREACH(InputPath* inputPath, inputPathQueue) { |
|
const Phrase &phrase = inputPath->GetPhrase(); |
|
TargetPhraseCollection::shared_ptr targetPhrases = |
|
this->GetTargetPhraseCollectionLEGACY(ttask, phrase); |
|
inputPath->SetTargetPhrases(*this, targetPhrases, NULL); |
|
} |
|
} |
|
|
|
TargetPhraseCollection::shared_ptr PhraseDictionaryGroup::GetTargetPhraseCollectionLEGACY( |
|
const Phrase& src) const |
|
{ |
|
UTIL_THROW2("Don't call me without the translation task."); |
|
} |
|
|
|
TargetPhraseCollection::shared_ptr |
|
PhraseDictionaryGroup:: |
|
GetTargetPhraseCollectionLEGACY(const ttasksptr& ttask, const Phrase& src) const |
|
{ |
|
TargetPhraseCollection::shared_ptr ret |
|
= CreateTargetPhraseCollection(ttask, src); |
|
ret->NthElement(m_tableLimit); |
|
const_cast<PhraseDictionaryGroup*>(this)->CacheForCleanup(ret); |
|
return ret; |
|
} |
|
|
|
TargetPhraseCollection::shared_ptr |
|
PhraseDictionaryGroup:: |
|
CreateTargetPhraseCollection(const ttasksptr& ttask, const Phrase& src) const |
|
{ |
|
|
|
vector<TargetPhrase*> phraseList; |
|
typedef unordered_map<const TargetPhrase*, PDGroupPhrase, UnorderedComparer<Phrase>, UnorderedComparer<Phrase> > PhraseMap; |
|
PhraseMap phraseMap; |
|
|
|
|
|
size_t offset = 0; |
|
for (size_t i = 0; i < m_numModels; ++i) { |
|
|
|
|
|
const PhraseDictionary& pd = *m_memberPDs[i]; |
|
TargetPhraseCollection::shared_ptr |
|
ret_raw = pd.GetTargetPhraseCollectionLEGACY(ttask, src); |
|
|
|
if (ret_raw != NULL) { |
|
|
|
BOOST_FOREACH(const TargetPhrase* targetPhrase, *ret_raw) { |
|
vector<float> raw_scores = |
|
targetPhrase->GetScoreBreakdown().GetScoresForProducer(&pd); |
|
|
|
|
|
PhraseMap::iterator iter = phraseMap.find(targetPhrase); |
|
if (iter == phraseMap.end()) { |
|
if (m_restrict && i > 0) { |
|
continue; |
|
} |
|
|
|
|
|
TargetPhrase* phrase = new TargetPhrase(*targetPhrase); |
|
|
|
phrase->GetScoreBreakdown().InvertDenseFeatures(&pd); |
|
vector<FeatureFunction*> pd_feature; |
|
pd_feature.push_back(m_memberPDs[i]); |
|
const vector<FeatureFunction*> pd_feature_const(pd_feature); |
|
phrase->EvaluateInIsolation(src, pd_feature_const); |
|
|
|
phrase->GetScoreBreakdown().ZeroDenseFeatures(&pd); |
|
|
|
phraseList.push_back(phrase); |
|
phraseMap[targetPhrase] = PDGroupPhrase(phrase, m_defaultScores, m_numModels); |
|
} else { |
|
|
|
TargetPhrase* phrase = iter->second.m_targetPhrase; |
|
BOOST_FOREACH(const TargetPhrase::ScoreCache_t::value_type pair, targetPhrase->GetExtraScores()) { |
|
phrase->SetExtraScores(pair.first, pair.second); |
|
} |
|
} |
|
|
|
PDGroupPhrase& pdgPhrase = (iter == phraseMap.end()) ? phraseMap.find(targetPhrase)->second : iter->second; |
|
|
|
|
|
for (size_t j = 0; j < pd.GetNumScoreComponents(); ++j) { |
|
pdgPhrase.m_scores[offset + j] = raw_scores[j]; |
|
} |
|
|
|
|
|
pdgPhrase.m_seenBy[i] = true; |
|
} |
|
} |
|
offset += pd.GetNumScoreComponents(); |
|
} |
|
|
|
|
|
TargetPhraseCollection::shared_ptr ret(new TargetPhraseCollection); |
|
const vector<FeatureFunction*> pd_feature_const(m_pdFeature); |
|
BOOST_FOREACH(TargetPhrase* phrase, phraseList) { |
|
PDGroupPhrase& pdgPhrase = phraseMap.find(phrase)->second; |
|
|
|
|
|
|
|
|
|
|
|
size_t offset = m_totalModelScores; |
|
|
|
if (m_phraseCounts) { |
|
for (size_t i = 0; i < m_numModels; ++i) { |
|
if (pdgPhrase.m_seenBy[i]) { |
|
pdgPhrase.m_scores[offset + i] = 1; |
|
} |
|
} |
|
offset += m_numModels; |
|
} |
|
|
|
if (m_wordCounts) { |
|
size_t wc = pdgPhrase.m_targetPhrase->GetSize(); |
|
for (size_t i = 0; i < m_numModels; ++i) { |
|
if (pdgPhrase.m_seenBy[i]) { |
|
pdgPhrase.m_scores[offset + i] = wc; |
|
} |
|
} |
|
offset += m_numModels; |
|
} |
|
|
|
|
|
|
|
if (m_modelBitmapCounts) { |
|
|
|
pdgPhrase.m_scores[offset + (pdgPhrase.m_seenBy.to_ulong() - 1)] = 1; |
|
offset += m_seenByAll.to_ulong(); |
|
} |
|
|
|
|
|
|
|
if (m_defaultAverageOthers) { |
|
|
|
if (pdgPhrase.m_seenBy != m_seenByAll) { |
|
vector<float> avgScores(m_scoresPerModel, 0); |
|
size_t seenBy = 0; |
|
offset = 0; |
|
|
|
for (size_t i = 0; i < m_numModels; ++i) { |
|
if (pdgPhrase.m_seenBy[i]) { |
|
for (size_t j = 0; j < m_scoresPerModel; ++j) { |
|
avgScores[j] += pdgPhrase.m_scores[offset + j]; |
|
} |
|
seenBy += 1; |
|
} |
|
offset += m_scoresPerModel; |
|
} |
|
|
|
for (size_t j = 0; j < m_scoresPerModel; ++j) { |
|
avgScores[j] /= seenBy; |
|
} |
|
|
|
offset = 0; |
|
for (size_t i = 0; i < m_numModels; ++i) { |
|
if (!pdgPhrase.m_seenBy[i]) { |
|
for (size_t j = 0; j < m_scoresPerModel; ++j) { |
|
pdgPhrase.m_scores[offset + j] = avgScores[j]; |
|
} |
|
} |
|
offset += m_scoresPerModel; |
|
} |
|
#ifdef PT_UG |
|
|
|
|
|
if (m_haveMmsaptLrFunc) { |
|
SPTR<Scores> avgLRScores; |
|
size_t seenBy = 0; |
|
|
|
for (size_t i = 0; i < m_numModels; ++i) { |
|
const LexicalReordering* lrFunc = *m_mmsaptLrFuncs[i]; |
|
|
|
if (pdgPhrase.m_seenBy[i] && lrFunc != NULL) { |
|
const Scores* scores = pdgPhrase.m_targetPhrase->GetExtraScores(lrFunc); |
|
if (!avgLRScores) { |
|
avgLRScores.reset(new Scores(*scores)); |
|
} else { |
|
for (size_t j = 0; j < scores->size(); ++j) { |
|
(*avgLRScores)[j] += (*scores)[j]; |
|
} |
|
} |
|
seenBy += 1; |
|
} |
|
} |
|
|
|
if (avgLRScores) { |
|
|
|
for (size_t j = 0; j < avgLRScores->size(); ++j) { |
|
(*avgLRScores)[j] /= seenBy; |
|
} |
|
|
|
for (size_t i = 0; i < m_numModels; ++i) { |
|
const LexicalReordering* lrFunc = *m_mmsaptLrFuncs[i]; |
|
if (!pdgPhrase.m_seenBy[i] && lrFunc != NULL) { |
|
pdgPhrase.m_targetPhrase->SetExtraScores(lrFunc, avgLRScores); |
|
} |
|
} |
|
} |
|
} |
|
#endif |
|
} |
|
} |
|
|
|
|
|
phrase->GetScoreBreakdown().Assign(this, pdgPhrase.m_scores); |
|
|
|
phrase->EvaluateInIsolation(src, pd_feature_const); |
|
ret->Add(phrase); |
|
} |
|
|
|
return ret; |
|
} |
|
|
|
ChartRuleLookupManager* |
|
PhraseDictionaryGroup:: |
|
CreateRuleLookupManager(const ChartParser &, |
|
const ChartCellCollectionBase&, size_t) |
|
{ |
|
UTIL_THROW(util::Exception, "Phrase table used in chart decoder"); |
|
} |
|
|
|
|
|
void PhraseDictionaryGroup::CacheForCleanup(TargetPhraseCollection::shared_ptr tpc) |
|
{ |
|
PhraseCache &ref = GetPhraseCache(); |
|
ref.push_back(tpc); |
|
} |
|
|
|
void |
|
PhraseDictionaryGroup:: |
|
CleanUpAfterSentenceProcessing(const InputType &source) |
|
{ |
|
GetPhraseCache().clear(); |
|
CleanUpComponentModels(source); |
|
} |
|
|
|
void PhraseDictionaryGroup::CleanUpComponentModels(const InputType &source) |
|
{ |
|
for (size_t i = 0; i < m_numModels; ++i) { |
|
m_memberPDs[i]->CleanUpAfterSentenceProcessing(source); |
|
} |
|
} |
|
|
|
} |
|
|