|
|
|
|
|
#include "mmsapt.h" |
|
#include <boost/foreach.hpp> |
|
#include <boost/scoped_ptr.hpp> |
|
#include <boost/intrusive_ptr.hpp> |
|
#include <boost/tokenizer.hpp> |
|
#include <boost/thread/locks.hpp> |
|
#include <algorithm> |
|
#include "util/exception.hh" |
|
#include <set> |
|
#include "util/usage.hh" |
|
|
|
namespace Moses |
|
{ |
|
using namespace sapt; |
|
using namespace std; |
|
using namespace boost; |
|
|
|
void |
|
fillIdSeq(Phrase const& mophrase, std::vector<FactorType> const& ifactors, |
|
TokenIndex const& V, vector<id_type>& dest) |
|
{ |
|
dest.resize(mophrase.GetSize()); |
|
for (size_t i = 0; i < mophrase.GetSize(); ++i) |
|
{ |
|
|
|
dest[i] = V[mophrase.GetWord(i).GetString(ifactors, false)]; |
|
} |
|
} |
|
|
|
void |
|
parseLine(string const& line, map<string,string> & param) |
|
{ |
|
char_separator<char> sep("; "); |
|
tokenizer<char_separator<char> > tokens(line,sep); |
|
BOOST_FOREACH(string const& t,tokens) |
|
{ |
|
size_t i = t.find_first_not_of(" ="); |
|
size_t j = t.find_first_of(" =",i+1); |
|
size_t k = t.find_first_not_of(" =",j+1); |
|
UTIL_THROW_IF2(i == string::npos || k == string::npos, |
|
"[" << HERE << "] " |
|
<< "Parameter specification error near '" |
|
<< t << "' in moses ini line\n" |
|
<< line); |
|
assert(i != string::npos); |
|
assert(k != string::npos); |
|
param[t.substr(i,j)] = t.substr(k); |
|
} |
|
} |
|
|
|
vector<string> const& |
|
Mmsapt:: |
|
GetFeatureNames() const |
|
{ |
|
return m_feature_names; |
|
} |
|
|
|
Mmsapt:: |
|
Mmsapt(string const& line) |
|
: PhraseDictionary(line, false) |
|
, btfix(new mmbitext) |
|
, m_bias_log(NULL) |
|
, m_bias_loglevel(0) |
|
#ifndef NO_MOSES |
|
, m_lr_func(NULL) |
|
#endif |
|
, m_sampling_method(random_sampling) |
|
, bias_key(((char*)this)+3) |
|
, cache_key(((char*)this)+2) |
|
, context_key(((char*)this)+1) |
|
, m_track_coord(false) |
|
|
|
|
|
|
|
{ |
|
init(line); |
|
setup_local_feature_functions(); |
|
|
|
|
|
|
|
|
|
|
|
SetFeaturesToApply(); |
|
|
|
} |
|
|
|
void |
|
Mmsapt:: |
|
read_config_file(string fname, map<string,string>& param) |
|
{ |
|
string line; |
|
ifstream config(fname.c_str()); |
|
while (getline(config,line)) |
|
{ |
|
if (line[0] == '#') continue; |
|
char_separator<char> sep(" \t"); |
|
tokenizer<char_separator<char> > tokens(line,sep); |
|
tokenizer<char_separator<char> >::const_iterator t = tokens.begin(); |
|
if (t == tokens.end()) continue; |
|
string& foo = param[*t++]; |
|
if (t == tokens.end() || foo.size()) continue; |
|
|
|
UTIL_THROW_IF2(*t++ != "=" || t == tokens.end(), |
|
"Syntax error in Mmsapt config file '" << fname << "'."); |
|
for (foo = *t++; t != tokens.end(); foo += " " + *t++); |
|
} |
|
} |
|
|
|
void |
|
Mmsapt:: |
|
register_ff(SPTR<pscorer> const& ff, vector<SPTR<pscorer> > & registry) |
|
{ |
|
registry.push_back(ff); |
|
ff->setIndex(m_feature_names.size()); |
|
for (int i = 0; i < ff->fcnt(); ++i) |
|
{ |
|
m_feature_names.push_back(ff->fname(i)); |
|
m_is_logval.push_back(ff->isLogVal(i)); |
|
m_is_integer.push_back(ff->isIntegerValued(i)); |
|
} |
|
} |
|
|
|
bool Mmsapt::isLogVal(int i) const { return m_is_logval.at(i); } |
|
bool Mmsapt::isInteger(int i) const { return m_is_integer.at(i); } |
|
|
|
void |
|
Mmsapt:: |
|
parse_factor_spec(std::vector<FactorType>& flist, std::string const key) |
|
{ |
|
pair<string,string> dflt(key, "0"); |
|
string factors = this->param.insert(dflt).first->second; |
|
size_t p = 0, q = factors.find(','); |
|
for (; q < factors.size(); q = factors.find(',', p=q+1)) |
|
flist.push_back(atoi(factors.substr(p, q-p).c_str())); |
|
flist.push_back(atoi(factors.substr(p).c_str())); |
|
} |
|
|
|
|
|
void Mmsapt::init(string const& line) |
|
{ |
|
map<string,string>::const_iterator m; |
|
parseLine(line,this->param); |
|
|
|
this->m_numScoreComponents = atoi(param["num-features"].c_str()); |
|
|
|
m = param.find("config"); |
|
if (m != param.end()) |
|
read_config_file(m->second,param); |
|
|
|
m = param.find("base"); |
|
if (m != param.end()) |
|
{ |
|
m_bname = m->second; |
|
m = param.find("path"); |
|
UTIL_THROW_IF2((m != param.end() && m->second != m_bname), |
|
"Conflicting aliases for path:\n" |
|
<< "path=" << string(m->second) << "\n" |
|
<< "base=" << m_bname.c_str() ); |
|
} |
|
else m_bname = param["path"]; |
|
L1 = param["L1"]; |
|
L2 = param["L2"]; |
|
|
|
UTIL_THROW_IF2(m_bname.size() == 0, "Missing corpus base name at " << HERE); |
|
UTIL_THROW_IF2(L1.size() == 0, "Missing L1 tag at " << HERE); |
|
UTIL_THROW_IF2(L2.size() == 0, "Missing L2 tag at " << HERE); |
|
|
|
|
|
parse_factor_spec(m_ifactor,"input-factor"); |
|
parse_factor_spec(m_ofactor,"output-factor"); |
|
|
|
|
|
m_inputFactors = FactorMask(m_ifactor); |
|
m_outputFactors = FactorMask(m_ofactor); |
|
|
|
pair<string,string> dflt = pair<string,string> ("smooth",".01"); |
|
m_lbop_conf = atof(param.insert(dflt).first->second.c_str()); |
|
|
|
dflt = pair<string,string> ("lexalpha","0"); |
|
m_lex_alpha = atof(param.insert(dflt).first->second.c_str()); |
|
|
|
dflt = pair<string,string> ("sample","1000"); |
|
m_default_sample_size = atoi(param.insert(dflt).first->second.c_str()); |
|
|
|
dflt = pair<string,string> ("min-sample","0"); |
|
m_min_sample_size = atoi(param.insert(dflt).first->second.c_str()); |
|
|
|
dflt = pair<string,string>("workers","0"); |
|
m_workers = atoi(param.insert(dflt).first->second.c_str()); |
|
if (m_workers == 0) m_workers = StaticData::Instance().ThreadCount(); |
|
else m_workers = min(m_workers,size_t(boost::thread::hardware_concurrency())); |
|
|
|
dflt = pair<string,string>("bias-loglevel","0"); |
|
m_bias_loglevel = atoi(param.insert(dflt).first->second.c_str()); |
|
|
|
dflt = pair<string,string>("table-limit","20"); |
|
m_tableLimit = atoi(param.insert(dflt).first->second.c_str()); |
|
|
|
dflt = pair<string,string>("cache","100000"); |
|
m_cache_size = max(10000,atoi(param.insert(dflt).first->second.c_str())); |
|
|
|
m_cache.reset(new TPCollCache(m_cache_size)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
param.insert(pair<string,string>("pfwd", "g")); |
|
param.insert(pair<string,string>("pbwd", "g")); |
|
param.insert(pair<string,string>("lenrat", "1")); |
|
param.insert(pair<string,string>("rare", "1")); |
|
param.insert(pair<string,string>("logcnt", "0")); |
|
param.insert(pair<string,string>("coh", "0")); |
|
param.insert(pair<string,string>("prov", "0")); |
|
param.insert(pair<string,string>("cumb", "0")); |
|
|
|
poolCounts = true; |
|
|
|
|
|
if ((m = param.find("bias")) != param.end()) |
|
m_bias_file = m->second; |
|
|
|
if ((m = param.find("bias-server")) != param.end()) |
|
m_bias_server = m->second; |
|
if (m_bias_loglevel) |
|
{ |
|
dflt = pair<string,string>("bias-logfile","/dev/stderr"); |
|
param.insert(dflt); |
|
} |
|
if ((m = param.find("bias-logfile")) != param.end()) |
|
{ |
|
m_bias_logfile = m->second; |
|
if (m_bias_logfile == "/dev/stderr") |
|
m_bias_log = &std::cerr; |
|
else if (m_bias_logfile == "/dev/stdout") |
|
m_bias_log = &std::cout; |
|
else |
|
{ |
|
m_bias_logger.reset(new std::ofstream(m_bias_logfile.c_str())); |
|
m_bias_log = m_bias_logger.get(); |
|
} |
|
} |
|
|
|
if ((m = param.find("lr-func")) != param.end()) |
|
m_lr_func_name = m->second; |
|
|
|
|
|
if ((m = param.find("lrfunc")) != param.end()) |
|
m_lr_func_name = m->second; |
|
|
|
if ((m = param.find("extra")) != param.end()) |
|
m_extra_data = m->second; |
|
|
|
if ((m = param.find("method")) != param.end()) |
|
{ |
|
if (m->second == "random") |
|
m_sampling_method = random_sampling; |
|
else if (m->second == "ranked") |
|
m_sampling_method = ranked_sampling; |
|
else if (m->second == "ranked2") |
|
m_sampling_method = ranked_sampling2; |
|
else if (m->second == "full") |
|
m_sampling_method = full_coverage; |
|
else UTIL_THROW2("unrecognized specification 'method='" << m->second |
|
<< "' in line:\n" << line); |
|
} |
|
|
|
dflt = pair<string,string>("tuneable","true"); |
|
m_tuneable = Scan<bool>(param.insert(dflt).first->second.c_str()); |
|
|
|
dflt = pair<string,string>("feature-sets","standard"); |
|
m_feature_set_names = Tokenize(param.insert(dflt).first->second.c_str(), ","); |
|
m = param.find("name"); |
|
if (m != param.end()) m_name = m->second; |
|
|
|
|
|
|
|
|
|
param.insert(pair<string,string>("coord","0")); |
|
if(param["coord"] != "0") |
|
{ |
|
m_track_coord = true; |
|
vector<string> coord_instances = Tokenize(param["coord"], ","); |
|
BOOST_FOREACH(std::string instance, coord_instances) |
|
{ |
|
vector<string> toks = Moses::Tokenize(instance, ":"); |
|
string space = toks[0]; |
|
string file = toks[1]; |
|
|
|
m_coord_spaces.push_back(StaticData::InstanceNonConst().MapCoordSpace(space)); |
|
|
|
m_sid_coord_list.push_back(vector<SPTR<vector<float> > >()); |
|
vector<SPTR<vector<float> > >& sid_coord = m_sid_coord_list[m_sid_coord_list.size() - 1]; |
|
|
|
sid_coord.reserve(btfix->T1->size()); |
|
string line; |
|
cerr << "Loading coordinate lines for space \"" << space << "\" from " << file << endl; |
|
iostreams::filtering_istream in; |
|
ugdiss::open_input_stream(file, in); |
|
while(getline(in, line)) |
|
{ |
|
SPTR<vector<float> > coord(new vector<float>); |
|
Scan<float>(*coord, Tokenize(line)); |
|
sid_coord.push_back(coord); |
|
} |
|
cerr << "Loaded " << sid_coord.size() << " lines" << endl; |
|
} |
|
} |
|
|
|
|
|
vector<string> known_parameters; known_parameters.reserve(50); |
|
known_parameters.push_back("L1"); |
|
known_parameters.push_back("L2"); |
|
known_parameters.push_back("Mmsapt"); |
|
known_parameters.push_back("PhraseDictionaryBitextSampling"); |
|
|
|
known_parameters.push_back("base"); |
|
known_parameters.push_back("bias"); |
|
known_parameters.push_back("bias-server"); |
|
known_parameters.push_back("bias-logfile"); |
|
known_parameters.push_back("bias-loglevel"); |
|
known_parameters.push_back("cache"); |
|
known_parameters.push_back("coh"); |
|
known_parameters.push_back("config"); |
|
known_parameters.push_back("coord"); |
|
known_parameters.push_back("cumb"); |
|
known_parameters.push_back("extra"); |
|
known_parameters.push_back("feature-sets"); |
|
known_parameters.push_back("input-factor"); |
|
known_parameters.push_back("lenrat"); |
|
known_parameters.push_back("lexalpha"); |
|
|
|
known_parameters.push_back("logcnt"); |
|
known_parameters.push_back("lr-func"); |
|
known_parameters.push_back("lrfunc"); |
|
known_parameters.push_back("method"); |
|
known_parameters.push_back("name"); |
|
known_parameters.push_back("num-features"); |
|
known_parameters.push_back("output-factor"); |
|
known_parameters.push_back("path"); |
|
known_parameters.push_back("pbwd"); |
|
known_parameters.push_back("pfwd"); |
|
known_parameters.push_back("prov"); |
|
known_parameters.push_back("rare"); |
|
known_parameters.push_back("sample"); |
|
known_parameters.push_back("min-sample"); |
|
known_parameters.push_back("smooth"); |
|
known_parameters.push_back("table-limit"); |
|
known_parameters.push_back("tuneable"); |
|
known_parameters.push_back("unal"); |
|
known_parameters.push_back("workers"); |
|
sort(known_parameters.begin(),known_parameters.end()); |
|
for (map<string,string>::iterator m = param.begin(); m != param.end(); ++m) |
|
{ |
|
UTIL_THROW_IF2(!binary_search(known_parameters.begin(), |
|
known_parameters.end(), m->first), |
|
HERE << ": Unknown parameter specification for Mmsapt: " |
|
<< m->first); |
|
} |
|
} |
|
|
|
void |
|
Mmsapt:: |
|
load_bias(string const fname) |
|
{ |
|
m_bias = btfix->loadSentenceBias(fname); |
|
} |
|
|
|
void |
|
Mmsapt:: |
|
load_extra_data(string bname, bool locking = true) |
|
{ |
|
using namespace boost; |
|
using namespace ugdiss; |
|
|
|
|
|
|
|
|
|
vector<string> text1,text2,symal; |
|
string line; |
|
boost::iostreams::filtering_istream in1,in2,ina; |
|
|
|
open_input_stream(bname+L1+".txt.gz",in1); |
|
open_input_stream(bname+L2+".txt.gz",in2); |
|
open_input_stream(bname+L1+"-"+L2+".symal.gz",ina); |
|
|
|
while(getline(in1,line)) text1.push_back(line); |
|
while(getline(in2,line)) text2.push_back(line); |
|
while(getline(ina,line)) symal.push_back(line); |
|
|
|
scoped_ptr<boost::unique_lock<shared_mutex> > guard; |
|
if (locking) guard.reset(new boost::unique_lock<shared_mutex>(m_lock)); |
|
btdyn = btdyn->add(text1,text2,symal); |
|
assert(btdyn); |
|
cerr << "Loaded " << btdyn->T1->size() << " sentence pairs" << endl; |
|
} |
|
|
|
template<typename fftype> |
|
void |
|
Mmsapt:: |
|
check_ff(string const ffname, vector<SPTR<pscorer> >* registry) |
|
{ |
|
string const& spec = param[ffname]; |
|
if (spec == "" || spec == "0") return; |
|
if (registry) |
|
{ |
|
SPTR<fftype> ff(new fftype(spec)); |
|
register_ff(ff, *registry); |
|
} |
|
else if (spec[spec.size()-1] == '+') |
|
{ |
|
SPTR<fftype> ff(new fftype(spec)); |
|
register_ff(ff, m_active_ff_fix); |
|
ff.reset(new fftype(spec)); |
|
register_ff(ff, m_active_ff_dyn); |
|
} |
|
else |
|
{ |
|
SPTR<fftype> ff(new fftype(spec)); |
|
register_ff(ff, m_active_ff_common); |
|
} |
|
} |
|
|
|
template<typename fftype> |
|
void |
|
Mmsapt:: |
|
check_ff(string const ffname, float const xtra, |
|
vector<SPTR<pscorer> >* registry) |
|
{ |
|
string const& spec = param[ffname]; |
|
if (spec == "" || spec == "0") return; |
|
if (registry) |
|
{ |
|
SPTR<fftype> ff(new fftype(xtra,spec)); |
|
register_ff(ff, *registry); |
|
} |
|
else if (spec[spec.size()-1] == '+') |
|
{ |
|
SPTR<fftype> ff(new fftype(xtra,spec)); |
|
register_ff(ff, m_active_ff_fix); |
|
ff.reset(new fftype(xtra,spec)); |
|
register_ff(ff, m_active_ff_dyn); |
|
} |
|
else |
|
{ |
|
SPTR<fftype> ff(new fftype(xtra,spec)); |
|
register_ff(ff, m_active_ff_common); |
|
} |
|
} |
|
|
|
void |
|
Mmsapt:: |
|
Load(AllOptions::ptr const& opts) |
|
{ |
|
Load(opts, true); |
|
} |
|
|
|
void |
|
Mmsapt |
|
::setup_local_feature_functions() |
|
{ |
|
boost::unique_lock<boost::shared_mutex> lock(m_lock); |
|
|
|
BOOST_FOREACH(string const& fsname, m_feature_set_names) |
|
{ |
|
|
|
if (fsname == "standard") |
|
{ |
|
|
|
string lexfile = m_bname + L1 + "-" + L2 + ".lex"; |
|
SPTR<PScoreLex1<Token> > |
|
ff(new PScoreLex1<Token>(param["lex_alpha"],lexfile)); |
|
register_ff(ff,m_active_ff_common); |
|
|
|
|
|
check_ff<PScoreRareness<Token> > ("rare", &m_active_ff_common); |
|
check_ff<PScoreUnaligned<Token> >("unal", &m_active_ff_common); |
|
check_ff<PScoreCoherence<Token> >("coh", &m_active_ff_common); |
|
check_ff<PScoreCumBias<Token> >("cumb", &m_active_ff_common); |
|
check_ff<PScoreLengthRatio<Token> > ("lenrat", &m_active_ff_common); |
|
|
|
|
|
|
|
check_ff<PScorePfwd<Token> >("pfwd", m_lbop_conf); |
|
check_ff<PScorePbwd<Token> >("pbwd", m_lbop_conf); |
|
check_ff<PScoreLogCnt<Token> >("logcnt"); |
|
|
|
|
|
check_ff<PScoreProvenance<Token> >("prov", &m_active_ff_fix); |
|
check_ff<PScoreProvenance<Token> >("prov", &m_active_ff_dyn); |
|
} |
|
|
|
|
|
|
|
else if (fsname == "datasource") |
|
{ |
|
SPTR<PScorePC<Token> > ffpcnt(new PScorePC<Token>("pcnt")); |
|
register_ff(ffpcnt,m_active_ff_common); |
|
SPTR<PScoreWC<Token> > ffwcnt(new PScoreWC<Token>("wcnt")); |
|
register_ff(ffwcnt,m_active_ff_common); |
|
} |
|
} |
|
|
|
this->m_numScoreComponents = this->m_feature_names.size(); |
|
this->m_numTuneableComponents = this->m_numScoreComponents; |
|
} |
|
|
|
void |
|
Mmsapt:: |
|
Load(AllOptions::ptr const& opts, bool with_checks) |
|
{ |
|
m_options = opts; |
|
boost::unique_lock<boost::shared_mutex> lock(m_lock); |
|
|
|
BOOST_FOREACH(SPTR<pscorer>& ff, m_active_ff_fix) ff->load(); |
|
BOOST_FOREACH(SPTR<pscorer>& ff, m_active_ff_dyn) ff->load(); |
|
BOOST_FOREACH(SPTR<pscorer>& ff, m_active_ff_common) ff->load(); |
|
#if 0 |
|
if (with_checks) |
|
{ |
|
UTIL_THROW_IF2(this->m_feature_names.size() != this->m_numScoreComponents, |
|
"At " << HERE << ": number of feature values provided by " |
|
<< "Phrase table (" << this->m_feature_names.size() |
|
<< ") does not match number specified in Moses config file (" |
|
<< this->m_numScoreComponents << ")!\n";); |
|
} |
|
#endif |
|
|
|
m_thread_pool.reset(new ug::ThreadPool(max(m_workers,size_t(1)))); |
|
|
|
|
|
|
|
|
|
btfix->m_num_workers = this->m_workers; |
|
btfix->open(m_bname, L1, L2); |
|
btfix->setDefaultSampleSize(m_default_sample_size); |
|
|
|
btdyn.reset(new imbitext(btfix->V1, btfix->V2, m_default_sample_size, m_workers)); |
|
if (m_bias_file.size()) |
|
load_bias(m_bias_file); |
|
|
|
if (m_extra_data.size()) |
|
load_extra_data(m_extra_data, false); |
|
|
|
#if 0 |
|
|
|
LexicalPhraseScorer2<Token>::table_t & COOC = calc_lex.scorer.COOC; |
|
typedef LexicalPhraseScorer2<Token>::table_t::Cell cell_t; |
|
wlex21.resize(COOC.numCols); |
|
for (size_t r = 0; r < COOC.numRows; ++r) |
|
for (cell_t const* c = COOC[r].start; c < COOC[r].stop; ++c) |
|
wlex21[c->id].push_back(r); |
|
COOCraw.open(m_bname + L1 + "-" + L2 + ".coc"); |
|
#endif |
|
assert(btdyn); |
|
|
|
} |
|
|
|
void |
|
Mmsapt:: |
|
add(string const& s1, string const& s2, string const& a) |
|
{ |
|
vector<string> S1(1,s1); |
|
vector<string> S2(1,s2); |
|
vector<string> ALN(1,a); |
|
boost::unique_lock<boost::shared_mutex> guard(m_lock); |
|
btdyn = btdyn->add(S1,S2,ALN); |
|
} |
|
|
|
|
|
TargetPhrase* |
|
Mmsapt:: |
|
mkTPhrase(ttasksptr const& ttask, |
|
Phrase const& src, |
|
PhrasePair<Token>* fix, |
|
PhrasePair<Token>* dyn, |
|
SPTR<Bitext<Token> > const& dynbt) const |
|
{ |
|
UTIL_THROW_IF2(!fix && !dyn, HERE << |
|
": Can't create target phrase from nothing."); |
|
vector<float> fvals(this->m_numScoreComponents); |
|
PhrasePair<Token> pool = fix ? *fix : *dyn; |
|
if (fix) |
|
{ |
|
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_fix) |
|
(*ff)(*btfix, *fix, &fvals); |
|
} |
|
if (dyn) |
|
{ |
|
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_dyn) |
|
(*ff)(*dynbt, *dyn, &fvals); |
|
} |
|
|
|
if (fix && dyn) { pool += *dyn; } |
|
else if (fix) |
|
{ |
|
PhrasePair<Token> zilch; zilch.init(); |
|
TSA<Token>::tree_iterator m(dynbt->I2.get(), fix->start2, fix->len2); |
|
if (m.size() == fix->len2) |
|
zilch.raw2 = m.approxOccurrenceCount(); |
|
pool += zilch; |
|
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_dyn) |
|
(*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals); |
|
} |
|
else if (dyn) |
|
{ |
|
PhrasePair<Token> zilch; zilch.init(); |
|
TSA<Token>::tree_iterator m(btfix->I2.get(), dyn->start2, dyn->len2); |
|
if (m.size() == dyn->len2) |
|
zilch.raw2 = m.approxOccurrenceCount(); |
|
pool += zilch; |
|
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_fix) |
|
(*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals); |
|
} |
|
if (fix) |
|
{ |
|
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_common) |
|
(*ff)(*btfix, pool, &fvals); |
|
} |
|
else |
|
{ |
|
BOOST_FOREACH(SPTR<pscorer> const& ff, m_active_ff_common) |
|
(*ff)(*dynbt, pool, &fvals); |
|
} |
|
|
|
TargetPhrase* tp = new TargetPhrase(const_cast<ttasksptr&>(ttask), this); |
|
Token const* x = fix ? fix->start2 : dyn->start2; |
|
uint32_t len = fix ? fix->len2 : dyn->len2; |
|
for (uint32_t k = 0; k < len; ++k, x = x->next()) |
|
{ |
|
StringPiece wrd = (*(btfix->V2))[x->id()]; |
|
Word w; |
|
w.CreateFromString(Output, m_ofactor, wrd, false); |
|
tp->AddWord(w); |
|
} |
|
tp->SetAlignTerm(pool.aln); |
|
tp->GetScoreBreakdown().Assign(this, fvals); |
|
|
|
tp->EvaluateInIsolation(src, m_featuresToApply); |
|
|
|
#ifndef NO_MOSES |
|
if (m_lr_func) |
|
{ |
|
LRModel::ModelType mdl = m_lr_func->GetModel().GetModelType(); |
|
LRModel::Direction dir = m_lr_func->GetModel().GetDirection(); |
|
SPTR<Scores> scores(new Scores()); |
|
pool.fill_lr_vec(dir, mdl, *scores); |
|
tp->SetExtraScores(m_lr_func, scores); |
|
} |
|
#endif |
|
|
|
|
|
if (m_track_coord) |
|
{ |
|
BOOST_FOREACH(uint32_t const sid, *pool.sids) |
|
{ |
|
for(size_t i = 0; i < m_coord_spaces.size(); ++i) |
|
{ |
|
tp->PushCoord(m_coord_spaces[i], m_sid_coord_list[i][sid]); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
return tp; |
|
} |
|
|
|
void |
|
Mmsapt:: |
|
GetTargetPhraseCollectionBatch(ttasksptr const& ttask, |
|
const InputPathList &inputPathQueue) const |
|
{ |
|
InputPathList::const_iterator iter; |
|
for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) |
|
{ |
|
InputPath &inputPath = **iter; |
|
const Phrase &phrase = inputPath.GetPhrase(); |
|
PrefixExists(ttask, phrase); |
|
} |
|
for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) |
|
{ |
|
InputPath &inputPath = **iter; |
|
const Phrase &phrase = inputPath.GetPhrase(); |
|
TargetPhraseCollection::shared_ptr targetPhrases |
|
= this->GetTargetPhraseCollectionLEGACY(ttask,phrase); |
|
inputPath.SetTargetPhrases(*this, targetPhrases, NULL); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TargetPhraseCollection::shared_ptr |
|
Mmsapt:: |
|
GetTargetPhraseCollectionLEGACY(ttasksptr const& ttask, const Phrase& src) const |
|
{ |
|
SPTR<TPCollWrapper> ret; |
|
|
|
|
|
|
|
vector<id_type> sphrase; |
|
fillIdSeq(src, m_ifactor, *(btfix->V1), sphrase); |
|
if (sphrase.size() == 0) return ret; |
|
|
|
|
|
|
|
|
|
SPTR<imBitext<Token> > dyn; |
|
{ |
|
boost::unique_lock<boost::shared_mutex> guard(m_lock); |
|
assert(btdyn); |
|
dyn = btdyn; |
|
} |
|
assert(dyn); |
|
|
|
|
|
TSA<Token>::tree_iterator mfix(btfix->I1.get(), &sphrase[0], sphrase.size()); |
|
TSA<Token>::tree_iterator mdyn(dyn->I1.get()); |
|
if (dyn->I1.get()) |
|
for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i) |
|
mdyn.extend(sphrase[i]); |
|
|
|
if (mdyn.size() != sphrase.size() && mfix.size() != sphrase.size()) |
|
return ret; |
|
|
|
|
|
uint64_t phrasekey = (mfix.size() == sphrase.size() |
|
? (mfix.getPid()<<1) |
|
: (mdyn.getPid()<<1)+1); |
|
|
|
|
|
SPTR<ContextScope> const& scope = ttask->GetScope(); |
|
SPTR<TPCollCache> cache = scope->get<TPCollCache>(cache_key); |
|
if (!cache) cache = m_cache; |
|
|
|
ret = cache->get(phrasekey, dyn->revision()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
boost::upgrade_lock<boost::shared_mutex> rlock(ret->lock); |
|
if (ret->GetSize()) return ret; |
|
|
|
|
|
boost::upgrade_to_unique_lock<boost::shared_mutex> wlock(rlock); |
|
|
|
if (ret->GetSize()) return ret; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SPTR<pstats> sfix,sdyn; |
|
|
|
if (mfix.size() == sphrase.size()) |
|
{ |
|
SPTR<ContextForQuery> context = scope->get<ContextForQuery>(btfix.get()); |
|
SPTR<pstats> const* foo = context->cache1->get(mfix.getPid()); |
|
if (foo) { sfix = *foo; sfix->wait(); } |
|
else |
|
{ |
|
BitextSampler<Token> s(btfix, mfix, context->bias, |
|
m_min_sample_size, |
|
m_default_sample_size, |
|
m_sampling_method, |
|
m_track_coord); |
|
s(); |
|
sfix = s.stats(); |
|
} |
|
} |
|
|
|
if (mdyn.size() == sphrase.size()) |
|
sdyn = dyn->lookup(ttask, mdyn); |
|
|
|
vector<PhrasePair<Token> > ppfix,ppdyn; |
|
PhrasePair<Token>::SortByTargetIdSeq sort_by_tgt_id; |
|
if (sfix) |
|
{ |
|
expand(mfix, *btfix, *sfix, ppfix, m_bias_log); |
|
sort(ppfix.begin(), ppfix.end(),sort_by_tgt_id); |
|
} |
|
if (sdyn) |
|
{ |
|
expand(mdyn, *dyn, *sdyn, ppdyn, m_bias_log); |
|
sort(ppdyn.begin(), ppdyn.end(),sort_by_tgt_id); |
|
} |
|
|
|
|
|
PhrasePair<Token>::SortByTargetIdSeq sorter; |
|
size_t i = 0; size_t k = 0; |
|
while (i < ppfix.size() && k < ppdyn.size()) |
|
{ |
|
int cmp = sorter.cmp(ppfix[i], ppdyn[k]); |
|
if (cmp < 0) ret->Add(mkTPhrase(ttask,src,&ppfix[i++],NULL,dyn)); |
|
else if (cmp == 0) ret->Add(mkTPhrase(ttask,src,&ppfix[i++],&ppdyn[k++],dyn)); |
|
else ret->Add(mkTPhrase(ttask,src,NULL,&ppdyn[k++],dyn)); |
|
} |
|
while (i < ppfix.size()) ret->Add(mkTPhrase(ttask,src,&ppfix[i++],NULL,dyn)); |
|
while (k < ppdyn.size()) ret->Add(mkTPhrase(ttask,src,NULL,&ppdyn[k++],dyn)); |
|
|
|
|
|
if (m_tableLimit) ret->Prune(true, m_tableLimit); |
|
else ret->Prune(true,ret->GetSize()); |
|
|
|
#if 1 |
|
if (m_bias_log && m_lr_func && m_bias_loglevel > 3) |
|
{ |
|
PhrasePair<Token>::SortDescendingByJointCount sorter; |
|
sort(ppfix.begin(), ppfix.end(),sorter); |
|
BOOST_FOREACH(PhrasePair<Token> const& pp, ppfix) |
|
{ |
|
|
|
pp.print(*m_bias_log,*btfix->V1, *btfix->V2, m_lr_func->GetModel()); |
|
} |
|
} |
|
#endif |
|
return ret; |
|
} |
|
|
|
size_t |
|
Mmsapt:: |
|
SetTableLimit(size_t limit) |
|
{ |
|
std::swap(m_tableLimit,limit); |
|
return limit; |
|
} |
|
|
|
void |
|
Mmsapt:: |
|
CleanUpAfterSentenceProcessing(ttasksptr const& ttask) |
|
{ } |
|
|
|
|
|
ChartRuleLookupManager* |
|
Mmsapt:: |
|
CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &) |
|
{ |
|
throw "CreateRuleLookupManager is currently not supported in Mmsapt!"; |
|
} |
|
|
|
ChartRuleLookupManager* |
|
Mmsapt:: |
|
CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &, |
|
size_t ) |
|
{ |
|
throw "CreateRuleLookupManager is currently not supported in Mmsapt!"; |
|
} |
|
|
|
void |
|
Mmsapt:: |
|
setup_bias(ttasksptr const& ttask) |
|
{ |
|
SPTR<ContextScope> const& scope = ttask->GetScope(); |
|
SPTR<ContextForQuery> context; |
|
context = scope->get<ContextForQuery>(btfix.get(), true); |
|
if (context->bias) return; |
|
|
|
|
|
SPTR<std::map<std::string, float> const> w; |
|
w = ttask->GetScope()->GetContextWeights(); |
|
if (w && !w->empty()) |
|
{ |
|
if (m_bias_log) |
|
*m_bias_log << "BIAS WEIGHTS GIVEN WITH INPUT at " << HERE << endl; |
|
context->bias = btfix->SetupDocumentBias(*w, m_bias_log); |
|
} |
|
else if (m_bias_server.size() && ttask->GetContextWindow()) |
|
{ |
|
|
|
string context_words; |
|
BOOST_FOREACH(string const& line, *ttask->GetContextWindow()) |
|
{ |
|
if (context_words.size()) context_words += " "; |
|
context_words += line; |
|
} |
|
if (context_words.size()) |
|
{ |
|
if (m_bias_log) |
|
*m_bias_log << "GETTING BIAS FROM SERVER at " << HERE << endl |
|
<< "BIAS LOOKUP CONTEXT: " << context_words << endl; |
|
context->bias |
|
= btfix->SetupDocumentBias(m_bias_server,context_words,m_bias_log); |
|
|
|
|
|
ttask->GetScope()->SetContextWeights(context->bias->getBiasMap()); |
|
} |
|
} |
|
if (context->bias) |
|
{ |
|
context->bias_log = m_bias_log; |
|
context->bias->loglevel = m_bias_loglevel; |
|
} |
|
} |
|
|
|
void |
|
Mmsapt:: |
|
InitializeForInput(ttasksptr const& ttask) |
|
{ |
|
boost::unique_lock<boost::shared_mutex> mylock(m_lock); |
|
|
|
SPTR<ContextScope> const& scope = ttask->GetScope(); |
|
SPTR<TPCollCache> localcache = scope->get<TPCollCache>(cache_key); |
|
SPTR<ContextForQuery> context = scope->get<ContextForQuery>(btfix.get(), true); |
|
boost::unique_lock<boost::shared_mutex> ctxlock(context->lock); |
|
|
|
|
|
|
|
if (!localcache) |
|
{ |
|
|
|
setup_bias(ttask); |
|
if (context->bias) |
|
{ |
|
localcache.reset(new TPCollCache(m_cache_size)); |
|
} |
|
else localcache = m_cache; |
|
scope->set<TPCollCache>(cache_key, localcache); |
|
} |
|
|
|
if (!context->cache1) context->cache1.reset(new pstats::cache_t); |
|
if (!context->cache2) context->cache2.reset(new pstats::cache_t); |
|
|
|
#ifndef NO_MOSES |
|
if (m_lr_func_name.size() && m_lr_func == NULL) |
|
{ |
|
FeatureFunction* lr = &FeatureFunction::FindFeatureFunction(m_lr_func_name); |
|
m_lr_func = dynamic_cast<LexicalReordering*>(lr); |
|
UTIL_THROW_IF2(lr == NULL, "FF " << m_lr_func_name |
|
<< " does not seem to be a lexical reordering function!"); |
|
|
|
} |
|
#endif |
|
} |
|
|
|
bool |
|
Mmsapt:: |
|
PrefixExists(ttasksptr const& ttask, Moses::Phrase const& phrase) const |
|
{ |
|
if (phrase.GetSize() == 0) return false; |
|
SPTR<ContextScope> const& scope = ttask->GetScope(); |
|
|
|
vector<id_type> myphrase; |
|
fillIdSeq(phrase, m_ifactor, *btfix->V1, myphrase); |
|
|
|
TSA<Token>::tree_iterator mfix(btfix->I1.get(),&myphrase[0],myphrase.size()); |
|
if (mfix.size() == myphrase.size()) |
|
{ |
|
SPTR<ContextForQuery> context = scope->get<ContextForQuery>(btfix.get(), true); |
|
uint64_t pid = mfix.getPid(); |
|
if (!context->cache1->get(pid)) |
|
{ |
|
BitextSampler<Token> s(btfix, mfix, context->bias, |
|
m_min_sample_size, m_default_sample_size, |
|
m_sampling_method, m_track_coord); |
|
if (*context->cache1->get(pid, s.stats()) == s.stats()) |
|
m_thread_pool->add(s); |
|
} |
|
|
|
|
|
return true; |
|
} |
|
|
|
SPTR<imBitext<Token> > dyn; |
|
{ |
|
boost::unique_lock<boost::shared_mutex> guard(m_lock); |
|
dyn = btdyn; |
|
} |
|
assert(dyn); |
|
TSA<Token>::tree_iterator mdyn(dyn->I1.get()); |
|
if (dyn->I1.get()) |
|
{ |
|
for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i) |
|
mdyn.extend(myphrase[i]); |
|
|
|
if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn, m_track_coord); |
|
} |
|
return mdyn.size() == myphrase.size(); |
|
} |
|
|
|
#if 0 |
|
void |
|
Mmsapt |
|
::Release(ttasksptr const& ttask, TargetPhraseCollection::shared_ptr*& tpc) const |
|
{ |
|
if (!tpc) |
|
{ |
|
|
|
return; |
|
} |
|
SPTR<TPCollCache> cache = ttask->GetScope()->get<TPCollCache>(cache_key); |
|
|
|
TPCollWrapper const* foo = static_cast<TPCollWrapper const*>(tpc); |
|
|
|
|
|
|
|
if (cache) cache->release(static_cast<TPCollWrapper const*>(tpc)); |
|
tpc = NULL; |
|
} |
|
#endif |
|
|
|
bool Mmsapt |
|
::ProvidesPrefixCheck() const { return true; } |
|
|
|
string const& Mmsapt |
|
::GetName() const { return m_name; } |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vector<float> |
|
Mmsapt |
|
::DefaultWeights() const |
|
{ return vector<float>(this->GetNumScoreComponents(), 1.); } |
|
|
|
} |
|
|