sakharamg's picture
Uploading all files
158b61b
// build a phrase table for the given input
// #include "ug_lexical_phrase_scorer2.h"
#if 0
#include <stdint.h>
#include <string>
#include <vector>
#include <cassert>
#include <iomanip>
#include <algorithm>
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
#include <boost/math/distributions/binomial.hpp>
#include <boost/unordered_map.hpp>
#include <boost/foreach.hpp>
#include "ug_mm_ttrack.h"
#include "ug_mm_tsa.h"
#include "tpt_tokenindex.h"
#include "ug_corpus_token.h"
#include "ug_typedefs.h"
#include "tpt_pickler.h"
#include "ug_bitext.h"
#include "ug_lexical_phrase_scorer2.h"
#include "../sapt_phrase_scorers.h"
using namespace std;
using namespace ugdiss;
using namespace Moses;
using namespace Moses::bitext;
#define CACHING_THRESHOLD 1000
#define lbop boost::math::binomial_distribution<>::find_lower_bound_on_p
size_t mctr=0,xctr=0;
typedef L2R_Token<SimpleWordId> Token;
typedef mmBitext<Token> mmbitext;
mmbitext bt;
float lbsmooth = .005;
PScorePfwd<Token> calc_pfwd;
PScorePbwd<Token> calc_pbwd;
PScoreLex<Token> calc_lex(1.0);
PScoreWC<Token> apply_wp;
vector<float> fweights;
void
nbest_phrasepairs(uint64_t const pid1,
pstats const& ps,
vector<PhrasePair> & nbest)
{
pstats::trg_map_t::const_iterator m;
vector<size_t> idx(nbest.size());
size_t i=0;
for (m = ps.trg.begin();
m != ps.trg.end() && i < nbest.size();
++m)
{
// cout << m->second.rcnt() << " " << ps.good << endl;
if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good))
continue;
nbest[i].init(pid1,ps,5);
nbest[i].update(m->first,m->second);
calc_pfwd(bt, nbest[i]);
calc_pbwd(bt, nbest[i]);
calc_lex(bt, nbest[i]);
apply_wp(bt, nbest[i]);
nbest[i].eval(fweights);
idx[i] = i;
++i;
}
// cout << i << " " << nbest.size() << endl;
if (i < nbest.size())
{
// cout << "Resizing from " << nbest.size() << " to " << i << endl;
nbest.resize(i);
idx.resize(i);
}
VectorIndexSorter<PhrasePair> sorter(nbest,greater<PhrasePair>());
if (m != ps.trg.end())
{
make_heap(idx.begin(),idx.end(),sorter);
PhrasePair cand;
cand.init(pid1,ps,5);
for (; m != ps.trg.end(); ++m)
{
if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good))
continue;
cand.update(m->first,m->second);
calc_pfwd(bt, cand);
calc_pbwd(bt, cand);
calc_lex(bt, cand);
apply_wp(bt, cand);
cand.eval(fweights);
if (cand < nbest[idx[0]]) continue;
pop_heap(idx.begin(),idx.end(),sorter);
nbest[idx.back()] = cand;
push_heap(idx.begin(),idx.end(),sorter);
}
}
sort(nbest.begin(),nbest.end(),greater<PhrasePair>());
}
int main(int argc, char* argv[])
{
// assert(argc == 4);
#if 0
#if 0
string base = argv[1];
string L1 = argv[2];
string L2 = argv[3];
size_t max_samples = argc > 4 ? atoi(argv[4]) : 0;
#else
string base = "/fs/syn5/germann/exp/sapt/crp/trn/mm/";
string L1 = "de";
string L2 = "en";
size_t max_samples = argc > 1 ? atoi(argv[1]) : 1000;
#endif
char c = *base.rbegin();
if (c != '/' && c != '.')
base += ".";
fweights.resize(5,.25);
fweights[0] = 1;
bt.open(base,L1,L2);
bt.setDefaultSampleSize(max_samples);
size_t i;
i = calc_pfwd.init(0,.05,'g');
i = calc_pbwd.init(i,.05,'g');
i = calc_lex.init(i,base+L1+"-"+L2+".lex");
i = apply_wp.init(i);
string line;
while (getline(cin,line))
{
vector<id_type> snt;
bt.V1->fillIdSeq(line,snt);
for (size_t i = 0; i < snt.size(); ++i)
{
TSA<Token>::tree_iterator m(bt.I1.get());
for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
bt.prep(m);
}
// continue;
for (size_t i = 0; i < snt.size(); ++i)
{
TSA<Token>::tree_iterator m(bt.I1.get());
for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
{
uint64_t spid = m.getPid();
SPTR<pstats> s = bt.lookup(m);
for (size_t j = i; j <= k; ++j)
cout << (*bt.V1)[snt[j]] << " ";
cout << s->good << "/"
<< s->sample_cnt << "/"
<< s->raw_cnt << endl;
// vector<PhrasePair> nbest(min(s->trg.size(),size_t(20)));
vector<PhrasePair> nbest(s->trg.size());
nbest_phrasepairs(spid, *s, nbest);
BOOST_FOREACH(PhrasePair const& pp, nbest)
{
uint32_t sid,off,len;
parse_pid(pp.p2,sid,off,len);
uint32_t stop = off + len;
// cout << sid << " " << off << " " << len << endl;
Token const* o = bt.T2->sntStart(sid);
cout << " " << setw(6) << pp.score << " ";
for (uint32_t i = off; i < stop; ++i)
cout << (*bt.V2)[o[i].id()] << " ";
cout << pp.joint << "/"
<< pp.raw1 << "/"
<< pp.raw2 << " |";
BOOST_FOREACH(float f, pp.fvals)
cout << " " << f;
cout << endl;
}
}
}
}
#endif
exit(0);
}
#endif