sakharamg
/

NMTKD

Model card Files Files and versions Community

File size: 5,069 Bytes

158b61b

// build a phrase table for the given input
// #include "ug_lexical_phrase_scorer2.h"
#if 0
#include <stdint.h>
#include <string>
#include <vector>
#include <cassert>
#include <iomanip>
#include <algorithm>

#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"

#include <boost/math/distributions/binomial.hpp>
#include <boost/unordered_map.hpp>
#include <boost/foreach.hpp>

#include "ug_mm_ttrack.h"
#include "ug_mm_tsa.h"
#include "tpt_tokenindex.h"
#include "ug_corpus_token.h"
#include "ug_typedefs.h"
#include "tpt_pickler.h"
#include "ug_bitext.h"
#include "ug_lexical_phrase_scorer2.h"
#include "../sapt_phrase_scorers.h"
using namespace std;
using namespace ugdiss;
using namespace Moses;
using namespace Moses::bitext;

#define CACHING_THRESHOLD 1000
#define lbop boost::math::binomial_distribution<>::find_lower_bound_on_p
size_t mctr=0,xctr=0;

typedef L2R_Token<SimpleWordId> Token;
typedef mmBitext<Token> mmbitext;
mmbitext bt;


float lbsmooth = .005;


PScorePfwd<Token> calc_pfwd;
PScorePbwd<Token> calc_pbwd;
PScoreLex<Token>  calc_lex(1.0);
PScoreWC<Token>   apply_wp;
vector<float> fweights;

void
nbest_phrasepairs(uint64_t const  pid1,
		  pstats   const& ps,
		  vector<PhrasePair> & nbest)
{
  pstats::trg_map_t::const_iterator m;
  vector<size_t> idx(nbest.size());
  size_t i=0;
  for (m  = ps.trg.begin();
       m != ps.trg.end() && i < nbest.size();
       ++m)
    {
      // cout << m->second.rcnt() << " " << ps.good << endl;
      if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good))
	continue;
      nbest[i].init(pid1,ps,5);
      nbest[i].update(m->first,m->second);
      calc_pfwd(bt, nbest[i]);
      calc_pbwd(bt, nbest[i]);
      calc_lex(bt, nbest[i]);
      apply_wp(bt, nbest[i]);
      nbest[i].eval(fweights);
      idx[i] = i;
      ++i;
    }
  // cout << i << " " << nbest.size() << endl;
  if (i < nbest.size())
    {
      // cout << "Resizing from " << nbest.size() << " to " << i << endl;
      nbest.resize(i);
      idx.resize(i);
    }
  VectorIndexSorter<PhrasePair> sorter(nbest,greater<PhrasePair>());
  if (m != ps.trg.end())
    {
      make_heap(idx.begin(),idx.end(),sorter);
      PhrasePair cand;
      cand.init(pid1,ps,5);
      for (; m != ps.trg.end(); ++m)
	{
	  if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good))
	    continue;
	  cand.update(m->first,m->second);
	  calc_pfwd(bt, cand);
	  calc_pbwd(bt, cand);
	  calc_lex(bt, cand);
	  apply_wp(bt, cand);
	  cand.eval(fweights);
	  if (cand < nbest[idx[0]]) continue;
	  pop_heap(idx.begin(),idx.end(),sorter);
	  nbest[idx.back()] = cand;
	  push_heap(idx.begin(),idx.end(),sorter);
	}
    }
  sort(nbest.begin(),nbest.end(),greater<PhrasePair>());
}

int main(int argc, char* argv[])
{
  // assert(argc == 4);
#if 0
#if 0
  string base = argv[1];
  string L1   = argv[2];
  string L2   = argv[3];
  size_t max_samples = argc > 4 ? atoi(argv[4]) : 0;
#else
  string base = "/fs/syn5/germann/exp/sapt/crp/trn/mm/";
  string L1 = "de";
  string L2 = "en";
  size_t max_samples = argc > 1 ? atoi(argv[1]) : 1000;
#endif
  char c = *base.rbegin();
  if (c != '/' && c != '.')
    base += ".";

  fweights.resize(5,.25);
  fweights[0] = 1;
  bt.open(base,L1,L2);
  bt.setDefaultSampleSize(max_samples);

  size_t i;
  i = calc_pfwd.init(0,.05,'g');
  i = calc_pbwd.init(i,.05,'g');
  i = calc_lex.init(i,base+L1+"-"+L2+".lex");
  i = apply_wp.init(i);

  string line;
  while (getline(cin,line))
    {
      vector<id_type> snt;
      bt.V1->fillIdSeq(line,snt);
      for (size_t i = 0; i < snt.size(); ++i)
  	{
  	  TSA<Token>::tree_iterator m(bt.I1.get());
	  for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
	    bt.prep(m);
	}
      // continue;
      for (size_t i = 0; i < snt.size(); ++i)
      	{
      	  TSA<Token>::tree_iterator m(bt.I1.get());
      	  for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
      	    {
	      uint64_t spid = m.getPid();
      	      SPTR<pstats> s = bt.lookup(m);
      	      for (size_t j = i; j <= k; ++j)
      		cout << (*bt.V1)[snt[j]] << " ";
      	      cout << s->good << "/"
		   << s->sample_cnt << "/"
		   << s->raw_cnt << endl;
	      // vector<PhrasePair> nbest(min(s->trg.size(),size_t(20)));
	      vector<PhrasePair> nbest(s->trg.size());
	      nbest_phrasepairs(spid, *s, nbest);
	      BOOST_FOREACH(PhrasePair const& pp, nbest)
		{
		  uint32_t sid,off,len;
		  parse_pid(pp.p2,sid,off,len);
		  uint32_t stop = off + len;
		  // cout << sid << " " << off << " " << len << endl;
		  Token const* o = bt.T2->sntStart(sid);
		  cout << "   " << setw(6) << pp.score << " ";
		  for (uint32_t i = off; i < stop; ++i)
		    cout << (*bt.V2)[o[i].id()] << " ";
		  cout << pp.joint << "/"
		       << pp.raw1  << "/"
		       << pp.raw2  << " |";
		  BOOST_FOREACH(float f, pp.fvals)
		    cout << " " << f;
		  cout << endl;
		}
      	    }
      	}
    }
#endif
    exit(0);
}
#endif