|
|
|
#include <boost/program_options.hpp> |
|
#include <boost/algorithm/string/predicate.hpp> |
|
#include <boost/format.hpp> |
|
#include "mm/ug_bitext.h" |
|
#include "mm/tpt_typedefs.h" |
|
#include "mm/ug_prime_sampling1.h" |
|
#include "generic/sorting/VectorIndexSorter.h" |
|
#include "generic/sorting/NBestList.h" |
|
#include <string> |
|
|
|
using namespace std; |
|
using namespace Moses; |
|
using namespace Moses::bitext; |
|
namespace po=boost::program_options; |
|
using namespace boost::algorithm; |
|
typedef L2R_Token<SimpleWordId> Token; |
|
typedef mmBitext<Token> mmbitext; |
|
typedef Bitext<Token>::tsa tsa; |
|
typedef imTtrack<Token> imttrack; |
|
typedef imTSA<Token> imtsa; |
|
|
|
string bname, bname1, bname2, L1, L2, Q1, Q2; |
|
size_t maxhits; |
|
void interpret_args(int ac, char* av[]); |
|
|
|
TokenIndex V1; |
|
TokenIndex V2; |
|
sptr<mmTtrack<Token> > C1; |
|
sptr<mmTtrack<Token> > C2; |
|
mmTSA<Token> I1; |
|
|
|
void |
|
open_bitext() |
|
{ |
|
C1.reset(new mmTtrack<Token>); |
|
if (L2.size()) |
|
{ |
|
bname1 = bname + L1 + "."; |
|
bname2 = bname + L2 + "."; |
|
} |
|
else if (L1.size()) |
|
{ |
|
bname1 = bname; |
|
bname2 = L1; |
|
} |
|
else bname1 = bname; |
|
|
|
if (bname2.size()) C2.reset(new mmTtrack<Token>); |
|
|
|
C1->open(bname1+"mct"); |
|
I1.open(bname1+"sfa", C1); |
|
V1.open(bname1+"tdx"); |
|
V1.setDynamic(true); |
|
|
|
if (bname2.size()) |
|
{ |
|
C2->open(bname2+"mct"); |
|
V2.open(bname2+"tdx"); |
|
} |
|
|
|
} |
|
|
|
sptr<imttrack> |
|
read_input() |
|
{ |
|
sptr<vector<vector<Token> > > crp(new vector<vector<Token> >); |
|
crp->reserve(1000); |
|
string line; |
|
while (getline(cin,line)) |
|
{ |
|
crp->push_back(vector<Token>()); |
|
fill_token_seq(V1, line, crp->back()); |
|
} |
|
sptr<imttrack> ret(new imttrack (crp)); |
|
return ret; |
|
} |
|
|
|
sptr<NBestList<uint32_t, VectorIndexSorter<float> > > |
|
nbest(TSA<Token>::tree_iterator const& r, vector<float> const& hits, |
|
vector<float>& score, VectorIndexSorter<float>& sorter, |
|
size_t const nbest_size) |
|
{ |
|
typedef NBestList<uint32_t, VectorIndexSorter<float> > nbest_list_t; |
|
sptr<nbest_list_t> ret(new nbest_list_t(nbest_size, sorter)); |
|
bitvector mycheck(hits.size()); |
|
tsa::ArrayEntry I(r.lower_bound(-1)); |
|
char const* stop = r.upper_bound(-1); |
|
while (I.next < stop) |
|
{ |
|
r.root->readEntry(I.next,I); |
|
if (mycheck[I.sid]) continue; |
|
score[I.sid] = hits[I.sid] / r.root->getCorpus()->sntLen(I.sid); |
|
ret->add(I.sid); |
|
mycheck.set(I.sid); |
|
} |
|
return ret; |
|
} |
|
|
|
int main(int argc, char* argv[]) |
|
{ |
|
interpret_args(argc, argv); |
|
open_bitext(); |
|
sptr<imttrack> icrp = read_input(); |
|
imtsa newIdx(icrp,NULL); |
|
sptr<SentenceBias> hits = prime_sampling1(I1, newIdx, 1000); |
|
vector<float> score(hits->size()); |
|
VectorIndexSorter<float> sorter(score); |
|
for (size_t s = 0; s < icrp->size(); ++s) |
|
{ |
|
size_t stop = icrp->sntLen(s); |
|
Token const* t = icrp->sntStart(s); |
|
cout << string(80,'-') << "\n" << toString(V1, t, stop) << endl; |
|
for (size_t i = 0; i < stop; ++i) |
|
{ |
|
TSA<Token>::tree_iterator r(&I1); |
|
for (size_t k = i; k < stop && r.extend(t[k].id()); ++k) |
|
{ |
|
if (r.ca() < 3) continue; |
|
cout << "\n" << r.str(&V1) << " " << int(r.ca()) << endl; |
|
if (r.ca() > 10000) continue; |
|
sptr<NBestList<uint32_t, VectorIndexSorter<float> > > top; |
|
top = nbest(r, *hits, score, sorter, 5); |
|
for (size_t n = 0; n < top->size(); ++n) |
|
{ |
|
cout << "[" << n << ": " << score[(*top)[n]] |
|
<< " (" << (*hits)[(*top)[n]] << "/" << C1->sntLen((*top)[n]) << ")]\n" |
|
<< toString(V1, C1->sntStart((*top)[n]), C1->sntLen((*top)[n])) << "\n"; |
|
if (C2) cout << toString(V2, C2->sntStart((*top)[n]), C2->sntLen((*top)[n])) << "\n"; |
|
cout << endl; |
|
} |
|
} |
|
} |
|
|
|
} |
|
} |
|
|
|
void |
|
interpret_args(int ac, char* av[]) |
|
{ |
|
po::variables_map vm; |
|
po::options_description o("Options"); |
|
o.add_options() |
|
|
|
("help,h", "print this message") |
|
("maxhits,n", po::value<size_t>(&maxhits)->default_value(25), |
|
"max. number of hits") |
|
("q1", po::value<string>(&Q1), "query in L1") |
|
("q2", po::value<string>(&Q2), "query in L2") |
|
; |
|
|
|
po::options_description h("Hidden Options"); |
|
h.add_options() |
|
("bname", po::value<string>(&bname), "base name of corpus") |
|
("L1", po::value<string>(&L1), "L1 tag") |
|
("L2", po::value<string>(&L2), "L2 tag") |
|
; |
|
|
|
h.add(o); |
|
po::positional_options_description a; |
|
a.add("bname",1); |
|
a.add("L1",1); |
|
a.add("L2",1); |
|
|
|
po::store(po::command_line_parser(ac,av) |
|
.options(h) |
|
.positional(a) |
|
.run(),vm); |
|
po::notify(vm); |
|
if (vm.count("help")) |
|
{ |
|
cout << "\nusage:\n\t" << av[0] |
|
<< " [options] [--q1=<L1string>] [--q2=<L2string>]" << endl; |
|
cout << o << endl; |
|
exit(0); |
|
} |
|
} |
|
|