|
|
|
|
|
|
|
|
|
#include <boost/foreach.hpp> |
|
#include <boost/format.hpp> |
|
#include <boost/tokenizer.hpp> |
|
#include <boost/shared_ptr.hpp> |
|
#include <algorithm> |
|
#include <iostream> |
|
#include "mm/ug_bitext.h" |
|
#include "generic/file_io/ug_stream.h" |
|
#include <string> |
|
#include <sstream> |
|
#include "mm/ug_bitext_sampler.h" |
|
|
|
#include <boost/program_options.hpp> |
|
#include <boost/math/distributions/binomial.hpp> |
|
|
|
|
|
|
|
namespace po=boost::program_options; |
|
using namespace Moses; |
|
using namespace sapt; |
|
using namespace std; |
|
using namespace boost; |
|
|
|
typedef sapt::L2R_Token<sapt::SimpleWordId> Token; |
|
typedef mmTtrack<Token> ttrack_t; |
|
|
|
size_t ngram_size; |
|
size_t verbosity; |
|
string bname; |
|
vector<string> ifiles; |
|
|
|
void interpret_args(int ac, char* av[]); |
|
|
|
|
|
void |
|
dump(mmTSA<Token>::tree_iterator& m, TokenIndex& V) |
|
{ |
|
if (m.size()) cout << m.str(NULL) << endl; |
|
if (m.size()) cout << m.str(&V) << endl; |
|
if (m.down()) |
|
{ |
|
do { dump(m, V); } while (m.over()); |
|
m.up(); |
|
} |
|
} |
|
|
|
int |
|
main(int argc, char* argv[]) |
|
{ |
|
interpret_args(argc,argv); |
|
TokenIndex V; |
|
V.open(bname+".tdx"); V.setDynamic(true); V.iniReverseIndex(); |
|
boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>); |
|
T->open(bname+".mct"); |
|
mmTSA<Token> I; I.open(bname+".sfa", T); |
|
|
|
string line; |
|
BOOST_FOREACH(string const& file, ifiles) |
|
{ |
|
size_t total_ngrams=0; |
|
float matched_ngrams=0; |
|
ifstream in(file.c_str()); |
|
while(getline(in,line)) |
|
{ |
|
|
|
vector<id_type> snt; |
|
V.fillIdSeq(line,snt); |
|
if (snt.size() < ngram_size) continue; |
|
total_ngrams += snt.size() - ngram_size + 1; |
|
for (size_t i = 0; i + ngram_size <= snt.size(); ++i) |
|
|
|
{ |
|
mmTSA<Token>::tree_iterator m(&I); |
|
size_t stop = min(snt.size(), i+ngram_size); |
|
size_t k = i; |
|
while (k < stop && m.extend(snt[k])) ++k; |
|
if (verbosity) cout << i << " " << k-i << " " << m.str(&V) << endl; |
|
if (k - i == ngram_size) |
|
++matched_ngrams; |
|
} |
|
} |
|
printf ("%5.1f%% matched %zu-grams (%.0f/%zu): %s\n", |
|
(100 * matched_ngrams / total_ngrams), ngram_size, |
|
matched_ngrams, total_ngrams, file.c_str()); |
|
} |
|
} |
|
|
|
void |
|
interpret_args(int ac, char* av[]) |
|
{ |
|
po::variables_map vm; |
|
po::options_description o("Options"); |
|
o.add_options() |
|
|
|
("help,h", "print this message") |
|
("ngram-size,n", po::value<size_t>(&ngram_size)->default_value(5), |
|
"sample size") |
|
("verbose,v", po::value<size_t>(&verbosity)->default_value(0), |
|
"verbosity") |
|
; |
|
|
|
po::options_description h("Hidden Options"); |
|
h.add_options() |
|
("bname", po::value<string>(&bname), "base name of corpus") |
|
("ifiles", po::value<vector<string> >(&ifiles), "input files") |
|
; |
|
|
|
h.add(o); |
|
po::positional_options_description a; |
|
a.add("bname",1); |
|
a.add("ifiles",-1); |
|
|
|
po::store(po::command_line_parser(ac,av) |
|
.options(h) |
|
.positional(a) |
|
.run(),vm); |
|
po::notify(vm); |
|
if (vm.count("help")) |
|
{ |
|
std::cout << "\nusage:\n\t" << av[0] |
|
<< " [options] <model file stem>" << std::endl; |
|
std::cout << o << std::endl; |
|
exit(0); |
|
} |
|
} |
|
|