File size: 3,445 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
// read a text from stdin, report percentage of n-grams covered
#include <boost/foreach.hpp>
#include <boost/format.hpp>
#include <boost/tokenizer.hpp>
#include <boost/shared_ptr.hpp>
#include <algorithm>
#include <iostream>
#include "mm/ug_bitext.h"
#include "generic/file_io/ug_stream.h"
#include <string>
#include <sstream>
#include "mm/ug_bitext_sampler.h"
#include <boost/program_options.hpp>
#include <boost/math/distributions/binomial.hpp>
// #include "LSA.h"
namespace po=boost::program_options;
using namespace Moses;
using namespace sapt;
using namespace std;
using namespace boost;
typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
typedef mmTtrack<Token> ttrack_t;
size_t ngram_size;
size_t verbosity;
string bname;
vector<string> ifiles;
void interpret_args(int ac, char* av[]);
void
dump(mmTSA<Token>::tree_iterator& m, TokenIndex& V)
{
if (m.size()) cout << m.str(NULL) << endl;
if (m.size()) cout << m.str(&V) << endl;
if (m.down())
{
do { dump(m, V); } while (m.over());
m.up();
}
}
int
main(int argc, char* argv[])
{
interpret_args(argc,argv);
TokenIndex V;
V.open(bname+".tdx"); V.setDynamic(true); V.iniReverseIndex();
boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>);
T->open(bname+".mct");
mmTSA<Token> I; I.open(bname+".sfa", T);
string line;
BOOST_FOREACH(string const& file, ifiles)
{
size_t total_ngrams=0;
float matched_ngrams=0;
ifstream in(file.c_str());
while(getline(in,line))
{
// cout << line << endl;
vector<id_type> snt;
V.fillIdSeq(line,snt);
if (snt.size() < ngram_size) continue;
total_ngrams += snt.size() - ngram_size + 1;
for (size_t i = 0; i + ngram_size <= snt.size(); ++i)
// for (size_t i = 0; i < snt.size(); ++i)
{
mmTSA<Token>::tree_iterator m(&I);
size_t stop = min(snt.size(), i+ngram_size);
size_t k = i;
while (k < stop && m.extend(snt[k])) ++k;
if (verbosity) cout << i << " " << k-i << " " << m.str(&V) << endl;
if (k - i == ngram_size)
++matched_ngrams;
}
}
printf ("%5.1f%% matched %zu-grams (%.0f/%zu): %s\n",
(100 * matched_ngrams / total_ngrams), ngram_size,
matched_ngrams, total_ngrams, file.c_str());
}
}
void
interpret_args(int ac, char* av[])
{
po::variables_map vm;
po::options_description o("Options");
o.add_options()
("help,h", "print this message")
("ngram-size,n", po::value<size_t>(&ngram_size)->default_value(5),
"sample size")
("verbose,v", po::value<size_t>(&verbosity)->default_value(0),
"verbosity")
;
po::options_description h("Hidden Options");
h.add_options()
("bname", po::value<string>(&bname), "base name of corpus")
("ifiles", po::value<vector<string> >(&ifiles), "input files")
;
h.add(o);
po::positional_options_description a;
a.add("bname",1);
a.add("ifiles",-1);
po::store(po::command_line_parser(ac,av)
.options(h)
.positional(a)
.run(),vm);
po::notify(vm);
if (vm.count("help"))
{
std::cout << "\nusage:\n\t" << av[0]
<< " [options] <model file stem>" << std::endl;
std::cout << o << std::endl;
exit(0);
}
}
|