File size: 3,445 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-

// read a text from stdin, report percentage of n-grams covered

#include <boost/foreach.hpp>
#include <boost/format.hpp>
#include <boost/tokenizer.hpp>
#include <boost/shared_ptr.hpp>
#include <algorithm>
#include <iostream>
#include "mm/ug_bitext.h"
#include "generic/file_io/ug_stream.h"
#include <string>
#include <sstream>
#include "mm/ug_bitext_sampler.h"

#include <boost/program_options.hpp>
#include <boost/math/distributions/binomial.hpp>

// #include "LSA.h"

namespace po=boost::program_options;
using namespace Moses;
using namespace sapt;
using namespace std;
using namespace boost;

typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
typedef mmTtrack<Token> ttrack_t;

size_t ngram_size;
size_t verbosity;
string bname;
vector<string> ifiles;

void interpret_args(int ac, char* av[]);


void
dump(mmTSA<Token>::tree_iterator& m, TokenIndex& V)
{
  if (m.size()) cout << m.str(NULL) << endl;
  if (m.size()) cout << m.str(&V) << endl;
  if (m.down())
    {
      do { dump(m, V); } while (m.over());
      m.up();
    }
}

int
main(int argc, char* argv[])
{
  interpret_args(argc,argv);
  TokenIndex V;
  V.open(bname+".tdx"); V.setDynamic(true); V.iniReverseIndex();
  boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>);
  T->open(bname+".mct");
  mmTSA<Token> I; I.open(bname+".sfa", T);

  string line;
  BOOST_FOREACH(string const& file, ifiles)
    {
      size_t total_ngrams=0;
      float matched_ngrams=0;
      ifstream in(file.c_str());
      while(getline(in,line))
        {
          // cout << line << endl;
          vector<id_type> snt;
          V.fillIdSeq(line,snt);
          if (snt.size() < ngram_size) continue;
          total_ngrams += snt.size() - ngram_size + 1;
          for (size_t i = 0; i + ngram_size <= snt.size(); ++i)
            // for (size_t i = 0; i < snt.size(); ++i)
            {
              mmTSA<Token>::tree_iterator m(&I);
              size_t stop = min(snt.size(), i+ngram_size);
              size_t k = i; 
              while (k < stop && m.extend(snt[k])) ++k;
              if (verbosity) cout << i << " " << k-i << " " << m.str(&V) << endl;
              if (k - i == ngram_size)
                ++matched_ngrams;
            }
        }
      printf ("%5.1f%% matched %zu-grams (%.0f/%zu): %s\n",
              (100 * matched_ngrams / total_ngrams), ngram_size,
              matched_ngrams, total_ngrams, file.c_str());
    }
}

void
interpret_args(int ac, char* av[])
{
  po::variables_map vm;
  po::options_description o("Options");
  o.add_options()

    ("help,h",  "print this message")
    ("ngram-size,n", po::value<size_t>(&ngram_size)->default_value(5),
     "sample size")
    ("verbose,v", po::value<size_t>(&verbosity)->default_value(0),
     "verbosity")
    ;

  po::options_description h("Hidden Options");
  h.add_options()
    ("bname", po::value<string>(&bname), "base name of corpus")
    ("ifiles", po::value<vector<string> >(&ifiles), "input files")
    ;

  h.add(o);
  po::positional_options_description a;
  a.add("bname",1);
  a.add("ifiles",-1);

  po::store(po::command_line_parser(ac,av)
            .options(h)
            .positional(a)
            .run(),vm);
  po::notify(vm);
  if (vm.count("help"))
    {
      std::cout << "\nusage:\n\t" << av[0]
                << " [options] <model file stem>" << std::endl;
      std::cout << o << std::endl;
      exit(0);
    }
}