File size: 6,090 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
#include <string>
#include <iostream>
#include <cstdlib>
#include <fstream>
#include <sstream>
#include <unordered_map>
#include <set>
#include <vector>
#include <iterator>
#include <stdexcept>
#include <re2/re2.h>
#include <unistd.h>
#include "Parameters.h"
#ifdef TOKENIZER_NAMESPACE
namespace TOKENIZER_NAMESPACE {
#endif
//
// @about
// Tokenizer implements the process of Koehn's tokenizer.perl via RE2
//
class Tokenizer {
private:
typedef enum {
empty = 0,
blank,
upper, // upper case
letta, // extended word class (includes number, hyphen)
numba,
hyphn,
stops, // blank to stops are "extended word class" variants
quote, // init & fini = {',"}
pinit, // init (includes INVERT_*)
pfini, // fini
pfpct, // fini + pct
marks,
limit
} charclass_t;
std::size_t nthreads;
std::size_t chunksize;
std::string cfg_dir;
// non-breaking prefixes (numeric) utf8
std::set<std::string> nbpre_num_set;
// non-breaking prefixes (other) utf8
std::set<std::string> nbpre_gen_set;
// non-breaking prefixes (numeric) ucs4
std::set<std::wstring> nbpre_num_ucs4;
// non-breaking prefixes (other) ucs4
std::set<std::wstring> nbpre_gen_ucs4;
// compiled protected patterns
std::vector<re2::RE2 *> prot_pat_vec;
protected:
// language
std::string lang_iso;
bool english_p; // is lang_iso "en"
bool latin_p; // is lang_iso "fr" or "it"
bool skip_xml_p;
bool skip_alltags_p;
bool entities_p;
bool escape_p;
bool unescape_p;
bool aggressive_hyphen_p;
bool supersub_p;
bool url_p;
bool downcase_p;
bool normalize_p;
bool penn_p;
bool narrow_latin_p;
bool narrow_kana_p;
bool refined_p;
bool drop_bad_p;
bool splits_p;
bool verbose_p;
bool para_marks_p;
bool split_breaks_p;
// return counts of general and numeric prefixes loaded
std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
// in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
void protected_tokenize(std::string& inplace);
// used for boost::thread
struct VectorTokenizerCallable {
Tokenizer *tokenizer;
std::vector<std::string>& in;
std::vector<std::string>& out;
VectorTokenizerCallable(Tokenizer *_tokenizer,
std::vector<std::string>& _in,
std::vector<std::string>& _out)
: tokenizer(_tokenizer)
, in(_in)
, out(_out) {
};
void operator()() {
out.resize(in.size());
for (std::size_t ii = 0; ii < in.size(); ++ii)
if (in[ii].empty())
out[ii] = in[ii];
else if (tokenizer->penn_p)
out[ii] = tokenizer->penn_tokenize(in[ii]);
else
out[ii] = tokenizer->quik_tokenize(in[ii]);
};
};
public:
Tokenizer(); // UNIMPL
// no throw
Tokenizer(const Parameters& _params);
// frees dynamically compiled expressions
~Tokenizer();
// required before other methods, may throw
void init(const char *cfg_dir_path = 0);
void set_config_dir(const std::string& _cfg_dir);
// required after processing a contiguous sequence of lines when sentence splitting is on
void reset();
// simultaneous sentence splitting not yet implemented
bool splitting() const { return splits_p; }
// escapes chars the set &|"'<> after tokenization (moses special characters)
bool escape(std::string& inplace);
// used in detokenizer, converts entities into characters
// if escape_p is set, does not unescape moses special tokens, thus
// escape_p and unescape_p can be used together usefully
bool unescape(std::string& inplace);
// streaming select-tokenizer reads from is, writes to os, preserving line breaks (unless splitting)
std::size_t tokenize(std::istream& is, std::ostream& os);
// quik-tokenize padded line buffer to return string
std::string quik_tokenize(const std::string& buf);
// penn-tokenize padded line buffer to return string // untested
std::string penn_tokenize(const std::string& buf);
// select-tokenize padded line buffer to return string
std::string tokenize(const std::string& buf) {
return penn_p ? penn_tokenize(buf) : quik_tokenize(buf);
}
// tokenize with output argument
void tokenize(const std::string& buf, std::string& outs) {
outs = tokenize(buf);
}
// tokenize to a vector
std::vector<std::string> tokens(const std::string& in) {
std::istringstream tokss(penn_p ? penn_tokenize(in) : tokenize(in));
std::vector<std::string> outv;
std::copy(std::istream_iterator<std::string>(tokss),
std::istream_iterator<std::string>(),
std::back_inserter(outv));
return outv;
}
// streaming detokenizer reads from is, writes to os, preserving breaks
std::size_t detokenize(std::istream& is, std::ostream &os);
// detokenize padded line buffer to return string
std::string detokenize(const std::string& buf);
void detokenize(const std::string& buf, std::string& outs) {
outs = detokenize(buf);
}
// detokenize from a vector
std::string detokenize(const std::vector<std::string>& inv) {
std::ostringstream oss;
std::copy(inv.begin(), inv.end(), std::ostream_iterator<std::string>(oss," "));
return detokenize(oss.str());
}
// split a string on sentence boundaries (approximately)
std::vector<std::string> splitter(const std::string &istr,bool *continuation_p = 0);
// split sentences from input stream and write one per line on output stream
std::pair<std::size_t,std::size_t> splitter(std::istream& is, std::ostream& os);
}; // end class Tokenizer
#ifdef TOKENIZER_NAMESPACE
};
#endif
|