#include #include #include #include #include #include #include #include #include #include #include #include #include "Parameters.h" #ifdef TOKENIZER_NAMESPACE namespace TOKENIZER_NAMESPACE { #endif // // @about // Tokenizer implements the process of Koehn's tokenizer.perl via RE2 // class Tokenizer { private: typedef enum { empty = 0, blank, upper, // upper case letta, // extended word class (includes number, hyphen) numba, hyphn, stops, // blank to stops are "extended word class" variants quote, // init & fini = {',"} pinit, // init (includes INVERT_*) pfini, // fini pfpct, // fini + pct marks, limit } charclass_t; std::size_t nthreads; std::size_t chunksize; std::string cfg_dir; // non-breaking prefixes (numeric) utf8 std::set nbpre_num_set; // non-breaking prefixes (other) utf8 std::set nbpre_gen_set; // non-breaking prefixes (numeric) ucs4 std::set nbpre_num_ucs4; // non-breaking prefixes (other) ucs4 std::set nbpre_gen_ucs4; // compiled protected patterns std::vector prot_pat_vec; protected: // language std::string lang_iso; bool english_p; // is lang_iso "en" bool latin_p; // is lang_iso "fr" or "it" bool skip_xml_p; bool skip_alltags_p; bool entities_p; bool escape_p; bool unescape_p; bool aggressive_hyphen_p; bool supersub_p; bool url_p; bool downcase_p; bool normalize_p; bool penn_p; bool narrow_latin_p; bool narrow_kana_p; bool refined_p; bool drop_bad_p; bool splits_p; bool verbose_p; bool para_marks_p; bool split_breaks_p; // return counts of general and numeric prefixes loaded std::pair load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso // in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants void protected_tokenize(std::string& inplace); // used for boost::thread struct VectorTokenizerCallable { Tokenizer *tokenizer; std::vector& in; std::vector& out; VectorTokenizerCallable(Tokenizer *_tokenizer, std::vector& _in, std::vector& _out) : tokenizer(_tokenizer) , in(_in) , out(_out) { }; void operator()() { out.resize(in.size()); for (std::size_t ii = 0; ii < in.size(); ++ii) if (in[ii].empty()) out[ii] = in[ii]; else if (tokenizer->penn_p) out[ii] = tokenizer->penn_tokenize(in[ii]); else out[ii] = tokenizer->quik_tokenize(in[ii]); }; }; public: Tokenizer(); // UNIMPL // no throw Tokenizer(const Parameters& _params); // frees dynamically compiled expressions ~Tokenizer(); // required before other methods, may throw void init(const char *cfg_dir_path = 0); void set_config_dir(const std::string& _cfg_dir); // required after processing a contiguous sequence of lines when sentence splitting is on void reset(); // simultaneous sentence splitting not yet implemented bool splitting() const { return splits_p; } // escapes chars the set &|"'<> after tokenization (moses special characters) bool escape(std::string& inplace); // used in detokenizer, converts entities into characters // if escape_p is set, does not unescape moses special tokens, thus // escape_p and unescape_p can be used together usefully bool unescape(std::string& inplace); // streaming select-tokenizer reads from is, writes to os, preserving line breaks (unless splitting) std::size_t tokenize(std::istream& is, std::ostream& os); // quik-tokenize padded line buffer to return string std::string quik_tokenize(const std::string& buf); // penn-tokenize padded line buffer to return string // untested std::string penn_tokenize(const std::string& buf); // select-tokenize padded line buffer to return string std::string tokenize(const std::string& buf) { return penn_p ? penn_tokenize(buf) : quik_tokenize(buf); } // tokenize with output argument void tokenize(const std::string& buf, std::string& outs) { outs = tokenize(buf); } // tokenize to a vector std::vector tokens(const std::string& in) { std::istringstream tokss(penn_p ? penn_tokenize(in) : tokenize(in)); std::vector outv; std::copy(std::istream_iterator(tokss), std::istream_iterator(), std::back_inserter(outv)); return outv; } // streaming detokenizer reads from is, writes to os, preserving breaks std::size_t detokenize(std::istream& is, std::ostream &os); // detokenize padded line buffer to return string std::string detokenize(const std::string& buf); void detokenize(const std::string& buf, std::string& outs) { outs = detokenize(buf); } // detokenize from a vector std::string detokenize(const std::vector& inv) { std::ostringstream oss; std::copy(inv.begin(), inv.end(), std::ostream_iterator(oss," ")); return detokenize(oss.str()); } // split a string on sentence boundaries (approximately) std::vector splitter(const std::string &istr,bool *continuation_p = 0); // split sentences from input stream and write one per line on output stream std::pair splitter(std::istream& is, std::ostream& os); }; // end class Tokenizer #ifdef TOKENIZER_NAMESPACE }; #endif