|
#include <string> |
|
#include <iostream> |
|
#include <cstdlib> |
|
#include <fstream> |
|
#include <sstream> |
|
#include <unordered_map> |
|
#include <set> |
|
#include <vector> |
|
#include <iterator> |
|
#include <stdexcept> |
|
|
|
#include <re2/re2.h> |
|
#include <unistd.h> |
|
|
|
#include "Parameters.h" |
|
|
|
#ifdef TOKENIZER_NAMESPACE |
|
namespace TOKENIZER_NAMESPACE { |
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
class Tokenizer { |
|
|
|
private: |
|
|
|
typedef enum { |
|
empty = 0, |
|
blank, |
|
upper, |
|
letta, |
|
numba, |
|
hyphn, |
|
stops, |
|
quote, |
|
pinit, |
|
pfini, |
|
pfpct, |
|
marks, |
|
limit |
|
} charclass_t; |
|
|
|
std::size_t nthreads; |
|
std::size_t chunksize; |
|
std::string cfg_dir; |
|
|
|
|
|
std::set<std::string> nbpre_num_set; |
|
|
|
std::set<std::string> nbpre_gen_set; |
|
|
|
|
|
std::set<std::wstring> nbpre_num_ucs4; |
|
|
|
std::set<std::wstring> nbpre_gen_ucs4; |
|
|
|
|
|
std::vector<re2::RE2 *> prot_pat_vec; |
|
|
|
protected: |
|
|
|
|
|
std::string lang_iso; |
|
bool english_p; |
|
bool latin_p; |
|
bool skip_xml_p; |
|
bool skip_alltags_p; |
|
bool entities_p; |
|
bool escape_p; |
|
bool unescape_p; |
|
bool aggressive_hyphen_p; |
|
bool supersub_p; |
|
bool url_p; |
|
bool downcase_p; |
|
bool normalize_p; |
|
bool penn_p; |
|
bool narrow_latin_p; |
|
bool narrow_kana_p; |
|
bool refined_p; |
|
bool drop_bad_p; |
|
bool splits_p; |
|
bool verbose_p; |
|
bool para_marks_p; |
|
bool split_breaks_p; |
|
|
|
|
|
std::pair<int,int> load_prefixes(std::ifstream& ifs); |
|
|
|
|
|
void protected_tokenize(std::string& inplace); |
|
|
|
|
|
struct VectorTokenizerCallable { |
|
Tokenizer *tokenizer; |
|
std::vector<std::string>& in; |
|
std::vector<std::string>& out; |
|
|
|
VectorTokenizerCallable(Tokenizer *_tokenizer, |
|
std::vector<std::string>& _in, |
|
std::vector<std::string>& _out) |
|
: tokenizer(_tokenizer) |
|
, in(_in) |
|
, out(_out) { |
|
}; |
|
|
|
void operator()() { |
|
out.resize(in.size()); |
|
for (std::size_t ii = 0; ii < in.size(); ++ii) |
|
if (in[ii].empty()) |
|
out[ii] = in[ii]; |
|
else if (tokenizer->penn_p) |
|
out[ii] = tokenizer->penn_tokenize(in[ii]); |
|
else |
|
out[ii] = tokenizer->quik_tokenize(in[ii]); |
|
}; |
|
}; |
|
|
|
public: |
|
|
|
Tokenizer(); |
|
|
|
|
|
Tokenizer(const Parameters& _params); |
|
|
|
|
|
~Tokenizer(); |
|
|
|
|
|
void init(const char *cfg_dir_path = 0); |
|
|
|
void set_config_dir(const std::string& _cfg_dir); |
|
|
|
|
|
void reset(); |
|
|
|
|
|
bool splitting() const { return splits_p; } |
|
|
|
|
|
bool escape(std::string& inplace); |
|
|
|
|
|
|
|
|
|
bool unescape(std::string& inplace); |
|
|
|
|
|
std::size_t tokenize(std::istream& is, std::ostream& os); |
|
|
|
|
|
std::string quik_tokenize(const std::string& buf); |
|
|
|
|
|
std::string penn_tokenize(const std::string& buf); |
|
|
|
|
|
std::string tokenize(const std::string& buf) { |
|
return penn_p ? penn_tokenize(buf) : quik_tokenize(buf); |
|
} |
|
|
|
|
|
void tokenize(const std::string& buf, std::string& outs) { |
|
outs = tokenize(buf); |
|
} |
|
|
|
|
|
std::vector<std::string> tokens(const std::string& in) { |
|
std::istringstream tokss(penn_p ? penn_tokenize(in) : tokenize(in)); |
|
std::vector<std::string> outv; |
|
std::copy(std::istream_iterator<std::string>(tokss), |
|
std::istream_iterator<std::string>(), |
|
std::back_inserter(outv)); |
|
return outv; |
|
} |
|
|
|
|
|
std::size_t detokenize(std::istream& is, std::ostream &os); |
|
|
|
|
|
std::string detokenize(const std::string& buf); |
|
|
|
void detokenize(const std::string& buf, std::string& outs) { |
|
outs = detokenize(buf); |
|
} |
|
|
|
|
|
std::string detokenize(const std::vector<std::string>& inv) { |
|
std::ostringstream oss; |
|
std::copy(inv.begin(), inv.end(), std::ostream_iterator<std::string>(oss," ")); |
|
return detokenize(oss.str()); |
|
} |
|
|
|
|
|
std::vector<std::string> splitter(const std::string &istr,bool *continuation_p = 0); |
|
|
|
|
|
std::pair<std::size_t,std::size_t> splitter(std::istream& is, std::ostream& os); |
|
|
|
}; |
|
|
|
#ifdef TOKENIZER_NAMESPACE |
|
}; |
|
#endif |
|
|