|
#include "tokenizer.h" |
|
#include "Parameters.h" |
|
#include <memory> |
|
#include <vector> |
|
#include <cctype> |
|
#include <cstring> |
|
|
|
#ifdef TOKENIZER_NAMESPACE |
|
using namespace TOKENIZER_NAMESPACE ; |
|
#endif |
|
|
|
|
|
void |
|
usage(const char *path) |
|
{ |
|
std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl; |
|
std::cerr << " -a -- aggressive hyphenization" << std::endl; |
|
std::cerr << " -b -- drop bad bytes" << std::endl; |
|
std::cerr << " -B -- splitter will split on linebreak" << std::endl; |
|
std::cerr << " -c DIR -- config (pattern) file directory" << std::endl; |
|
std::cerr << " -d -- downcase" << std::endl; |
|
std::cerr << " -D -- detokenize" << std::endl; |
|
std::cerr << " -e -- do not escape entities during tokenization" << std::endl; |
|
std::cerr << " -E -- preserve entities during tokenization" << std::endl; |
|
std::cerr << " -k -- narrow kana" << std::endl; |
|
std::cerr << " -n -- narrow latin" << std::endl; |
|
std::cerr << " -N -- normalize" << std::endl; |
|
std::cerr << " -o OUT -- output file path" << std::endl; |
|
std::cerr << " -p -- penn treebank style" << std::endl; |
|
std::cerr << " -r -- refined contraction and quantity conjoining" << std::endl; |
|
std::cerr << " -s -- super- and sub-script conjoining" << std::endl; |
|
std::cerr << " -S -- buffer and sentence-split lines" << std::endl; |
|
std::cerr << " -T -- do not tokenize, just split, no <P> marks" << std::endl; |
|
std::cerr << " -t N[,C] -- use N threads (1), chunksize C lines" << std::endl; |
|
std::cerr << " -u -- disable url handling" << std::endl; |
|
std::cerr << " -U -- unescape entities before tokenization, after detokenization" << std::endl; |
|
std::cerr << " -v -- verbose" << std::endl; |
|
std::cerr << " -w -- word filter" << std::endl; |
|
std::cerr << " -x -- skip xml tag lines" << std::endl; |
|
std::cerr << " -y -- skip all xml tags" << std::endl; |
|
std::cerr << " -X -- split only, with <P> marks" << std::endl; |
|
std::cerr << "Default is -c ., stdin, stdout." << std::endl; |
|
std::cerr << "LL in en,fr,it affect contraction. LL selects nonbreaking prefix file" << std::endl; |
|
std::cerr << "nonbreaking_prefix.LL is sought in getenv('TOKENIZER_SHARED_DIR')." << std::endl; |
|
return; |
|
} |
|
|
|
|
|
std::string token_word(const std::string& in) { |
|
int pos = -1; |
|
int digits_prefixed = 0; |
|
int nalpha = 0; |
|
int len = in.size(); |
|
std::vector<char> cv; |
|
int last_quirk = -1; |
|
while (++pos < len) { |
|
char ch = in.at(pos); |
|
if (std::isdigit(ch)) { |
|
if (digits_prefixed > 0) { |
|
last_quirk = pos; |
|
break; |
|
} |
|
digits_prefixed--; |
|
cv.push_back(std::tolower(ch)); |
|
} else if (std::isalpha(ch)) { |
|
if (digits_prefixed < 0) |
|
digits_prefixed = -digits_prefixed; |
|
cv.push_back(std::tolower(ch)); |
|
nalpha++; |
|
} else { |
|
if (digits_prefixed < 0) |
|
digits_prefixed = -digits_prefixed; |
|
last_quirk = pos; |
|
if ((ch == '-' || ch == '\'') && pos != 0) { |
|
cv.push_back(ch); |
|
} else { |
|
break; |
|
} |
|
} |
|
} |
|
if (last_quirk == pos || (digits_prefixed > 0 && nalpha == 0)) |
|
cv.clear(); |
|
return std::string(cv.begin(),cv.end()); |
|
} |
|
|
|
|
|
int |
|
copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) { |
|
int nlines = 0; |
|
std::string line; |
|
while (ifs.good() && std::getline(ifs,line)) { |
|
if (line.empty()) |
|
continue; |
|
std::vector<std::string> tokens(tize.tokens(line)); |
|
int count = 0; |
|
bool was_break = false; |
|
|
|
for (auto& token: tokens) { |
|
if (token.empty()) { |
|
if (count || was_break) { |
|
ofs << std::endl; |
|
count = 0; |
|
nlines++; |
|
was_break = true; |
|
continue; |
|
} |
|
} |
|
was_break = false; |
|
|
|
std::string word(token_word(token)); |
|
if (word.empty()) { |
|
continue; |
|
} |
|
|
|
if (count++) { |
|
ofs << ' '; |
|
} |
|
ofs << word; |
|
} |
|
|
|
if (count) { |
|
ofs << std::endl; |
|
nlines++; |
|
} |
|
} |
|
return nlines; |
|
} |
|
|
|
|
|
int main(int ac, char **av) |
|
{ |
|
int rc = 0; |
|
Parameters params; |
|
|
|
const char *prog = av[0]; |
|
bool next_cfg_p = false; |
|
bool next_output_p = false; |
|
bool next_threads_p = false; |
|
bool detokenize_p = std::strstr(av[0],"detokenize") != 0; |
|
if (!detokenize_p) |
|
params.split_p = std::strstr(av[0],"splitter") != 0; |
|
|
|
while (++av,--ac) { |
|
if (**av == '-') { |
|
switch (av[0][1]) { |
|
case 'a': |
|
params.aggro_p = true; |
|
break; |
|
case 'b': |
|
params.drop_bad_p = true; |
|
break; |
|
case 'B': |
|
params.split_breaks_p = true; |
|
break; |
|
case 'c': |
|
next_cfg_p = true; |
|
break; |
|
case 'd': |
|
params.downcase_p = true; |
|
break; |
|
case 'D': |
|
detokenize_p = !detokenize_p; |
|
break; |
|
case 'e': |
|
params.escape_p = !params.escape_p; |
|
break; |
|
case 'E': |
|
params.entities_p = true; |
|
break; |
|
case 'h': |
|
usage(prog); |
|
exit(0); |
|
case 'k': |
|
params.narrow_kana_p = true; |
|
break; |
|
case 'n': |
|
params.narrow_latin_p = true; |
|
break; |
|
case 'N': |
|
params.normalize_p = true; |
|
break; |
|
case 'o': |
|
next_output_p = true; |
|
break; |
|
case 'p': |
|
params.penn_p = true; |
|
break; |
|
case 'r': |
|
params.refined_p = true; |
|
break; |
|
case 's': |
|
params.supersub_p = true; |
|
break; |
|
case 'S': |
|
params.split_p = !params.split_p; |
|
break; |
|
case 'T': |
|
params.notokenization_p = true; |
|
params.para_marks_p = false; |
|
break; |
|
case 't': |
|
next_threads_p = true; |
|
break; |
|
case 'U': |
|
params.unescape_p = true; |
|
break; |
|
case 'u': |
|
params.url_p = false; |
|
break; |
|
case 'v': |
|
params.verbose_p = true; |
|
break; |
|
case 'w': |
|
params.words_p = true; |
|
break; |
|
case 'x': |
|
params.detag_p = true; |
|
break; |
|
case 'X': |
|
params.notokenization_p = true; |
|
params.para_marks_p = true; |
|
break; |
|
case 'y': |
|
params.alltag_p = true; |
|
break; |
|
case 'l': |
|
|
|
break; |
|
default: |
|
std::cerr << "Unknown option: " << *av << std::endl; |
|
::exit(1); |
|
} |
|
} else if (params.lang_iso.empty() && strlen(*av) == 2 && !isdigit(**av)) { |
|
params.lang_iso = *av; |
|
} else if (next_output_p) { |
|
next_output_p = false; |
|
params.out_path = *av; |
|
} else if (next_cfg_p) { |
|
next_cfg_p = false; |
|
params.cfg_path = *av; |
|
} else if (next_threads_p) { |
|
next_threads_p = false; |
|
char *comma = strchr(*av,','); |
|
if (comma) { |
|
*comma++ = 0; |
|
params.chunksize = std::strtoul(comma,0,0); |
|
} |
|
params.nthreads = std::strtoul(*av,0,0); |
|
} else { |
|
params.args.push_back(std::string(*av)); |
|
} |
|
} |
|
|
|
if (!params.cfg_path) { |
|
params.cfg_path = getenv("TOKENIZER_SHARED_DIR"); |
|
} |
|
if (!params.cfg_path) { |
|
if (!::access("../share/.",X_OK)) { |
|
if (!::access("../share/moses/.",X_OK)) { |
|
params.cfg_path = "../share/moses"; |
|
} else { |
|
params.cfg_path = "../share"; |
|
} |
|
} else if (!::access("./scripts/share/.",X_OK)) { |
|
params.cfg_path = "./scripts/share"; |
|
} else if (!::access("./nonbreaking_prefix.en",R_OK)) { |
|
params.cfg_path = "."; |
|
} else { |
|
const char *slash = std::strrchr(prog,'/'); |
|
if (slash) { |
|
std::string cfg_dir_str(prog,slash-prog); |
|
std::string cfg_shr_str(cfg_dir_str); |
|
cfg_shr_str.append("/shared"); |
|
std::string cfg_mos_str(cfg_shr_str); |
|
cfg_mos_str.append("/moses"); |
|
if (!::access(cfg_mos_str.c_str(),X_OK)) { |
|
params.cfg_path = strdup(cfg_mos_str.c_str()); |
|
} else if (!::access(cfg_shr_str.c_str(),X_OK)) { |
|
params.cfg_path = strdup(cfg_shr_str.c_str()); |
|
} else if (!::access(cfg_dir_str.c_str(),X_OK)) { |
|
params.cfg_path = strdup(cfg_dir_str.c_str()); |
|
} |
|
} |
|
} |
|
} |
|
if (params.cfg_path) { |
|
if (params.verbose_p) { |
|
std::cerr << "config path: " << params.cfg_path << std::endl; |
|
} |
|
} |
|
|
|
std::unique_ptr<std::ofstream> pofs = 0; |
|
if (!params.out_path.empty()) { |
|
pofs.reset(new std::ofstream(params.out_path.c_str())); |
|
} |
|
std::ostream& ofs(pofs ? *pofs : std::cout); |
|
|
|
if (params.lang_iso.empty()) |
|
params.lang_iso = "en"; |
|
|
|
Tokenizer tize(params); |
|
tize.init(); |
|
std::pair<std::size_t,std::size_t> plines = { 0, 0 }; |
|
|
|
if (params.words_p) { |
|
if (params.args.empty()) { |
|
plines.first += copy_words(tize,std::cin,ofs); |
|
} else { |
|
for (std::string& arg : params.args) { |
|
try { |
|
std::ifstream ifs(arg.c_str()); |
|
plines.first += copy_words(tize,ifs,ofs); |
|
} catch (...) { |
|
std::cerr << "Exception extracting words from path " << arg << std::endl; |
|
} |
|
} |
|
} |
|
} else if (params.args.empty()) { |
|
if (detokenize_p) { |
|
plines.first = tize.detokenize(std::cin,ofs); |
|
} else if (params.notokenization_p) { |
|
plines = tize.splitter(std::cin,ofs); |
|
} else { |
|
plines.first = tize.tokenize(std::cin,ofs); |
|
} |
|
} else { |
|
for (std::string& arg : params.args) { |
|
try { |
|
std::ifstream ifs(arg.c_str()); |
|
if (detokenize_p) { |
|
plines.first = tize.detokenize(ifs,ofs); |
|
} else if (params.notokenization_p) { |
|
plines = tize.splitter(ifs,ofs); |
|
} else { |
|
plines.first = tize.tokenize(ifs,ofs); |
|
} |
|
} catch (...) { |
|
std::cerr << "Exception tokenizing from path " << arg << std::endl; |
|
} |
|
} |
|
} |
|
|
|
if (params.verbose_p) { |
|
std::cerr << "%%% " << plines.first << " lines." << std::endl; |
|
if (plines.second) { |
|
std::cerr << "%%% " << plines.second << " sentences." << std::endl; |
|
} |
|
} |
|
return rc; |
|
} |
|
|
|
|
|
|