sakharamg
/

NMTKD

Model card Files Files and versions Community

NMTKD / translation /tools /mosesdecoder /contrib /c++tokenizer /tokenizer_main.cpp

sakharamg

Uploading all files

158b61b about 2 years ago

raw

history blame contribute delete

11.6 kB

	#include "tokenizer.h"
	#include "Parameters.h"
	#include <memory>
	#include <vector>
	#include <cctype>
	#include <cstring>

	#ifdef TOKENIZER_NAMESPACE
	using namespace TOKENIZER_NAMESPACE ;
	#endif


	void
	usage(const char *path)
	{
	std::cerr << "Usage: " << path << "[-{v\|x\|p\|a\|e\|s\|u\|n\|N]* [LL] [-{c\|o} PATH]* INFILE*" << std::endl;
	std::cerr << " -a -- aggressive hyphenization" << std::endl;
	std::cerr << " -b -- drop bad bytes" << std::endl;
	std::cerr << " -B -- splitter will split on linebreak" << std::endl;
	std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
	std::cerr << " -d -- downcase" << std::endl;
	std::cerr << " -D -- detokenize" << std::endl;
	std::cerr << " -e -- do not escape entities during tokenization" << std::endl;
	std::cerr << " -E -- preserve entities during tokenization" << std::endl;
	std::cerr << " -k -- narrow kana" << std::endl;
	std::cerr << " -n -- narrow latin" << std::endl;
	std::cerr << " -N -- normalize" << std::endl;
	std::cerr << " -o OUT -- output file path" << std::endl;
	std::cerr << " -p -- penn treebank style" << std::endl;
	std::cerr << " -r -- refined contraction and quantity conjoining" << std::endl;
	std::cerr << " -s -- super- and sub-script conjoining" << std::endl;
	std::cerr << " -S -- buffer and sentence-split lines" << std::endl;
	std::cerr << " -T -- do not tokenize, just split, no <P> marks" << std::endl;
	std::cerr << " -t N[,C] -- use N threads (1), chunksize C lines" << std::endl;
	std::cerr << " -u -- disable url handling" << std::endl;
	std::cerr << " -U -- unescape entities before tokenization, after detokenization" << std::endl;
	std::cerr << " -v -- verbose" << std::endl;
	std::cerr << " -w -- word filter" << std::endl;
	std::cerr << " -x -- skip xml tag lines" << std::endl;
	std::cerr << " -y -- skip all xml tags" << std::endl;
	std::cerr << " -X -- split only, with <P> marks" << std::endl;
	std::cerr << "Default is -c ., stdin, stdout." << std::endl;
	std::cerr << "LL in en,fr,it affect contraction. LL selects nonbreaking prefix file" << std::endl;
	std::cerr << "nonbreaking_prefix.LL is sought in getenv('TOKENIZER_SHARED_DIR')." << std::endl;
	return;
	}


	std::string token_word(const std::string& in) {
	int pos = -1;
	int digits_prefixed = 0;
	int nalpha = 0;
	int len = in.size();
	std::vector<char> cv;
	int last_quirk = -1;
	while (++pos < len) {
	char ch = in.at(pos);
	if (std::isdigit(ch)) {
	if (digits_prefixed > 0) {
	last_quirk = pos;
	break;
	}
	digits_prefixed--;
	cv.push_back(std::tolower(ch));
	} else if (std::isalpha(ch)) {
	if (digits_prefixed < 0)
	digits_prefixed = -digits_prefixed;
	cv.push_back(std::tolower(ch));
	nalpha++;
	} else {
	if (digits_prefixed < 0)
	digits_prefixed = -digits_prefixed;
	last_quirk = pos;
	if ((ch == '-' \|\| ch == '\'') && pos != 0) {
	cv.push_back(ch);
	} else {
	break;
	}
	}
	}
	if (last_quirk == pos \|\| (digits_prefixed > 0 && nalpha == 0))
	cv.clear(); // invalid word
	return std::string(cv.begin(),cv.end());
	}


	int
	copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
	int nlines = 0;
	std::string line;
	while (ifs.good() && std::getline(ifs,line)) {
	if (line.empty())
	continue;
	std::vector<std::string> tokens(tize.tokens(line));
	int count = 0;
	bool was_break = false;

	for (auto& token: tokens) {
	if (token.empty()) {
	if (count \|\| was_break) {
	ofs << std::endl;
	count = 0;
	nlines++;
	was_break = true;
	continue;
	}
	}
	was_break = false;

	std::string word(token_word(token));
	if (word.empty()) {
	continue;
	}

	if (count++) {
	ofs << ' ';
	}
	ofs << word;
	}

	if (count) {
	ofs << std::endl;
	nlines++;
	}
	}
	return nlines;
	}


	int main(int ac, char **av)
	{
	int rc = 0;
	Parameters params;

	const char *prog = av[0];
	bool next_cfg_p = false;
	bool next_output_p = false;
	bool next_threads_p = false;
	bool detokenize_p = std::strstr(av[0],"detokenize") != 0;
	if (!detokenize_p)
	params.split_p = std::strstr(av[0],"splitter") != 0;

	while (++av,--ac) {
	if (**av == '-') {
	switch (av[0][1]) {
	case 'a':
	params.aggro_p = true;
	break;
	case 'b':
	params.drop_bad_p = true;
	break;
	case 'B':
	params.split_breaks_p = true;
	break;
	case 'c':
	next_cfg_p = true;
	break;
	case 'd':
	params.downcase_p = true;
	break;
	case 'D':
	detokenize_p = !detokenize_p;
	break;
	case 'e':
	params.escape_p = !params.escape_p;
	break;
	case 'E':
	params.entities_p = true;
	break;
	case 'h':
	usage(prog);
	exit(0);
	case 'k':
	params.narrow_kana_p = true;
	break;
	case 'n':
	params.narrow_latin_p = true;
	break;
	case 'N':
	params.normalize_p = true;
	break;
	case 'o':
	next_output_p = true;
	break;
	case 'p':
	params.penn_p = true;
	break;
	case 'r':
	params.refined_p = true;
	break;
	case 's':
	params.supersub_p = true;
	break;
	case 'S':
	params.split_p = !params.split_p;
	break;
	case 'T':
	params.notokenization_p = true;
	params.para_marks_p = false;
	break;
	case 't':
	next_threads_p = true;
	break;
	case 'U':
	params.unescape_p = true;
	break;
	case 'u':
	params.url_p = false;
	break;
	case 'v':
	params.verbose_p = true;
	break;
	case 'w':
	params.words_p = true;
	break;
	case 'x':
	params.detag_p = true;
	break;
	case 'X':
	params.notokenization_p = true;
	params.para_marks_p = true;
	break;
	case 'y':
	params.alltag_p = true;
	break;
	case 'l':
	// ignored
	break;
	default:
	std::cerr << "Unknown option: " << *av << std::endl;
	::exit(1);
	}
	} else if (params.lang_iso.empty() && strlen(av) == 2 && !isdigit(*av)) {
	params.lang_iso = *av;
	} else if (next_output_p) {
	next_output_p = false;
	params.out_path = *av;
	} else if (next_cfg_p) {
	next_cfg_p = false;
	params.cfg_path = *av;
	} else if (next_threads_p) {
	next_threads_p = false;
	char comma = strchr(av,',');
	if (comma) {
	*comma++ = 0;
	params.chunksize = std::strtoul(comma,0,0);
	}
	params.nthreads = std::strtoul(*av,0,0);
	} else {
	params.args.push_back(std::string(*av));
	}
	}

	if (!params.cfg_path) {
	params.cfg_path = getenv("TOKENIZER_SHARED_DIR");
	}
	if (!params.cfg_path) {
	if (!::access("../share/.",X_OK)) {
	if (!::access("../share/moses/.",X_OK)) {
	params.cfg_path = "../share/moses";
	} else {
	params.cfg_path = "../share";
	}
	} else if (!::access("./scripts/share/.",X_OK)) {
	params.cfg_path = "./scripts/share";
	} else if (!::access("./nonbreaking_prefix.en",R_OK)) {
	params.cfg_path = ".";
	} else {
	const char *slash = std::strrchr(prog,'/');
	if (slash) {
	std::string cfg_dir_str(prog,slash-prog);
	std::string cfg_shr_str(cfg_dir_str);
	cfg_shr_str.append("/shared");
	std::string cfg_mos_str(cfg_shr_str);
	cfg_mos_str.append("/moses");
	if (!::access(cfg_mos_str.c_str(),X_OK)) {
	params.cfg_path = strdup(cfg_mos_str.c_str());
	} else if (!::access(cfg_shr_str.c_str(),X_OK)) {
	params.cfg_path = strdup(cfg_shr_str.c_str());
	} else if (!::access(cfg_dir_str.c_str(),X_OK)) {
	params.cfg_path = strdup(cfg_dir_str.c_str());
	}
	}
	}
	}
	if (params.cfg_path) {
	if (params.verbose_p) {
	std::cerr << "config path: " << params.cfg_path << std::endl;
	}
	}

	std::unique_ptr<std::ofstream> pofs = 0;
	if (!params.out_path.empty()) {
	pofs.reset(new std::ofstream(params.out_path.c_str()));
	}
	std::ostream& ofs(pofs ? *pofs : std::cout);

	if (params.lang_iso.empty())
	params.lang_iso = "en";

	Tokenizer tize(params);
	tize.init();
	std::pair<std::size_t,std::size_t> plines = { 0, 0 };

	if (params.words_p) {
	if (params.args.empty()) {
	plines.first += copy_words(tize,std::cin,ofs);
	} else {
	for (std::string& arg : params.args) {
	try {
	std::ifstream ifs(arg.c_str());
	plines.first += copy_words(tize,ifs,ofs);
	} catch (...) {
	std::cerr << "Exception extracting words from path " << arg << std::endl;
	}
	}
	}
	} else if (params.args.empty()) {
	if (detokenize_p) {
	plines.first = tize.detokenize(std::cin,ofs);
	} else if (params.notokenization_p) {
	plines = tize.splitter(std::cin,ofs);
	} else {
	plines.first = tize.tokenize(std::cin,ofs);
	}
	} else {
	for (std::string& arg : params.args) {
	try {
	std::ifstream ifs(arg.c_str());
	if (detokenize_p) {
	plines.first = tize.detokenize(ifs,ofs);
	} else if (params.notokenization_p) {
	plines = tize.splitter(ifs,ofs);
	} else {
	plines.first = tize.tokenize(ifs,ofs);
	}
	} catch (...) {
	std::cerr << "Exception tokenizing from path " << arg << std::endl;
	}
	}
	}

	if (params.verbose_p) {
	std::cerr << "%%% " << plines.first << " lines." << std::endl;
	if (plines.second) {
	std::cerr << "%%% " << plines.second << " sentences." << std::endl;
	}
	}
	return rc;
	}