|
#include "tokenizer.h" |
|
#include <re2/stringpiece.h> |
|
#include <sstream> |
|
#include <iterator> |
|
#include <memory> |
|
#include <vector> |
|
#include <algorithm> |
|
#include <cstring> |
|
#include <set> |
|
#include <glib.h> |
|
#include <stdexcept> |
|
#include <boost/thread.hpp> |
|
|
|
namespace { |
|
|
|
|
|
|
|
RE2 genl_tags_x("<[/!\\p{L}]+[^>]*>"); |
|
RE2 mult_spc_x(" +"); |
|
RE2 tag_line_x("^<.+>$"); |
|
RE2 white_line_x("^\\s*$"); |
|
RE2 slash_x("([\\p{L}\\p{N}])(/)([\\p{L}\\p{N}])"); |
|
RE2 final_x("([^.])([.])([\\]\\)}>\"']*) ?$"); |
|
RE2 qx_x("([?!])"); |
|
RE2 braces_x("([\\]\\[\\(\\){}<>])"); |
|
RE2 endq_x("([^'])' "); |
|
RE2 letter_x("\\p{L}"); |
|
RE2 lower_x("^\\p{Ll}"); |
|
RE2 sinteger_x("^\\p{N}"); |
|
RE2 numprefixed_x("[-+/.@\\\\#\\%&\\p{Sc}\\p{N}]*[\\p{N}]+-[-'`\"\\p{L}]*\\p{L}"); |
|
RE2 quasinumeric_x("[-.;:@\\\\#\%&\\p{Sc}\\p{So}\\p{N}]*[\\p{N}]+"); |
|
RE2 numscript_x("([\\p{N}\\p{L}])([\\p{No}]+)(\\p{Ll})"); |
|
|
|
RE2 x1_v_d("([ ([{<])\""); |
|
RE2 x1_v_gg("([ ([{<])``"); |
|
RE2 x1_v_g("([ ([{<])`([^`])"); |
|
RE2 x1_v_q("([ ([{<])'"); |
|
RE2 ndndcomma_x("([^\\p{N}]),([^\\p{N}])"); |
|
RE2 pdndcomma_x("([\\p{N}]),([^\\p{N}])"); |
|
RE2 ndpdcomma_x("([^\\p{N}]),([\\p{N}])"); |
|
RE2 symbol_x("([;:@\\#\\$%&\\p{Sc}\\p{So}])"); |
|
RE2 contract_x("'([sSmMdD]) "); |
|
RE2 right_x("[({¿¡]+"); |
|
RE2 left_x("[,.?!:;\\%\\p{Sc}})]+"); |
|
RE2 curr_en_x("^[Nn]?[\'][\\p{L}]"); |
|
RE2 pre_en_x(".*[\\p{L}\\p{N}]+$"); |
|
RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); |
|
RE2 post_fr_x("^[\\p{L}\\p{N}]*"); |
|
|
|
|
|
const char * |
|
SPC_BYTE = " "; |
|
|
|
|
|
|
|
inline bool |
|
class_follows_p(gunichar *s, gunichar *e, GUnicodeType gclass) { |
|
while (s < e) { |
|
GUnicodeType tclass = g_unichar_type(*s); |
|
if (tclass == gclass) |
|
return true; |
|
switch (tclass) { |
|
case G_UNICODE_SPACING_MARK: |
|
case G_UNICODE_LINE_SEPARATOR: |
|
case G_UNICODE_PARAGRAPH_SEPARATOR: |
|
case G_UNICODE_SPACE_SEPARATOR: |
|
++s; |
|
continue; |
|
break; |
|
default: |
|
return false; |
|
} |
|
} |
|
return false; |
|
} |
|
|
|
|
|
const char *ESCAPE_MOSES[] = { |
|
"|", |
|
"[", |
|
"]", |
|
"&", |
|
"<", |
|
">", |
|
"'", |
|
""", |
|
}; |
|
|
|
const std::set<std::string> |
|
ESCAPE_SET = { |
|
std::string(ESCAPE_MOSES[0]), |
|
std::string(ESCAPE_MOSES[1]), |
|
std::string(ESCAPE_MOSES[2]), |
|
std::string(ESCAPE_MOSES[3]), |
|
std::string(ESCAPE_MOSES[4]), |
|
std::string(ESCAPE_MOSES[5]), |
|
std::string(ESCAPE_MOSES[6]), |
|
std::string(ESCAPE_MOSES[7]), |
|
}; |
|
|
|
const std::map<std::wstring,gunichar> |
|
ENTITY_MAP = { |
|
{ std::wstring(L"""), L'"' }, |
|
{ std::wstring(L"&"), L'&' }, |
|
{ std::wstring(L"'"), L'\'' }, |
|
{ std::wstring(L"<"), L'<' }, |
|
{ std::wstring(L">"), L'>' }, |
|
{ std::wstring(L" "), L'\u00A0' }, |
|
{ std::wstring(L"¡"), L'\u00A1' }, |
|
{ std::wstring(L"¢"), L'\u00A2' }, |
|
{ std::wstring(L"£"), L'\u00A3' }, |
|
{ std::wstring(L"¤"), L'\u00A4' }, |
|
{ std::wstring(L"¥"), L'\u00A5' }, |
|
{ std::wstring(L"¦"), L'\u00A6' }, |
|
{ std::wstring(L"§"), L'\u00A7' }, |
|
{ std::wstring(L"¨"), L'\u00A8' }, |
|
{ std::wstring(L"©"), L'\u00A9' }, |
|
{ std::wstring(L"ª"), L'\u00AA' }, |
|
{ std::wstring(L"«"), L'\u00AB' }, |
|
{ std::wstring(L"¬"), L'\u00AC' }, |
|
{ std::wstring(L"­"), L'\u00AD' }, |
|
{ std::wstring(L"®"), L'\u00AE' }, |
|
{ std::wstring(L"¯"), L'\u00AF' }, |
|
{ std::wstring(L"°"), L'\u00B0' }, |
|
{ std::wstring(L"±"), L'\u00B1' }, |
|
{ std::wstring(L"²"), L'\u00B2' }, |
|
{ std::wstring(L"³"), L'\u00B3' }, |
|
{ std::wstring(L"´"), L'\u00B4' }, |
|
{ std::wstring(L"µ"), L'\u00B5' }, |
|
{ std::wstring(L"¶"), L'\u00B6' }, |
|
{ std::wstring(L"·"), L'\u00B7' }, |
|
{ std::wstring(L"¸"), L'\u00B8' }, |
|
{ std::wstring(L"¹"), L'\u00B9' }, |
|
{ std::wstring(L"º"), L'\u00BA' }, |
|
{ std::wstring(L"»"), L'\u00BB' }, |
|
{ std::wstring(L"¼"), L'\u00BC' }, |
|
{ std::wstring(L"½"), L'\u00BD' }, |
|
{ std::wstring(L"¾"), L'\u00BE' }, |
|
{ std::wstring(L"¿"), L'\u00BF' }, |
|
{ std::wstring(L"À"), L'\u00C0' }, |
|
{ std::wstring(L"Á"), L'\u00C1' }, |
|
{ std::wstring(L"Â"), L'\u00C2' }, |
|
{ std::wstring(L"Ã"), L'\u00C3' }, |
|
{ std::wstring(L"Ä"), L'\u00C4' }, |
|
{ std::wstring(L"Å"), L'\u00C5' }, |
|
{ std::wstring(L"Æ"), L'\u00C6' }, |
|
{ std::wstring(L"Ç"), L'\u00C7' }, |
|
{ std::wstring(L"È"), L'\u00C8' }, |
|
{ std::wstring(L"É"), L'\u00C9' }, |
|
{ std::wstring(L"Ê"), L'\u00CA' }, |
|
{ std::wstring(L"Ë"), L'\u00CB' }, |
|
{ std::wstring(L"Ì"), L'\u00CC' }, |
|
{ std::wstring(L"Í"), L'\u00CD' }, |
|
{ std::wstring(L"Î"), L'\u00CE' }, |
|
{ std::wstring(L"Ï"), L'\u00CF' }, |
|
{ std::wstring(L"Ð"), L'\u00D0' }, |
|
{ std::wstring(L"Ñ"), L'\u00D1' }, |
|
{ std::wstring(L"Ò"), L'\u00D2' }, |
|
{ std::wstring(L"Ó"), L'\u00D3' }, |
|
{ std::wstring(L"Ô"), L'\u00D4' }, |
|
{ std::wstring(L"Õ"), L'\u00D5' }, |
|
{ std::wstring(L"Ö"), L'\u00D6' }, |
|
{ std::wstring(L"×"), L'\u00D7' }, |
|
{ std::wstring(L"Ø"), L'\u00D8' }, |
|
{ std::wstring(L"Ù"), L'\u00D9' }, |
|
{ std::wstring(L"Ú"), L'\u00DA' }, |
|
{ std::wstring(L"Û"), L'\u00DB' }, |
|
{ std::wstring(L"Ü"), L'\u00DC' }, |
|
{ std::wstring(L"Ý"), L'\u00DD' }, |
|
{ std::wstring(L"Þ"), L'\u00DE' }, |
|
{ std::wstring(L"ß"), L'\u00DF' }, |
|
{ std::wstring(L"à"), L'\u00E0' }, |
|
{ std::wstring(L"á"), L'\u00E1' }, |
|
{ std::wstring(L"â"), L'\u00E2' }, |
|
{ std::wstring(L"ã"), L'\u00E3' }, |
|
{ std::wstring(L"ä"), L'\u00E4' }, |
|
{ std::wstring(L"å"), L'\u00E5' }, |
|
{ std::wstring(L"æ"), L'\u00E6' }, |
|
{ std::wstring(L"ç"), L'\u00E7' }, |
|
{ std::wstring(L"è"), L'\u00E8' }, |
|
{ std::wstring(L"é"), L'\u00E9' }, |
|
{ std::wstring(L"ê"), L'\u00EA' }, |
|
{ std::wstring(L"ë"), L'\u00EB' }, |
|
{ std::wstring(L"ì"), L'\u00EC' }, |
|
{ std::wstring(L"í"), L'\u00ED' }, |
|
{ std::wstring(L"î"), L'\u00EE' }, |
|
{ std::wstring(L"ï"), L'\u00EF' }, |
|
{ std::wstring(L"ð"), L'\u00F0' }, |
|
{ std::wstring(L"ñ"), L'\u00F1' }, |
|
{ std::wstring(L"ò"), L'\u00F2' }, |
|
{ std::wstring(L"ó"), L'\u00F3' }, |
|
{ std::wstring(L"ô"), L'\u00F4' }, |
|
{ std::wstring(L"õ"), L'\u00F5' }, |
|
{ std::wstring(L"ö"), L'\u00F6' }, |
|
{ std::wstring(L"÷"), L'\u00F7' }, |
|
{ std::wstring(L"ø"), L'\u00F8' }, |
|
{ std::wstring(L"ù"), L'\u00F9' }, |
|
{ std::wstring(L"ú"), L'\u00FA' }, |
|
{ std::wstring(L"û"), L'\u00FB' }, |
|
{ std::wstring(L"ü"), L'\u00FC' }, |
|
{ std::wstring(L"ý"), L'\u00FD' }, |
|
{ std::wstring(L"þ"), L'\u00FE' }, |
|
{ std::wstring(L"ÿ"), L'\u00FF' }, |
|
{ std::wstring(L"Œ"), L'\u0152' }, |
|
{ std::wstring(L"œ"), L'\u0153' }, |
|
{ std::wstring(L"Š"), L'\u0160' }, |
|
{ std::wstring(L"š"), L'\u0161' }, |
|
{ std::wstring(L"Ÿ"), L'\u0178' }, |
|
{ std::wstring(L"ƒ"), L'\u0192' }, |
|
{ std::wstring(L"ˆ"), L'\u02C6' }, |
|
{ std::wstring(L"˜"), L'\u02DC' }, |
|
{ std::wstring(L"Α"), L'\u0391' }, |
|
{ std::wstring(L"Β"), L'\u0392' }, |
|
{ std::wstring(L"Γ"), L'\u0393' }, |
|
{ std::wstring(L"Δ"), L'\u0394' }, |
|
{ std::wstring(L"Ε"), L'\u0395' }, |
|
{ std::wstring(L"Ζ"), L'\u0396' }, |
|
{ std::wstring(L"Η"), L'\u0397' }, |
|
{ std::wstring(L"Θ"), L'\u0398' }, |
|
{ std::wstring(L"Ι"), L'\u0399' }, |
|
{ std::wstring(L"Κ"), L'\u039A' }, |
|
{ std::wstring(L"Λ"), L'\u039B' }, |
|
{ std::wstring(L"Μ"), L'\u039C' }, |
|
{ std::wstring(L"Ν"), L'\u039D' }, |
|
{ std::wstring(L"Ξ"), L'\u039E' }, |
|
{ std::wstring(L"Ο"), L'\u039F' }, |
|
{ std::wstring(L"Π"), L'\u03A0' }, |
|
{ std::wstring(L"Ρ"), L'\u03A1' }, |
|
{ std::wstring(L"Σ"), L'\u03A3' }, |
|
{ std::wstring(L"Τ"), L'\u03A4' }, |
|
{ std::wstring(L"Υ"), L'\u03A5' }, |
|
{ std::wstring(L"Φ"), L'\u03A6' }, |
|
{ std::wstring(L"Χ"), L'\u03A7' }, |
|
{ std::wstring(L"Ψ"), L'\u03A8' }, |
|
{ std::wstring(L"Ω"), L'\u03A9' }, |
|
{ std::wstring(L"α"), L'\u03B1' }, |
|
{ std::wstring(L"β"), L'\u03B2' }, |
|
{ std::wstring(L"γ"), L'\u03B3' }, |
|
{ std::wstring(L"δ"), L'\u03B4' }, |
|
{ std::wstring(L"ε"), L'\u03B5' }, |
|
{ std::wstring(L"ζ"), L'\u03B6' }, |
|
{ std::wstring(L"η"), L'\u03B7' }, |
|
{ std::wstring(L"θ"), L'\u03B8' }, |
|
{ std::wstring(L"ι"), L'\u03B9' }, |
|
{ std::wstring(L"κ"), L'\u03BA' }, |
|
{ std::wstring(L"λ"), L'\u03BB' }, |
|
{ std::wstring(L"μ"), L'\u03BC' }, |
|
{ std::wstring(L"ν"), L'\u03BD' }, |
|
{ std::wstring(L"ξ"), L'\u03BE' }, |
|
{ std::wstring(L"ο"), L'\u03BF' }, |
|
{ std::wstring(L"π"), L'\u03C0' }, |
|
{ std::wstring(L"ρ"), L'\u03C1' }, |
|
{ std::wstring(L"ς"), L'\u03C2' }, |
|
{ std::wstring(L"σ"), L'\u03C3' }, |
|
{ std::wstring(L"τ"), L'\u03C4' }, |
|
{ std::wstring(L"υ"), L'\u03C5' }, |
|
{ std::wstring(L"φ"), L'\u03C6' }, |
|
{ std::wstring(L"χ"), L'\u03C7' }, |
|
{ std::wstring(L"ψ"), L'\u03C8' }, |
|
{ std::wstring(L"ω"), L'\u03C9' }, |
|
{ std::wstring(L"ϑ"), L'\u03D1' }, |
|
{ std::wstring(L"ϒ"), L'\u03D2' }, |
|
{ std::wstring(L"ϖ"), L'\u03D6' }, |
|
{ std::wstring(L" "), L'\u2002' }, |
|
{ std::wstring(L" "), L'\u2003' }, |
|
{ std::wstring(L" "), L'\u2009' }, |
|
{ std::wstring(L"‌"), L'\u200C' }, |
|
{ std::wstring(L"‍"), L'\u200D' }, |
|
{ std::wstring(L"‎"), L'\u200E' }, |
|
{ std::wstring(L"‏"), L'\u200F' }, |
|
{ std::wstring(L"–"), L'\u2013' }, |
|
{ std::wstring(L"—"), L'\u2014' }, |
|
{ std::wstring(L"‘"), L'\u2018' }, |
|
{ std::wstring(L"’"), L'\u2019' }, |
|
{ std::wstring(L"‚"), L'\u201A' }, |
|
{ std::wstring(L"“"), L'\u201C' }, |
|
{ std::wstring(L"”"), L'\u201D' }, |
|
{ std::wstring(L"„"), L'\u201E' }, |
|
{ std::wstring(L"†"), L'\u2020' }, |
|
{ std::wstring(L"‡"), L'\u2021' }, |
|
{ std::wstring(L"•"), L'\u2022' }, |
|
{ std::wstring(L"…"), L'\u2026' }, |
|
{ std::wstring(L"‰"), L'\u2030' }, |
|
{ std::wstring(L"′"), L'\u2032' }, |
|
{ std::wstring(L"″"), L'\u2033' }, |
|
{ std::wstring(L"‹"), L'\u2039' }, |
|
{ std::wstring(L"›"), L'\u203A' }, |
|
{ std::wstring(L"‾"), L'\u203E' }, |
|
{ std::wstring(L"⁄"), L'\u2044' }, |
|
{ std::wstring(L"€"), L'\u20AC' }, |
|
{ std::wstring(L"ℑ"), L'\u2111' }, |
|
{ std::wstring(L"℘"), L'\u2118' }, |
|
{ std::wstring(L"ℜ"), L'\u211C' }, |
|
{ std::wstring(L"™"), L'\u2122' }, |
|
{ std::wstring(L"ℵ"), L'\u2135' }, |
|
{ std::wstring(L"←"), L'\u2190' }, |
|
{ std::wstring(L"↑"), L'\u2191' }, |
|
{ std::wstring(L"→"), L'\u2192' }, |
|
{ std::wstring(L"↓"), L'\u2193' }, |
|
{ std::wstring(L"↔"), L'\u2194' }, |
|
{ std::wstring(L"↵"), L'\u21B5' }, |
|
{ std::wstring(L"⇐"), L'\u21D0' }, |
|
{ std::wstring(L"⇑"), L'\u21D1' }, |
|
{ std::wstring(L"⇒"), L'\u21D2' }, |
|
{ std::wstring(L"⇓"), L'\u21D3' }, |
|
{ std::wstring(L"⇔"), L'\u21D4' }, |
|
{ std::wstring(L"∀"), L'\u2200' }, |
|
{ std::wstring(L"∂"), L'\u2202' }, |
|
{ std::wstring(L"∃"), L'\u2203' }, |
|
{ std::wstring(L"∅"), L'\u2205' }, |
|
{ std::wstring(L"∇"), L'\u2207' }, |
|
{ std::wstring(L"∈"), L'\u2208' }, |
|
{ std::wstring(L"∉"), L'\u2209' }, |
|
{ std::wstring(L"∋"), L'\u220B' }, |
|
{ std::wstring(L"∏"), L'\u220F' }, |
|
{ std::wstring(L"∑"), L'\u2211' }, |
|
{ std::wstring(L"−"), L'\u2212' }, |
|
{ std::wstring(L"∗"), L'\u2217' }, |
|
{ std::wstring(L"√"), L'\u221A' }, |
|
{ std::wstring(L"∝"), L'\u221D' }, |
|
{ std::wstring(L"∞"), L'\u221E' }, |
|
{ std::wstring(L"∠"), L'\u2220' }, |
|
{ std::wstring(L"∧"), L'\u2227' }, |
|
{ std::wstring(L"∨"), L'\u2228' }, |
|
{ std::wstring(L"∩"), L'\u2229' }, |
|
{ std::wstring(L"∪"), L'\u222A' }, |
|
{ std::wstring(L"∫"), L'\u222B' }, |
|
{ std::wstring(L"∴"), L'\u2234' }, |
|
{ std::wstring(L"∼"), L'\u223C' }, |
|
{ std::wstring(L"≅"), L'\u2245' }, |
|
{ std::wstring(L"≈"), L'\u2248' }, |
|
{ std::wstring(L"≠"), L'\u2260' }, |
|
{ std::wstring(L"≡"), L'\u2261' }, |
|
{ std::wstring(L"≤"), L'\u2264' }, |
|
{ std::wstring(L"≥"), L'\u2265' }, |
|
{ std::wstring(L"⊂"), L'\u2282' }, |
|
{ std::wstring(L"⊃"), L'\u2283' }, |
|
{ std::wstring(L"⊄"), L'\u2284' }, |
|
{ std::wstring(L"⊆"), L'\u2286' }, |
|
{ std::wstring(L"⊇"), L'\u2287' }, |
|
{ std::wstring(L"⊕"), L'\u2295' }, |
|
{ std::wstring(L"⊗"), L'\u2297' }, |
|
{ std::wstring(L"⊥"), L'\u22A5' }, |
|
{ std::wstring(L"⋅"), L'\u22C5' }, |
|
{ std::wstring(L"⌈"), L'\u2308' }, |
|
{ std::wstring(L"⌉"), L'\u2309' }, |
|
{ std::wstring(L"⌊"), L'\u230A' }, |
|
{ std::wstring(L"⌋"), L'\u230B' }, |
|
{ std::wstring(L"⟨"), L'\u2329' }, |
|
{ std::wstring(L"⟩"), L'\u232A' }, |
|
{ std::wstring(L"◊"), L'\u25CA' }, |
|
{ std::wstring(L"♠"), L'\u2660' }, |
|
{ std::wstring(L"♣"), L'\u2663' }, |
|
{ std::wstring(L"♥"), L'\u2665' }, |
|
{ std::wstring(L"♦"), L'\u2666' } |
|
}; |
|
|
|
inline gunichar |
|
get_entity(gunichar *ptr, size_t len) { |
|
|
|
gunichar ech(0); |
|
if (ptr[1] == gunichar(L'#') && len > 3) { |
|
std::wstringstream wss; |
|
int wch = 0; |
|
try { |
|
wss << std::hex << std::wstring((wchar_t *)(ptr+2),len-3); |
|
wss >> wch; |
|
ech = gunichar(wch); |
|
} catch (...) { |
|
ech = 0; |
|
} |
|
} else if (g_unichar_type(ptr[1]) == G_UNICODE_DECIMAL_NUMBER) { |
|
std::wstringstream wss; |
|
int wch = 0; |
|
try { |
|
wss << std::dec << std::wstring((wchar_t *)(ptr+1),len-2); |
|
wss >> wch; |
|
ech = gunichar(wch); |
|
} catch (...) { |
|
ech = 0; |
|
} |
|
} |
|
if (ech) |
|
return ech; |
|
|
|
std::map<std::wstring,gunichar>::const_iterator it = |
|
ENTITY_MAP.find(std::wstring((wchar_t *)(ptr),len)); |
|
return it != ENTITY_MAP.end() ? it->second : gunichar(0); |
|
} |
|
|
|
|
|
inline gunichar |
|
get_entity(char *ptr, size_t len) { |
|
glong ulen = 0; |
|
gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)ptr, len, &ulen); |
|
gunichar gch = get_entity(gtmp,ulen); |
|
g_free(gtmp); |
|
return gch; |
|
} |
|
|
|
|
|
inline std::string |
|
trim(const std::string& in) |
|
{ |
|
std::size_t start = 0; |
|
std::size_t limit = in.size(); |
|
while (start < limit && in.at(start) < '!') ++start; |
|
while (start < limit && in.at(limit-1) < '!') --limit; |
|
if (start == limit) return std::string(""); |
|
if (start > 0 || limit < in.size()) |
|
return in.substr(start,limit-start); |
|
return std::string(in); |
|
} |
|
|
|
|
|
inline std::vector<std::string> |
|
split(const std::string& in) |
|
{ |
|
std::vector<std::string> outv; |
|
std::istringstream iss(in); |
|
std::copy(std::istream_iterator<std::string>(iss), |
|
std::istream_iterator<std::string>(), |
|
std::back_inserter(outv)); |
|
return outv; |
|
} |
|
|
|
}; |
|
|
|
|
|
#ifdef TOKENIZER_NAMESPACE |
|
namespace TOKENIZER_NAMESPACE { |
|
#endif |
|
|
|
|
|
void |
|
Tokenizer::set_config_dir(const std::string& dir) { |
|
if (dir.empty()) { |
|
cfg_dir = "."; |
|
} else { |
|
cfg_dir.assign(dir); |
|
} |
|
} |
|
|
|
|
|
Tokenizer::Tokenizer(const Parameters& _) |
|
: nthreads(_.nthreads ? _.nthreads : 1) |
|
, chunksize(_.chunksize) |
|
, lang_iso(_.lang_iso) |
|
, english_p(_.lang_iso.compare("en")==0) |
|
, latin_p((!english_p) && (_.lang_iso.compare("fr")==0 || _.lang_iso.compare("it")==0)) |
|
, skip_xml_p(_.detag_p) |
|
, skip_alltags_p(_.alltag_p) |
|
, entities_p(_.entities_p) |
|
, escape_p(_.escape_p) |
|
, unescape_p(_.unescape_p) |
|
, aggressive_hyphen_p(_.aggro_p) |
|
, supersub_p(_.supersub_p) |
|
, url_p(_.url_p) |
|
, downcase_p(_.downcase_p) |
|
, normalize_p(_.normalize_p) |
|
, penn_p(_.penn_p) |
|
, narrow_latin_p(_.narrow_latin_p) |
|
, narrow_kana_p(_.narrow_kana_p) |
|
, refined_p(_.refined_p) |
|
, drop_bad_p(_.drop_bad_p) |
|
, splits_p(_.split_p) |
|
, verbose_p(_.verbose_p) |
|
, para_marks_p(_.para_marks_p) |
|
, split_breaks_p(_.split_breaks_p) |
|
{ |
|
if (_.cfg_path) |
|
set_config_dir(_.cfg_path); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
Tokenizer::~Tokenizer() |
|
{ |
|
for (auto& ptr : prot_pat_vec) { |
|
if (ptr == &numprefixed_x || ptr == &quasinumeric_x) |
|
continue; |
|
delete ptr; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
std::pair<int,int> |
|
Tokenizer::load_prefixes(std::ifstream& ifs) |
|
{ |
|
RE2 numonly("(.*)[\\s]+(\\#NUMERIC_ONLY\\#)"); |
|
std::string line; |
|
int nnon = 0; |
|
int nnum = 0; |
|
|
|
while (std::getline(ifs,line)) { |
|
if (!line.empty() && line[0] != '#') { |
|
std::string prefix; |
|
if (RE2::PartialMatch(line,numonly,&prefix)) { |
|
nbpre_num_set.insert(prefix); |
|
gunichar * x=g_utf8_to_ucs4_fast((const gchar *)prefix.c_str(),prefix.size(),0); |
|
nbpre_num_ucs4.insert(std::wstring((wchar_t *)x)); |
|
g_free(x); |
|
nnum++; |
|
} else { |
|
nbpre_gen_set.insert(line); |
|
gunichar * x=g_utf8_to_ucs4_fast((const gchar *)line.c_str(),line.size(),0); |
|
nbpre_gen_ucs4.insert(std::wstring((wchar_t *)x)); |
|
g_free(x); |
|
nnon++; |
|
} |
|
} |
|
} |
|
return std::make_pair(nnon,nnum); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
void |
|
Tokenizer::init(const char *cfg_dir_optional) { |
|
if (cfg_dir_optional) |
|
set_config_dir(std::string(cfg_dir_optional)); |
|
|
|
std::string dir_path(cfg_dir); |
|
dir_path.append("/nonbreaking_prefixes"); |
|
if (::access(dir_path.c_str(),X_OK)) { |
|
dir_path = cfg_dir; |
|
} |
|
|
|
std::string nbpre_path(dir_path); |
|
nbpre_path.append("/nonbreaking_prefix.").append(lang_iso); |
|
|
|
|
|
if (::access(nbpre_path.c_str(),R_OK)) |
|
nbpre_path = nbpre_path.substr(0,nbpre_path.size()-lang_iso.size()-1); |
|
|
|
if (::access(nbpre_path.c_str(),R_OK) == 0) { |
|
std::ifstream cfg(nbpre_path.c_str()); |
|
try { |
|
std::pair<int,int> counts = load_prefixes(cfg); |
|
if (verbose_p) { |
|
std::cerr << "loaded " << counts.first << " non-numeric, " |
|
<< counts.second << " numeric prefixes from " |
|
<< nbpre_path << std::endl; |
|
} |
|
} catch (...) { |
|
std::ostringstream ess; |
|
ess << "I/O error reading " << nbpre_path << " in " << __FILE__ << " at " << __LINE__; |
|
throw std::runtime_error(ess.str()); |
|
} |
|
} else if (verbose_p) { |
|
std::cerr << "no prefix file found: " << nbpre_path << std::endl; |
|
} |
|
|
|
if (nbpre_gen_set.empty() && nbpre_num_set.empty()) { |
|
std::ostringstream ess; |
|
ess << "Error at " << __FILE__ << ":" << __LINE__ << " : " |
|
<< "No known abbreviations for language " << lang_iso; |
|
throw std::runtime_error(ess.str()); |
|
} |
|
|
|
std::string protpat_path(cfg_dir); |
|
protpat_path.append("/protected_pattern.").append(lang_iso); |
|
|
|
if (::access(protpat_path.c_str(),R_OK)) |
|
protpat_path = protpat_path.substr(0,protpat_path.size()-lang_iso.size()-1); |
|
|
|
prot_pat_vec.push_back(&numprefixed_x); |
|
prot_pat_vec.push_back(&quasinumeric_x); |
|
|
|
if (::access(protpat_path.c_str(),R_OK) == 0) { |
|
std::ifstream cfg(protpat_path.c_str()); |
|
char linebuf[1028]; |
|
int npat = 0; |
|
try { |
|
linebuf[0]='('; |
|
while (cfg.good()) { |
|
cfg.getline(linebuf+1,1024); |
|
if (linebuf[1] && linebuf[1] != '#') { |
|
strcat(linebuf,")"); |
|
prot_pat_vec.push_back(new RE2(linebuf)); |
|
npat++; |
|
} |
|
} |
|
} catch (...) { |
|
std::ostringstream ess; |
|
ess << "I/O error reading " << protpat_path << " in " << __FILE__ << " at " << __LINE__; |
|
throw std::runtime_error(ess.str()); |
|
} |
|
if (verbose_p) { |
|
std::cerr << "loaded " << npat << " protected patterns from " |
|
<< protpat_path << std::endl; |
|
} |
|
} else if (verbose_p) { |
|
std::cerr << "no protected file found: " << protpat_path << std::endl; |
|
} |
|
} |
|
|
|
|
|
void |
|
Tokenizer::reset() { |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void |
|
Tokenizer::protected_tokenize(std::string& text) { |
|
std::vector<re2::StringPiece> words; |
|
re2::StringPiece textpc(text); |
|
int pos = 0; |
|
if (textpc[pos] == ' ') |
|
++pos; |
|
size_t next = text.find(' ',pos); |
|
while (next != std::string::npos) { |
|
if (next - pos) |
|
words.push_back(textpc.substr(pos,next-pos)); |
|
pos = next + 1; |
|
while (pos < textpc.size() && textpc[pos] == ' ') |
|
++pos; |
|
next = textpc.find(' ',pos); |
|
} |
|
if (pos < textpc.size() && textpc[pos] != ' ') |
|
words.push_back(textpc.substr(pos,textpc.size()-pos)); |
|
|
|
|
|
std::string outs; |
|
std::size_t nwords(words.size()); |
|
for (size_t ii = 0; ii < nwords; ++ii) { |
|
bool more_p = ii < nwords - 1; |
|
size_t len = words[ii].size(); |
|
bool sentence_break_p = len > 1 && words[ii][len-1] == '.'; |
|
|
|
|
|
if (sentence_break_p) { |
|
re2::StringPiece pfx(words[ii].substr(0,len-1)); |
|
std::string pfxs(pfx.as_string()); |
|
if (nbpre_gen_set.find(pfxs) != nbpre_gen_set.end()) { |
|
|
|
sentence_break_p = false; |
|
} else if (more_p && nbpre_num_set.find(pfxs) != nbpre_num_set.end() && RE2::PartialMatch(words[ii+1],sinteger_x)) { |
|
|
|
sentence_break_p = false; |
|
} else if (pfxs.find('.') != std::string::npos && RE2::PartialMatch(pfx,letter_x)) { |
|
|
|
sentence_break_p = false; |
|
} else if (more_p && RE2::PartialMatch(words[ii+1],lower_x)) { |
|
|
|
sentence_break_p = false; |
|
} |
|
} |
|
|
|
outs.append(words[ii].data(),len); |
|
if (sentence_break_p) |
|
outs.append(" ."); |
|
if (more_p) |
|
outs.append(SPC_BYTE,1); |
|
} |
|
text.assign(outs.begin(),outs.end()); |
|
} |
|
|
|
|
|
bool |
|
Tokenizer::unescape(std::string& word) { |
|
std::ostringstream oss; |
|
std::size_t was = 0; |
|
std::size_t pos = 0; |
|
std::size_t len = 0; |
|
bool hit = false; |
|
for (std::size_t endp=0; |
|
(pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos; |
|
was = endp == std::string::npos ? pos : 1+endp) { |
|
len = endp - pos + 1; |
|
glong ulen(0); |
|
gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)word.c_str()+pos, len, &ulen); |
|
gunichar gbuf[2] = { 0 }; |
|
if ((gbuf[0] = get_entity(gtmp,ulen)) != gunichar(0)) { |
|
gchar *gstr = g_ucs4_to_utf8(gbuf,ulen,0,0,0); |
|
if (escape_p && ESCAPE_SET.find(std::string(gstr)) != ESCAPE_SET.end()) { |
|
|
|
oss << word.substr(was,1+endp-was); |
|
} else { |
|
if (was < pos) |
|
oss << word.substr(was,pos-was); |
|
oss << gstr; |
|
was += ulen; |
|
hit = true; |
|
} |
|
g_free(gstr); |
|
} else { |
|
oss << word.substr(was,1+endp-was); |
|
} |
|
g_free(gtmp); |
|
} |
|
if (was < word.size()) |
|
oss << word.substr(was); |
|
if (hit) |
|
word = oss.str(); |
|
return hit; |
|
} |
|
|
|
|
|
bool |
|
Tokenizer::escape(std::string& text) { |
|
bool mod_p = false; |
|
std::string outs; |
|
|
|
const char *pp = text.c_str(); |
|
const char *ep = pp + text.size(); |
|
const char *pt = pp; |
|
|
|
while (pt < ep) { |
|
if (*pt & 0x80) { |
|
const char *mk = (const char *)g_utf8_find_next_char((const gchar *)pt,(const gchar *)ep); |
|
if (!mk) { |
|
if (mod_p) |
|
outs.append(pp,pt-pp+1); |
|
} else { |
|
if (mod_p) |
|
outs.append(pp,mk-pp); |
|
pt = --mk; |
|
} |
|
pp = ++pt; |
|
continue; |
|
} |
|
|
|
const char *sequence_p = 0; |
|
if (*pt < '?') { |
|
if (*pt == '&') { |
|
|
|
const char *sc = strchr(pt,';'); |
|
if (!sc || sc-pt < 2 || sc-pt > 9) { |
|
sequence_p = ESCAPE_MOSES[3]; |
|
} |
|
} else if (*pt == '\'') { |
|
sequence_p = ESCAPE_MOSES[6]; |
|
} else if (*pt == '"') { |
|
sequence_p = ESCAPE_MOSES[7]; |
|
} |
|
} else if (*pt > ']') { |
|
if (*pt =='|') { |
|
sequence_p = ESCAPE_MOSES[0]; |
|
} |
|
} else if (*pt > 'Z') { |
|
if (*pt == '<') { |
|
sequence_p = ESCAPE_MOSES[4]; |
|
} else if (*pt == '>') { |
|
sequence_p = ESCAPE_MOSES[5]; |
|
} else if (*pt == '[') { |
|
sequence_p = ESCAPE_MOSES[1]; |
|
} else if (*pt == ']') { |
|
sequence_p = ESCAPE_MOSES[2]; |
|
} |
|
} |
|
|
|
if (sequence_p) { |
|
if (pt > pp) |
|
outs.append(pp,pt-pp); |
|
outs.append(sequence_p); |
|
mod_p = true; |
|
pp = ++pt; |
|
} else { |
|
++pt; |
|
} |
|
} |
|
|
|
if (mod_p) { |
|
if (pp < pt) { |
|
outs.append(pp,pt-pp); |
|
} |
|
text.assign(outs.begin(),outs.end()); |
|
} |
|
|
|
return mod_p; |
|
} |
|
|
|
|
|
std::string |
|
Tokenizer::penn_tokenize(const std::string& buf) |
|
{ |
|
static const char *comma_refs = "\\1 , \\2"; |
|
static const char *isolate_ref = " \\1 "; |
|
static const char *special_refs = "\\1 @\\2@ \\3"; |
|
|
|
std::string text(buf); |
|
std::string outs; |
|
if (skip_alltags_p) |
|
RE2::GlobalReplace(&text,genl_tags_x,SPC_BYTE); |
|
|
|
|
|
size_t len = text.size(); |
|
if (len > 2 && text.substr(0,2) == "``") |
|
text.replace(0,2,"`` ",3); |
|
else if (text[0] == '"') |
|
text.replace(0,1,"`` ",3); |
|
else if (text[0] == '`' || text[0] == '\'') |
|
text.replace(0,1,"` ",2); |
|
static char one_gg[] = "\\1 ``"; |
|
RE2::GlobalReplace(&text,x1_v_d,one_gg); |
|
RE2::GlobalReplace(&text,x1_v_gg,one_gg); |
|
RE2::GlobalReplace(&text,x1_v_g,"\\1 ` \\2"); |
|
RE2::GlobalReplace(&text,x1_v_q,"\\1 ` "); |
|
|
|
|
|
for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11)) |
|
text.replace(pos,3,"MANYELIPSIS",11); |
|
|
|
|
|
RE2::GlobalReplace(&text,ndndcomma_x,comma_refs); |
|
RE2::GlobalReplace(&text,pdndcomma_x,comma_refs); |
|
RE2::GlobalReplace(&text,ndpdcomma_x,comma_refs); |
|
|
|
|
|
RE2::GlobalReplace(&text,symbol_x,isolate_ref); |
|
|
|
|
|
RE2::GlobalReplace(&text,slash_x,special_refs); |
|
|
|
|
|
RE2::GlobalReplace(&text,final_x,"\\1 \\2\\3"); |
|
|
|
|
|
RE2::GlobalReplace(&text,qx_x,isolate_ref); |
|
|
|
|
|
RE2::GlobalReplace(&text,braces_x,isolate_ref); |
|
|
|
|
|
RE2::GlobalReplace(&text,"\\(","-LRB-"); |
|
RE2::GlobalReplace(&text,"\\[","-LSB-"); |
|
RE2::GlobalReplace(&text,"\\{","-LCB-"); |
|
RE2::GlobalReplace(&text,"\\)","-RRB-"); |
|
RE2::GlobalReplace(&text,"\\]","-RSB-"); |
|
RE2::GlobalReplace(&text,"\\}","-RCB-"); |
|
|
|
|
|
RE2::GlobalReplace(&text,"--"," -- "); |
|
|
|
|
|
|
|
len = text.size(); |
|
while (len > 1 && text[len-1] == ' ') --len; |
|
if (len < text.size()) |
|
text.assign(text.substr(0,len)); |
|
if (len > 2 && text[len-1] == '.') { |
|
if (text[len-2] != ' ') { |
|
text.assign(text.substr(0,len-1)); |
|
text.append(" . "); |
|
} else { |
|
text.assign(text.substr(0,len-1)); |
|
text.append(". "); |
|
} |
|
} else { |
|
text.append(SPC_BYTE,1); |
|
} |
|
std::string ntext(SPC_BYTE); |
|
ntext.append(text); |
|
|
|
|
|
RE2::GlobalReplace(&ntext,"\""," '' "); |
|
|
|
|
|
RE2::GlobalReplace(&ntext,endq_x,"\\1 ' "); |
|
RE2::GlobalReplace(&ntext,contract_x," '\\1 "); |
|
RE2::GlobalReplace(&ntext,"'ll "," 'll "); |
|
RE2::GlobalReplace(&ntext,"'re "," 're "); |
|
RE2::GlobalReplace(&ntext,"'ve "," 've "); |
|
RE2::GlobalReplace(&ntext,"n't "," n't "); |
|
RE2::GlobalReplace(&ntext,"'LL "," 'LL "); |
|
RE2::GlobalReplace(&ntext,"'RE "," 'RE "); |
|
RE2::GlobalReplace(&ntext,"'VE "," 'VE "); |
|
RE2::GlobalReplace(&ntext,"N'T "," N'T "); |
|
RE2::GlobalReplace(&ntext," ([Cc])annot "," \\1an not "); |
|
RE2::GlobalReplace(&ntext," ([Dd])'ye "," \\1' ye "); |
|
RE2::GlobalReplace(&ntext," ([Gg])imme "," \\1im me "); |
|
RE2::GlobalReplace(&ntext," ([Gg])onna "," \\1on na "); |
|
RE2::GlobalReplace(&ntext," ([Gg])otta "," \\1ot ta "); |
|
RE2::GlobalReplace(&ntext," ([Ll])emme "," \\1em me "); |
|
RE2::GlobalReplace(&ntext," ([Mm])ore'n "," \\1ore 'n "); |
|
RE2::GlobalReplace(&ntext," '([Tt])is "," '\\1 is 'n "); |
|
RE2::GlobalReplace(&ntext," '([Tt])was "," '\\1 was 'n "); |
|
RE2::GlobalReplace(&ntext," '([Tt])were "," '\\1 were 'n "); |
|
RE2::GlobalReplace(&ntext," ([Ww])anna "," \\1an na "); |
|
|
|
protected_tokenize(ntext); |
|
|
|
|
|
RE2::GlobalReplace(&ntext,"MANYELIPSIS","..."); |
|
|
|
|
|
RE2::GlobalReplace(&ntext,mult_spc_x,SPC_BYTE); |
|
|
|
|
|
if (escape_p) |
|
escape(ntext); |
|
|
|
|
|
outs.assign(ntext.substr(1,ntext.size()-2)); |
|
return outs; |
|
} |
|
|
|
|
|
std::string |
|
Tokenizer::quik_tokenize(const std::string& buf) |
|
{ |
|
std::string text(buf); |
|
size_t pos; |
|
int num = 0; |
|
|
|
|
|
|
|
|
|
std::vector<std::string> prot_stack; |
|
std::string match; |
|
|
|
for (auto& pat : prot_pat_vec) { |
|
pos = 0; |
|
while (RE2::PartialMatch(text.substr(pos),*pat,&match)) { |
|
pos = text.find(match,pos); |
|
if (pos == std::string::npos) |
|
break; |
|
size_t len = match.size(); |
|
if (text[pos-1] == ' ' || text[pos-1] == '\'' || text[pos-1] == '`'|| text[pos-1] == '"') { |
|
char subst[32]; |
|
int nsubst = snprintf(subst,sizeof(subst)," THISISPROTECTED%.3d ",num++); |
|
text.replace(pos,len,subst,nsubst); |
|
prot_stack.push_back(match); |
|
pos += nsubst; |
|
} else { |
|
pos += len; |
|
} |
|
} |
|
} |
|
|
|
const char *pt(text.c_str()); |
|
const char *ep(pt + text.size()); |
|
while (pt < ep && *pt >= 0 && *pt <= ' ') |
|
++pt; |
|
glong ulen(0); |
|
gunichar *usrc(g_utf8_to_ucs4_fast((const gchar *)pt,ep - pt, &ulen)); |
|
gunichar *ucs4(usrc); |
|
gunichar *lim4(ucs4 + ulen); |
|
|
|
gunichar *nxt4 = ucs4; |
|
gunichar *ubuf(g_new0(gunichar,ulen*6+1)); |
|
gunichar *uptr(ubuf); |
|
|
|
gunichar prev_uch(0); |
|
gunichar next_uch(*ucs4); |
|
gunichar curr_uch(0); |
|
|
|
GUnicodeType curr_type(G_UNICODE_UNASSIGNED); |
|
GUnicodeType next_type((ucs4 && *ucs4) ? g_unichar_type(*ucs4) : G_UNICODE_UNASSIGNED); |
|
GUnicodeType prev_type(G_UNICODE_UNASSIGNED); |
|
|
|
bool post_break_p = false; |
|
bool in_num_p = next_uch <= gunichar(L'9') && next_uch >= gunichar(L'0'); |
|
bool in_url_p = false; |
|
int since_start = 0; |
|
int alpha_prefix = 0; |
|
int bad_length = 0; |
|
|
|
while (ucs4 < lim4) { |
|
prev_uch = curr_uch; |
|
prev_type = curr_type; |
|
curr_uch = next_uch; |
|
curr_type = next_type; |
|
|
|
if (++nxt4 >= lim4) { |
|
next_uch = 0; |
|
next_type = G_UNICODE_UNASSIGNED; |
|
} else { |
|
next_uch = *nxt4; |
|
next_type = g_unichar_type(next_uch); |
|
} |
|
|
|
if (url_p) { |
|
if (!in_url_p && *ucs4 < 0x80L) { |
|
if (!since_start) { |
|
if (std::isalpha(char(*ucs4))) |
|
alpha_prefix++; |
|
} else if (alpha_prefix == since_start |
|
&& char(*ucs4) == ':' |
|
&& next_type != G_UNICODE_SPACE_SEPARATOR) { |
|
in_url_p = true; |
|
} |
|
} |
|
} |
|
|
|
bool pre_break_p = false; |
|
const wchar_t *substitute_p = 0; |
|
|
|
if (post_break_p) { |
|
*uptr++ = gunichar(L' '); |
|
since_start = bad_length = 0; |
|
in_url_p = in_num_p = post_break_p = false; |
|
} |
|
|
|
retry: |
|
|
|
switch (curr_type) { |
|
case G_UNICODE_MODIFIER_LETTER: |
|
case G_UNICODE_OTHER_LETTER: |
|
case G_UNICODE_TITLECASE_LETTER: |
|
if (in_url_p || in_num_p) |
|
pre_break_p = true; |
|
|
|
case G_UNICODE_UPPERCASE_LETTER: |
|
case G_UNICODE_LOWERCASE_LETTER: |
|
if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER) |
|
curr_uch = g_unichar_tolower(*ucs4); |
|
break; |
|
case G_UNICODE_SPACING_MARK: |
|
pre_break_p = true; |
|
in_num_p = false; |
|
curr_uch = 0; |
|
break; |
|
case G_UNICODE_DECIMAL_NUMBER: |
|
case G_UNICODE_LETTER_NUMBER: |
|
case G_UNICODE_OTHER_NUMBER: |
|
if (!in_num_p && !in_url_p) { |
|
switch (prev_type) { |
|
case G_UNICODE_DASH_PUNCTUATION: |
|
case G_UNICODE_FORMAT: |
|
case G_UNICODE_OTHER_PUNCTUATION: |
|
case G_UNICODE_UPPERCASE_LETTER: |
|
case G_UNICODE_LOWERCASE_LETTER: |
|
case G_UNICODE_DECIMAL_NUMBER: |
|
break; |
|
default: |
|
pre_break_p = true; |
|
} |
|
} |
|
in_num_p = true; |
|
break; |
|
case G_UNICODE_CONNECT_PUNCTUATION: |
|
if (curr_uch != gunichar(L'_')) { |
|
if (in_url_p) { |
|
in_url_p = false; |
|
post_break_p = pre_break_p = true; |
|
} |
|
} |
|
if (in_num_p) { |
|
post_break_p = pre_break_p = true; |
|
} else { |
|
switch (next_type) { |
|
case G_UNICODE_LOWERCASE_LETTER: |
|
case G_UNICODE_MODIFIER_LETTER: |
|
case G_UNICODE_OTHER_LETTER: |
|
case G_UNICODE_TITLECASE_LETTER: |
|
break; |
|
default: |
|
post_break_p = pre_break_p = true; |
|
} |
|
switch (prev_type) { |
|
case G_UNICODE_LOWERCASE_LETTER: |
|
case G_UNICODE_MODIFIER_LETTER: |
|
case G_UNICODE_OTHER_LETTER: |
|
case G_UNICODE_TITLECASE_LETTER: |
|
break; |
|
default: |
|
post_break_p = pre_break_p = true; |
|
} |
|
} |
|
break; |
|
case G_UNICODE_FORMAT: |
|
in_url_p = in_num_p = false; |
|
break; |
|
case G_UNICODE_DASH_PUNCTUATION: |
|
if (aggressive_hyphen_p && !in_url_p && curr_uch != next_uch && prev_uch != curr_uch && (!(prev_uch == L' ' || !prev_uch) && !(next_uch == L' ' || !next_uch))) { |
|
substitute_p = L"@-@"; |
|
post_break_p = pre_break_p = true; |
|
} else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) || |
|
( curr_uch > gunichar(L'\u2011') |
|
&& curr_uch != gunichar(L'\u30A0') |
|
&& curr_uch < gunichar(L'\uFE63') ) ) { |
|
|
|
post_break_p = pre_break_p = true; |
|
} else if (next_type == G_UNICODE_SPACE_SEPARATOR) { |
|
} else { |
|
if (prev_type == curr_type) { |
|
if (next_type != curr_type) { |
|
post_break_p = !in_url_p; |
|
} |
|
} else if (next_type == curr_type) { |
|
pre_break_p = !in_url_p; |
|
} else if ((prev_type == G_UNICODE_UPPERCASE_LETTER || |
|
prev_type == G_UNICODE_LOWERCASE_LETTER) && |
|
next_type == G_UNICODE_DECIMAL_NUMBER) { |
|
in_num_p = false; |
|
} else if (in_num_p || since_start == 0) { |
|
switch (next_type) { |
|
case G_UNICODE_UPPERCASE_LETTER: |
|
case G_UNICODE_LOWERCASE_LETTER: |
|
case G_UNICODE_MODIFIER_LETTER: |
|
case G_UNICODE_OTHER_LETTER: |
|
case G_UNICODE_TITLECASE_LETTER: |
|
case G_UNICODE_SPACE_SEPARATOR: |
|
in_num_p = false; |
|
break; |
|
case G_UNICODE_DECIMAL_NUMBER: |
|
case G_UNICODE_LETTER_NUMBER: |
|
case G_UNICODE_OTHER_NUMBER: |
|
case G_UNICODE_OTHER_PUNCTUATION: |
|
break; |
|
default: |
|
post_break_p = true; |
|
pre_break_p = prev_uch != curr_uch; |
|
} |
|
} else if (in_url_p) { |
|
pre_break_p = curr_uch != gunichar(L'-'); |
|
} else { |
|
switch (prev_type) { |
|
case G_UNICODE_UPPERCASE_LETTER: |
|
case G_UNICODE_LOWERCASE_LETTER: |
|
case G_UNICODE_MODIFIER_LETTER: |
|
case G_UNICODE_OTHER_LETTER: |
|
case G_UNICODE_TITLECASE_LETTER: |
|
case G_UNICODE_DECIMAL_NUMBER: |
|
case G_UNICODE_LETTER_NUMBER: |
|
case G_UNICODE_OTHER_NUMBER: |
|
case G_UNICODE_OTHER_PUNCTUATION: |
|
switch (next_type) { |
|
case G_UNICODE_UPPERCASE_LETTER: |
|
case G_UNICODE_LOWERCASE_LETTER: |
|
case G_UNICODE_MODIFIER_LETTER: |
|
case G_UNICODE_OTHER_LETTER: |
|
case G_UNICODE_TITLECASE_LETTER: |
|
case G_UNICODE_DECIMAL_NUMBER: |
|
case G_UNICODE_LETTER_NUMBER: |
|
case G_UNICODE_OTHER_NUMBER: |
|
break; |
|
case G_UNICODE_OTHER_PUNCTUATION: |
|
if (prev_type != next_type) |
|
break; |
|
default: |
|
post_break_p = pre_break_p = prev_uch != curr_uch; |
|
} |
|
break; |
|
default: |
|
post_break_p = pre_break_p = prev_uch != curr_uch; |
|
break; |
|
} |
|
} |
|
} |
|
break; |
|
case G_UNICODE_OTHER_PUNCTUATION: |
|
switch (curr_uch) { |
|
case gunichar(L':'): |
|
case gunichar(L'/'): |
|
if (refined_p && !in_url_p |
|
&& prev_type == G_UNICODE_DECIMAL_NUMBER |
|
&& next_type == G_UNICODE_DECIMAL_NUMBER) { |
|
break; |
|
} |
|
|
|
case gunichar(L'!'): |
|
case gunichar(L'#'): |
|
case gunichar(L';'): |
|
case gunichar(L'?'): |
|
case gunichar(L'@'): |
|
post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR; |
|
break; |
|
case gunichar(L'+'): |
|
post_break_p = pre_break_p = !in_num_p && since_start > 0; |
|
in_num_p = in_num_p || since_start == 0; |
|
break; |
|
case gunichar(L'&'): |
|
if (unescape_p) { |
|
if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER |
|
|| next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'#')) { |
|
gunichar *eptr = nxt4; |
|
GUnicodeType eptr_type(G_UNICODE_UNASSIGNED); |
|
for (++eptr; eptr < lim4 && *eptr != gunichar(L';'); ++eptr) { |
|
eptr_type = g_unichar_type(*eptr); |
|
if (eptr_type != G_UNICODE_LOWERCASE_LETTER |
|
&& eptr_type != G_UNICODE_UPPERCASE_LETTER |
|
&& eptr_type != G_UNICODE_DECIMAL_NUMBER) |
|
break; |
|
} |
|
gunichar ech(0); |
|
if (*eptr == gunichar(L';') && (ech = get_entity(ucs4,eptr-ucs4+1))) { |
|
curr_uch = ech; |
|
curr_type = g_unichar_type(ech); |
|
ucs4 = eptr; |
|
nxt4 = ++eptr; |
|
next_uch = *nxt4; |
|
next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED; |
|
goto retry; |
|
} |
|
} |
|
} |
|
if (entities_p && !in_url_p) { |
|
gunichar *cur4 = nxt4; |
|
if (*cur4 == gunichar('#')) ++cur4; |
|
while (g_unichar_isalnum(*cur4)) ++cur4; |
|
if (cur4 > nxt4 && *cur4 == gunichar(';')) { |
|
if (since_start) { |
|
*uptr++ = gunichar(L' '); |
|
since_start = 0; |
|
} |
|
++cur4; |
|
memcpy(uptr,ucs4,cur4-ucs4); |
|
uptr += cur4-ucs4; |
|
ucs4 = cur4; |
|
*uptr++ = gunichar(L' '); |
|
pre_break_p = post_break_p = false; |
|
curr_uch = *ucs4; |
|
curr_type = ucs4 < lim4 ? g_unichar_type(curr_uch) : G_UNICODE_UNASSIGNED; |
|
nxt4 = ++cur4; |
|
next_uch = *nxt4; |
|
next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED; |
|
goto retry; |
|
} |
|
|
|
} |
|
post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR; |
|
if (escape_p) |
|
substitute_p = L"&"; |
|
break; |
|
case gunichar(L'\''): |
|
if (english_p) { |
|
if (!in_url_p) { |
|
bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER |
|
|| next_type == G_UNICODE_UPPERCASE_LETTER; |
|
pre_break_p = true; |
|
if (next_letter_p && refined_p) { |
|
|
|
if (prev_uch == gunichar(L'n') || prev_uch == gunichar(L'N')) { |
|
*(uptr - 1) = gunichar(L' '); |
|
*(uptr++) = prev_uch; |
|
pre_break_p = false; |
|
} |
|
} |
|
post_break_p = since_start == 0 |
|
|| (!next_letter_p && next_type != G_UNICODE_DECIMAL_NUMBER); |
|
} |
|
} else if (latin_p) { |
|
post_break_p = !in_url_p; |
|
pre_break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER; |
|
} else { |
|
post_break_p = pre_break_p = !in_url_p; |
|
} |
|
if (escape_p) |
|
substitute_p = L"'"; |
|
break; |
|
case gunichar(L'"'): |
|
post_break_p = pre_break_p = true; |
|
if (escape_p) |
|
substitute_p = L"""; |
|
break; |
|
case gunichar(L','): |
|
pre_break_p = !in_num_p || next_type != G_UNICODE_DECIMAL_NUMBER; |
|
post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER; |
|
break; |
|
case gunichar(L'%'): |
|
if (refined_p) { |
|
pre_break_p = !in_num_p; |
|
post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER; |
|
} else { |
|
post_break_p = pre_break_p = true; |
|
} |
|
break; |
|
case gunichar(L'.'): |
|
if (prev_uch != '.') { |
|
if (!in_num_p) { |
|
switch (next_type) { |
|
case G_UNICODE_DECIMAL_NUMBER: |
|
case G_UNICODE_LOWERCASE_LETTER: |
|
case G_UNICODE_UPPERCASE_LETTER: |
|
break; |
|
default: |
|
if (since_start > 0) { |
|
switch (prev_type) { |
|
case G_UNICODE_LOWERCASE_LETTER: |
|
case G_UNICODE_UPPERCASE_LETTER: { |
|
std::wstring k((wchar_t *)(uptr-since_start),since_start); |
|
if (nbpre_gen_ucs4.find(k) != nbpre_gen_ucs4.end()) { |
|
|
|
} else if (nbpre_num_ucs4.find(k) != nbpre_num_ucs4.end() && class_follows_p(nxt4,lim4,G_UNICODE_DECIMAL_NUMBER)) { |
|
|
|
} else if (k.find(curr_uch) != std::wstring::npos) { |
|
if (since_start > 1) { |
|
GUnicodeType tclass = g_unichar_type(*(uptr-2)); |
|
switch (tclass) { |
|
case G_UNICODE_UPPERCASE_LETTER: |
|
case G_UNICODE_LOWERCASE_LETTER: |
|
pre_break_p = true; |
|
break; |
|
default: |
|
break; |
|
} |
|
} |
|
|
|
} else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) || |
|
g_unichar_type(*nxt4) == G_UNICODE_DASH_PUNCTUATION) { |
|
|
|
} else { |
|
pre_break_p = true; |
|
} |
|
break; |
|
} |
|
default: |
|
pre_break_p = true; |
|
break; |
|
} |
|
} |
|
break; |
|
} |
|
} else { |
|
switch (next_type) { |
|
case G_UNICODE_DECIMAL_NUMBER: |
|
case G_UNICODE_LOWERCASE_LETTER: |
|
case G_UNICODE_UPPERCASE_LETTER: |
|
break; |
|
default: |
|
pre_break_p = true; |
|
} |
|
} |
|
} else if (next_uch != '.') { |
|
post_break_p = true; |
|
} |
|
break; |
|
default: |
|
post_break_p = pre_break_p = true; |
|
break; |
|
} |
|
break; |
|
case G_UNICODE_CLOSE_PUNCTUATION: |
|
case G_UNICODE_FINAL_PUNCTUATION: |
|
case G_UNICODE_INITIAL_PUNCTUATION: |
|
case G_UNICODE_OPEN_PUNCTUATION: |
|
switch (curr_uch) { |
|
case gunichar(L'('): |
|
case gunichar(L')'): |
|
break; |
|
case gunichar(L'['): |
|
if (escape_p) |
|
substitute_p = L"["; |
|
break; |
|
case gunichar(L']'): |
|
if (escape_p) |
|
substitute_p = L"]"; |
|
break; |
|
default: |
|
in_url_p = false; |
|
} |
|
post_break_p = pre_break_p = !in_url_p; |
|
break; |
|
case G_UNICODE_CURRENCY_SYMBOL: |
|
if (refined_p) { |
|
post_break_p = in_num_p; |
|
pre_break_p = !in_num_p; |
|
in_num_p = in_num_p || next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'.') || next_uch == gunichar(L','); |
|
} else { |
|
post_break_p = pre_break_p = true; |
|
in_num_p = false; |
|
} |
|
if (curr_uch != gunichar(L'$')) |
|
in_url_p = false; |
|
break; |
|
case G_UNICODE_MODIFIER_SYMBOL: |
|
case G_UNICODE_MATH_SYMBOL: |
|
switch (curr_uch) { |
|
case gunichar(L'`'): |
|
if (english_p) { |
|
if (!in_url_p) { |
|
pre_break_p = true; |
|
post_break_p = since_start == 0 || |
|
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER); |
|
} |
|
} else if (latin_p) { |
|
post_break_p = !in_url_p; |
|
pre_break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER; |
|
} else { |
|
post_break_p = pre_break_p = !in_url_p; |
|
} |
|
if (escape_p) |
|
substitute_p = L"'"; |
|
else |
|
curr_uch = gunichar(L'\''); |
|
break; |
|
case gunichar(L'|'): |
|
if (escape_p) |
|
substitute_p = L"|"; |
|
post_break_p = pre_break_p = true; |
|
break; |
|
case gunichar(L'<'): |
|
if (escape_p) |
|
substitute_p = L"<"; |
|
post_break_p = pre_break_p = true; |
|
break; |
|
case gunichar(L'>'): |
|
if (escape_p) |
|
substitute_p = L">"; |
|
post_break_p = pre_break_p = true; |
|
break; |
|
case gunichar(L'%'): |
|
post_break_p = in_num_p; |
|
pre_break_p = !in_num_p && !in_url_p; |
|
in_num_p = false; |
|
break; |
|
case gunichar(L'='): |
|
case gunichar(L'~'): |
|
in_num_p = false; |
|
post_break_p = pre_break_p = !in_url_p; |
|
break; |
|
case gunichar(L'+'): |
|
post_break_p = pre_break_p = !in_url_p; |
|
if (in_url_p) { |
|
in_num_p = false; |
|
} else if (refined_p) { |
|
|
|
bool next_digit_p = next_type == G_UNICODE_DECIMAL_NUMBER || |
|
next_uch == gunichar(L'.'); |
|
pre_break_p = !in_num_p; |
|
in_num_p = next_digit_p && prev_type != G_UNICODE_DECIMAL_NUMBER; |
|
post_break_p = !in_num_p; |
|
} else { |
|
in_num_p = in_num_p || since_start == 0; |
|
} |
|
break; |
|
default: |
|
post_break_p = pre_break_p = true; |
|
break; |
|
} |
|
break; |
|
case G_UNICODE_OTHER_SYMBOL: |
|
post_break_p = pre_break_p = true; |
|
break; |
|
case G_UNICODE_CONTROL: |
|
if (drop_bad_p) { |
|
curr_uch = gunichar(L' '); |
|
} else if (curr_uch < gunichar(L' ')) { |
|
curr_uch = gunichar(L' '); |
|
} else if (curr_uch == gunichar(L'\u0092') && |
|
(next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER)) { |
|
|
|
if (english_p) { |
|
pre_break_p = true; |
|
post_break_p = since_start == 0 || |
|
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER); |
|
} else if (latin_p) { |
|
post_break_p = true; |
|
pre_break_p = prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER; |
|
} else { |
|
post_break_p = pre_break_p = true; |
|
} |
|
if (escape_p) |
|
substitute_p = L"'"; |
|
else |
|
curr_uch = gunichar(L'\''); |
|
} else { |
|
post_break_p = pre_break_p = true; |
|
} |
|
in_url_p = in_num_p = false; |
|
break; |
|
case G_UNICODE_LINE_SEPARATOR: |
|
case G_UNICODE_SPACE_SEPARATOR: |
|
curr_uch = gunichar(L' '); |
|
in_url_p = in_num_p = false; |
|
break; |
|
case G_UNICODE_ENCLOSING_MARK: |
|
in_url_p = false; |
|
break; |
|
case G_UNICODE_NON_SPACING_MARK: |
|
case G_UNICODE_PRIVATE_USE: |
|
case G_UNICODE_SURROGATE: |
|
in_url_p = in_num_p = false; |
|
break; |
|
case G_UNICODE_UNASSIGNED: |
|
default: |
|
|
|
if (drop_bad_p) { |
|
curr_uch = 0; |
|
} else { |
|
pre_break_p = since_start > 0 && bad_length == 0; |
|
curr_type = G_UNICODE_UNASSIGNED; |
|
} |
|
in_url_p = in_num_p = false; |
|
break; |
|
} |
|
|
|
if (pre_break_p || curr_uch == gunichar(L' ') || (bad_length && curr_type != G_UNICODE_UNASSIGNED)) { |
|
if (since_start) { |
|
|
|
*uptr++ = gunichar(L' '); |
|
since_start = bad_length = 0; |
|
} |
|
if (curr_uch == gunichar(L' ')) |
|
|
|
curr_uch = 0; |
|
} |
|
|
|
if (substitute_p) { |
|
for (gunichar *sptr = (gunichar *)substitute_p; *sptr; ++sptr) { |
|
*uptr++ = *sptr; |
|
since_start++; |
|
} |
|
in_url_p = in_num_p = false; |
|
} else if (curr_uch) { |
|
*uptr++ = curr_uch; |
|
since_start++; |
|
if (curr_type == G_UNICODE_UNASSIGNED) |
|
bad_length++; |
|
} |
|
|
|
ucs4 = nxt4; |
|
} |
|
|
|
glong nbytes = 0; |
|
gchar *utf8 = g_ucs4_to_utf8(ubuf,uptr-ubuf,0,&nbytes,0); |
|
if (utf8[nbytes-1] == ' ') |
|
--nbytes; |
|
text.assign((const char *)utf8,(const char *)(utf8 + nbytes)); |
|
g_free(utf8); |
|
g_free(usrc); |
|
g_free(ubuf); |
|
|
|
|
|
if (supersub_p) |
|
RE2::GlobalReplace(&text,numscript_x,"\\1\\2 \\3"); |
|
|
|
|
|
num = 0; |
|
for (auto& prot : prot_stack) { |
|
char subst[32]; |
|
snprintf(subst,sizeof(subst),"THISISPROTECTED%.3d",num++); |
|
size_t loc = text.find(subst); |
|
while (loc != std::string::npos) { |
|
text.replace(loc,18,prot.data(),prot.size()); |
|
loc = text.find(subst,loc+18); |
|
} |
|
} |
|
|
|
|
|
if (escape_p) |
|
escape(text); |
|
|
|
return text; |
|
} |
|
|
|
|
|
std::size_t |
|
Tokenizer::tokenize(std::istream& is, std::ostream& os) |
|
{ |
|
std::size_t line_no = 0; |
|
std::size_t perchunk = chunksize ? chunksize : 2000; |
|
std::vector< std::vector< std::string > > lines(nthreads); |
|
std::vector< std::vector< std::string > > results(nthreads); |
|
std::vector< boost::thread > workers(nthreads); |
|
bool done_p = !(is.good() && os.good()); |
|
|
|
|
|
for (std::size_t tranche = 0; !done_p; ++tranche) { |
|
|
|
|
|
for (std::size_t ithread = 0; ithread < nthreads; ++ithread) { |
|
|
|
lines[ithread].resize(perchunk); |
|
std::size_t line_pos = 0; |
|
|
|
for ( ; line_pos < perchunk; ++line_pos) { |
|
|
|
std::string istr; |
|
std::getline(is,istr); |
|
|
|
if (skip_alltags_p) { |
|
RE2::GlobalReplace(&istr,genl_tags_x,SPC_BYTE); |
|
istr = trim(istr); |
|
} |
|
line_no++; |
|
|
|
if (istr.empty()) { |
|
if (is.eof()) { |
|
done_p = true; |
|
lines[ithread].resize(line_pos); |
|
results[ithread].resize(line_pos); |
|
break; |
|
} |
|
lines[ithread][line_pos].clear(); |
|
} else if (skip_xml_p && |
|
(RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) { |
|
lines[ithread][line_pos].clear(); |
|
} else { |
|
lines[ithread][line_pos] = |
|
std::string(SPC_BYTE).append(istr).append(SPC_BYTE); |
|
} |
|
} |
|
|
|
if (line_pos) { |
|
workers[ithread] = |
|
boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread])); |
|
} |
|
} |
|
|
|
for (std::size_t ithread = 0; ithread < nthreads; ++ithread) { |
|
if (!workers[ithread].joinable()) |
|
continue; |
|
|
|
workers[ithread].join(); |
|
|
|
std::size_t nres = results[ithread].size(); |
|
std::size_t nlin = lines[ithread].size(); |
|
|
|
if (nlin != nres) { |
|
std::ostringstream emsg; |
|
emsg << "Tranche " << tranche |
|
<< " worker " << ithread << "/" << nthreads |
|
<< " |lines|==" << nlin << " != |results|==" << nres; |
|
throw std::runtime_error(emsg.str()); |
|
} |
|
|
|
for (std::size_t ires = 0; ires < nres; ++ires) |
|
os << results[ithread][ires] << std::endl; |
|
|
|
} |
|
|
|
if (verbose_p) { |
|
std::cerr << line_no << ' '; |
|
std::cerr.flush(); |
|
} |
|
|
|
} |
|
|
|
return line_no; |
|
} |
|
|
|
|
|
std::string |
|
Tokenizer::detokenize(const std::string& buf) |
|
{ |
|
std::vector<std::string> words = split(trim(buf)); |
|
|
|
std::size_t squotes = 0; |
|
std::size_t dquotes = 0; |
|
std::string prepends(""); |
|
|
|
std::ostringstream oss; |
|
|
|
std::size_t nwords = words.size(); |
|
std::size_t iword = 0; |
|
|
|
if (unescape_p) |
|
for (auto &word: words) |
|
unescape(word); |
|
|
|
for (auto &word: words) { |
|
if (RE2::FullMatch(word,right_x)) { |
|
if (iword) |
|
oss << SPC_BYTE; |
|
oss << word; |
|
prepends.clear(); |
|
} else if (RE2::FullMatch(word,left_x)) { |
|
oss << word; |
|
prepends = SPC_BYTE; |
|
} else if (english_p && iword |
|
&& RE2::FullMatch(word,curr_en_x) |
|
&& RE2::FullMatch(words[iword-1],pre_en_x)) { |
|
oss << word; |
|
prepends = SPC_BYTE; |
|
} else if (latin_p && iword < nwords - 2 |
|
&& RE2::FullMatch(word,curr_fr_x) |
|
&& RE2::FullMatch(words[iword+1],post_fr_x)) { |
|
oss << prepends << word; |
|
prepends.clear(); |
|
} else if (word.size() == 1) { |
|
if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) || |
|
(word.at(0) == '"' && ((dquotes % 2) == 0))) { |
|
if (english_p && iword |
|
&& word.at(0) == '\'' |
|
&& std::tolower(words[iword-1].at(words[iword-1].size()-1)) == 's') { |
|
oss << word; |
|
prepends = SPC_BYTE; |
|
} else { |
|
oss << prepends << word; |
|
prepends.clear(); |
|
if (word.at(0) == '\'') |
|
squotes++; |
|
else |
|
dquotes++; |
|
} |
|
} else { |
|
if (std::isalnum(word.at(0))) |
|
oss << prepends; |
|
oss << word; |
|
prepends = SPC_BYTE; |
|
if (word.at(0) == '\'') |
|
squotes++; |
|
else if (word.at(0) == '"') |
|
dquotes++; |
|
} |
|
} else { |
|
oss << prepends << word; |
|
prepends = SPC_BYTE; |
|
} |
|
iword++; |
|
} |
|
|
|
|
|
std::string text(oss.str()); |
|
RE2::GlobalReplace(&text," +",SPC_BYTE); |
|
RE2::GlobalReplace(&text,"\n ","\n"); |
|
RE2::GlobalReplace(&text," \n","\n"); |
|
return trim(text); |
|
} |
|
|
|
|
|
std::size_t |
|
Tokenizer::detokenize(std::istream& is, std::ostream& os) |
|
{ |
|
size_t line_no = 0; |
|
while (is.good() && os.good()) { |
|
std::string istr; |
|
std::getline(is,istr); |
|
line_no ++; |
|
if (istr.empty()) |
|
continue; |
|
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) { |
|
os << istr << std::endl; |
|
} else { |
|
os << detokenize(istr) << std::endl; |
|
} |
|
} |
|
return line_no; |
|
} |
|
|
|
|
|
std::vector<std::string> |
|
Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) { |
|
std::vector<std::string> parts; |
|
glong ncp = 0; |
|
glong ocp = 0; |
|
glong icp = 0; |
|
gunichar *ucs4 = g_utf8_to_ucs4_fast((gchar *)istr.c_str(),istr.size(),&ncp); |
|
if (ncp == 0) { |
|
g_free(ucs4); |
|
return parts; |
|
} |
|
gunichar *uout = (gunichar *)g_malloc0(2*ncp*sizeof(gunichar)); |
|
|
|
const wchar_t GENL_HYPH = L'\u2010'; |
|
const wchar_t IDEO_STOP = L'\u3002'; |
|
const wchar_t KANA_MDOT = L'\u30FB'; |
|
const wchar_t WAVE_DASH = L'\u301C'; |
|
|
|
const wchar_t KANA_DHYP = L'\u30A0'; |
|
const wchar_t SMAL_HYPH = L'\uFE63'; |
|
const wchar_t WIDE_EXCL = L'\uFF01'; |
|
const wchar_t WIDE_PCTS = L'\uFF05'; |
|
|
|
const wchar_t WIDE_STOP = L'\uFF0E'; |
|
const wchar_t WIDE_QUES = L'\uFF1F'; |
|
const wchar_t INVERT_QM = L'\u00BF'; |
|
const wchar_t INVERT_EX = L'\u00A1'; |
|
|
|
wchar_t currwc = 0; |
|
|
|
std::size_t init_word = 0; |
|
std::size_t fini_word = 0; |
|
std::size_t finilen = 0; |
|
std::size_t dotslen = 0; |
|
|
|
const std::size_t SEQ_LIM = 6; |
|
|
|
charclass_t prev_class = empty; |
|
charclass_t curr_class = empty; |
|
std::vector<charclass_t> seq(SEQ_LIM, empty); |
|
std::vector<std::size_t> pos(SEQ_LIM, 0); |
|
std::size_t seqpos = 0; |
|
|
|
GUnicodeType curr_type = G_UNICODE_UNASSIGNED; |
|
|
|
bool curr_word_p = false; |
|
|
|
std::vector<std::size_t> breaks; |
|
std::set<std::size_t> suppress; |
|
|
|
for (; icp <= ncp; ++icp) { |
|
currwc = wchar_t(ucs4[icp]); |
|
curr_type = g_unichar_type(currwc); |
|
prev_class = curr_class; |
|
|
|
|
|
switch (curr_type) { |
|
case G_UNICODE_DECIMAL_NUMBER: |
|
case G_UNICODE_OTHER_NUMBER: |
|
curr_class = numba; |
|
curr_word_p = true; |
|
break; |
|
case G_UNICODE_LOWERCASE_LETTER: |
|
case G_UNICODE_MODIFIER_LETTER: |
|
case G_UNICODE_OTHER_LETTER: |
|
curr_class = letta; |
|
curr_word_p = true; |
|
break; |
|
case G_UNICODE_UPPERCASE_LETTER: |
|
case G_UNICODE_TITLECASE_LETTER: |
|
curr_class = upper; |
|
curr_word_p = true; |
|
break; |
|
case G_UNICODE_OPEN_PUNCTUATION: |
|
case G_UNICODE_INITIAL_PUNCTUATION: |
|
curr_class = pinit; |
|
curr_word_p = false; |
|
break; |
|
case G_UNICODE_DASH_PUNCTUATION: |
|
curr_class = hyphn; |
|
if (currwc <= GENL_HYPH) { |
|
curr_word_p = true; |
|
} else if (currwc >= SMAL_HYPH) { |
|
curr_word_p = true; |
|
} else { |
|
curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP); |
|
} |
|
break; |
|
case G_UNICODE_CLOSE_PUNCTUATION: |
|
case G_UNICODE_FINAL_PUNCTUATION: |
|
curr_class = pfini; |
|
curr_word_p = false; |
|
break; |
|
case G_UNICODE_OTHER_PUNCTUATION: |
|
if (currwc == L'\'' || currwc == L'"') { |
|
curr_class = quote; |
|
curr_word_p = false; |
|
} else if (currwc == L'.' || currwc == IDEO_STOP || currwc == WIDE_STOP || currwc == KANA_MDOT) { |
|
curr_class = stops; |
|
curr_word_p = true; |
|
} else if (currwc == L'?' || currwc == '!' || currwc == WIDE_EXCL || currwc == WIDE_QUES) { |
|
curr_class = marks; |
|
curr_word_p = false; |
|
} else if (currwc == INVERT_QM || currwc == INVERT_EX) { |
|
curr_class = pinit; |
|
curr_word_p = false; |
|
} else if ( currwc == L'%' || currwc == WIDE_PCTS) { |
|
curr_class = pfpct; |
|
curr_word_p = true; |
|
} else { |
|
curr_class = empty; |
|
curr_word_p = false; |
|
} |
|
break; |
|
default: |
|
if (!g_unichar_isgraph(currwc)) { |
|
curr_class = blank; |
|
} else { |
|
curr_class = empty; |
|
} |
|
curr_word_p = false; |
|
break; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
bool check_abbr_p = false; |
|
if (curr_class == stops) { |
|
if (prev_class != stops) { |
|
dotslen = 1; |
|
} else { |
|
dotslen++; |
|
} |
|
} else if (curr_word_p) { |
|
if (!fini_word) { |
|
init_word = ocp; |
|
} |
|
fini_word = ocp+1; |
|
dotslen = finilen = 0; |
|
} else if (curr_class >= quote && curr_class <= pfpct && curr_class != pinit) { |
|
finilen++; |
|
dotslen = 0; |
|
init_word = fini_word = 0; |
|
} else if (dotslen) { |
|
if (fini_word > init_word) { |
|
if (prev_class!=stops || seqpos<1 || (ocp-pos[seqpos-1])<dotslen) |
|
check_abbr_p = false; |
|
else |
|
check_abbr_p = dotslen < 2; |
|
} |
|
dotslen = 0; |
|
} else { |
|
init_word = fini_word = 0; |
|
} |
|
|
|
if (check_abbr_p) { |
|
|
|
std::wstring k((wchar_t *)uout+init_word,fini_word-init_word); |
|
if (finilen == 0 && nbpre_gen_ucs4.find(k) != nbpre_gen_ucs4.end()) { |
|
suppress.insert(std::size_t(ocp)); |
|
seqpos = 0; |
|
} else { |
|
bool acro_p = false; |
|
bool found_upper_p = false; |
|
for (glong ii = init_word; ii < ocp; ++ii) { |
|
if (uout[ii] == L'.') { |
|
acro_p = true; |
|
} else if (acro_p) { |
|
if (uout[ii] != L'.' && uout[ii] != L'-') { |
|
GUnicodeType i_type = g_unichar_type(uout[ii]); |
|
if (i_type != G_UNICODE_UPPERCASE_LETTER) { |
|
acro_p = false; |
|
} else { |
|
found_upper_p = true; |
|
} |
|
} |
|
} |
|
} |
|
if (acro_p && found_upper_p) { |
|
suppress.insert(std::size_t(ocp)); |
|
seqpos = 0; |
|
} else { |
|
|
|
|
|
int fcp = icp; |
|
int state = (curr_class == pinit || curr_class == quote) ? 1 : 0; |
|
bool num_p = true; |
|
while (fcp < ncp) { |
|
GUnicodeType f_type = g_unichar_type(ucs4[fcp]); |
|
bool f_white = g_unichar_isgraph(ucs4[fcp]); |
|
switch (state) { |
|
case 0: |
|
if (!f_white) { |
|
++fcp; |
|
continue; |
|
} else if (f_type == G_UNICODE_INITIAL_PUNCTUATION || f_type == G_UNICODE_OPEN_PUNCTUATION || |
|
ucs4[fcp] == L'"'|| ucs4[fcp] == '\'' || ucs4[fcp] == INVERT_QM || ucs4[fcp] == INVERT_EX) { |
|
num_p = false; |
|
state = 1; |
|
++fcp; |
|
continue; |
|
} else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) { |
|
if (num_p) |
|
num_p = f_type == G_UNICODE_DECIMAL_NUMBER; |
|
state = 3; |
|
++fcp; |
|
} |
|
break; |
|
case 1: |
|
if (!f_white) { |
|
++fcp; |
|
state = 2; |
|
continue; |
|
} else if (f_type == G_UNICODE_INITIAL_PUNCTUATION || f_type == G_UNICODE_OPEN_PUNCTUATION || |
|
ucs4[fcp] == L'"'|| ucs4[fcp] == '\'' || ucs4[fcp] == INVERT_QM || ucs4[fcp] == INVERT_EX) { |
|
++fcp; |
|
continue; |
|
} else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) { |
|
if (num_p) |
|
num_p = f_type == G_UNICODE_DECIMAL_NUMBER; |
|
state = 3; |
|
++fcp; |
|
} |
|
break; |
|
case 2: |
|
if (!f_white) { |
|
++fcp; |
|
continue; |
|
} else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) { |
|
if (num_p) |
|
num_p = f_type == G_UNICODE_DECIMAL_NUMBER; |
|
state = 3; |
|
++fcp; |
|
break; |
|
} |
|
break; |
|
} |
|
break; |
|
} |
|
if (num_p && state == 3 && nbpre_num_ucs4.find(k) != nbpre_num_ucs4.end()) { |
|
suppress.insert(std::size_t(ocp)); |
|
seqpos = 0; |
|
} |
|
} |
|
} |
|
init_word = fini_word = 0; |
|
} |
|
|
|
if (seqpos >= SEQ_LIM) { |
|
seqpos = 0; |
|
} |
|
|
|
if (curr_class == stops || curr_class == marks) { |
|
if (!seqpos) { |
|
seq[seqpos] = curr_class; |
|
pos[seqpos] = ocp; |
|
seqpos++; |
|
uout[ocp++] = gunichar(currwc); |
|
continue; |
|
} else if (seqpos>1 && (seq[seqpos-1]==blank || seq[seqpos-1]==quote || seq[seqpos-1]==pfini)) { |
|
|
|
if (seq[seqpos-2] == curr_class || seq[seqpos-2] == marks) { |
|
seqpos--; |
|
uout[ocp++] = gunichar(currwc); |
|
continue; |
|
} |
|
seqpos = 0; |
|
} else if (seq[seqpos-1] != curr_class) { |
|
seqpos = 0; |
|
} else if (curr_class == marks) { |
|
seqpos = 0; |
|
} else { |
|
uout[ocp++] = gunichar(currwc); |
|
continue; |
|
} |
|
} |
|
|
|
if (!seqpos) { |
|
if (curr_class != blank) { |
|
uout[ocp++] = gunichar(currwc); |
|
} else if (curr_class != prev_class) { |
|
uout[ocp++] = L' '; |
|
} |
|
continue; |
|
} |
|
|
|
if (curr_class == blank) { |
|
if (prev_class != blank) { |
|
seq[seqpos] = blank; |
|
pos[seqpos] = ocp; |
|
seqpos++; |
|
uout[ocp++] = L' '; |
|
} |
|
if (icp < ncp) |
|
continue; |
|
} |
|
|
|
if (curr_class >= quote && curr_class <= pfini) { |
|
if (prev_class < quote || prev_class > pfini) { |
|
seq[seqpos] = curr_class; |
|
pos[seqpos] = ocp; |
|
seqpos++; |
|
} else if (curr_class == quote && prev_class != curr_class) { |
|
curr_class = prev_class; |
|
} else if (prev_class == quote) { |
|
seq[seqpos] = prev_class = curr_class; |
|
} |
|
uout[ocp++] = gunichar(currwc); |
|
continue; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::size_t iblank = 0; |
|
if (curr_class == upper || icp == ncp) { |
|
if (seqpos && (seq[0] == stops || seq[0] == marks)) { |
|
switch (seqpos) { |
|
case 2: |
|
if (seq[1] == blank) |
|
iblank = 1; |
|
break; |
|
case 3: |
|
switch (seq[1]) { |
|
case blank: |
|
if (seq[2] == quote || seq[2] == pinit) |
|
iblank = 1; |
|
break; |
|
case quote: |
|
case pfini: |
|
if (seq[2] == blank) |
|
iblank = 2; |
|
break; |
|
default: |
|
break; |
|
} |
|
break; |
|
case 4: |
|
switch (seq[1]) { |
|
case blank: |
|
iblank = 1; |
|
switch (seq[2]) { |
|
case quote: |
|
switch (seq[3]) { |
|
case quote: |
|
case pinit: |
|
break; |
|
case blank: |
|
iblank = 3; |
|
break; |
|
default: |
|
iblank = 0; |
|
break; |
|
} |
|
break; |
|
case pinit: |
|
if (seq[3] != blank) |
|
iblank = 0; |
|
break; |
|
case pfini: |
|
if (seq[3] == blank) |
|
iblank = 3; |
|
break; |
|
default: |
|
iblank = 0; |
|
break; |
|
} |
|
break; |
|
case quote: |
|
case pfini: |
|
iblank = (seq[2] == blank && (seq[3] == quote || seq[3] == pinit)) ? 2 : 0; |
|
break; |
|
default: |
|
iblank = 0; |
|
break; |
|
} |
|
break; |
|
case 5: |
|
iblank = (seq[1] == blank) ? 2 : 1; |
|
if (seq[iblank] == quote || seq[iblank] == pfini) |
|
iblank++; |
|
if (seq[iblank] != blank) { |
|
iblank = 0; |
|
} else { |
|
if (seq[iblank+1] != quote && seq[iblank+1] != pinit) { |
|
iblank = 0; |
|
} else if (iblank+2 < seqpos) { |
|
if (seq[iblank+2] != blank) |
|
iblank = 0; |
|
} |
|
} |
|
break; |
|
} |
|
} |
|
if (iblank && suppress.find(pos[iblank]) == suppress.end()) { |
|
breaks.push_back(pos[iblank]); |
|
suppress.insert(pos[iblank]); |
|
} |
|
} |
|
|
|
uout[ocp++] = gunichar(currwc); |
|
seqpos = 0; |
|
} |
|
|
|
std::vector<std::size_t>::iterator it = breaks.begin(); |
|
glong iop = 0; |
|
while (iop < ocp) { |
|
glong endpos = it == breaks.end() ? ocp : *it++; |
|
glong nextpos = endpos + 1; |
|
while (endpos > iop) { |
|
std::size_t chkpos = endpos-1; |
|
if (uout[chkpos] == L'\n' || uout[chkpos] == L' ') { |
|
endpos = chkpos; |
|
continue; |
|
} |
|
if (g_unichar_isgraph(uout[chkpos])) |
|
break; |
|
endpos = chkpos; |
|
} |
|
if (endpos > iop) { |
|
gchar *pre = g_ucs4_to_utf8(uout+iop,endpos-iop,0,0,0); |
|
parts.push_back(std::string(pre)); |
|
g_free(pre); |
|
} |
|
if (continuation_ptr) |
|
*continuation_ptr = endpos > iop; |
|
iop = nextpos; |
|
} |
|
|
|
g_free(uout); |
|
g_free(ucs4); |
|
|
|
return parts; |
|
} |
|
|
|
|
|
std::pair<std::size_t,std::size_t> |
|
Tokenizer::splitter(std::istream& is, std::ostream& os) |
|
{ |
|
std::pair<std::size_t,std::size_t> counts = { 0, 0 }; |
|
bool continuation_p = false; |
|
bool pending_gap = false; |
|
bool paragraph_p = false; |
|
|
|
while (is.good() && os.good()) { |
|
std::string istr; |
|
|
|
std::getline(is,istr); |
|
counts.first++; |
|
|
|
if (istr.empty() && (is.eof() ||!para_marks_p)) |
|
continue; |
|
|
|
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) |
|
continue; |
|
|
|
std::vector<std::string> sentences(splitter(istr,&continuation_p)); |
|
if (sentences.empty()) { |
|
if (!paragraph_p) { |
|
if (pending_gap) |
|
os << std::endl; |
|
pending_gap = false; |
|
if (para_marks_p) |
|
os << "<P>" << std::endl; |
|
paragraph_p = true; |
|
} |
|
continue; |
|
} |
|
|
|
paragraph_p = false; |
|
std::size_t nsents = sentences.size(); |
|
counts.second += nsents; |
|
|
|
if (pending_gap) { |
|
os << " "; |
|
pending_gap = false; |
|
} |
|
|
|
for (std::size_t ii = 0; ii < nsents-1; ++ii) |
|
os << sentences[ii] << std::endl; |
|
|
|
os << sentences[nsents-1]; |
|
|
|
if (continuation_p) |
|
pending_gap = !split_breaks_p; |
|
if (!pending_gap) |
|
os << std::endl; |
|
} |
|
|
|
if (pending_gap) |
|
os << std::endl; |
|
|
|
return counts; |
|
} |
|
|
|
|
|
#ifdef TOKENIZER_NAMESPACE |
|
}; |
|
#endif |
|
|
|
|