Spaces:
Sleeping
Sleeping
import string | |
from textsearch import TextSearch | |
from contractions import contractions_dict, leftovers_dict | |
ABBREVS = ( | |
"a.m.", | |
"adm.", | |
"bros.", | |
"co.", | |
"corp.", | |
"d.c.", | |
"dr.", | |
"e.g.", | |
"gen.", | |
"gov.", | |
"i.e.", | |
"inc.", | |
"jr.", | |
"ltd.", | |
"md.", | |
"messrs.", | |
"mo.", | |
"mont.", | |
"mr.", | |
"mrs.", | |
"ms.", | |
"p.m.", | |
"ph.d.", | |
"rep.", | |
"rev.", | |
"sen.", | |
"st.", | |
"vs.", | |
) | |
class Tokenizer: | |
def __init__( | |
self, | |
handle_http=False, | |
handle_domains=False, | |
numbers=True, | |
combine_punctuation=True, | |
eol="\n", | |
currencies=("$",), | |
protected_words=None, | |
contractions=True, | |
language="en", | |
abbrevs=ABBREVS, | |
): | |
# set() set() should fallback to just using __iter__ of automaton for a speedboost | |
if language != "en" and contractions: | |
raise ValueError("No contractions known for languages other than English.") | |
self.contractions = contractions | |
self.tokenizer = None | |
self.handle_http = handle_http | |
self.handle_domains = handle_domains | |
self.combine_punctuation = combine_punctuation | |
self.numbers = numbers | |
self.eol = eol | |
self.currencies = currencies or [] | |
self.protected_words = protected_words or [] | |
self.abbrevs = abbrevs | |
self.explain_dict = {} | |
self.setup() | |
def setup(self): | |
self.tokenizer = TextSearch("sensitive", "norm", set(), set()) | |
self.add_base_cases() | |
self.add_currencies() | |
self.add_words(self.protected_words) | |
if self.handle_http: | |
self.tokenizer.add_http_handler(keep_result=True) | |
for word in ["http://", "https://", "www."]: | |
self.explain_dict[ | |
word | |
] = "regex: when it finds '{}' it will stop after it finds a space.".format(word) | |
if self.handle_domains: | |
self.add_domain_handler() | |
if self.contractions: | |
if self.contractions == True: | |
self.contractions = {} | |
self.contractions.update(contractions_dict) | |
self.contractions.update(leftovers_dict) | |
self.add_words(self.contractions) | |
if self.abbrevs: | |
self.add_words(self.abbrevs) | |
def add_words(self, words): | |
words = words.items() if isinstance(words, dict) else words | |
if words and isinstance(words, (list, set, tuple)) and isinstance(words[0], str): | |
words = [(x, x) for x in words] | |
for x, y in words: | |
REASON_AS_IS = "protected word: adds word as is, prevents splitting it." | |
REASON_UPPER = "protected word: adds word uppercased, prevents splitting it." | |
REASON_TITLE = "protected word: adds word titlecased, prevents splitting it." | |
self.add(x, y, REASON_AS_IS) | |
self.add(x.upper(), y.upper(), REASON_UPPER) | |
if y: | |
self.add(x[0].upper() + x[1:], y[0].upper() + y[1:], REASON_TITLE) | |
def add_domain_handler(self): | |
import re | |
from tldextract.tldextract import TLD_EXTRACTOR | |
valid_re = re.compile("^[a-zA-Z.]+$") | |
tlds = ["." + x for x in TLD_EXTRACTOR.tlds if valid_re.match(x)] | |
for x in tlds: | |
self.add(x, x, "Added by domain handler, keeps the token existing.") | |
def add_base_cases(self): | |
if self.numbers: | |
for x in "0123456789": | |
self.keep(x + ",") | |
self.keep(x + ".") | |
# self.tokenizer.add(" !", " ! ") | |
if self.combine_punctuation: | |
# combine multiples | |
R_COMBINE = "combine punctuation: merges '{}' into '{}' and starts a new sentence." | |
for s in "!.?-": | |
for i in range(2, 10): | |
# one of these is a splitting char | |
if i == 1 and s == "-": | |
continue | |
c = s * i | |
e = s * 3 if i > 1 else s | |
# end = "$<EOS>$" if i == 1 or s != "-" else " " | |
end = " \n" if i == 1 or s != "-" else " " | |
self.add(c, " {}{}".format(e, end), R_COMBINE.format(c, e + end)) | |
for i in range(2, 10): | |
# self.tokenizer.add("\n" * i, "$<EOS>$") | |
self.add("\n" * i, " \n ", "merges newlines") | |
for s in "!.?-\n": | |
self.add(s, " " + s + "\n", "Splits on '{}' and creating a new sentence.".format(s)) | |
self.split("- ") | |
self.split("...") | |
# does not work | |
# self.tokenizer.add_regex_handler(["!?"], "[!]+[?]+[!?]+", True, return_value=" !? ") | |
self.split("!?") | |
self.split("!?!") | |
self.split("!!?") | |
self.split("!??") | |
self.split("?!!") | |
self.split("?!?") | |
self.split("??!") | |
for x in string.ascii_letters: | |
self.keep(" " + x + ".") | |
# for x in string.ascii_letters: | |
# self.tokenizer.add("\n" + x, "\n" + x) | |
for s in ":;,": | |
self.split(s, "Splits on '{}' (punctuation)") | |
# quotes (make sure we add all the exeptions) | |
self.split("'") | |
self.split('"') | |
def keep(self, x, reason=None): | |
""" Whenever it finds x, it will not add whitespace. Prevents direct tokenization. """ | |
self.tokenizer.add(x, x) | |
self.explain_dict[x] = reason or "keep:" + self.keep.__doc__.replace("x", repr(x)).rstrip() | |
def split(self, x, reason=None): | |
""" Whenever it finds x, it will surround it by whitespace, thus creating a token. """ | |
self.tokenizer.add(x, " {} ".format(x)) | |
self.explain_dict[x] = ( | |
reason or "split:" + self.split.__doc__.replace("x", repr(x)).rstrip() | |
) | |
def drop(self, x, reason=None): | |
""" Whenever it finds x, it will remove it but add a split.""" | |
self.tokenizer.add(x, " ") | |
self.explain_dict[x] = reason or "drop:" + self.drop.__doc__.replace("x", repr(x)).rstrip() | |
def strip(self, x, reason=None): | |
""" Whenever it finds x, it will remove it without splitting. """ | |
self.tokenizer.add(x, "") | |
self.explain_dict[x] = ( | |
reason or "strip:" + self.strip.__doc__.replace("x", repr(x)).rstrip() | |
) | |
def add(self, x, y, reason): | |
self.tokenizer.add(x, y) | |
self.explain_dict[x] = reason | |
def explain(self, char_or_chars): | |
keys = [x for x in self.tokenizer._root_dict if char_or_chars in x] | |
if not keys: | |
return { | |
"explanation": "No explanation, meaning there is nothing specified for the input" | |
} | |
return [ | |
{"from": x, "to": self.tokenizer._root_dict[x], "explanation": self.explain_dict[x]} | |
for x in keys | |
] | |
def remove(self, x): | |
if x in self.tokenizer: | |
self.tokenizer.remove(x) | |
del self.explain_dict[x] | |
def add_currencies(self): | |
for currency in self.currencies: | |
self.split(currency) | |
for num in "0123456789": | |
# to prevent the . and , from being treated as punct | |
for punc in ",.": | |
s = "{currency}{num}{punc}".format(currency=currency, num=num, punc=punc) | |
r = " {currency} {num}{punc}".format(currency=currency, num=num, punc=punc) | |
self.add(s, r, "protecting currency from being seen as a number.") | |
def word_tokenize(self, z, return_entities=False, to_lower=False): | |
if return_entities: | |
a, b = self.tokenizer.replace(" " + z, return_entities=True) | |
return a.split(), b | |
res = self.tokenizer.replace(" " + z).split() | |
if to_lower: | |
res = [x.lower() for x in res] | |
return res | |
def word_newlined_tokenize(self, z): | |
sentences = self.sent_tokenize(z) | |
return sum([x + ["\n"] for x in sentences[:-1]], []) + sentences[-1] | |
def sent_tokenize(self, z): | |
return [x.split() for x in self.tokenizer.replace(z).split("\n") if x.strip()] | |
t = Tokenizer(handle_http=True, handle_domains=False) | |
word_tokenize = t.word_tokenize | |
sent_tokenize = t.sent_tokenize |