Spaces:

asynchronousai
/

paragraph-embedder

Sleeping

App Files Files Community

asynchronousai commited on Sep 18, 2024

Commit

42e8afb

verified ·

1 Parent(s): f621a6c

Create tok.py

Browse files

Files changed (1) hide show

tok.py +243 -0

tok.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import string
+from textsearch import TextSearch
+from contractions import contractions_dict, leftovers_dict
+ABBREVS = (
+    "a.m.",
+    "adm.",
+    "bros.",
+    "co.",
+    "corp.",
+    "d.c.",
+    "dr.",
+    "e.g.",
+    "gen.",
+    "gov.",
+    "i.e.",
+    "inc.",
+    "jr.",
+    "ltd.",
+    "md.",
+    "messrs.",
+    "mo.",
+    "mont.",
+    "mr.",
+    "mrs.",
+    "ms.",
+    "p.m.",
+    "ph.d.",
+    "rep.",
+    "rev.",
+    "sen.",
+    "st.",
+    "vs.",
+)
+class Tokenizer:
+    def __init__(
+        self,
+        handle_http=False,
+        handle_domains=False,
+        numbers=True,
+        combine_punctuation=True,
+        eol="\n",
+        currencies=("$",),
+        protected_words=None,
+        contractions=True,
+        language="en",
+        abbrevs=ABBREVS,
+    ):
+        # set() set() should fallback to just using __iter__ of automaton for a speedboost
+        if language != "en" and contractions:
+            raise ValueError("No contractions known for languages other than English.")
+        self.contractions = contractions
+        self.tokenizer = None
+        self.handle_http = handle_http
+        self.handle_domains = handle_domains
+        self.combine_punctuation = combine_punctuation
+        self.numbers = numbers
+        self.eol = eol
+        self.currencies = currencies or []
+        self.protected_words = protected_words or []
+        self.abbrevs = abbrevs
+        self.explain_dict = {}
+        self.setup()
+    def setup(self):
+        self.tokenizer = TextSearch("sensitive", "norm", set(), set())
+        self.add_base_cases()
+        self.add_currencies()
+        self.add_words(self.protected_words)
+        if self.handle_http:
+            self.tokenizer.add_http_handler(keep_result=True)
+            for word in ["http://", "https://", "www."]:
+                self.explain_dict[
+                    word
+                ] = "regex: when it finds '{}' it will stop after it finds a space.".format(word)
+        if self.handle_domains:
+            self.add_domain_handler()
+        if self.contractions:
+            if self.contractions == True:
+                self.contractions = {}
+                self.contractions.update(contractions_dict)
+                self.contractions.update(leftovers_dict)
+            self.add_words(self.contractions)
+        if self.abbrevs:
+            self.add_words(self.abbrevs)
+    def add_words(self, words):
+        words = words.items() if isinstance(words, dict) else words
+        if words and isinstance(words, (list, set, tuple)) and isinstance(words[0], str):
+            words = [(x, x) for x in words]
+        for x, y in words:
+            REASON_AS_IS = "protected word: adds word as is, prevents splitting it."
+            REASON_UPPER = "protected word: adds word uppercased, prevents splitting it."
+            REASON_TITLE = "protected word: adds word titlecased, prevents splitting it."
+            self.add(x, y, REASON_AS_IS)
+            self.add(x.upper(), y.upper(), REASON_UPPER)
+            if y:
+                self.add(x[0].upper() + x[1:], y[0].upper() + y[1:], REASON_TITLE)
+    def add_domain_handler(self):
+        import re
+        from tldextract.tldextract import TLD_EXTRACTOR
+        valid_re = re.compile("^[a-zA-Z.]+$")
+        tlds = ["." + x for x in TLD_EXTRACTOR.tlds if valid_re.match(x)]
+        for x in tlds:
+            self.add(x, x, "Added by domain handler, keeps the token existing.")
+    def add_base_cases(self):
+        if self.numbers:
+            for x in "0123456789":
+                self.keep(x + ",")
+                self.keep(x + ".")
+        # self.tokenizer.add(" !", " ! ")
+        if self.combine_punctuation:
+            # combine multiples
+            R_COMBINE = "combine punctuation: merges '{}' into '{}' and starts a new sentence."
+            for s in "!.?-":
+                for i in range(2, 10):
+                    # one of these is a splitting char
+                    if i == 1 and s == "-":
+                        continue
+                    c = s * i
+                    e = s * 3 if i > 1 else s
+                    # end = "$<EOS>$" if i == 1 or s != "-" else " "
+                    end = " \n" if i == 1 or s != "-" else " "
+                    self.add(c, " {}{}".format(e, end), R_COMBINE.format(c, e + end))
+            for i in range(2, 10):
+                # self.tokenizer.add("\n" * i, "$<EOS>$")
+                self.add("\n" * i, " \n ", "merges newlines")
+        for s in "!.?-\n":
+            self.add(s, " " + s + "\n", "Splits on '{}' and creating a new sentence.".format(s))
+        self.split("- ")
+        self.split("...")
+        # does not work
+        # self.tokenizer.add_regex_handler(["!?"], "[!]+[?]+[!?]+", True, return_value=" !? ")
+        self.split("!?")
+        self.split("!?!")
+        self.split("!!?")
+        self.split("!??")
+        self.split("?!!")
+        self.split("?!?")
+        self.split("??!")
+        for x in string.ascii_letters:
+            self.keep(" " + x + ".")
+        # for x in string.ascii_letters:
+        #     self.tokenizer.add("\n" + x, "\n" + x)
+        for s in ":;,":
+            self.split(s, "Splits on '{}' (punctuation)")
+        # quotes (make sure we add all the exeptions)
+        self.split("'")
+        self.split('"')
+    def keep(self, x, reason=None):
+        """ Whenever it finds x, it will not add whitespace. Prevents direct tokenization. """
+        self.tokenizer.add(x, x)
+        self.explain_dict[x] = reason or "keep:" + self.keep.__doc__.replace("x", repr(x)).rstrip()
+    def split(self, x, reason=None):
+        """ Whenever it finds x, it will surround it by whitespace, thus creating a token. """
+        self.tokenizer.add(x, " {} ".format(x))
+        self.explain_dict[x] = (
+            reason or "split:" + self.split.__doc__.replace("x", repr(x)).rstrip()
+        )
+    def drop(self, x, reason=None):
+        """ Whenever it finds x, it will remove it but add a split."""
+        self.tokenizer.add(x, " ")
+        self.explain_dict[x] = reason or "drop:" + self.drop.__doc__.replace("x", repr(x)).rstrip()
+    def strip(self, x, reason=None):
+        """ Whenever it finds x, it will remove it without splitting. """
+        self.tokenizer.add(x, "")
+        self.explain_dict[x] = (
+            reason or "strip:" + self.strip.__doc__.replace("x", repr(x)).rstrip()
+        )
+    def add(self, x, y, reason):
+        self.tokenizer.add(x, y)
+        self.explain_dict[x] = reason
+    def explain(self, char_or_chars):
+        keys = [x for x in self.tokenizer._root_dict if char_or_chars in x]
+        if not keys:
+            return {
+                "explanation": "No explanation, meaning there is nothing specified for the input"
+            }
+        return [
+            {"from": x, "to": self.tokenizer._root_dict[x], "explanation": self.explain_dict[x]}
+            for x in keys
+        ]
+    def remove(self, x):
+        if x in self.tokenizer:
+            self.tokenizer.remove(x)
+            del self.explain_dict[x]
+    def add_currencies(self):
+        for currency in self.currencies:
+            self.split(currency)
+        for num in "0123456789":
+            # to prevent the . and , from being treated as punct
+            for punc in ",.":
+                s = "{currency}{num}{punc}".format(currency=currency, num=num, punc=punc)
+                r = " {currency} {num}{punc}".format(currency=currency, num=num, punc=punc)
+                self.add(s, r, "protecting currency from being seen as a number.")
+    def word_tokenize(self, z, return_entities=False, to_lower=False):
+        if return_entities:
+            a, b = self.tokenizer.replace(" " + z, return_entities=True)
+            return a.split(), b
+        res = self.tokenizer.replace(" " + z).split()
+        if to_lower:
+            res = [x.lower() for x in res]
+        return res
+    def word_newlined_tokenize(self, z):
+        sentences = self.sent_tokenize(z)
+        return sum([x + ["\n"] for x in sentences[:-1]], []) + sentences[-1]
+    def sent_tokenize(self, z):
+        return [x.split() for x in self.tokenizer.replace(z).split("\n") if x.strip()]
+t = Tokenizer(handle_http=True, handle_domains=False)
+word_tokenize = t.word_tokenize
+sent_tokenize = t.sent_tokenize