asynchronousai commited on
Commit
d494042
·
verified ·
1 Parent(s): 575a607

Delete tok.py

Browse files
Files changed (1) hide show
  1. tok.py +0 -195
tok.py DELETED
@@ -1,195 +0,0 @@
1
- import string
2
- from textsearch import TextSearch
3
-
4
- class Tokenizer:
5
- def __init__(
6
- self,
7
- handle_http=False,
8
- handle_domains=False,
9
- numbers=True,
10
- combine_punctuation=True,
11
- eol="\n",
12
- currencies=("$",),
13
- protected_words=None,
14
- language="en",
15
- ):
16
- self.tokenizer = None
17
- self.handle_http = handle_http
18
- self.handle_domains = handle_domains
19
- self.combine_punctuation = combine_punctuation
20
- self.numbers = numbers
21
- self.eol = eol
22
- self.currencies = currencies or []
23
- self.protected_words = protected_words or []
24
- self.explain_dict = {}
25
- self.setup()
26
-
27
- def setup(self):
28
- self.tokenizer = TextSearch("sensitive", "norm", set(), set())
29
- self.add_base_cases()
30
- self.add_currencies()
31
- self.add_words(self.protected_words)
32
- if self.handle_http:
33
- self.tokenizer.add_http_handler(keep_result=True)
34
- for word in ["http://", "https://", "www."]:
35
- self.explain_dict[
36
- word
37
- ] = "regex: when it finds '{}' it will stop after it finds a space.".format(word)
38
- if self.handle_domains:
39
- self.add_domain_handler()
40
-
41
- def add_words(self, words):
42
- words = words.items() if isinstance(words, dict) else words
43
- if words and isinstance(words, (list, set, tuple)) and isinstance(words[0], str):
44
- words = [(x, x) for x in words]
45
- for x, y in words:
46
- REASON_AS_IS = "protected word: adds word as is, prevents splitting it."
47
- REASON_UPPER = "protected word: adds word uppercased, prevents splitting it."
48
- REASON_TITLE = "protected word: adds word titlecased, prevents splitting it."
49
- self.add(x, y, REASON_AS_IS)
50
- self.add(x.upper(), y.upper(), REASON_UPPER)
51
- if y:
52
- self.add(x[0].upper() + x[1:], y[0].upper() + y[1:], REASON_TITLE)
53
-
54
- def add_domain_handler(self):
55
- import re
56
- from tldextract.tldextract import TLD_EXTRACTOR
57
-
58
- valid_re = re.compile("^[a-zA-Z.]+$")
59
- tlds = ["." + x for x in TLD_EXTRACTOR.tlds if valid_re.match(x)]
60
-
61
- for x in tlds:
62
- self.add(x, x, "Added by domain handler, keeps the token existing.")
63
-
64
- def add_base_cases(self):
65
- if self.numbers:
66
- for x in "0123456789":
67
- self.keep(x + ",")
68
- self.keep(x + ".")
69
-
70
- # self.tokenizer.add(" !", " ! ")
71
-
72
- if self.combine_punctuation:
73
- # combine multiples
74
- R_COMBINE = "combine punctuation: merges '{}' into '{}' and starts a new sentence."
75
- for s in "!.?-":
76
- for i in range(2, 10):
77
- # one of these is a splitting char
78
- if i == 1 and s == "-":
79
- continue
80
- c = s * i
81
- e = s * 3 if i > 1 else s
82
- # end = "$<EOS>$" if i == 1 or s != "-" else " "
83
- end = " \n" if i == 1 or s != "-" else " "
84
- self.add(c, " {}{}".format(e, end), R_COMBINE.format(c, e + end))
85
-
86
- for i in range(2, 10):
87
- # self.tokenizer.add("\n" * i, "$<EOS>$")
88
- self.add("\n" * i, " \n ", "merges newlines")
89
-
90
- for s in "!.?-\n":
91
- self.add(s, " " + s + "\n", "Splits on '{}' and creating a new sentence.".format(s))
92
-
93
- self.split("- ")
94
-
95
- self.split("...")
96
-
97
- # does not work
98
- # self.tokenizer.add_regex_handler(["!?"], "[!]+[?]+[!?]+", True, return_value=" !? ")
99
-
100
- self.split("!?")
101
- self.split("!?!")
102
- self.split("!!?")
103
- self.split("!??")
104
- self.split("?!!")
105
- self.split("?!?")
106
- self.split("??!")
107
-
108
- for x in string.ascii_letters:
109
- self.keep(" " + x + ".")
110
-
111
- # for x in string.ascii_letters:
112
- # self.tokenizer.add("\n" + x, "\n" + x)
113
-
114
- for s in ":;,":
115
- self.split(s, "Splits on '{}' (punctuation)")
116
-
117
- # quotes (make sure we add all the exeptions)
118
- self.split("'")
119
- self.split('"')
120
-
121
- def keep(self, x, reason=None):
122
- """ Whenever it finds x, it will not add whitespace. Prevents direct tokenization. """
123
- self.tokenizer.add(x, x)
124
- self.explain_dict[x] = reason or "keep:" + self.keep.__doc__.replace("x", repr(x)).rstrip()
125
-
126
- def split(self, x, reason=None):
127
- """ Whenever it finds x, it will surround it by whitespace, thus creating a token. """
128
- self.tokenizer.add(x, " {} ".format(x))
129
- self.explain_dict[x] = (
130
- reason or "split:" + self.split.__doc__.replace("x", repr(x)).rstrip()
131
- )
132
-
133
- def drop(self, x, reason=None):
134
- """ Whenever it finds x, it will remove it but add a split."""
135
- self.tokenizer.add(x, " ")
136
- self.explain_dict[x] = reason or "drop:" + self.drop.__doc__.replace("x", repr(x)).rstrip()
137
-
138
- def strip(self, x, reason=None):
139
- """ Whenever it finds x, it will remove it without splitting. """
140
- self.tokenizer.add(x, "")
141
- self.explain_dict[x] = (
142
- reason or "strip:" + self.strip.__doc__.replace("x", repr(x)).rstrip()
143
- )
144
-
145
- def add(self, x, y, reason):
146
- self.tokenizer.add(x, y)
147
- self.explain_dict[x] = reason
148
-
149
- def explain(self, char_or_chars):
150
- keys = [x for x in self.tokenizer._root_dict if char_or_chars in x]
151
- if not keys:
152
- return {
153
- "explanation": "No explanation, meaning there is nothing specified for the input"
154
- }
155
- return [
156
- {"from": x, "to": self.tokenizer._root_dict[x], "explanation": self.explain_dict[x]}
157
- for x in keys
158
- ]
159
-
160
- def remove(self, x):
161
- if x in self.tokenizer:
162
- self.tokenizer.remove(x)
163
- del self.explain_dict[x]
164
-
165
- def add_currencies(self):
166
- for currency in self.currencies:
167
- self.split(currency)
168
-
169
- for num in "0123456789":
170
- # to prevent the . and , from being treated as punct
171
- for punc in ",.":
172
- s = "{currency}{num}{punc}".format(currency=currency, num=num, punc=punc)
173
- r = " {currency} {num}{punc}".format(currency=currency, num=num, punc=punc)
174
- self.add(s, r, "protecting currency from being seen as a number.")
175
-
176
- def word_tokenize(self, z, return_entities=False, to_lower=False):
177
- if return_entities:
178
- a, b = self.tokenizer.replace(" " + z, return_entities=True)
179
- return a.split(), b
180
- res = self.tokenizer.replace(" " + z).split()
181
- if to_lower:
182
- res = [x.lower() for x in res]
183
- return res
184
-
185
- def word_newlined_tokenize(self, z):
186
- sentences = self.sent_tokenize(z)
187
- return sum([x + ["\n"] for x in sentences[:-1]], []) + sentences[-1]
188
-
189
- def sent_tokenize(self, z):
190
- return [x.split() for x in self.tokenizer.replace(z).split("\n") if x.strip()]
191
-
192
-
193
- t = Tokenizer(handle_http=True, handle_domains=False)
194
- word_tokenize = t.word_tokenize
195
- sent_tokenize = t.sent_tokenize