asynchronousai commited on
Commit
42e8afb
·
verified ·
1 Parent(s): f621a6c

Create tok.py

Browse files
Files changed (1) hide show
  1. tok.py +243 -0
tok.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+ from textsearch import TextSearch
3
+ from contractions import contractions_dict, leftovers_dict
4
+
5
+ ABBREVS = (
6
+ "a.m.",
7
+ "adm.",
8
+ "bros.",
9
+ "co.",
10
+ "corp.",
11
+ "d.c.",
12
+ "dr.",
13
+ "e.g.",
14
+ "gen.",
15
+ "gov.",
16
+ "i.e.",
17
+ "inc.",
18
+ "jr.",
19
+ "ltd.",
20
+ "md.",
21
+ "messrs.",
22
+ "mo.",
23
+ "mont.",
24
+ "mr.",
25
+ "mrs.",
26
+ "ms.",
27
+ "p.m.",
28
+ "ph.d.",
29
+ "rep.",
30
+ "rev.",
31
+ "sen.",
32
+ "st.",
33
+ "vs.",
34
+ )
35
+
36
+
37
+ class Tokenizer:
38
+ def __init__(
39
+ self,
40
+ handle_http=False,
41
+ handle_domains=False,
42
+ numbers=True,
43
+ combine_punctuation=True,
44
+ eol="\n",
45
+ currencies=("$",),
46
+ protected_words=None,
47
+ contractions=True,
48
+ language="en",
49
+ abbrevs=ABBREVS,
50
+ ):
51
+ # set() set() should fallback to just using __iter__ of automaton for a speedboost
52
+ if language != "en" and contractions:
53
+ raise ValueError("No contractions known for languages other than English.")
54
+ self.contractions = contractions
55
+ self.tokenizer = None
56
+ self.handle_http = handle_http
57
+ self.handle_domains = handle_domains
58
+ self.combine_punctuation = combine_punctuation
59
+ self.numbers = numbers
60
+ self.eol = eol
61
+ self.currencies = currencies or []
62
+ self.protected_words = protected_words or []
63
+ self.abbrevs = abbrevs
64
+ self.explain_dict = {}
65
+ self.setup()
66
+
67
+ def setup(self):
68
+ self.tokenizer = TextSearch("sensitive", "norm", set(), set())
69
+ self.add_base_cases()
70
+ self.add_currencies()
71
+ self.add_words(self.protected_words)
72
+ if self.handle_http:
73
+ self.tokenizer.add_http_handler(keep_result=True)
74
+ for word in ["http://", "https://", "www."]:
75
+ self.explain_dict[
76
+ word
77
+ ] = "regex: when it finds '{}' it will stop after it finds a space.".format(word)
78
+ if self.handle_domains:
79
+ self.add_domain_handler()
80
+ if self.contractions:
81
+ if self.contractions == True:
82
+ self.contractions = {}
83
+ self.contractions.update(contractions_dict)
84
+ self.contractions.update(leftovers_dict)
85
+ self.add_words(self.contractions)
86
+ if self.abbrevs:
87
+ self.add_words(self.abbrevs)
88
+
89
+ def add_words(self, words):
90
+ words = words.items() if isinstance(words, dict) else words
91
+ if words and isinstance(words, (list, set, tuple)) and isinstance(words[0], str):
92
+ words = [(x, x) for x in words]
93
+ for x, y in words:
94
+ REASON_AS_IS = "protected word: adds word as is, prevents splitting it."
95
+ REASON_UPPER = "protected word: adds word uppercased, prevents splitting it."
96
+ REASON_TITLE = "protected word: adds word titlecased, prevents splitting it."
97
+ self.add(x, y, REASON_AS_IS)
98
+ self.add(x.upper(), y.upper(), REASON_UPPER)
99
+ if y:
100
+ self.add(x[0].upper() + x[1:], y[0].upper() + y[1:], REASON_TITLE)
101
+
102
+ def add_domain_handler(self):
103
+ import re
104
+ from tldextract.tldextract import TLD_EXTRACTOR
105
+
106
+ valid_re = re.compile("^[a-zA-Z.]+$")
107
+ tlds = ["." + x for x in TLD_EXTRACTOR.tlds if valid_re.match(x)]
108
+
109
+ for x in tlds:
110
+ self.add(x, x, "Added by domain handler, keeps the token existing.")
111
+
112
+ def add_base_cases(self):
113
+ if self.numbers:
114
+ for x in "0123456789":
115
+ self.keep(x + ",")
116
+ self.keep(x + ".")
117
+
118
+ # self.tokenizer.add(" !", " ! ")
119
+
120
+ if self.combine_punctuation:
121
+ # combine multiples
122
+ R_COMBINE = "combine punctuation: merges '{}' into '{}' and starts a new sentence."
123
+ for s in "!.?-":
124
+ for i in range(2, 10):
125
+ # one of these is a splitting char
126
+ if i == 1 and s == "-":
127
+ continue
128
+ c = s * i
129
+ e = s * 3 if i > 1 else s
130
+ # end = "$<EOS>$" if i == 1 or s != "-" else " "
131
+ end = " \n" if i == 1 or s != "-" else " "
132
+ self.add(c, " {}{}".format(e, end), R_COMBINE.format(c, e + end))
133
+
134
+ for i in range(2, 10):
135
+ # self.tokenizer.add("\n" * i, "$<EOS>$")
136
+ self.add("\n" * i, " \n ", "merges newlines")
137
+
138
+ for s in "!.?-\n":
139
+ self.add(s, " " + s + "\n", "Splits on '{}' and creating a new sentence.".format(s))
140
+
141
+ self.split("- ")
142
+
143
+ self.split("...")
144
+
145
+ # does not work
146
+ # self.tokenizer.add_regex_handler(["!?"], "[!]+[?]+[!?]+", True, return_value=" !? ")
147
+
148
+ self.split("!?")
149
+ self.split("!?!")
150
+ self.split("!!?")
151
+ self.split("!??")
152
+ self.split("?!!")
153
+ self.split("?!?")
154
+ self.split("??!")
155
+
156
+ for x in string.ascii_letters:
157
+ self.keep(" " + x + ".")
158
+
159
+ # for x in string.ascii_letters:
160
+ # self.tokenizer.add("\n" + x, "\n" + x)
161
+
162
+ for s in ":;,":
163
+ self.split(s, "Splits on '{}' (punctuation)")
164
+
165
+ # quotes (make sure we add all the exeptions)
166
+ self.split("'")
167
+ self.split('"')
168
+
169
+ def keep(self, x, reason=None):
170
+ """ Whenever it finds x, it will not add whitespace. Prevents direct tokenization. """
171
+ self.tokenizer.add(x, x)
172
+ self.explain_dict[x] = reason or "keep:" + self.keep.__doc__.replace("x", repr(x)).rstrip()
173
+
174
+ def split(self, x, reason=None):
175
+ """ Whenever it finds x, it will surround it by whitespace, thus creating a token. """
176
+ self.tokenizer.add(x, " {} ".format(x))
177
+ self.explain_dict[x] = (
178
+ reason or "split:" + self.split.__doc__.replace("x", repr(x)).rstrip()
179
+ )
180
+
181
+ def drop(self, x, reason=None):
182
+ """ Whenever it finds x, it will remove it but add a split."""
183
+ self.tokenizer.add(x, " ")
184
+ self.explain_dict[x] = reason or "drop:" + self.drop.__doc__.replace("x", repr(x)).rstrip()
185
+
186
+ def strip(self, x, reason=None):
187
+ """ Whenever it finds x, it will remove it without splitting. """
188
+ self.tokenizer.add(x, "")
189
+ self.explain_dict[x] = (
190
+ reason or "strip:" + self.strip.__doc__.replace("x", repr(x)).rstrip()
191
+ )
192
+
193
+ def add(self, x, y, reason):
194
+ self.tokenizer.add(x, y)
195
+ self.explain_dict[x] = reason
196
+
197
+ def explain(self, char_or_chars):
198
+ keys = [x for x in self.tokenizer._root_dict if char_or_chars in x]
199
+ if not keys:
200
+ return {
201
+ "explanation": "No explanation, meaning there is nothing specified for the input"
202
+ }
203
+ return [
204
+ {"from": x, "to": self.tokenizer._root_dict[x], "explanation": self.explain_dict[x]}
205
+ for x in keys
206
+ ]
207
+
208
+ def remove(self, x):
209
+ if x in self.tokenizer:
210
+ self.tokenizer.remove(x)
211
+ del self.explain_dict[x]
212
+
213
+ def add_currencies(self):
214
+ for currency in self.currencies:
215
+ self.split(currency)
216
+
217
+ for num in "0123456789":
218
+ # to prevent the . and , from being treated as punct
219
+ for punc in ",.":
220
+ s = "{currency}{num}{punc}".format(currency=currency, num=num, punc=punc)
221
+ r = " {currency} {num}{punc}".format(currency=currency, num=num, punc=punc)
222
+ self.add(s, r, "protecting currency from being seen as a number.")
223
+
224
+ def word_tokenize(self, z, return_entities=False, to_lower=False):
225
+ if return_entities:
226
+ a, b = self.tokenizer.replace(" " + z, return_entities=True)
227
+ return a.split(), b
228
+ res = self.tokenizer.replace(" " + z).split()
229
+ if to_lower:
230
+ res = [x.lower() for x in res]
231
+ return res
232
+
233
+ def word_newlined_tokenize(self, z):
234
+ sentences = self.sent_tokenize(z)
235
+ return sum([x + ["\n"] for x in sentences[:-1]], []) + sentences[-1]
236
+
237
+ def sent_tokenize(self, z):
238
+ return [x.split() for x in self.tokenizer.replace(z).split("\n") if x.strip()]
239
+
240
+
241
+ t = Tokenizer(handle_http=True, handle_domains=False)
242
+ word_tokenize = t.word_tokenize
243
+ sent_tokenize = t.sent_tokenize