asynchronousai commited on
Commit
bcfa582
·
verified ·
1 Parent(s): b1111ce

Update tok.py

Browse files
Files changed (1) hide show
  1. tok.py +0 -48
tok.py CHANGED
@@ -1,38 +1,5 @@
1
  import string
2
  from textsearch import TextSearch
3
- from contractions import contractions_dict, leftovers_dict
4
-
5
- ABBREVS = (
6
- "a.m.",
7
- "adm.",
8
- "bros.",
9
- "co.",
10
- "corp.",
11
- "d.c.",
12
- "dr.",
13
- "e.g.",
14
- "gen.",
15
- "gov.",
16
- "i.e.",
17
- "inc.",
18
- "jr.",
19
- "ltd.",
20
- "md.",
21
- "messrs.",
22
- "mo.",
23
- "mont.",
24
- "mr.",
25
- "mrs.",
26
- "ms.",
27
- "p.m.",
28
- "ph.d.",
29
- "rep.",
30
- "rev.",
31
- "sen.",
32
- "st.",
33
- "vs.",
34
- )
35
-
36
 
37
  class Tokenizer:
38
  def __init__(
@@ -44,14 +11,8 @@ class Tokenizer:
44
  eol="\n",
45
  currencies=("$",),
46
  protected_words=None,
47
- contractions=True,
48
  language="en",
49
- abbrevs=ABBREVS,
50
  ):
51
- # set() set() should fallback to just using __iter__ of automaton for a speedboost
52
- if language != "en" and contractions:
53
- raise ValueError("No contractions known for languages other than English.")
54
- self.contractions = contractions
55
  self.tokenizer = None
56
  self.handle_http = handle_http
57
  self.handle_domains = handle_domains
@@ -60,7 +21,6 @@ class Tokenizer:
60
  self.eol = eol
61
  self.currencies = currencies or []
62
  self.protected_words = protected_words or []
63
- self.abbrevs = abbrevs
64
  self.explain_dict = {}
65
  self.setup()
66
 
@@ -77,14 +37,6 @@ class Tokenizer:
77
  ] = "regex: when it finds '{}' it will stop after it finds a space.".format(word)
78
  if self.handle_domains:
79
  self.add_domain_handler()
80
- if self.contractions:
81
- if self.contractions == True:
82
- self.contractions = {}
83
- self.contractions.update(contractions_dict)
84
- self.contractions.update(leftovers_dict)
85
- self.add_words(self.contractions)
86
- if self.abbrevs:
87
- self.add_words(self.abbrevs)
88
 
89
  def add_words(self, words):
90
  words = words.items() if isinstance(words, dict) else words
 
1
  import string
2
  from textsearch import TextSearch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  class Tokenizer:
5
  def __init__(
 
11
  eol="\n",
12
  currencies=("$",),
13
  protected_words=None,
 
14
  language="en",
 
15
  ):
 
 
 
 
16
  self.tokenizer = None
17
  self.handle_http = handle_http
18
  self.handle_domains = handle_domains
 
21
  self.eol = eol
22
  self.currencies = currencies or []
23
  self.protected_words = protected_words or []
 
24
  self.explain_dict = {}
25
  self.setup()
26
 
 
37
  ] = "regex: when it finds '{}' it will stop after it finds a space.".format(word)
38
  if self.handle_domains:
39
  self.add_domain_handler()
 
 
 
 
 
 
 
 
40
 
41
  def add_words(self, words):
42
  words = words.items() if isinstance(words, dict) else words