asynchronousai commited on
Commit
55601de
·
verified ·
1 Parent(s): 42e8afb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -46
app.py CHANGED
@@ -1,36 +1,7 @@
1
  import gradio as gr
2
  import io
3
  import numpy as np
4
-
5
- # Trie
6
- class TrieNode:
7
- def __init__(self):
8
- self.children = {}
9
- self.is_end_of_token = False
10
- class Trie:
11
- def __init__(self):
12
- self.root = TrieNode()
13
-
14
- def insert(self, token):
15
- node = self.root
16
- for char in token:
17
- if char not in node.children:
18
- node.children[char] = TrieNode()
19
- node = node.children[char]
20
- node.is_end_of_token = True
21
-
22
- def search_longest_prefix(self, text, start):
23
- node = self.root
24
- longest_match = None
25
- current_pos = start
26
-
27
- while current_pos < len(text) and text[current_pos] in node.children:
28
- node = node.children[text[current_pos]]
29
- if node.is_end_of_token:
30
- longest_match = current_pos
31
- current_pos += 1
32
-
33
- return longest_match
34
 
35
  # Vector Loader
36
  def load_vectors(fname):
@@ -44,23 +15,9 @@ def load_vectors(fname):
44
  vectors, sorted_vector = load_vectors('wiki-news-300d-1M.vec')
45
 
46
  # Tokenizer
 
47
  def tokenize(text):
48
- trie = Trie()
49
- for token in sorted_vector:
50
- trie.insert(token)
51
-
52
- result = []
53
- start = 0
54
-
55
- while start < len(text):
56
- longest_match = trie.search_longest_prefix(text, start)
57
- if longest_match is not None:
58
- result.append(text[start:longest_match+1])
59
- start = longest_match + 1
60
- else:
61
- start += 1
62
-
63
- return result
64
 
65
  # Interface
66
  def onInput(paragraph, progress = gr.Progress()):
 
1
  import gradio as gr
2
  import io
3
  import numpy as np
4
+ from tok import Tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  # Vector Loader
7
  def load_vectors(fname):
 
15
  vectors, sorted_vector = load_vectors('wiki-news-300d-1M.vec')
16
 
17
  # Tokenizer
18
+ tokenizer = Tokenizer(protected_words=sorted_vector)
19
  def tokenize(text):
20
+ return tokenizer.word_tokenize(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # Interface
23
  def onInput(paragraph, progress = gr.Progress()):