Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,36 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
import io
|
3 |
import numpy as np
|
4 |
-
|
5 |
-
# Trie
|
6 |
-
class TrieNode:
|
7 |
-
def __init__(self):
|
8 |
-
self.children = {}
|
9 |
-
self.is_end_of_token = False
|
10 |
-
class Trie:
|
11 |
-
def __init__(self):
|
12 |
-
self.root = TrieNode()
|
13 |
-
|
14 |
-
def insert(self, token):
|
15 |
-
node = self.root
|
16 |
-
for char in token:
|
17 |
-
if char not in node.children:
|
18 |
-
node.children[char] = TrieNode()
|
19 |
-
node = node.children[char]
|
20 |
-
node.is_end_of_token = True
|
21 |
-
|
22 |
-
def search_longest_prefix(self, text, start):
|
23 |
-
node = self.root
|
24 |
-
longest_match = None
|
25 |
-
current_pos = start
|
26 |
-
|
27 |
-
while current_pos < len(text) and text[current_pos] in node.children:
|
28 |
-
node = node.children[text[current_pos]]
|
29 |
-
if node.is_end_of_token:
|
30 |
-
longest_match = current_pos
|
31 |
-
current_pos += 1
|
32 |
-
|
33 |
-
return longest_match
|
34 |
|
35 |
# Vector Loader
|
36 |
def load_vectors(fname):
|
@@ -44,23 +15,9 @@ def load_vectors(fname):
|
|
44 |
vectors, sorted_vector = load_vectors('wiki-news-300d-1M.vec')
|
45 |
|
46 |
# Tokenizer
|
|
|
47 |
def tokenize(text):
|
48 |
-
|
49 |
-
for token in sorted_vector:
|
50 |
-
trie.insert(token)
|
51 |
-
|
52 |
-
result = []
|
53 |
-
start = 0
|
54 |
-
|
55 |
-
while start < len(text):
|
56 |
-
longest_match = trie.search_longest_prefix(text, start)
|
57 |
-
if longest_match is not None:
|
58 |
-
result.append(text[start:longest_match+1])
|
59 |
-
start = longest_match + 1
|
60 |
-
else:
|
61 |
-
start += 1
|
62 |
-
|
63 |
-
return result
|
64 |
|
65 |
# Interface
|
66 |
def onInput(paragraph, progress = gr.Progress()):
|
|
|
1 |
import gradio as gr
|
2 |
import io
|
3 |
import numpy as np
|
4 |
+
from tok import Tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
# Vector Loader
|
7 |
def load_vectors(fname):
|
|
|
15 |
vectors, sorted_vector = load_vectors('wiki-news-300d-1M.vec')
|
16 |
|
17 |
# Tokenizer
|
18 |
+
tokenizer = Tokenizer(protected_words=sorted_vector)
|
19 |
def tokenize(text):
|
20 |
+
return tokenizer.word_tokenize(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
# Interface
|
23 |
def onInput(paragraph, progress = gr.Progress()):
|