Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import gradio as gr
|
2 |
import io
|
3 |
import numpy as np
|
4 |
-
|
|
|
5 |
|
6 |
# Vector Loader
|
7 |
def load_vectors(fname):
|
@@ -11,20 +12,33 @@ def load_vectors(fname):
|
|
11 |
tokens = line.rstrip().split(' ')
|
12 |
data[tokens[0]] = np.array(list(map(float, tokens[1:]))) # Convert to NumPy array
|
13 |
del fin
|
14 |
-
return data
|
15 |
-
vectors
|
|
|
16 |
|
17 |
# Tokenizer
|
18 |
-
|
|
|
|
|
|
|
19 |
def tokenize(text):
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
# Interface
|
23 |
-
def onInput(paragraph
|
24 |
-
progress(0, "Tokenizing...")
|
25 |
tokens = tokenize(paragraph)
|
26 |
|
27 |
-
progress(0.1, "Initializing merged vector...")
|
28 |
if not tokens: # Handle case with no tokens found
|
29 |
return np.zeros(300).tolist() # Return a zero vector of appropriate dimension
|
30 |
|
@@ -33,17 +47,17 @@ def onInput(paragraph, progress = gr.Progress()):
|
|
33 |
# Merge vectors using NumPy
|
34 |
totalTokens = len(tokens)
|
35 |
for ind, token in enumerate(tokens):
|
36 |
-
completion = 0.
|
37 |
-
|
|
|
|
|
38 |
|
39 |
vector = vectors[token]
|
40 |
merged_vector += vector
|
41 |
|
42 |
# Normalize
|
43 |
-
progress(0.9, "Normalizing...")
|
44 |
merged_vector /= len(tokens)
|
45 |
|
46 |
-
progress(1, "Converting to list...")
|
47 |
return merged_vector.tolist() # Convert back to list for output
|
48 |
|
49 |
demo = gr.Interface(fn=onInput, inputs="text", outputs="text")
|
|
|
1 |
import gradio as gr
|
2 |
import io
|
3 |
import numpy as np
|
4 |
+
import ctypes
|
5 |
+
|
6 |
|
7 |
# Vector Loader
|
8 |
def load_vectors(fname):
|
|
|
12 |
tokens = line.rstrip().split(' ')
|
13 |
data[tokens[0]] = np.array(list(map(float, tokens[1:]))) # Convert to NumPy array
|
14 |
del fin
|
15 |
+
return data
|
16 |
+
vectors = load_vectors('wiki-news-300d-1M.vec')
|
17 |
+
tokens = [token.encode('utf-8') for token in vectors.keys()]
|
18 |
|
19 |
# Tokenizer
|
20 |
+
lib = ctypes.CDLL('./tokenizer.so')
|
21 |
+
|
22 |
+
lib.tokenize.argtypes = [ctypes.c_char_p, ctypes.POINTER(ctypes.c_char_p), ctypes.c_int, ctypes.POINTER(ctypes.c_int)]
|
23 |
+
lib.tokenize.restype = ctypes.POINTER(ctypes.c_char_p)
|
24 |
def tokenize(text):
|
25 |
+
text = text.encode('utf-8')
|
26 |
+
num_tokens = len(tokens)
|
27 |
+
tokens_array = (ctypes.c_char_p * num_tokens)(*tokens)
|
28 |
+
|
29 |
+
result_size = ctypes.c_int()
|
30 |
+
|
31 |
+
result = lib.tokenize(text, tokens_array, num_tokens, ctypes.byref(result_size))
|
32 |
+
|
33 |
+
python_tokens = [result[i].decode('utf-8') for i in range(result_size.value)]
|
34 |
+
lib.free_tokens(result, result_size.value)
|
35 |
+
|
36 |
+
return python_tokens
|
37 |
|
38 |
# Interface
|
39 |
+
def onInput(paragraph):
|
|
|
40 |
tokens = tokenize(paragraph)
|
41 |
|
|
|
42 |
if not tokens: # Handle case with no tokens found
|
43 |
return np.zeros(300).tolist() # Return a zero vector of appropriate dimension
|
44 |
|
|
|
47 |
# Merge vectors using NumPy
|
48 |
totalTokens = len(tokens)
|
49 |
for ind, token in enumerate(tokens):
|
50 |
+
completion = 0.2*((ind+1)/totalTokens)
|
51 |
+
|
52 |
+
if token not in vectors:
|
53 |
+
continue
|
54 |
|
55 |
vector = vectors[token]
|
56 |
merged_vector += vector
|
57 |
|
58 |
# Normalize
|
|
|
59 |
merged_vector /= len(tokens)
|
60 |
|
|
|
61 |
return merged_vector.tolist() # Convert back to list for output
|
62 |
|
63 |
demo = gr.Interface(fn=onInput, inputs="text", outputs="text")
|