asynchronousai commited on
Commit
e96c4ee
·
verified ·
1 Parent(s): 24adeaa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -12
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import gradio as gr
2
  import io
3
  import numpy as np
4
- from tok import Tokenizer
 
5
 
6
  # Vector Loader
7
  def load_vectors(fname):
@@ -11,20 +12,33 @@ def load_vectors(fname):
11
  tokens = line.rstrip().split(' ')
12
  data[tokens[0]] = np.array(list(map(float, tokens[1:]))) # Convert to NumPy array
13
  del fin
14
- return data, sorted(data.keys(), key=len, reverse=True)
15
- vectors, sorted_vector = load_vectors('wiki-news-300d-1M.vec')
 
16
 
17
  # Tokenizer
18
- tokenizer = Tokenizer(protected_words=sorted_vector)
 
 
 
19
  def tokenize(text):
20
- return tokenizer.word_tokenize(text)
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # Interface
23
- def onInput(paragraph, progress = gr.Progress()):
24
- progress(0, "Tokenizing...")
25
  tokens = tokenize(paragraph)
26
 
27
- progress(0.1, "Initializing merged vector...")
28
  if not tokens: # Handle case with no tokens found
29
  return np.zeros(300).tolist() # Return a zero vector of appropriate dimension
30
 
@@ -33,17 +47,17 @@ def onInput(paragraph, progress = gr.Progress()):
33
  # Merge vectors using NumPy
34
  totalTokens = len(tokens)
35
  for ind, token in enumerate(tokens):
36
- completion = 0.7*((ind+1)/totalTokens)
37
- progress(0.1 + completion, f"Merging {token}, Token #{tokens.index(token)+1}/{len(tokens)}")
 
 
38
 
39
  vector = vectors[token]
40
  merged_vector += vector
41
 
42
  # Normalize
43
- progress(0.9, "Normalizing...")
44
  merged_vector /= len(tokens)
45
 
46
- progress(1, "Converting to list...")
47
  return merged_vector.tolist() # Convert back to list for output
48
 
49
  demo = gr.Interface(fn=onInput, inputs="text", outputs="text")
 
1
  import gradio as gr
2
  import io
3
  import numpy as np
4
+ import ctypes
5
+
6
 
7
  # Vector Loader
8
  def load_vectors(fname):
 
12
  tokens = line.rstrip().split(' ')
13
  data[tokens[0]] = np.array(list(map(float, tokens[1:]))) # Convert to NumPy array
14
  del fin
15
+ return data
16
+ vectors = load_vectors('wiki-news-300d-1M.vec')
17
+ tokens = [token.encode('utf-8') for token in vectors.keys()]
18
 
19
  # Tokenizer
20
+ lib = ctypes.CDLL('./tokenizer.so')
21
+
22
+ lib.tokenize.argtypes = [ctypes.c_char_p, ctypes.POINTER(ctypes.c_char_p), ctypes.c_int, ctypes.POINTER(ctypes.c_int)]
23
+ lib.tokenize.restype = ctypes.POINTER(ctypes.c_char_p)
24
  def tokenize(text):
25
+ text = text.encode('utf-8')
26
+ num_tokens = len(tokens)
27
+ tokens_array = (ctypes.c_char_p * num_tokens)(*tokens)
28
+
29
+ result_size = ctypes.c_int()
30
+
31
+ result = lib.tokenize(text, tokens_array, num_tokens, ctypes.byref(result_size))
32
+
33
+ python_tokens = [result[i].decode('utf-8') for i in range(result_size.value)]
34
+ lib.free_tokens(result, result_size.value)
35
+
36
+ return python_tokens
37
 
38
  # Interface
39
+ def onInput(paragraph):
 
40
  tokens = tokenize(paragraph)
41
 
 
42
  if not tokens: # Handle case with no tokens found
43
  return np.zeros(300).tolist() # Return a zero vector of appropriate dimension
44
 
 
47
  # Merge vectors using NumPy
48
  totalTokens = len(tokens)
49
  for ind, token in enumerate(tokens):
50
+ completion = 0.2*((ind+1)/totalTokens)
51
+
52
+ if token not in vectors:
53
+ continue
54
 
55
  vector = vectors[token]
56
  merged_vector += vector
57
 
58
  # Normalize
 
59
  merged_vector /= len(tokens)
60
 
 
61
  return merged_vector.tolist() # Convert back to list for output
62
 
63
  demo = gr.Interface(fn=onInput, inputs="text", outputs="text")