asynchronousai commited on
Commit
8d5a24c
·
verified ·
1 Parent(s): cbb233c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -32
app.py CHANGED
@@ -1,42 +1,39 @@
1
  import gradio as gr
2
- import io
3
  import numpy as np
4
- import ctypes
5
-
6
-
7
- # Vector Loader
8
- def load_vectors(fname):
9
- fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
10
- data = {}
11
- for line in fin:
12
- tokens = line.rstrip().split(' ')
13
- data[tokens[0]] = np.array(list(map(float, tokens[1:]))) # Convert to NumPy array
14
- del fin
15
- return data
16
- vectors = load_vectors('wiki-news-300d-1M.vec')
17
- tokens = [token.encode('utf-8') for token in vectors.keys()]
18
 
19
  # Tokenizer
20
- lib = ctypes.CDLL('./tokenizer.so')
 
 
21
 
22
- lib.tokenize.argtypes = [ctypes.c_char_p, ctypes.POINTER(ctypes.c_char_p), ctypes.c_int, ctypes.POINTER(ctypes.c_int)]
23
- lib.tokenize.restype = ctypes.POINTER(ctypes.c_char_p)
24
  def tokenize(text):
25
- text = text.encode('utf-8')
26
- num_tokens = len(tokens)
27
- tokens_array = (ctypes.c_char_p * num_tokens)(*tokens)
28
-
29
- result_size = ctypes.c_int()
30
-
31
- result = lib.tokenize(text, tokens_array, num_tokens, ctypes.byref(result_size))
32
-
33
- python_tokens = [result[i].decode('utf-8') for i in range(result_size.value)]
34
- lib.free_tokens(result, result_size.value)
 
 
 
 
 
 
35
 
36
- return python_tokens
37
 
38
  # Interface
39
- def onInput(paragraph):
40
  tokens = tokenize(paragraph)
41
 
42
  if not tokens: # Handle case with no tokens found
@@ -48,6 +45,7 @@ def onInput(paragraph):
48
  totalTokens = len(tokens)
49
  for ind, token in enumerate(tokens):
50
  completion = 0.2*((ind+1)/totalTokens)
 
51
 
52
  if token not in vectors:
53
  continue
@@ -58,7 +56,7 @@ def onInput(paragraph):
58
  # Normalize
59
  merged_vector /= len(tokens)
60
 
61
- return merged_vector.tolist() # Convert back to list for output
62
 
63
- demo = gr.Interface(fn=onInput, inputs="text", outputs="text")
64
  demo.launch()
 
1
  import gradio as gr
 
2
  import numpy as np
3
+ import json
4
+ import pickle as pkl
5
+ from transformers import AutoTokenizer
6
+ import re
7
+ # Vector Loader
8
+ vectors = pkl.load(open("vectors.pkl", "rb"))
9
+ vocab = [word.lower() for word in vectors.keys()]
 
 
 
 
 
 
 
10
 
11
  # Tokenizer
12
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
13
+ def make_alphanumeric(input_string):
14
+ return re.sub(r'[^a-zA-Z0-9 ]', '', input_string)
15
 
 
 
16
  def tokenize(text):
17
+ # Check data
18
+ if len(text) == 0:
19
+ gr.Error("No text provided.")
20
+ elif len(text) > 4096:
21
+ gr.Error("Text too long.")
22
+
23
+ # Filter
24
+ text = make_alphanumeric(text.lower())
25
+ pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
26
+ pre_tokenized_text = [word for word, offset in pre_tokenize_result]
27
+
28
+ tokens = []
29
+ for word in pre_tokenized_text:
30
+ if word in vocab:
31
+ tokens.append(word)
32
+ return tokens
33
 
 
34
 
35
  # Interface
36
+ def onInput(paragraph, progress = gr.Progress()):
37
  tokens = tokenize(paragraph)
38
 
39
  if not tokens: # Handle case with no tokens found
 
45
  totalTokens = len(tokens)
46
  for ind, token in enumerate(tokens):
47
  completion = 0.2*((ind+1)/totalTokens)
48
+ progress(0.6 + completion, f"Merging {token}, Token #{tokens.index(token)+1}/{len(tokens)}")
49
 
50
  if token not in vectors:
51
  continue
 
56
  # Normalize
57
  merged_vector /= len(tokens)
58
 
59
+ return merged_vector.tolist(), json.dumps(tokens)
60
 
61
+ demo = gr.Interface(fn=onInput, inputs="text", outputs=["text", "json"])
62
  demo.launch()