Spaces:
Sleeping
Sleeping
File size: 1,910 Bytes
7a8cb87 94f57b8 4d7bc75 e96c4ee 4d7bc75 f621a6c e96c4ee 4d7bc75 f621a6c e96c4ee 7a8cb87 e96c4ee 4d7bc75 f621a6c e96c4ee 7a8cb87 4d7bc75 f621a6c 4d7bc75 f621a6c e96c4ee f621a6c 4d7bc75 7a8cb87 4d7bc75 7a8cb87 4d7bc75 7a8cb87 f621a6c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import gradio as gr
import io
import numpy as np
import ctypes
# Vector Loader
def load_vectors(fname):
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
data = {}
for line in fin:
tokens = line.rstrip().split(' ')
data[tokens[0]] = np.array(list(map(float, tokens[1:]))) # Convert to NumPy array
del fin
return data
vectors = load_vectors('wiki-news-300d-1M.vec')
tokens = [token.encode('utf-8') for token in vectors.keys()]
# Tokenizer
lib = ctypes.CDLL('./tokenizer.so')
lib.tokenize.argtypes = [ctypes.c_char_p, ctypes.POINTER(ctypes.c_char_p), ctypes.c_int, ctypes.POINTER(ctypes.c_int)]
lib.tokenize.restype = ctypes.POINTER(ctypes.c_char_p)
def tokenize(text):
text = text.encode('utf-8')
num_tokens = len(tokens)
tokens_array = (ctypes.c_char_p * num_tokens)(*tokens)
result_size = ctypes.c_int()
result = lib.tokenize(text, tokens_array, num_tokens, ctypes.byref(result_size))
python_tokens = [result[i].decode('utf-8') for i in range(result_size.value)]
lib.free_tokens(result, result_size.value)
return python_tokens
# Interface
def onInput(paragraph):
tokens = tokenize(paragraph)
if not tokens: # Handle case with no tokens found
return np.zeros(300).tolist() # Return a zero vector of appropriate dimension
merged_vector = np.zeros(300) # Assuming vectors are 300-dimensional
# Merge vectors using NumPy
totalTokens = len(tokens)
for ind, token in enumerate(tokens):
completion = 0.2*((ind+1)/totalTokens)
if token not in vectors:
continue
vector = vectors[token]
merged_vector += vector
# Normalize
merged_vector /= len(tokens)
return merged_vector.tolist() # Convert back to list for output
demo = gr.Interface(fn=onInput, inputs="text", outputs="text")
demo.launch() |