import gradio as gr import io import numpy as np import ctypes # Vector Loader def load_vectors(fname): fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') data = {} for line in fin: tokens = line.rstrip().split(' ') data[tokens[0]] = np.array(list(map(float, tokens[1:]))) # Convert to NumPy array del fin return data vectors = load_vectors('wiki-news-300d-1M.vec') tokens = [token.encode('utf-8') for token in vectors.keys()] # Tokenizer lib = ctypes.CDLL('./tokenizer.so') lib.tokenize.argtypes = [ctypes.c_char_p, ctypes.POINTER(ctypes.c_char_p), ctypes.c_int, ctypes.POINTER(ctypes.c_int)] lib.tokenize.restype = ctypes.POINTER(ctypes.c_char_p) def tokenize(text): text = text.encode('utf-8') num_tokens = len(tokens) tokens_array = (ctypes.c_char_p * num_tokens)(*tokens) result_size = ctypes.c_int() result = lib.tokenize(text, tokens_array, num_tokens, ctypes.byref(result_size)) python_tokens = [result[i].decode('utf-8') for i in range(result_size.value)] lib.free_tokens(result, result_size.value) return python_tokens # Interface def onInput(paragraph): tokens = tokenize(paragraph) if not tokens: # Handle case with no tokens found return np.zeros(300).tolist() # Return a zero vector of appropriate dimension merged_vector = np.zeros(300) # Assuming vectors are 300-dimensional # Merge vectors using NumPy totalTokens = len(tokens) for ind, token in enumerate(tokens): completion = 0.2*((ind+1)/totalTokens) if token not in vectors: continue vector = vectors[token] merged_vector += vector # Normalize merged_vector /= len(tokens) return merged_vector.tolist() # Convert back to list for output demo = gr.Interface(fn=onInput, inputs="text", outputs="text") demo.launch()