import gradio as gr import io import numpy as np from tok import Tokenizer # Vector Loader def load_vectors(fname): fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') data = {} for line in fin: tokens = line.rstrip().split(' ') data[tokens[0]] = np.array(list(map(float, tokens[1:]))) # Convert to NumPy array del fin return data, sorted(data.keys(), key=len, reverse=True) vectors, sorted_vector = load_vectors('wiki-news-300d-1M.vec') # Tokenizer tokenizer = Tokenizer(protected_words=sorted_vector) def tokenize(text): return tokenizer.word_tokenize(text) # Interface def onInput(paragraph, progress = gr.Progress()): progress(0, "Tokenizing...") tokens = tokenize(paragraph) progress(0.1, "Initializing merged vector...") if not tokens: # Handle case with no tokens found return np.zeros(300).tolist() # Return a zero vector of appropriate dimension merged_vector = np.zeros(300) # Assuming vectors are 300-dimensional # Merge vectors using NumPy totalTokens = len(tokens) for ind, token in enumerate(tokens): completion = 0.7*((ind+1)/totalTokens) progress(0.1 + completion, f"Merging {token}, Token #{tokens.index(token)+1}/{len(tokens)}") vector = vectors[token] merged_vector += vector # Normalize progress(0.9, "Normalizing...") merged_vector /= len(tokens) progress(1, "Converting to list...") return merged_vector.tolist() # Convert back to list for output demo = gr.Interface(fn=onInput, inputs="text", outputs="text") demo.launch()