Spaces:
Sleeping
Sleeping
import gradio as gr | |
import io | |
import numpy as np | |
from tok import Tokenizer | |
# Vector Loader | |
def load_vectors(fname): | |
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') | |
data = {} | |
for line in fin: | |
tokens = line.rstrip().split(' ') | |
data[tokens[0]] = np.array(list(map(float, tokens[1:]))) # Convert to NumPy array | |
del fin | |
return data, sorted(data.keys(), key=len, reverse=True) | |
vectors, sorted_vector = load_vectors('wiki-news-300d-1M.vec') | |
# Tokenizer | |
tokenizer = Tokenizer(protected_words=sorted_vector) | |
def tokenize(text): | |
return tokenizer.word_tokenize(text) | |
# Interface | |
def onInput(paragraph, progress = gr.Progress()): | |
progress(0, "Tokenizing...") | |
tokens = tokenize(paragraph) | |
progress(0.1, "Initializing merged vector...") | |
if not tokens: # Handle case with no tokens found | |
return np.zeros(300).tolist() # Return a zero vector of appropriate dimension | |
merged_vector = np.zeros(300) # Assuming vectors are 300-dimensional | |
# Merge vectors using NumPy | |
totalTokens = len(tokens) | |
for ind, token in enumerate(tokens): | |
completion = 0.7*((ind+1)/totalTokens) | |
progress(0.1 + completion, f"Merging {token}, Token #{tokens.index(token)+1}/{len(tokens)}") | |
vector = vectors[token] | |
merged_vector += vector | |
# Normalize | |
progress(0.9, "Normalizing...") | |
merged_vector /= len(tokens) | |
progress(1, "Converting to list...") | |
return merged_vector.tolist() # Convert back to list for output | |
demo = gr.Interface(fn=onInput, inputs="text", outputs="text") | |
demo.launch() |