Spaces:
Sleeping
Sleeping
File size: 1,638 Bytes
7a8cb87 94f57b8 4d7bc75 55601de 4d7bc75 f621a6c 4d7bc75 f621a6c 55601de 7a8cb87 55601de 4d7bc75 f621a6c 7a8cb87 4d7bc75 f621a6c 4d7bc75 f621a6c 4d7bc75 f621a6c 4d7bc75 7a8cb87 f621a6c 4d7bc75 7a8cb87 f621a6c 4d7bc75 7a8cb87 f621a6c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import gradio as gr
import io
import numpy as np
from tok import Tokenizer
# Vector Loader
def load_vectors(fname):
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
data = {}
for line in fin:
tokens = line.rstrip().split(' ')
data[tokens[0]] = np.array(list(map(float, tokens[1:]))) # Convert to NumPy array
del fin
return data, sorted(data.keys(), key=len, reverse=True)
vectors, sorted_vector = load_vectors('wiki-news-300d-1M.vec')
# Tokenizer
tokenizer = Tokenizer(protected_words=sorted_vector)
def tokenize(text):
return tokenizer.word_tokenize(text)
# Interface
def onInput(paragraph, progress = gr.Progress()):
progress(0, "Tokenizing...")
tokens = tokenize(paragraph)
progress(0.1, "Initializing merged vector...")
if not tokens: # Handle case with no tokens found
return np.zeros(300).tolist() # Return a zero vector of appropriate dimension
merged_vector = np.zeros(300) # Assuming vectors are 300-dimensional
# Merge vectors using NumPy
totalTokens = len(tokens)
for ind, token in enumerate(tokens):
completion = 0.7*((ind+1)/totalTokens)
progress(0.1 + completion, f"Merging {token}, Token #{tokens.index(token)+1}/{len(tokens)}")
vector = vectors[token]
merged_vector += vector
# Normalize
progress(0.9, "Normalizing...")
merged_vector /= len(tokens)
progress(1, "Converting to list...")
return merged_vector.tolist() # Convert back to list for output
demo = gr.Interface(fn=onInput, inputs="text", outputs="text")
demo.launch() |