Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import json | |
import pickle as pkl | |
from transformers import AutoTokenizer | |
import re | |
# Vector Loader | |
vectors = pkl.load(open("vectors.pkl", "rb")) | |
vocab = [word.lower() for word in vectors.keys()] | |
# Tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") | |
def make_alphanumeric(input_string): | |
return re.sub(r'[^a-zA-Z0-9 ]', '', input_string) | |
def tokenize(text): | |
# Check data | |
if len(text) == 0: | |
gr.Error("No text provided.") | |
elif len(text) > 4096: | |
gr.Error("Text too long.") | |
# Filter | |
text = make_alphanumeric(text.lower()) | |
pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text) | |
pre_tokenized_text = [word for word, offset in pre_tokenize_result] | |
tokens = [] | |
for word in pre_tokenized_text: | |
if word in vocab: | |
tokens.append(word) | |
return tokens | |
# Interface | |
def onInput(paragraph, progress = gr.Progress()): | |
tokens = tokenize(paragraph) | |
if not tokens: # Handle case with no tokens found | |
return np.zeros(300).tolist() # Return a zero vector of appropriate dimension | |
merged_vector = np.zeros(300) # Assuming vectors are 300-dimensional | |
# Merge vectors using NumPy | |
totalTokens = len(tokens) | |
for ind, token in enumerate(tokens): | |
completion = 0.2*((ind+1)/totalTokens) | |
progress(0.6 + completion, f"Merging {token}, Token #{tokens.index(token)+1}/{len(tokens)}") | |
if token not in vectors: | |
continue | |
vector = vectors[token] | |
merged_vector += vector | |
# Normalize | |
merged_vector /= len(tokens) | |
return merged_vector.tolist(), json.dumps(tokens) | |
demo = gr.Interface(fn=onInput, inputs="text", outputs=["text", "json"]) | |
demo.launch() |