asynchronousai's picture
Update app.py
8d5a24c verified
raw
history blame
1.81 kB
import gradio as gr
import numpy as np
import json
import pickle as pkl
from transformers import AutoTokenizer
import re
# Vector Loader
vectors = pkl.load(open("vectors.pkl", "rb"))
vocab = [word.lower() for word in vectors.keys()]
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def make_alphanumeric(input_string):
return re.sub(r'[^a-zA-Z0-9 ]', '', input_string)
def tokenize(text):
# Check data
if len(text) == 0:
gr.Error("No text provided.")
elif len(text) > 4096:
gr.Error("Text too long.")
# Filter
text = make_alphanumeric(text.lower())
pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
pre_tokenized_text = [word for word, offset in pre_tokenize_result]
tokens = []
for word in pre_tokenized_text:
if word in vocab:
tokens.append(word)
return tokens
# Interface
def onInput(paragraph, progress = gr.Progress()):
tokens = tokenize(paragraph)
if not tokens: # Handle case with no tokens found
return np.zeros(300).tolist() # Return a zero vector of appropriate dimension
merged_vector = np.zeros(300) # Assuming vectors are 300-dimensional
# Merge vectors using NumPy
totalTokens = len(tokens)
for ind, token in enumerate(tokens):
completion = 0.2*((ind+1)/totalTokens)
progress(0.6 + completion, f"Merging {token}, Token #{tokens.index(token)+1}/{len(tokens)}")
if token not in vectors:
continue
vector = vectors[token]
merged_vector += vector
# Normalize
merged_vector /= len(tokens)
return merged_vector.tolist(), json.dumps(tokens)
demo = gr.Interface(fn=onInput, inputs="text", outputs=["text", "json"])
demo.launch()