import gradio as gr import numpy as np import json import pickle as pkl from transformers import AutoTokenizer import re # Vector Loader vectors = pkl.load(open("vectors.pkl", "rb")) vocab = [word.lower() for word in vectors.keys()] # Tokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") def make_alphanumeric(input_string): return re.sub(r'[^a-zA-Z0-9 ]', '', input_string) def tokenize(text): # Check data if len(text) == 0: gr.Error("No text provided.") elif len(text) > 4096: gr.Error("Text too long.") # Filter text = make_alphanumeric(text.lower()) pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text) pre_tokenized_text = [word for word, offset in pre_tokenize_result] tokens = [] for word in pre_tokenized_text: if word in vocab: tokens.append(word) return tokens # Interface def onInput(paragraph, progress = gr.Progress()): tokens = tokenize(paragraph) if not tokens: # Handle case with no tokens found return np.zeros(300).tolist() # Return a zero vector of appropriate dimension merged_vector = np.zeros(300) # Assuming vectors are 300-dimensional # Merge vectors using NumPy totalTokens = len(tokens) for ind, token in enumerate(tokens): completion = 0.2*((ind+1)/totalTokens) progress(0.6 + completion, f"Merging {token}, Token #{tokens.index(token)+1}/{len(tokens)}") if token not in vectors: continue vector = vectors[token] merged_vector += vector # Normalize merged_vector /= len(tokens) return merged_vector.tolist(), json.dumps(tokens) demo = gr.Interface(fn=onInput, inputs="text", outputs=["text", "json"]) demo.launch()