|
import gradio as gr |
|
import numpy as np |
|
import json |
|
import pickle as pkl |
|
from transformers import AutoTokenizer |
|
import re |
|
|
|
vectors = pkl.load(open("vectors.pkl", "rb")) |
|
vocab = [word.lower() for word in vectors.keys()] |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") |
|
def make_alphanumeric(input_string): |
|
return re.sub(r'[^a-zA-Z0-9 ]', '', input_string) |
|
|
|
def tokenize(text): |
|
|
|
if len(text) == 0: |
|
gr.Error("No text provided.") |
|
elif len(text) > 4096: |
|
gr.Error("Text too long.") |
|
|
|
|
|
text = make_alphanumeric(text.lower()) |
|
pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text) |
|
pre_tokenized_text = [word for word, offset in pre_tokenize_result] |
|
|
|
tokens = [] |
|
for word in pre_tokenized_text: |
|
if word in vocab: |
|
tokens.append(word) |
|
return tokens |
|
|
|
|
|
|
|
def onInput(paragraph, progress = gr.Progress()): |
|
tokens = tokenize(paragraph) |
|
|
|
if not tokens: |
|
return np.zeros(300).tolist() |
|
|
|
merged_vector = np.zeros(300) |
|
|
|
|
|
totalTokens = len(tokens) |
|
for ind, token in enumerate(tokens): |
|
completion = 0.2*((ind+1)/totalTokens) |
|
progress(0.6 + completion, f"Merging {token}, Token #{tokens.index(token)+1}/{len(tokens)}") |
|
|
|
if token not in vectors: |
|
continue |
|
|
|
vector = vectors[token] |
|
merged_vector += vector |
|
|
|
|
|
merged_vector /= len(tokens) |
|
|
|
return merged_vector.tolist(), json.dumps(tokens) |
|
|
|
demo = gr.Interface(fn=onInput, inputs="text", outputs=["text", "json"]) |
|
demo.launch() |