Spaces:
Sleeping
Sleeping
File size: 1,808 Bytes
7a8cb87 4d7bc75 8d5a24c 4d7bc75 f621a6c 8d5a24c e96c4ee 7a8cb87 8d5a24c e96c4ee 4d7bc75 f621a6c 8d5a24c 7a8cb87 4d7bc75 f621a6c 4d7bc75 f621a6c e96c4ee 8d5a24c e96c4ee f621a6c 4d7bc75 7a8cb87 4d7bc75 7a8cb87 8d5a24c 7a8cb87 8d5a24c f621a6c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import gradio as gr
import numpy as np
import json
import pickle as pkl
from transformers import AutoTokenizer
import re
# Vector Loader
vectors = pkl.load(open("vectors.pkl", "rb"))
vocab = [word.lower() for word in vectors.keys()]
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def make_alphanumeric(input_string):
return re.sub(r'[^a-zA-Z0-9 ]', '', input_string)
def tokenize(text):
# Check data
if len(text) == 0:
gr.Error("No text provided.")
elif len(text) > 4096:
gr.Error("Text too long.")
# Filter
text = make_alphanumeric(text.lower())
pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
pre_tokenized_text = [word for word, offset in pre_tokenize_result]
tokens = []
for word in pre_tokenized_text:
if word in vocab:
tokens.append(word)
return tokens
# Interface
def onInput(paragraph, progress = gr.Progress()):
tokens = tokenize(paragraph)
if not tokens: # Handle case with no tokens found
return np.zeros(300).tolist() # Return a zero vector of appropriate dimension
merged_vector = np.zeros(300) # Assuming vectors are 300-dimensional
# Merge vectors using NumPy
totalTokens = len(tokens)
for ind, token in enumerate(tokens):
completion = 0.2*((ind+1)/totalTokens)
progress(0.6 + completion, f"Merging {token}, Token #{tokens.index(token)+1}/{len(tokens)}")
if token not in vectors:
continue
vector = vectors[token]
merged_vector += vector
# Normalize
merged_vector /= len(tokens)
return merged_vector.tolist(), json.dumps(tokens)
demo = gr.Interface(fn=onInput, inputs="text", outputs=["text", "json"])
demo.launch() |