token-explorer / app.py
christopher's picture
Update app.py
a147e52
raw
history blame
2.11 kB
import gradio as gr
from faiss import IndexFlatIP, IndexFlatL2
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
input_embeddings = np.load("bert_input_embeddings.npy")
unnormalized_input_embeddings = np.load("unnormalized_bert_input_embeddings.npy")
index_L2 = IndexFlatL2(input_embeddings.shape[-1])
index_L2.add(input_embeddings)
index_IP = IndexFlatIP(input_embeddings.shape[-1])
index_IP.add(input_embeddings)
index_L2_unnormalized = IndexFlatL2(unnormalized_input_embeddings.shape[-1])
index_L2_unnormalized.add(unnormalized_input_embeddings)
index_IP_unnormalized = IndexFlatIP(unnormalized_input_embeddings.shape[-1])
index_IP_unnormalized.add(unnormalized_input_embeddings)
vocab = {v:k for k,v in tokenizer.vocab.items()}
lookup_table = pd.Series(vocab).sort_index()
def get_first_subword(word):
try:
return tokenizer.vocab[word]
except:
return tokenizer(word, add_special_tokens=False)['input_ids'][0]
def search(token_to_lookup, num_neighbors=200):
i = get_first_subword(token_to_lookup)
_ , I = index_L2_unnormalized.search(unnormalized_input_embeddings[i:i+1], num_neighbors)
hits = lookup_table.take(I[0])
results = hits.values[1:]
return [r for r in results if not "##" in r], [[r for r in results if "##" in r]]
iface = gr.Interface(
fn=search,
#inputs=[gr.Textbox(lines=1, label="Vocabulary Token", placeholder="Enter token..."), gr.Number(value=50, label="number of neighbors")],
inputs=gr.Textbox(lines=1, label="Vocabulary Token", placeholder="Enter token..."),
outputs=[gr.Textbox(label="Nearest tokens"), gr.Textbox(label="Nearest subwords")],
examples=[
["##logy"],
["##ness"],
["##nity"],
["responded"],
["queen"],
["king"],
["hospital"],
["disease"],
["grammar"],
["philosophy"],
["aristotle"],
["##ting"],
["woman"],
["man"]
],
)
iface.launch(enable_queue=True, debug=True, show_error=True)