Spaces:

bigscience-data
/

token-explorer

Build error

File size: 2,114 Bytes

9b95338
bb788ed
2623e85
 
8ceef3d
2623e85
8ceef3d
 
2623e85
a147e52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2623e85
 
 
 
 
 
 
 
 
a147e52
95d2476
a147e52
d9adf81
7825e2d
 
d9adf81
 
 
 
c83bbd7
c294475
ba399d6
46aec1c
cfc29b0
 
c3ed860
9638484
cfc29b0
f834bd4
b016be0
3d77f49
 
 
6ad4ef6
3d77f49
 
 
 
cfc29b0
d9adf81

import gradio as gr
from faiss import IndexFlatIP, IndexFlatL2
import pandas as pd
import numpy as np
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
input_embeddings = np.load("bert_input_embeddings.npy")
unnormalized_input_embeddings = np.load("unnormalized_bert_input_embeddings.npy")

index_L2 = IndexFlatL2(input_embeddings.shape[-1])  
index_L2.add(input_embeddings)

index_IP = IndexFlatIP(input_embeddings.shape[-1])  
index_IP.add(input_embeddings)

index_L2_unnormalized = IndexFlatL2(unnormalized_input_embeddings.shape[-1])  
index_L2_unnormalized.add(unnormalized_input_embeddings)

index_IP_unnormalized = IndexFlatIP(unnormalized_input_embeddings.shape[-1])  
index_IP_unnormalized.add(unnormalized_input_embeddings)


vocab = {v:k for k,v in tokenizer.vocab.items()}
lookup_table = pd.Series(vocab).sort_index()

def get_first_subword(word):
    try:
        return tokenizer.vocab[word]
    except:
        return tokenizer(word, add_special_tokens=False)['input_ids'][0]

def search(token_to_lookup, num_neighbors=200):
    i = get_first_subword(token_to_lookup)
    _ , I = index_L2_unnormalized.search(unnormalized_input_embeddings[i:i+1], num_neighbors)
    hits = lookup_table.take(I[0])
    results = hits.values[1:]
    return [r for r in results if not "##" in r], [[r for r in results if "##" in r]]


iface = gr.Interface(
    fn=search,
    
    #inputs=[gr.Textbox(lines=1, label="Vocabulary Token", placeholder="Enter token..."), gr.Number(value=50, label="number of neighbors")],
    inputs=gr.Textbox(lines=1, label="Vocabulary Token", placeholder="Enter token..."),
    outputs=[gr.Textbox(label="Nearest tokens"), gr.Textbox(label="Nearest subwords")],
    examples=[
        ["##logy"],
        ["##ness"],
        ["##nity"],
        ["responded"],
        ["queen"],
        ["king"],
        ["hospital"],
        ["disease"],
        ["grammar"],
        ["philosophy"],
        ["aristotle"],
        ["##ting"],
        ["woman"],
        ["man"]
    ],
)
iface.launch(enable_queue=True, debug=True, show_error=True)