File size: 1,979 Bytes
9b95338
bb788ed
2623e85
 
8ceef3d
2623e85
8ceef3d
 
0bad089
a147e52
 
0bad089
 
a147e52
0bad089
 
a147e52
 
2623e85
 
 
 
 
 
 
 
 
9c44dc5
95d2476
0bad089
d9adf81
7825e2d
b8dfa9f
60220a8
d9adf81
 
 
 
c83bbd7
2bbff8e
 
46aec1c
cfc29b0
 
c3ed860
9638484
cfc29b0
f834bd4
b016be0
3d77f49
 
 
6ad4ef6
3d77f49
 
 
 
cfc29b0
d9adf81
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import gradio as gr
from faiss import IndexFlatIP, IndexFlatL2
import pandas as pd
import numpy as np
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
normalized_input_embeddings = np.load("normalized_bert_input_embeddings.npy")
unnormalized_input_embeddings = np.load("unnormalized_bert_input_embeddings.npy")

index_L2 = IndexFlatL2(unnormalized_input_embeddings.shape[-1])  
index_L2.add(unnormalized_input_embeddings)

index_IP_normalized = IndexFlatIP(normalized_input_embeddings.shape[-1])  
index_IP_normalized.add(normalized_input_embeddings)


vocab = {v:k for k,v in tokenizer.vocab.items()}
lookup_table = pd.Series(vocab).sort_index()

def get_first_subword(word):
    try:
        return tokenizer.vocab[word]
    except:
        return tokenizer(word, add_special_tokens=False)['input_ids'][0]

def search(token_to_lookup, num_neighbors):
    i = get_first_subword(token_to_lookup)
    _ , I = index_IP_normalized.search(normalized_input_embeddings[i:i+1], num_neighbors)
    hits = lookup_table.take(I[0])
    results = hits.values[1:]
    results = [r for r in results if not "[unused" in r]
    return [r for r in results if not "##" in r], [r for r in results if "##" in r]


iface = gr.Interface(
    fn=search,
    
    #inputs=[gr.Textbox(lines=1, label="Vocabulary Token", placeholder="Enter token..."), gr.Number(value=50, label="number of neighbors")],
    inputs=gr.Textbox(lines=1, label="Vocabulary Token", placeholder="Enter token..."),
    outputs=[gr.Textbox(label="Nearest tokens"), gr.Textbox(label="Nearest subwords")],
    examples=[
        ["##logy"],
        ["##ness"],
        ["##nity"],
        ["responded"],
        ["queen"],
        ["king"],
        ["hospital"],
        ["disease"],
        ["grammar"],
        ["philosophy"],
        ["aristotle"],
        ["##ting"],
        ["woman"],
        ["man"]
    ],
)
iface.launch(enable_queue=True, debug=True, show_error=True)