File size: 2,307 Bytes
9b95338
bb788ed
2623e85
 
8ceef3d
2623e85
8ceef3d
 
b830b93
 
a147e52
b830b93
 
a147e52
b830b93
 
a147e52
 
2623e85
 
 
 
 
 
 
 
 
3003e77
95d2476
b830b93
26b15f7
 
 
 
b830b93
26b15f7
 
 
 
 
d9adf81
 
 
 
c83bbd7
3003e77
 
26b15f7
cfc29b0
 
c3ed860
19c7d3c
cfc29b0
a546915
f834bd4
b016be0
3d77f49
 
 
6ad4ef6
3d77f49
 
 
 
cfc29b0
d9adf81
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gradio as gr
from faiss import IndexFlatIP, IndexFlatL2
import pandas as pd
import numpy as np
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
normalized = np.load("embeddings/bert-large-uncased/normalized.npy")
unnormalized = np.load("embeddings/bert-large-uncased/unnormalized.npy")

index_L2 = IndexFlatL2(unnormalized.shape[-1])  
index_L2.add(unnormalized)

index_IP = IndexFlatIP(normalized.shape[-1])  
index_IP.add(normalized)


vocab = {v:k for k,v in tokenizer.vocab.items()}
lookup_table = pd.Series(vocab).sort_index()

def get_first_subword(word):
    try:
        return tokenizer.vocab[word]
    except:
        return tokenizer(word, add_special_tokens=False)['input_ids'][0]

def search(token_to_lookup, num_neighbors=250):
    i = get_first_subword(token_to_lookup)
    _ , I_IP = index_IP.search(normalized[i:i+1], num_neighbors)
    hits_IP = lookup_table.take(I_IP[0])
    results_IP = hits_IP.values[1:]
    results_IP = [r for r in results_IP if not "[unused" in r]

    _ , I_L2 = index_L2.search(unnormalized[i:i+1], num_neighbors)
    hits_L2 = lookup_table.take(I_L2[0])
    results_L2 = hits_L2.values[1:]
    results_L2 = [r for r in results_L2 if not "[unused" in r]
    
    return [r for r in results_IP if not "##" in r], [r for r in results_IP if "##" in r], [r for r in results_L2 if not "##" in r], [r for r in results_L2 if "##" in r]


iface = gr.Interface(
    fn=search,
    
    #inputs=[gr.Textbox(lines=1, label="Vocabulary Token", placeholder="Enter token..."), gr.Slider(minimum=0, maximum=1000, value=250, step=10,label="number of neighbors")],
    inputs=gr.Textbox(lines=1, label="Vocabulary Token", placeholder="Enter token..."),
    outputs=[gr.Textbox(label="IP-Nearest tokens"), gr.Textbox(label="IP-Nearest subwords"), gr.Textbox(label="L2-Nearest tokens"), gr.Textbox(label="L2-Nearest subwords")],
    examples=[
        ["##logy"],
        ["##ness"],
        ["##ity"],
        ["responded"],
        ["sadness"],
        ["queen"],
        ["king"],
        ["hospital"],
        ["disease"],
        ["grammar"],
        ["philosophy"],
        ["aristotle"],
        ["##ting"],
        ["woman"],
        ["man"]
    ],
)
iface.launch(enable_queue=True, debug=True, show_error=True)