Spaces:

bigscience-data
/

token-explorer

Build error

christopher commited on Aug 18, 2022

Commit

2623e85

1 Parent(s): c97076c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

 import gradio as gr
+from faiss import IndexFlatIP
+import pandas as pd
+import numpy as np
+input_embeddings = np.load("bert_input_embeddings.npy")
+index = IndexFlatIP(input_embeddings.shape[-1])
+index.add(input_embeddings)
+vocab = {v:k for k,v in tokenizer.vocab.items()}
+lookup_table = pd.Series(vocab).sort_index()
+def get_first_subword(word):
+    try:
+        return tokenizer.vocab[word]
+    except:
+        return tokenizer(word, add_special_tokens=False)['input_ids'][0]
+word_to_lookup = "##lity"
+num_neighbors = 20
+num_neighbors = 20
+i = get_first_subword(word_to_lookup)
+_ , I = index.search(input_embeddings[i:i+1], num_neighbors)
+hits = lookup_table.take(I[0])
+hits.values