Spaces:
Build error
Build error
Christopher Akiki
commited on
Commit
Β·
b830b93
1
Parent(s):
8ee1828
Minor fixes
Browse files
app.py
CHANGED
@@ -6,14 +6,14 @@ from transformers import AutoTokenizer
|
|
6 |
|
7 |
|
8 |
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
|
9 |
-
|
10 |
-
|
11 |
|
12 |
-
index_L2 = IndexFlatL2(
|
13 |
-
index_L2.add(
|
14 |
|
15 |
-
index_IP = IndexFlatIP(
|
16 |
-
index_IP.add(
|
17 |
|
18 |
|
19 |
vocab = {v:k for k,v in tokenizer.vocab.items()}
|
@@ -27,12 +27,12 @@ def get_first_subword(word):
|
|
27 |
|
28 |
def search(token_to_lookup, num_neighbors=250):
|
29 |
i = get_first_subword(token_to_lookup)
|
30 |
-
_ , I_IP = index_IP.search(
|
31 |
hits_IP = lookup_table.take(I_IP[0])
|
32 |
results_IP = hits_IP.values[1:]
|
33 |
results_IP = [r for r in results_IP if not "[unused" in r]
|
34 |
|
35 |
-
_ , I_L2 = index_L2.search(
|
36 |
hits_L2 = lookup_table.take(I_L2[0])
|
37 |
results_L2 = hits_L2.values[1:]
|
38 |
results_L2 = [r for r in results_L2 if not "[unused" in r]
|
|
|
6 |
|
7 |
|
8 |
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
|
9 |
+
normalized = np.load("embeddings/bert-large-uncased/normalized.npy")
|
10 |
+
unnormalized = np.load("embeddings/bert-large-uncased/unnormalized.npy")
|
11 |
|
12 |
+
index_L2 = IndexFlatL2(unnormalized.shape[-1])
|
13 |
+
index_L2.add(unnormalized)
|
14 |
|
15 |
+
index_IP = IndexFlatIP(normalized.shape[-1])
|
16 |
+
index_IP.add(normalized)
|
17 |
|
18 |
|
19 |
vocab = {v:k for k,v in tokenizer.vocab.items()}
|
|
|
27 |
|
28 |
def search(token_to_lookup, num_neighbors=250):
|
29 |
i = get_first_subword(token_to_lookup)
|
30 |
+
_ , I_IP = index_IP.search(normalized[i:i+1], num_neighbors)
|
31 |
hits_IP = lookup_table.take(I_IP[0])
|
32 |
results_IP = hits_IP.values[1:]
|
33 |
results_IP = [r for r in results_IP if not "[unused" in r]
|
34 |
|
35 |
+
_ , I_L2 = index_L2.search(unnormalized[i:i+1], num_neighbors)
|
36 |
hits_L2 = lookup_table.take(I_L2[0])
|
37 |
results_L2 = hits_L2.values[1:]
|
38 |
results_L2 = [r for r in results_L2 if not "[unused" in r]
|