Jonas Leeb commited on
Commit
5355a96
·
1 Parent(s): dfc89a9

small updates

Browse files
Files changed (1) hide show
  1. app.py +16 -1
app.py CHANGED
@@ -149,7 +149,7 @@ class ArxivSearch:
149
  y=reduced_data[:, 1],
150
  z=reduced_data[:, 2],
151
  mode='markers',
152
- marker=dict(size=3.5, color='#cccccc', opacity=0.35),
153
  name='All Documents'
154
  )
155
  layout = go.Layout(
@@ -223,6 +223,21 @@ class ArxivSearch:
223
  top_indices = sims.argsort()[::-1][:top_n]
224
  return [(i, sims[i]) for i in top_indices]
225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  def load_model(self, embedding):
227
  if embedding == "tfidf":
228
  self.tfidf_matrix = load_npz("TF-IDF embeddings/tfidf_matrix_train.npz")
 
149
  y=reduced_data[:, 1],
150
  z=reduced_data[:, 2],
151
  mode='markers',
152
+ marker=dict(size=3.5, color="#ffffff", opacity=0.2),
153
  name='All Documents'
154
  )
155
  layout = go.Layout(
 
223
  top_indices = sims.argsort()[::-1][:top_n]
224
  return [(i, sims[i]) for i in top_indices]
225
 
226
+ def bert_search_2(self, query, top_n=5):
227
+ with torch.no_grad():
228
+ inputs = self.tokenizer(query, return_tensors="pt", truncation=True, padding=True)
229
+ outputs = self.model(**inputs)
230
+ token_embeddings = outputs.last_hidden_state
231
+ attention_mask = inputs['attention_mask']
232
+ mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
233
+ sentence_embeddings = torch.sum(token_embeddings * mask_expanded, dim=1)
234
+ sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
235
+ query_vec = sentence_embeddings / sum_mask
236
+ self.query_encoding = query_vec
237
+ sims = cosine_similarity(query_vec, self.bert_embeddings).flatten()
238
+ top_indices = sims.argsort()[::-1][:top_n]
239
+ return [(i, sims[i]) for i in top_indices]
240
+
241
  def load_model(self, embedding):
242
  if embedding == "tfidf":
243
  self.tfidf_matrix = load_npz("TF-IDF embeddings/tfidf_matrix_train.npz")