anzorq commited on
Commit
f4077f7
·
1 Parent(s): 108b0a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -10
app.py CHANGED
@@ -10,13 +10,13 @@ def save_embeddings(sentences, filename):
10
  def load_embeddings(filename):
11
  return torch.load(filename, map_location=torch.device('cpu'))
12
 
13
- def preprocess_model_descriptions(file_path):
14
  encodings = ['utf-8', 'latin-1', 'utf-16']
15
  for encoding in encodings:
16
  try:
17
- df = pd.read_csv(file_path, sep='\t', header=None, names=['model_id', 'description'])
18
  df.dropna(subset=['description'], inplace=True)
19
- model_ids = df['model_id'].tolist()
20
  descriptions = df['description'].tolist()
21
  break
22
  except UnicodeDecodeError:
@@ -24,22 +24,22 @@ def preprocess_model_descriptions(file_path):
24
  else:
25
  raise UnicodeDecodeError("Unable to decode the file using the available encodings.")
26
 
27
- return model_ids, descriptions
28
 
29
- def perform_similarity_search(query_embeddings, embeddings, model_ids, descriptions, top_k=10):
30
  cosine_scores = util.cos_sim(query_embeddings, embeddings)
31
  similarity_scores = cosine_scores.tolist()
32
 
33
  results = []
34
  for i, query_embedding in enumerate(query_embeddings):
35
- query_results = sorted(zip(model_ids, descriptions, similarity_scores[i]), key=lambda x: x[2], reverse=True)[:top_k]
36
  results.extend(query_results)
37
 
38
- return pd.DataFrame(results, columns=["model_id", "description", "score"])
39
 
40
  model = SentenceTransformer('all-MiniLM-L6-v2')
41
 
42
- model_ids, descriptions = preprocess_model_descriptions('hf_spaces_descriptions.tsv')
43
  embeddings = load_embeddings('embeddings_hf_spaces_descriptions.pt')
44
 
45
  with gr.Blocks() as demo:
@@ -50,9 +50,8 @@ with gr.Blocks() as demo:
50
 
51
  def search(query):
52
  query_embedding = model.encode([query], convert_to_tensor=True)
53
- return perform_similarity_search(query_embedding, embeddings, model_ids, descriptions, top_k=num_results)
54
 
55
  input.submit(search, inputs=input, outputs=df_output)
56
- button.click(search, inputs=input, outputs=df_output)
57
 
58
  demo.launch()
 
10
  def load_embeddings(filename):
11
  return torch.load(filename, map_location=torch.device('cpu'))
12
 
13
+ def preprocess_space_descriptions(file_path):
14
  encodings = ['utf-8', 'latin-1', 'utf-16']
15
  for encoding in encodings:
16
  try:
17
+ df = pd.read_csv(file_path, sep='\t', header=None, names=['space_id', 'description'])
18
  df.dropna(subset=['description'], inplace=True)
19
+ space_ids = df['space_id'].tolist()
20
  descriptions = df['description'].tolist()
21
  break
22
  except UnicodeDecodeError:
 
24
  else:
25
  raise UnicodeDecodeError("Unable to decode the file using the available encodings.")
26
 
27
+ return space_ids, descriptions
28
 
29
+ def perform_similarity_search(query_embeddings, embeddings, space_ids, descriptions, top_k=10):
30
  cosine_scores = util.cos_sim(query_embeddings, embeddings)
31
  similarity_scores = cosine_scores.tolist()
32
 
33
  results = []
34
  for i, query_embedding in enumerate(query_embeddings):
35
+ query_results = sorted(zip(space_ids, descriptions, similarity_scores[i]), key=lambda x: x[2], reverse=True)[:top_k]
36
  results.extend(query_results)
37
 
38
+ return pd.DataFrame(results, columns=["space_id", "description", "score"])
39
 
40
  model = SentenceTransformer('all-MiniLM-L6-v2')
41
 
42
+ space_ids, descriptions = preprocess_space_descriptions('hf_spaces_descriptions.tsv')
43
  embeddings = load_embeddings('embeddings_hf_spaces_descriptions.pt')
44
 
45
  with gr.Blocks() as demo:
 
50
 
51
  def search(query):
52
  query_embedding = model.encode([query], convert_to_tensor=True)
53
+ return perform_similarity_search(query_embedding, embeddings, space_ids, descriptions, top_k=num_results)
54
 
55
  input.submit(search, inputs=input, outputs=df_output)
 
56
 
57
  demo.launch()