Tonic commited on
Commit
56e4f3d
Β·
verified Β·
1 Parent(s): 2d4bf4a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -3
app.py CHANGED
@@ -51,6 +51,11 @@ def clear_cuda_cache():
51
  def free_memory(*args):
52
  for arg in args:
53
  del arg
 
 
 
 
 
54
 
55
  @spaces.GPU
56
  def compute_embeddings(selected_task, input_text):
@@ -100,10 +105,8 @@ def compute_similarity(selected_task, sentence1, sentence2, extra_sentence1, ext
100
  free_memory(embeddings1, embeddings2, embeddings3, embeddings4)
101
 
102
  similarity_scores = {"Similarity 1-2": similarity1, "Similarity 1-3": similarity2, "Similarity 1-4": similarity3}
103
-
104
  return similarity_scores
105
-
106
-
107
  @spaces.GPU
108
  def compute_cosine_similarity(emb1, emb2):
109
  tensor1 = torch.tensor(emb1).to(device).half()
@@ -112,7 +115,36 @@ def compute_cosine_similarity(emb1, emb2):
112
  free_memory(tensor1, tensor2)
113
  return similarity
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  def app_interface():
 
 
116
  with gr.Blocks() as demo:
117
  gr.Markdown(title)
118
  gr.Markdown(description)
@@ -141,6 +173,43 @@ def app_interface():
141
  inputs=[task_dropdown, sentence1_box, sentence2_box, extra_sentence1_box, extra_sentence2_box],
142
  outputs=similarity_output
143
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  with gr.Row():
146
  with gr.Column():
 
51
  def free_memory(*args):
52
  for arg in args:
53
  del arg
54
+
55
+ def load_corpus_from_json(file_path):
56
+ with open(file_path, 'r') as file:
57
+ data = json.load(file)
58
+ return data
59
 
60
  @spaces.GPU
61
  def compute_embeddings(selected_task, input_text):
 
105
  free_memory(embeddings1, embeddings2, embeddings3, embeddings4)
106
 
107
  similarity_scores = {"Similarity 1-2": similarity1, "Similarity 1-3": similarity2, "Similarity 1-4": similarity3}
 
108
  return similarity_scores
109
+
 
110
  @spaces.GPU
111
  def compute_cosine_similarity(emb1, emb2):
112
  tensor1 = torch.tensor(emb1).to(device).half()
 
115
  free_memory(tensor1, tensor2)
116
  return similarity
117
 
118
+
119
+ @spaces.GPU
120
+ def compute_embeddings_batch(input_texts):
121
+ max_length = 2042
122
+ processed_texts = [f'Instruct: {task_description}\nQuery: {text}' for text in input_texts]
123
+
124
+ batch_dict = tokenizer(processed_texts, max_length=max_length - 1, return_attention_mask=False, padding=False, truncation=True)
125
+ batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
126
+ batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt')
127
+ batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
128
+ outputs = model(**batch_dict)
129
+ embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
130
+ embeddings = F.normalize(embeddings, p=2, dim=1)
131
+ return embeddings.detach().cpu().numpy()
132
+
133
+ def semantic_search(query_embedding, corpus_embeddings, top_k=5):
134
+ scores = np.dot(corpus_embeddings, query_embedding.T).flatten()
135
+ top_k_indices = np.argsort(scores)[::-1][:top_k]
136
+ return top_k_indices, scores[top_k_indices]
137
+
138
+ def search_similar_sentences(input_question, corpus_sentences, corpus_embeddings):
139
+ question_embedding = compute_embeddings_batch([input_question])[0]
140
+ top_k_indices, top_k_scores = semantic_search(question_embedding, corpus_embeddings)
141
+ results = [(corpus_sentences[i], top_k_scores[i]) for i in top_k_indices]
142
+ return results
143
+
144
+
145
  def app_interface():
146
+ corpus_sentences = []
147
+ corpus_embeddings = []
148
  with gr.Blocks() as demo:
149
  gr.Markdown(title)
150
  gr.Markdown(description)
 
173
  inputs=[task_dropdown, sentence1_box, sentence2_box, extra_sentence1_box, extra_sentence2_box],
174
  outputs=similarity_output
175
  )
176
+ with gr.Tab("Load Corpus"):
177
+ json_uploader = gr.File(label="Upload JSON File")
178
+ load_corpus_button = gr.Button("Load Corpus")
179
+ corpus_status = gr.Textbox(label="Corpus Status", value="Corpus not loaded")
180
+
181
+ def load_corpus(file_info):
182
+ if file_info is None:
183
+ return "No file uploaded. Please upload a JSON file."
184
+ try:
185
+ global corpus_sentences, corpus_embeddings
186
+ corpus_sentences = load_corpus_from_json(file_info['name'])
187
+ corpus_embeddings = compute_embeddings_batch(corpus_sentences)
188
+ return "Corpus loaded successfully with {} sentences.".format(len(corpus_sentences))
189
+ except Exception as e:
190
+ return "Error loading corpus: {}".format(e)
191
+
192
+ load_corpus_button.click(
193
+ fn=load_corpus,
194
+ inputs=json_uploader,
195
+ outputs=corpus_status
196
+ )
197
+
198
+ with gr.Tab("Semantic Search"):
199
+ input_question_box = gr.Textbox(label="Enter your question")
200
+ search_button = gr.Button("Search")
201
+ search_results_output = gr.Textbox(label="Search Results")
202
+
203
+ def perform_search(input_question):
204
+ if not corpus_sentences or not corpus_embeddings:
205
+ return "Corpus is not loaded. Please load a corpus first."
206
+ return search_similar_sentences(input_question, corpus_sentences, corpus_embeddings)
207
+
208
+ search_button.click(
209
+ fn=perform_search,
210
+ inputs=input_question_box,
211
+ outputs=search_results_output
212
+ )
213
 
214
  with gr.Row():
215
  with gr.Column():