Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -51,6 +51,11 @@ def clear_cuda_cache():
|
|
51 |
def free_memory(*args):
|
52 |
for arg in args:
|
53 |
del arg
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
@spaces.GPU
|
56 |
def compute_embeddings(selected_task, input_text):
|
@@ -100,10 +105,8 @@ def compute_similarity(selected_task, sentence1, sentence2, extra_sentence1, ext
|
|
100 |
free_memory(embeddings1, embeddings2, embeddings3, embeddings4)
|
101 |
|
102 |
similarity_scores = {"Similarity 1-2": similarity1, "Similarity 1-3": similarity2, "Similarity 1-4": similarity3}
|
103 |
-
|
104 |
return similarity_scores
|
105 |
-
|
106 |
-
|
107 |
@spaces.GPU
|
108 |
def compute_cosine_similarity(emb1, emb2):
|
109 |
tensor1 = torch.tensor(emb1).to(device).half()
|
@@ -112,7 +115,36 @@ def compute_cosine_similarity(emb1, emb2):
|
|
112 |
free_memory(tensor1, tensor2)
|
113 |
return similarity
|
114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
def app_interface():
|
|
|
|
|
116 |
with gr.Blocks() as demo:
|
117 |
gr.Markdown(title)
|
118 |
gr.Markdown(description)
|
@@ -141,6 +173,43 @@ def app_interface():
|
|
141 |
inputs=[task_dropdown, sentence1_box, sentence2_box, extra_sentence1_box, extra_sentence2_box],
|
142 |
outputs=similarity_output
|
143 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
with gr.Row():
|
146 |
with gr.Column():
|
|
|
51 |
def free_memory(*args):
|
52 |
for arg in args:
|
53 |
del arg
|
54 |
+
|
55 |
+
def load_corpus_from_json(file_path):
|
56 |
+
with open(file_path, 'r') as file:
|
57 |
+
data = json.load(file)
|
58 |
+
return data
|
59 |
|
60 |
@spaces.GPU
|
61 |
def compute_embeddings(selected_task, input_text):
|
|
|
105 |
free_memory(embeddings1, embeddings2, embeddings3, embeddings4)
|
106 |
|
107 |
similarity_scores = {"Similarity 1-2": similarity1, "Similarity 1-3": similarity2, "Similarity 1-4": similarity3}
|
|
|
108 |
return similarity_scores
|
109 |
+
|
|
|
110 |
@spaces.GPU
|
111 |
def compute_cosine_similarity(emb1, emb2):
|
112 |
tensor1 = torch.tensor(emb1).to(device).half()
|
|
|
115 |
free_memory(tensor1, tensor2)
|
116 |
return similarity
|
117 |
|
118 |
+
|
119 |
+
@spaces.GPU
|
120 |
+
def compute_embeddings_batch(input_texts):
|
121 |
+
max_length = 2042
|
122 |
+
processed_texts = [f'Instruct: {task_description}\nQuery: {text}' for text in input_texts]
|
123 |
+
|
124 |
+
batch_dict = tokenizer(processed_texts, max_length=max_length - 1, return_attention_mask=False, padding=False, truncation=True)
|
125 |
+
batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
|
126 |
+
batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt')
|
127 |
+
batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
|
128 |
+
outputs = model(**batch_dict)
|
129 |
+
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
|
130 |
+
embeddings = F.normalize(embeddings, p=2, dim=1)
|
131 |
+
return embeddings.detach().cpu().numpy()
|
132 |
+
|
133 |
+
def semantic_search(query_embedding, corpus_embeddings, top_k=5):
|
134 |
+
scores = np.dot(corpus_embeddings, query_embedding.T).flatten()
|
135 |
+
top_k_indices = np.argsort(scores)[::-1][:top_k]
|
136 |
+
return top_k_indices, scores[top_k_indices]
|
137 |
+
|
138 |
+
def search_similar_sentences(input_question, corpus_sentences, corpus_embeddings):
|
139 |
+
question_embedding = compute_embeddings_batch([input_question])[0]
|
140 |
+
top_k_indices, top_k_scores = semantic_search(question_embedding, corpus_embeddings)
|
141 |
+
results = [(corpus_sentences[i], top_k_scores[i]) for i in top_k_indices]
|
142 |
+
return results
|
143 |
+
|
144 |
+
|
145 |
def app_interface():
|
146 |
+
corpus_sentences = []
|
147 |
+
corpus_embeddings = []
|
148 |
with gr.Blocks() as demo:
|
149 |
gr.Markdown(title)
|
150 |
gr.Markdown(description)
|
|
|
173 |
inputs=[task_dropdown, sentence1_box, sentence2_box, extra_sentence1_box, extra_sentence2_box],
|
174 |
outputs=similarity_output
|
175 |
)
|
176 |
+
with gr.Tab("Load Corpus"):
|
177 |
+
json_uploader = gr.File(label="Upload JSON File")
|
178 |
+
load_corpus_button = gr.Button("Load Corpus")
|
179 |
+
corpus_status = gr.Textbox(label="Corpus Status", value="Corpus not loaded")
|
180 |
+
|
181 |
+
def load_corpus(file_info):
|
182 |
+
if file_info is None:
|
183 |
+
return "No file uploaded. Please upload a JSON file."
|
184 |
+
try:
|
185 |
+
global corpus_sentences, corpus_embeddings
|
186 |
+
corpus_sentences = load_corpus_from_json(file_info['name'])
|
187 |
+
corpus_embeddings = compute_embeddings_batch(corpus_sentences)
|
188 |
+
return "Corpus loaded successfully with {} sentences.".format(len(corpus_sentences))
|
189 |
+
except Exception as e:
|
190 |
+
return "Error loading corpus: {}".format(e)
|
191 |
+
|
192 |
+
load_corpus_button.click(
|
193 |
+
fn=load_corpus,
|
194 |
+
inputs=json_uploader,
|
195 |
+
outputs=corpus_status
|
196 |
+
)
|
197 |
+
|
198 |
+
with gr.Tab("Semantic Search"):
|
199 |
+
input_question_box = gr.Textbox(label="Enter your question")
|
200 |
+
search_button = gr.Button("Search")
|
201 |
+
search_results_output = gr.Textbox(label="Search Results")
|
202 |
+
|
203 |
+
def perform_search(input_question):
|
204 |
+
if not corpus_sentences or not corpus_embeddings:
|
205 |
+
return "Corpus is not loaded. Please load a corpus first."
|
206 |
+
return search_similar_sentences(input_question, corpus_sentences, corpus_embeddings)
|
207 |
+
|
208 |
+
search_button.click(
|
209 |
+
fn=perform_search,
|
210 |
+
inputs=input_question_box,
|
211 |
+
outputs=search_results_output
|
212 |
+
)
|
213 |
|
214 |
with gr.Row():
|
215 |
with gr.Column():
|