Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,13 +3,19 @@ import pandas as pd
|
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
import gradio as gr
|
5 |
import re
|
|
|
|
|
6 |
|
|
|
7 |
model = SentenceTransformer("distilbert-base-multilingual-cased")
|
8 |
modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
|
|
|
|
|
9 |
df = pd.read_csv("cleaned1.csv")
|
10 |
df2 = pd.read_csv("cleaned2.csv")
|
11 |
df3 = pd.read_csv("cleaned3.csv")
|
12 |
|
|
|
13 |
embeddings = torch.load("embeddings1_1.pt")
|
14 |
embeddings2 = torch.load("embeddings2_1.pt")
|
15 |
embeddings3 = torch.load("embeddings3_1.pt")
|
@@ -18,6 +24,7 @@ embeddingsa = torch.load("embeddings1.pt")
|
|
18 |
embeddingsa2 = torch.load("embeddings2.pt")
|
19 |
embeddingsa3 = torch.load("embeddings3.pt")
|
20 |
|
|
|
21 |
df_questions = df["question"].values
|
22 |
df_links = df["link"].values
|
23 |
df2_questions = df2["question"].values
|
@@ -25,9 +32,6 @@ df2_links = df2["link"].values
|
|
25 |
df3_questions = df3["question"].values
|
26 |
df3_links = df3["url"].values
|
27 |
|
28 |
-
|
29 |
-
import re
|
30 |
-
|
31 |
ARABIC_STOPWORDS = {
|
32 |
'ูู', 'ู
ู', 'ุฅูู', 'ุนู', 'ู
ุน', 'ูุฐุง', 'ูุฐู', 'ุฐูู', 'ุชูู',
|
33 |
'ุงูุชู', 'ุงูุฐู', 'ู
ุง', 'ูุง', 'ุฃู', 'ุฃู', 'ููู', 'ูุฏ', 'ุญูู
', 'ูุงู',
|
@@ -43,8 +47,36 @@ def arabic_word_tokenize(text):
|
|
43 |
tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
|
44 |
return [t for t in tokens if t not in ARABIC_STOPWORDS]
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
def compute_word_overlap(query, questions):
|
|
|
48 |
query_words = set(arabic_word_tokenize(query))
|
49 |
if len(query_words) == 0:
|
50 |
return [0.0] * len(questions)
|
@@ -56,7 +88,7 @@ def compute_word_overlap(query, questions):
|
|
56 |
overlaps.append(0.0)
|
57 |
continue
|
58 |
|
59 |
-
# Use Jaccard similarity (intersection over union)
|
60 |
intersection = len(query_words & q_words)
|
61 |
union = len(query_words | q_words)
|
62 |
jaccard = intersection / union if union > 0 else 0.0
|
@@ -70,15 +102,23 @@ def compute_word_overlap(query, questions):
|
|
70 |
|
71 |
return overlaps
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
def predict(text):
|
74 |
-
print(f"Received
|
75 |
if not text or text.strip() == "":
|
76 |
return "No query provided"
|
77 |
|
|
|
78 |
query_embedding = model.encode(text, convert_to_tensor=True)
|
79 |
query_embeddinga = modela.encode(text, convert_to_tensor=True)
|
80 |
|
81 |
-
# Cosine similarities
|
82 |
sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
|
83 |
util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
|
84 |
sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
|
@@ -86,143 +126,161 @@ def predict(text):
|
|
86 |
sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
|
87 |
util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
|
88 |
|
89 |
-
#
|
|
|
|
|
|
|
|
|
|
|
90 |
word_overlap1 = compute_word_overlap(text, df_questions)
|
91 |
word_overlap2 = compute_word_overlap(text, df2_questions)
|
92 |
word_overlap3 = compute_word_overlap(text, df3_questions)
|
93 |
|
94 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
query_words = arabic_word_tokenize(text)
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
else:
|
103 |
-
# Long queries: prioritize semantic
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
word_score = float(word_overlap1[i])
|
111 |
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
boost = 0.0
|
117 |
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
"link": df_links[i],
|
123 |
-
"cosine_score": semantic_score,
|
124 |
-
"word_overlap_score": word_score,
|
125 |
-
"combined_score": combined_score
|
126 |
-
})
|
127 |
-
|
128 |
-
# Collect top2 with better scoring
|
129 |
-
combined2 = []
|
130 |
-
for i in range(len(df2_questions)):
|
131 |
-
semantic_score = float(sim_scores2[i].cpu().item())
|
132 |
-
word_score = float(word_overlap2[i])
|
133 |
-
|
134 |
-
if semantic_score > 0.5 and word_score > 0.3:
|
135 |
-
boost = 0.1
|
136 |
-
else:
|
137 |
-
boost = 0.0
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
if semantic_score > 0.5 and word_score > 0.3:
|
156 |
-
boost = 0.1
|
157 |
-
else:
|
158 |
-
boost = 0.0
|
159 |
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
# Sort by combined score and get top 3
|
173 |
by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
|
174 |
-
|
175 |
|
176 |
-
# Get
|
177 |
-
|
178 |
|
179 |
-
#
|
180 |
-
|
181 |
-
|
182 |
-
for item in
|
183 |
-
if item["question"] not in
|
184 |
-
|
185 |
break
|
186 |
|
187 |
-
#
|
188 |
-
by_semantic = sorted(combined_results, key=lambda x: x["
|
189 |
semantic_pick = None
|
190 |
-
|
191 |
-
|
192 |
-
excluded_questions.add(word_pick["question"])
|
193 |
|
194 |
for item in by_semantic:
|
195 |
-
if item["question"] not in
|
196 |
semantic_pick = item
|
197 |
break
|
198 |
|
199 |
# Combine results
|
200 |
-
final_results =
|
201 |
-
if
|
202 |
-
final_results.append(
|
203 |
if semantic_pick:
|
204 |
final_results.append(semantic_pick)
|
205 |
|
206 |
-
return final_results
|
207 |
|
208 |
-
|
209 |
-
|
210 |
-
|
|
|
211 |
|
212 |
results = {
|
213 |
|
214 |
"top2": top2,
|
215 |
"top3": top3,
|
216 |
"top1": top1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
}
|
218 |
|
219 |
return results
|
220 |
|
221 |
-
title = "Search
|
222 |
iface = gr.Interface(
|
223 |
fn=predict,
|
224 |
-
inputs=[gr.Textbox(label="
|
225 |
outputs='json',
|
226 |
title=title,
|
|
|
227 |
)
|
228 |
-
|
|
|
|
|
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
import gradio as gr
|
5 |
import re
|
6 |
+
from rank_bm25 import BM25Okapi
|
7 |
+
import numpy as np
|
8 |
|
9 |
+
# Load models
|
10 |
model = SentenceTransformer("distilbert-base-multilingual-cased")
|
11 |
modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
|
12 |
+
|
13 |
+
# Load data
|
14 |
df = pd.read_csv("cleaned1.csv")
|
15 |
df2 = pd.read_csv("cleaned2.csv")
|
16 |
df3 = pd.read_csv("cleaned3.csv")
|
17 |
|
18 |
+
# Load pre-computed embeddings
|
19 |
embeddings = torch.load("embeddings1_1.pt")
|
20 |
embeddings2 = torch.load("embeddings2_1.pt")
|
21 |
embeddings3 = torch.load("embeddings3_1.pt")
|
|
|
24 |
embeddingsa2 = torch.load("embeddings2.pt")
|
25 |
embeddingsa3 = torch.load("embeddings3.pt")
|
26 |
|
27 |
+
# Extract questions and links
|
28 |
df_questions = df["question"].values
|
29 |
df_links = df["link"].values
|
30 |
df2_questions = df2["question"].values
|
|
|
32 |
df3_questions = df3["question"].values
|
33 |
df3_links = df3["url"].values
|
34 |
|
|
|
|
|
|
|
35 |
ARABIC_STOPWORDS = {
|
36 |
'ูู', 'ู
ู', 'ุฅูู', 'ุนู', 'ู
ุน', 'ูุฐุง', 'ูุฐู', 'ุฐูู', 'ุชูู',
|
37 |
'ุงูุชู', 'ุงูุฐู', 'ู
ุง', 'ูุง', 'ุฃู', 'ุฃู', 'ููู', 'ูุฏ', 'ุญูู
', 'ูุงู',
|
|
|
47 |
tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
|
48 |
return [t for t in tokens if t not in ARABIC_STOPWORDS]
|
49 |
|
50 |
+
def prepare_bm25_corpus(questions):
|
51 |
+
"""Prepare tokenized corpus for BM25"""
|
52 |
+
tokenized_corpus = []
|
53 |
+
for question in questions:
|
54 |
+
tokens = arabic_word_tokenize(question)
|
55 |
+
tokenized_corpus.append(tokens)
|
56 |
+
return tokenized_corpus
|
57 |
+
|
58 |
+
# Initialize BM25 models for each dataset
|
59 |
+
print("Initializing BM25 models...")
|
60 |
+
bm25_corpus1 = prepare_bm25_corpus(df_questions)
|
61 |
+
bm25_corpus2 = prepare_bm25_corpus(df2_questions)
|
62 |
+
bm25_corpus3 = prepare_bm25_corpus(df3_questions)
|
63 |
+
|
64 |
+
bm25_model1 = BM25Okapi(bm25_corpus1)
|
65 |
+
bm25_model2 = BM25Okapi(bm25_corpus2)
|
66 |
+
bm25_model3 = BM25Okapi(bm25_corpus3)
|
67 |
+
print("BM25 models initialized!")
|
68 |
+
|
69 |
+
def compute_bm25_scores(query, bm25_model):
|
70 |
+
"""Compute BM25 scores for a query"""
|
71 |
+
query_tokens = arabic_word_tokenize(query)
|
72 |
+
if not query_tokens:
|
73 |
+
return np.zeros(len(bm25_model.corpus))
|
74 |
+
|
75 |
+
scores = bm25_model.get_scores(query_tokens)
|
76 |
+
return scores
|
77 |
|
78 |
def compute_word_overlap(query, questions):
|
79 |
+
"""Enhanced word overlap computation"""
|
80 |
query_words = set(arabic_word_tokenize(query))
|
81 |
if len(query_words) == 0:
|
82 |
return [0.0] * len(questions)
|
|
|
88 |
overlaps.append(0.0)
|
89 |
continue
|
90 |
|
91 |
+
# Use Jaccard similarity (intersection over union)
|
92 |
intersection = len(query_words & q_words)
|
93 |
union = len(query_words | q_words)
|
94 |
jaccard = intersection / union if union > 0 else 0.0
|
|
|
102 |
|
103 |
return overlaps
|
104 |
|
105 |
+
def normalize_scores(scores):
|
106 |
+
"""Normalize scores to 0-1 range"""
|
107 |
+
scores = np.array(scores)
|
108 |
+
if np.max(scores) == np.min(scores):
|
109 |
+
return np.zeros_like(scores)
|
110 |
+
return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
|
111 |
+
|
112 |
def predict(text):
|
113 |
+
print(f"Received query: {text}")
|
114 |
if not text or text.strip() == "":
|
115 |
return "No query provided"
|
116 |
|
117 |
+
# Semantic similarity scores
|
118 |
query_embedding = model.encode(text, convert_to_tensor=True)
|
119 |
query_embeddinga = modela.encode(text, convert_to_tensor=True)
|
120 |
|
121 |
+
# Cosine similarities (averaged from two models)
|
122 |
sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
|
123 |
util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
|
124 |
sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
|
|
|
126 |
sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
|
127 |
util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
|
128 |
|
129 |
+
# BM25 scores
|
130 |
+
bm25_scores1 = compute_bm25_scores(text, bm25_model1)
|
131 |
+
bm25_scores2 = compute_bm25_scores(text, bm25_model2)
|
132 |
+
bm25_scores3 = compute_bm25_scores(text, bm25_model3)
|
133 |
+
|
134 |
+
# Word overlap scores
|
135 |
word_overlap1 = compute_word_overlap(text, df_questions)
|
136 |
word_overlap2 = compute_word_overlap(text, df2_questions)
|
137 |
word_overlap3 = compute_word_overlap(text, df3_questions)
|
138 |
|
139 |
+
# Normalize all scores for fair combination
|
140 |
+
norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
|
141 |
+
norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
|
142 |
+
norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())
|
143 |
+
|
144 |
+
norm_bm25_1 = normalize_scores(bm25_scores1)
|
145 |
+
norm_bm25_2 = normalize_scores(bm25_scores2)
|
146 |
+
norm_bm25_3 = normalize_scores(bm25_scores3)
|
147 |
+
|
148 |
+
norm_word1 = normalize_scores(word_overlap1)
|
149 |
+
norm_word2 = normalize_scores(word_overlap2)
|
150 |
+
norm_word3 = normalize_scores(word_overlap3)
|
151 |
+
|
152 |
+
# Adaptive weighting based on query characteristics
|
153 |
query_words = arabic_word_tokenize(text)
|
154 |
+
query_length = len(query_words)
|
155 |
+
|
156 |
+
if query_length <= 2:
|
157 |
+
# Short queries: prioritize exact matches (BM25 + word overlap)
|
158 |
+
semantic_weight = 0.3
|
159 |
+
bm25_weight = 0.4
|
160 |
+
word_weight = 0.3
|
161 |
+
elif query_length <= 5:
|
162 |
+
# Medium queries: balanced approach
|
163 |
+
semantic_weight = 0.4
|
164 |
+
bm25_weight = 0.35
|
165 |
+
word_weight = 0.25
|
166 |
else:
|
167 |
+
# Long queries: prioritize semantic understanding
|
168 |
+
semantic_weight = 0.5
|
169 |
+
bm25_weight = 0.3
|
170 |
+
word_weight = 0.2
|
171 |
+
|
172 |
+
def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
|
173 |
+
combined_results = []
|
|
|
174 |
|
175 |
+
for i in range(len(questions)):
|
176 |
+
semantic_score = float(norm_semantic[i])
|
177 |
+
bm25_score = float(norm_bm25[i])
|
178 |
+
word_score = float(norm_word[i])
|
|
|
179 |
|
180 |
+
# Enhanced scoring with BM25
|
181 |
+
combined_score = (semantic_weight * semantic_score +
|
182 |
+
bm25_weight * bm25_score +
|
183 |
+
word_weight * word_score)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
+
# Boost results that perform well across multiple metrics
|
186 |
+
high_performance_count = sum([
|
187 |
+
semantic_score > 0.7,
|
188 |
+
bm25_score > 0.7,
|
189 |
+
word_score > 0.5
|
190 |
+
])
|
191 |
+
|
192 |
+
if high_performance_count >= 2:
|
193 |
+
boost = 0.1
|
194 |
+
elif high_performance_count >= 1:
|
195 |
+
boost = 0.05
|
196 |
+
else:
|
197 |
+
boost = 0.0
|
198 |
+
|
199 |
+
final_score = combined_score + boost
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
+
combined_results.append({
|
202 |
+
"question": questions[i],
|
203 |
+
"link": links[i],
|
204 |
+
"semantic_score": semantic_score,
|
205 |
+
"bm25_score": bm25_score,
|
206 |
+
"word_overlap_score": word_score,
|
207 |
+
"combined_score": final_score
|
208 |
+
})
|
209 |
|
210 |
+
return combined_results
|
211 |
+
|
212 |
+
# Create combined results for all datasets
|
213 |
+
combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
|
214 |
+
combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
|
215 |
+
combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)
|
216 |
+
|
217 |
+
def get_diverse_top_results(combined_results, top_k=5):
|
218 |
+
"""Get diverse top results using multiple ranking strategies"""
|
219 |
+
# Sort by combined score and get top candidates
|
|
|
220 |
by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
|
221 |
+
top_combined = by_combined[:3]
|
222 |
|
223 |
+
# Get questions from top combined to avoid duplicates
|
224 |
+
used_questions = {item["question"] for item in top_combined}
|
225 |
|
226 |
+
# Add best BM25 result not already included
|
227 |
+
by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
|
228 |
+
bm25_pick = None
|
229 |
+
for item in by_bm25:
|
230 |
+
if item["question"] not in used_questions:
|
231 |
+
bm25_pick = item
|
232 |
break
|
233 |
|
234 |
+
# Add best semantic result not already included
|
235 |
+
by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
|
236 |
semantic_pick = None
|
237 |
+
if bm25_pick:
|
238 |
+
used_questions.add(bm25_pick["question"])
|
|
|
239 |
|
240 |
for item in by_semantic:
|
241 |
+
if item["question"] not in used_questions:
|
242 |
semantic_pick = item
|
243 |
break
|
244 |
|
245 |
# Combine results
|
246 |
+
final_results = top_combined.copy()
|
247 |
+
if bm25_pick:
|
248 |
+
final_results.append(bm25_pick)
|
249 |
if semantic_pick:
|
250 |
final_results.append(semantic_pick)
|
251 |
|
252 |
+
return final_results[:top_k]
|
253 |
|
254 |
+
# Get top results for each dataset
|
255 |
+
top1 = get_diverse_top_results(combined1)
|
256 |
+
top2 = get_diverse_top_results(combined2)
|
257 |
+
top3 = get_diverse_top_results(combined3)
|
258 |
|
259 |
results = {
|
260 |
|
261 |
"top2": top2,
|
262 |
"top3": top3,
|
263 |
"top1": top1,
|
264 |
+
"query_info": {
|
265 |
+
"query_length": query_length,
|
266 |
+
"weights": {
|
267 |
+
"semantic": semantic_weight,
|
268 |
+
"bm25": bm25_weight,
|
269 |
+
"word_overlap": word_weight
|
270 |
+
}
|
271 |
+
}
|
272 |
}
|
273 |
|
274 |
return results
|
275 |
|
276 |
+
title = "Enhanced Search with BM25"
|
277 |
iface = gr.Interface(
|
278 |
fn=predict,
|
279 |
+
inputs=[gr.Textbox(label="Search Query", lines=3)],
|
280 |
outputs='json',
|
281 |
title=title,
|
282 |
+
description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
|
283 |
)
|
284 |
+
|
285 |
+
if __name__ == "__main__":
|
286 |
+
iface.launch()
|