mohbay commited on
Commit
2f4967b
ยท
verified ยท
1 Parent(s): 6dce45a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -104
app.py CHANGED
@@ -3,13 +3,19 @@ import pandas as pd
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
5
  import re
 
 
6
 
 
7
  model = SentenceTransformer("distilbert-base-multilingual-cased")
8
  modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
 
 
9
  df = pd.read_csv("cleaned1.csv")
10
  df2 = pd.read_csv("cleaned2.csv")
11
  df3 = pd.read_csv("cleaned3.csv")
12
 
 
13
  embeddings = torch.load("embeddings1_1.pt")
14
  embeddings2 = torch.load("embeddings2_1.pt")
15
  embeddings3 = torch.load("embeddings3_1.pt")
@@ -18,6 +24,7 @@ embeddingsa = torch.load("embeddings1.pt")
18
  embeddingsa2 = torch.load("embeddings2.pt")
19
  embeddingsa3 = torch.load("embeddings3.pt")
20
 
 
21
  df_questions = df["question"].values
22
  df_links = df["link"].values
23
  df2_questions = df2["question"].values
@@ -25,9 +32,6 @@ df2_links = df2["link"].values
25
  df3_questions = df3["question"].values
26
  df3_links = df3["url"].values
27
 
28
-
29
- import re
30
-
31
  ARABIC_STOPWORDS = {
32
  'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู†', 'ู…ุน', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ', 'ุชู„ูƒ',
33
  'ุงู„ุชูŠ', 'ุงู„ุฐูŠ', 'ู…ุง', 'ู„ุง', 'ุฃู†', 'ุฃูˆ', 'ู„ูƒู†', 'ู‚ุฏ', 'ุญูƒู…', 'ู‚ุงู„',
@@ -43,8 +47,36 @@ def arabic_word_tokenize(text):
43
  tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
44
  return [t for t in tokens if t not in ARABIC_STOPWORDS]
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  def compute_word_overlap(query, questions):
 
48
  query_words = set(arabic_word_tokenize(query))
49
  if len(query_words) == 0:
50
  return [0.0] * len(questions)
@@ -56,7 +88,7 @@ def compute_word_overlap(query, questions):
56
  overlaps.append(0.0)
57
  continue
58
 
59
- # Use Jaccard similarity (intersection over union) instead of just coverage
60
  intersection = len(query_words & q_words)
61
  union = len(query_words | q_words)
62
  jaccard = intersection / union if union > 0 else 0.0
@@ -70,15 +102,23 @@ def compute_word_overlap(query, questions):
70
 
71
  return overlaps
72
 
 
 
 
 
 
 
 
73
  def predict(text):
74
- print(f"Received POST data: {text}")
75
  if not text or text.strip() == "":
76
  return "No query provided"
77
 
 
78
  query_embedding = model.encode(text, convert_to_tensor=True)
79
  query_embeddinga = modela.encode(text, convert_to_tensor=True)
80
 
81
- # Cosine similarities
82
  sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
83
  util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
84
  sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
@@ -86,143 +126,161 @@ def predict(text):
86
  sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
87
  util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
88
 
89
- # Enhanced word overlaps
 
 
 
 
 
90
  word_overlap1 = compute_word_overlap(text, df_questions)
91
  word_overlap2 = compute_word_overlap(text, df2_questions)
92
  word_overlap3 = compute_word_overlap(text, df3_questions)
93
 
94
- # Adaptive weighting based on query length
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  query_words = arabic_word_tokenize(text)
96
- if len(query_words) <= 2:
97
- # Short queries: prioritize exact word matches
98
- weight = 0.6
99
- elif len(query_words) <= 5:
100
- # Medium queries: balanced
101
- weight = 0.4
 
 
 
 
 
 
102
  else:
103
- # Long queries: prioritize semantic similarity
104
- weight = 0.25
105
-
106
- # Collect top1 with better scoring
107
- combined1 = []
108
- for i in range(len(df_questions)):
109
- semantic_score = float(sim_scores1[i].cpu().item())
110
- word_score = float(word_overlap1[i])
111
 
112
- # Boost results that have both good semantic AND word overlap
113
- if semantic_score > 0.5 and word_score > 0.3:
114
- boost = 0.1
115
- else:
116
- boost = 0.0
117
 
118
- combined_score = semantic_score + weight * word_score + boost
119
-
120
- combined1.append({
121
- "question": df_questions[i],
122
- "link": df_links[i],
123
- "cosine_score": semantic_score,
124
- "word_overlap_score": word_score,
125
- "combined_score": combined_score
126
- })
127
-
128
- # Collect top2 with better scoring
129
- combined2 = []
130
- for i in range(len(df2_questions)):
131
- semantic_score = float(sim_scores2[i].cpu().item())
132
- word_score = float(word_overlap2[i])
133
-
134
- if semantic_score > 0.5 and word_score > 0.3:
135
- boost = 0.1
136
- else:
137
- boost = 0.0
138
 
139
- combined_score = semantic_score + weight * word_score + boost
140
-
141
- combined2.append({
142
- "question": df2_questions[i],
143
- "link": df2_links[i],
144
- "cosine_score": semantic_score,
145
- "word_overlap_score": word_score,
146
- "combined_score": combined_score
147
- })
148
-
149
- # Collect top3 with better scoring
150
- combined3 = []
151
- for i in range(len(df3_questions)):
152
- semantic_score = float(sim_scores3[i].cpu().item())
153
- word_score = float(word_overlap3[i])
154
-
155
- if semantic_score > 0.5 and word_score > 0.3:
156
- boost = 0.1
157
- else:
158
- boost = 0.0
159
 
160
- combined_score = semantic_score + weight * word_score + boost
 
 
 
 
 
 
 
161
 
162
- combined3.append({
163
- "question": df3_questions[i],
164
- "link": df3_links[i],
165
- "cosine_score": semantic_score,
166
- "word_overlap_score": word_score,
167
- "combined_score": combined_score
168
- })
169
-
170
- # Get top results with mixed ranking strategy
171
- def get_mixed_top_results(combined_results):
172
- # Sort by combined score and get top 3
173
  by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
174
- top_3_combined = by_combined[:3]
175
 
176
- # Get the questions from top 3 to avoid duplicates
177
- top_3_questions = {item["question"] for item in top_3_combined}
178
 
179
- # Sort by word overlap score and find first one not in top 3
180
- by_word = sorted(combined_results, key=lambda x: x["word_overlap_score"], reverse=True)
181
- word_pick = None
182
- for item in by_word:
183
- if item["question"] not in top_3_questions:
184
- word_pick = item
185
  break
186
 
187
- # Sort by semantic score and find first one not in top 3 or word pick
188
- by_semantic = sorted(combined_results, key=lambda x: x["cosine_score"], reverse=True)
189
  semantic_pick = None
190
- excluded_questions = top_3_questions.copy()
191
- if word_pick:
192
- excluded_questions.add(word_pick["question"])
193
 
194
  for item in by_semantic:
195
- if item["question"] not in excluded_questions:
196
  semantic_pick = item
197
  break
198
 
199
  # Combine results
200
- final_results = top_3_combined.copy()
201
- if word_pick:
202
- final_results.append(word_pick)
203
  if semantic_pick:
204
  final_results.append(semantic_pick)
205
 
206
- return final_results
207
 
208
- top1 = get_mixed_top_results(combined1)
209
- top2 = get_mixed_top_results(combined2)
210
- top3 = get_mixed_top_results(combined3)
 
211
 
212
  results = {
213
 
214
  "top2": top2,
215
  "top3": top3,
216
  "top1": top1,
 
 
 
 
 
 
 
 
217
  }
218
 
219
  return results
220
 
221
- title = "Search CSV"
222
  iface = gr.Interface(
223
  fn=predict,
224
- inputs=[gr.Textbox(label="text", lines=3)],
225
  outputs='json',
226
  title=title,
 
227
  )
228
- iface.launch()
 
 
 
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
5
  import re
6
+ from rank_bm25 import BM25Okapi
7
+ import numpy as np
8
 
9
+ # Load models
10
  model = SentenceTransformer("distilbert-base-multilingual-cased")
11
  modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
12
+
13
+ # Load data
14
  df = pd.read_csv("cleaned1.csv")
15
  df2 = pd.read_csv("cleaned2.csv")
16
  df3 = pd.read_csv("cleaned3.csv")
17
 
18
+ # Load pre-computed embeddings
19
  embeddings = torch.load("embeddings1_1.pt")
20
  embeddings2 = torch.load("embeddings2_1.pt")
21
  embeddings3 = torch.load("embeddings3_1.pt")
 
24
  embeddingsa2 = torch.load("embeddings2.pt")
25
  embeddingsa3 = torch.load("embeddings3.pt")
26
 
27
+ # Extract questions and links
28
  df_questions = df["question"].values
29
  df_links = df["link"].values
30
  df2_questions = df2["question"].values
 
32
  df3_questions = df3["question"].values
33
  df3_links = df3["url"].values
34
 
 
 
 
35
  ARABIC_STOPWORDS = {
36
  'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู†', 'ู…ุน', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ', 'ุชู„ูƒ',
37
  'ุงู„ุชูŠ', 'ุงู„ุฐูŠ', 'ู…ุง', 'ู„ุง', 'ุฃู†', 'ุฃูˆ', 'ู„ูƒู†', 'ู‚ุฏ', 'ุญูƒู…', 'ู‚ุงู„',
 
47
  tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
48
  return [t for t in tokens if t not in ARABIC_STOPWORDS]
49
 
50
+ def prepare_bm25_corpus(questions):
51
+ """Prepare tokenized corpus for BM25"""
52
+ tokenized_corpus = []
53
+ for question in questions:
54
+ tokens = arabic_word_tokenize(question)
55
+ tokenized_corpus.append(tokens)
56
+ return tokenized_corpus
57
+
58
+ # Initialize BM25 models for each dataset
59
+ print("Initializing BM25 models...")
60
+ bm25_corpus1 = prepare_bm25_corpus(df_questions)
61
+ bm25_corpus2 = prepare_bm25_corpus(df2_questions)
62
+ bm25_corpus3 = prepare_bm25_corpus(df3_questions)
63
+
64
+ bm25_model1 = BM25Okapi(bm25_corpus1)
65
+ bm25_model2 = BM25Okapi(bm25_corpus2)
66
+ bm25_model3 = BM25Okapi(bm25_corpus3)
67
+ print("BM25 models initialized!")
68
+
69
+ def compute_bm25_scores(query, bm25_model):
70
+ """Compute BM25 scores for a query"""
71
+ query_tokens = arabic_word_tokenize(query)
72
+ if not query_tokens:
73
+ return np.zeros(len(bm25_model.corpus))
74
+
75
+ scores = bm25_model.get_scores(query_tokens)
76
+ return scores
77
 
78
  def compute_word_overlap(query, questions):
79
+ """Enhanced word overlap computation"""
80
  query_words = set(arabic_word_tokenize(query))
81
  if len(query_words) == 0:
82
  return [0.0] * len(questions)
 
88
  overlaps.append(0.0)
89
  continue
90
 
91
+ # Use Jaccard similarity (intersection over union)
92
  intersection = len(query_words & q_words)
93
  union = len(query_words | q_words)
94
  jaccard = intersection / union if union > 0 else 0.0
 
102
 
103
  return overlaps
104
 
105
+ def normalize_scores(scores):
106
+ """Normalize scores to 0-1 range"""
107
+ scores = np.array(scores)
108
+ if np.max(scores) == np.min(scores):
109
+ return np.zeros_like(scores)
110
+ return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
111
+
112
  def predict(text):
113
+ print(f"Received query: {text}")
114
  if not text or text.strip() == "":
115
  return "No query provided"
116
 
117
+ # Semantic similarity scores
118
  query_embedding = model.encode(text, convert_to_tensor=True)
119
  query_embeddinga = modela.encode(text, convert_to_tensor=True)
120
 
121
+ # Cosine similarities (averaged from two models)
122
  sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
123
  util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
124
  sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
 
126
  sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
127
  util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
128
 
129
+ # BM25 scores
130
+ bm25_scores1 = compute_bm25_scores(text, bm25_model1)
131
+ bm25_scores2 = compute_bm25_scores(text, bm25_model2)
132
+ bm25_scores3 = compute_bm25_scores(text, bm25_model3)
133
+
134
+ # Word overlap scores
135
  word_overlap1 = compute_word_overlap(text, df_questions)
136
  word_overlap2 = compute_word_overlap(text, df2_questions)
137
  word_overlap3 = compute_word_overlap(text, df3_questions)
138
 
139
+ # Normalize all scores for fair combination
140
+ norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
141
+ norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
142
+ norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())
143
+
144
+ norm_bm25_1 = normalize_scores(bm25_scores1)
145
+ norm_bm25_2 = normalize_scores(bm25_scores2)
146
+ norm_bm25_3 = normalize_scores(bm25_scores3)
147
+
148
+ norm_word1 = normalize_scores(word_overlap1)
149
+ norm_word2 = normalize_scores(word_overlap2)
150
+ norm_word3 = normalize_scores(word_overlap3)
151
+
152
+ # Adaptive weighting based on query characteristics
153
  query_words = arabic_word_tokenize(text)
154
+ query_length = len(query_words)
155
+
156
+ if query_length <= 2:
157
+ # Short queries: prioritize exact matches (BM25 + word overlap)
158
+ semantic_weight = 0.3
159
+ bm25_weight = 0.4
160
+ word_weight = 0.3
161
+ elif query_length <= 5:
162
+ # Medium queries: balanced approach
163
+ semantic_weight = 0.4
164
+ bm25_weight = 0.35
165
+ word_weight = 0.25
166
  else:
167
+ # Long queries: prioritize semantic understanding
168
+ semantic_weight = 0.5
169
+ bm25_weight = 0.3
170
+ word_weight = 0.2
171
+
172
+ def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
173
+ combined_results = []
 
174
 
175
+ for i in range(len(questions)):
176
+ semantic_score = float(norm_semantic[i])
177
+ bm25_score = float(norm_bm25[i])
178
+ word_score = float(norm_word[i])
 
179
 
180
+ # Enhanced scoring with BM25
181
+ combined_score = (semantic_weight * semantic_score +
182
+ bm25_weight * bm25_score +
183
+ word_weight * word_score)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
+ # Boost results that perform well across multiple metrics
186
+ high_performance_count = sum([
187
+ semantic_score > 0.7,
188
+ bm25_score > 0.7,
189
+ word_score > 0.5
190
+ ])
191
+
192
+ if high_performance_count >= 2:
193
+ boost = 0.1
194
+ elif high_performance_count >= 1:
195
+ boost = 0.05
196
+ else:
197
+ boost = 0.0
198
+
199
+ final_score = combined_score + boost
 
 
 
 
 
200
 
201
+ combined_results.append({
202
+ "question": questions[i],
203
+ "link": links[i],
204
+ "semantic_score": semantic_score,
205
+ "bm25_score": bm25_score,
206
+ "word_overlap_score": word_score,
207
+ "combined_score": final_score
208
+ })
209
 
210
+ return combined_results
211
+
212
+ # Create combined results for all datasets
213
+ combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
214
+ combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
215
+ combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)
216
+
217
+ def get_diverse_top_results(combined_results, top_k=5):
218
+ """Get diverse top results using multiple ranking strategies"""
219
+ # Sort by combined score and get top candidates
 
220
  by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
221
+ top_combined = by_combined[:3]
222
 
223
+ # Get questions from top combined to avoid duplicates
224
+ used_questions = {item["question"] for item in top_combined}
225
 
226
+ # Add best BM25 result not already included
227
+ by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
228
+ bm25_pick = None
229
+ for item in by_bm25:
230
+ if item["question"] not in used_questions:
231
+ bm25_pick = item
232
  break
233
 
234
+ # Add best semantic result not already included
235
+ by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
236
  semantic_pick = None
237
+ if bm25_pick:
238
+ used_questions.add(bm25_pick["question"])
 
239
 
240
  for item in by_semantic:
241
+ if item["question"] not in used_questions:
242
  semantic_pick = item
243
  break
244
 
245
  # Combine results
246
+ final_results = top_combined.copy()
247
+ if bm25_pick:
248
+ final_results.append(bm25_pick)
249
  if semantic_pick:
250
  final_results.append(semantic_pick)
251
 
252
+ return final_results[:top_k]
253
 
254
+ # Get top results for each dataset
255
+ top1 = get_diverse_top_results(combined1)
256
+ top2 = get_diverse_top_results(combined2)
257
+ top3 = get_diverse_top_results(combined3)
258
 
259
  results = {
260
 
261
  "top2": top2,
262
  "top3": top3,
263
  "top1": top1,
264
+ "query_info": {
265
+ "query_length": query_length,
266
+ "weights": {
267
+ "semantic": semantic_weight,
268
+ "bm25": bm25_weight,
269
+ "word_overlap": word_weight
270
+ }
271
+ }
272
  }
273
 
274
  return results
275
 
276
+ title = "Enhanced Search with BM25"
277
  iface = gr.Interface(
278
  fn=predict,
279
+ inputs=[gr.Textbox(label="Search Query", lines=3)],
280
  outputs='json',
281
  title=title,
282
+ description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
283
  )
284
+
285
+ if __name__ == "__main__":
286
+ iface.launch()