Spaces:

mohbay
/

searchcsv2

Running

App Files Files Community

mohbay commited on Jul 8

Commit

2f4967b

verified ·

1 Parent(s): 6dce45a

Update app.py

Browse files

Files changed (1) hide show

app.py +162 -104

app.py CHANGED Viewed

@@ -3,13 +3,19 @@ import pandas as pd
 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
 import re
 model = SentenceTransformer("distilbert-base-multilingual-cased")
 modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
 df = pd.read_csv("cleaned1.csv")
 df2 = pd.read_csv("cleaned2.csv")
 df3 = pd.read_csv("cleaned3.csv")
 embeddings = torch.load("embeddings1_1.pt")
 embeddings2 = torch.load("embeddings2_1.pt")
 embeddings3 = torch.load("embeddings3_1.pt")
@@ -18,6 +24,7 @@ embeddingsa = torch.load("embeddings1.pt")
 embeddingsa2 = torch.load("embeddings2.pt")
 embeddingsa3 = torch.load("embeddings3.pt")
 df_questions = df["question"].values
 df_links = df["link"].values
 df2_questions = df2["question"].values
@@ -25,9 +32,6 @@ df2_links = df2["link"].values
 df3_questions = df3["question"].values
 df3_links = df3["url"].values
-import re
 ARABIC_STOPWORDS = {
     'في', 'من', 'إلى', 'عن', 'مع', 'هذا', 'هذه', 'ذلك', 'تلك',
     'التي', 'الذي', 'ما', 'لا', 'أن', 'أو', 'لكن', 'قد', 'حكم', 'قال',
@@ -43,8 +47,36 @@ def arabic_word_tokenize(text):
     tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
     return [t for t in tokens if t not in ARABIC_STOPWORDS]
 def compute_word_overlap(query, questions):
     query_words = set(arabic_word_tokenize(query))
     if len(query_words) == 0:
         return [0.0] * len(questions)
@@ -56,7 +88,7 @@ def compute_word_overlap(query, questions):
             overlaps.append(0.0)
             continue
-        # Use Jaccard similarity (intersection over union) instead of just coverage
         intersection = len(query_words & q_words)
         union = len(query_words | q_words)
         jaccard = intersection / union if union > 0 else 0.0
@@ -70,15 +102,23 @@ def compute_word_overlap(query, questions):
     return overlaps
 def predict(text):
-    print(f"Received POST data: {text}")
     if not text or text.strip() == "":
         return "No query provided"
     query_embedding = model.encode(text, convert_to_tensor=True)
     query_embeddinga = modela.encode(text, convert_to_tensor=True)
-    # Cosine similarities
     sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
                    util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
     sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
@@ -86,143 +126,161 @@ def predict(text):
     sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
                    util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
-    # Enhanced word overlaps
     word_overlap1 = compute_word_overlap(text, df_questions)
     word_overlap2 = compute_word_overlap(text, df2_questions)
     word_overlap3 = compute_word_overlap(text, df3_questions)
-    # Adaptive weighting based on query length
     query_words = arabic_word_tokenize(text)
-    if len(query_words) <= 2:
-        # Short queries: prioritize exact word matches
-        weight = 0.6
-    elif len(query_words) <= 5:
-        # Medium queries: balanced
-        weight = 0.4
     else:
-        # Long queries: prioritize semantic similarity
-        weight = 0.25
-    # Collect top1 with better scoring
-    combined1 = []
-    for i in range(len(df_questions)):
-        semantic_score = float(sim_scores1[i].cpu().item())
-        word_score = float(word_overlap1[i])
-        # Boost results that have both good semantic AND word overlap
-        if semantic_score > 0.5 and word_score > 0.3:
-            boost = 0.1
-        else:
-            boost = 0.0
-        combined_score = semantic_score + weight * word_score + boost
-        combined1.append({
-            "question": df_questions[i],
-            "link": df_links[i],
-            "cosine_score": semantic_score,
-            "word_overlap_score": word_score,
-            "combined_score": combined_score
-        })
-    # Collect top2 with better scoring
-    combined2 = []
-    for i in range(len(df2_questions)):
-        semantic_score = float(sim_scores2[i].cpu().item())
-        word_score = float(word_overlap2[i])
-        if semantic_score > 0.5 and word_score > 0.3:
-            boost = 0.1
-        else:
-            boost = 0.0
-        combined_score = semantic_score + weight * word_score + boost
-        combined2.append({
-            "question": df2_questions[i],
-            "link": df2_links[i],
-            "cosine_score": semantic_score,
-            "word_overlap_score": word_score,
-            "combined_score": combined_score
-        })
-    # Collect top3 with better scoring
-    combined3 = []
-    for i in range(len(df3_questions)):
-        semantic_score = float(sim_scores3[i].cpu().item())
-        word_score = float(word_overlap3[i])
-        if semantic_score > 0.5 and word_score > 0.3:
-            boost = 0.1
-        else:
-            boost = 0.0
-        combined_score = semantic_score + weight * word_score + boost
-        combined3.append({
-            "question": df3_questions[i],
-            "link": df3_links[i],
-            "cosine_score": semantic_score,
-            "word_overlap_score": word_score,
-            "combined_score": combined_score
-        })
-    # Get top results with mixed ranking strategy
-    def get_mixed_top_results(combined_results):
-        # Sort by combined score and get top 3
         by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
-        top_3_combined = by_combined[:3]
-        # Get the questions from top 3 to avoid duplicates
-        top_3_questions = {item["question"] for item in top_3_combined}
-        # Sort by word overlap score and find first one not in top 3
-        by_word = sorted(combined_results, key=lambda x: x["word_overlap_score"], reverse=True)
-        word_pick = None
-        for item in by_word:
-            if item["question"] not in top_3_questions:
-                word_pick = item
                 break
-        # Sort by semantic score and find first one not in top 3 or word pick
-        by_semantic = sorted(combined_results, key=lambda x: x["cosine_score"], reverse=True)
         semantic_pick = None
-        excluded_questions = top_3_questions.copy()
-        if word_pick:
-            excluded_questions.add(word_pick["question"])
         for item in by_semantic:
-            if item["question"] not in excluded_questions:
                 semantic_pick = item
                 break
         # Combine results
-        final_results = top_3_combined.copy()
-        if word_pick:
-            final_results.append(word_pick)
         if semantic_pick:
             final_results.append(semantic_pick)
-        return final_results
-    top1 = get_mixed_top_results(combined1)
-    top2 = get_mixed_top_results(combined2)
-    top3 = get_mixed_top_results(combined3)
     results = {
         "top2": top2,
         "top3": top3,
         "top1": top1,
     }
     return results
-title = "Search CSV"
 iface = gr.Interface(
     fn=predict,
-    inputs=[gr.Textbox(label="text", lines=3)],
     outputs='json',
     title=title,
 )
-iface.launch()

 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
 import re
+from rank_bm25 import BM25Okapi
+import numpy as np
+# Load models
 model = SentenceTransformer("distilbert-base-multilingual-cased")
 modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
+# Load data
 df = pd.read_csv("cleaned1.csv")
 df2 = pd.read_csv("cleaned2.csv")
 df3 = pd.read_csv("cleaned3.csv")
+# Load pre-computed embeddings
 embeddings = torch.load("embeddings1_1.pt")
 embeddings2 = torch.load("embeddings2_1.pt")
 embeddings3 = torch.load("embeddings3_1.pt")
 embeddingsa2 = torch.load("embeddings2.pt")
 embeddingsa3 = torch.load("embeddings3.pt")
+# Extract questions and links
 df_questions = df["question"].values
 df_links = df["link"].values
 df2_questions = df2["question"].values
 df3_questions = df3["question"].values
 df3_links = df3["url"].values
 ARABIC_STOPWORDS = {
     'في', 'من', 'إلى', 'عن', 'مع', 'هذا', 'هذه', 'ذلك', 'تلك',
     'التي', 'الذي', 'ما', 'لا', 'أن', 'أو', 'لكن', 'قد', 'حكم', 'قال',
     tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
     return [t for t in tokens if t not in ARABIC_STOPWORDS]
+def prepare_bm25_corpus(questions):
+    """Prepare tokenized corpus for BM25"""
+    tokenized_corpus = []
+    for question in questions:
+        tokens = arabic_word_tokenize(question)
+        tokenized_corpus.append(tokens)
+    return tokenized_corpus
+# Initialize BM25 models for each dataset
+print("Initializing BM25 models...")
+bm25_corpus1 = prepare_bm25_corpus(df_questions)
+bm25_corpus2 = prepare_bm25_corpus(df2_questions)
+bm25_corpus3 = prepare_bm25_corpus(df3_questions)
+bm25_model1 = BM25Okapi(bm25_corpus1)
+bm25_model2 = BM25Okapi(bm25_corpus2)
+bm25_model3 = BM25Okapi(bm25_corpus3)
+print("BM25 models initialized!")
+def compute_bm25_scores(query, bm25_model):
+    """Compute BM25 scores for a query"""
+    query_tokens = arabic_word_tokenize(query)
+    if not query_tokens:
+        return np.zeros(len(bm25_model.corpus))
+    scores = bm25_model.get_scores(query_tokens)
+    return scores
 def compute_word_overlap(query, questions):
+    """Enhanced word overlap computation"""
     query_words = set(arabic_word_tokenize(query))
     if len(query_words) == 0:
         return [0.0] * len(questions)
             overlaps.append(0.0)
             continue
+        # Use Jaccard similarity (intersection over union)
         intersection = len(query_words & q_words)
         union = len(query_words | q_words)
         jaccard = intersection / union if union > 0 else 0.0
     return overlaps
+def normalize_scores(scores):
+    """Normalize scores to 0-1 range"""
+    scores = np.array(scores)
+    if np.max(scores) == np.min(scores):
+        return np.zeros_like(scores)
+    return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
 def predict(text):
+    print(f"Received query: {text}")
     if not text or text.strip() == "":
         return "No query provided"
+    # Semantic similarity scores
     query_embedding = model.encode(text, convert_to_tensor=True)
     query_embeddinga = modela.encode(text, convert_to_tensor=True)
+    # Cosine similarities (averaged from two models)
     sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
                    util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
     sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
     sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
                    util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
+    # BM25 scores
+    bm25_scores1 = compute_bm25_scores(text, bm25_model1)
+    bm25_scores2 = compute_bm25_scores(text, bm25_model2)
+    bm25_scores3 = compute_bm25_scores(text, bm25_model3)
+    # Word overlap scores
     word_overlap1 = compute_word_overlap(text, df_questions)
     word_overlap2 = compute_word_overlap(text, df2_questions)
     word_overlap3 = compute_word_overlap(text, df3_questions)
+    # Normalize all scores for fair combination
+    norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
+    norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
+    norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())
+    norm_bm25_1 = normalize_scores(bm25_scores1)
+    norm_bm25_2 = normalize_scores(bm25_scores2)
+    norm_bm25_3 = normalize_scores(bm25_scores3)
+    norm_word1 = normalize_scores(word_overlap1)
+    norm_word2 = normalize_scores(word_overlap2)
+    norm_word3 = normalize_scores(word_overlap3)
+    # Adaptive weighting based on query characteristics
     query_words = arabic_word_tokenize(text)
+    query_length = len(query_words)
+    if query_length <= 2:
+        # Short queries: prioritize exact matches (BM25 + word overlap)
+        semantic_weight = 0.3
+        bm25_weight = 0.4
+        word_weight = 0.3
+    elif query_length <= 5:
+        # Medium queries: balanced approach
+        semantic_weight = 0.4
+        bm25_weight = 0.35
+        word_weight = 0.25
     else:
+        # Long queries: prioritize semantic understanding
+        semantic_weight = 0.5
+        bm25_weight = 0.3
+        word_weight = 0.2
+    def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
+        combined_results = []
+        for i in range(len(questions)):
+            semantic_score = float(norm_semantic[i])
+            bm25_score = float(norm_bm25[i])
+            word_score = float(norm_word[i])
+            # Enhanced scoring with BM25
+            combined_score = (semantic_weight * semantic_score +
+                            bm25_weight * bm25_score +
+                            word_weight * word_score)
+            # Boost results that perform well across multiple metrics
+            high_performance_count = sum([
+                semantic_score > 0.7,
+                bm25_score > 0.7,
+                word_score > 0.5
+            ])
+            if high_performance_count >= 2:
+                boost = 0.1
+            elif high_performance_count >= 1:
+                boost = 0.05
+            else:
+                boost = 0.0
+            final_score = combined_score + boost
+            combined_results.append({
+                "question": questions[i],
+                "link": links[i],
+                "semantic_score": semantic_score,
+                "bm25_score": bm25_score,
+                "word_overlap_score": word_score,
+                "combined_score": final_score
+            })
+        return combined_results
+    # Create combined results for all datasets
+    combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
+    combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
+    combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)
+    def get_diverse_top_results(combined_results, top_k=5):
+        """Get diverse top results using multiple ranking strategies"""
+        # Sort by combined score and get top candidates
         by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
+        top_combined = by_combined[:3]
+        # Get questions from top combined to avoid duplicates
+        used_questions = {item["question"] for item in top_combined}
+        # Add best BM25 result not already included
+        by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
+        bm25_pick = None
+        for item in by_bm25:
+            if item["question"] not in used_questions:
+                bm25_pick = item
                 break
+        # Add best semantic result not already included
+        by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
         semantic_pick = None
+        if bm25_pick:
+            used_questions.add(bm25_pick["question"])
         for item in by_semantic:
+            if item["question"] not in used_questions:
                 semantic_pick = item
                 break
         # Combine results
+        final_results = top_combined.copy()
+        if bm25_pick:
+            final_results.append(bm25_pick)
         if semantic_pick:
             final_results.append(semantic_pick)
+        return final_results[:top_k]
+    # Get top results for each dataset
+    top1 = get_diverse_top_results(combined1)
+    top2 = get_diverse_top_results(combined2)
+    top3 = get_diverse_top_results(combined3)
     results = {
         "top2": top2,
         "top3": top3,
         "top1": top1,
+        "query_info": {
+            "query_length": query_length,
+            "weights": {
+                "semantic": semantic_weight,
+                "bm25": bm25_weight,
+                "word_overlap": word_weight
+            }
+        }
     }
     return results
+title = "Enhanced Search with BM25"
 iface = gr.Interface(
     fn=predict,
+    inputs=[gr.Textbox(label="Search Query", lines=3)],
     outputs='json',
     title=title,
+    description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
 )
+if __name__ == "__main__":
+    iface.launch()