SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 17, 2024

Commit

6e7871f

verified ·

1 Parent(s): 1c96914

Update app.py

Browse files

Files changed (1) hide show

app.py +180 -47

app.py CHANGED Viewed

@@ -30,6 +30,15 @@ from mistralai import Mistral
 from dotenv import load_dotenv
 import re
 from typing import List, Tuple
 # Automatically get the current year
 current_year = datetime.datetime.now().year
@@ -252,8 +261,7 @@ def scrape_with_newspaper(url):
         return ""
 def rephrase_query(chat_history, query, temperature=0.2):
-    system_prompt = f"""
-You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
 1. Determine if the new query is a continuation of the previous conversation or an entirely new topic.
@@ -271,42 +279,56 @@ You are a highly intelligent and context-aware conversational assistant. Your ta
    - Ensure that entities from the previous context are properly quoted if they appear in the rephrased query.
 4. For both continuations and new topics:
-   - Append "after: {current_year}" to the end of the rephrased query.
-   - Ensure there is a space before "after:" for proper formatting.
-   - Do not use quotes or the "+" operator when adding the year.
 5. **Output**:
    - Return ONLY the rephrased query, ensuring it is concise, clear, and contextually accurate.
    - Do not include any additional commentary or explanation.
 ### Example Scenarios
-**Scenario 1: New Topic**
-- **User Query**: "What is the latest news on Golomt Bank?"
-- **Rephrased Query**: "What is the latest news on \"Golomt Bank\" after: {current_year}"
-**Scenario 2: Continuation**
-- **Previous Query**: "What is the latest news on Golomt Bank?"
-- **User Query**: "How did the Bank perform in Q2 2024?"
-- **Rephrased Query**: "How did \"Golomt Bank\" perform in Q2 2024 after: {current_year}"
-**Scenario 3: Query with Multiple Entities and Comma**
-- **User Query**: "What is the latest news about Prospect Capital, did the rating change?"
-- **Rephrased Query**: "What is the latest news about \"Prospect Capital\", did the rating change after: {current_year}"
-**Scenario 4: Query Without Recognizable Entities**
-- **User Query**: "How does photosynthesis work?"
-- **Rephrased Query**: "How does photosynthesis work? after: {current_year}"
 """
-    user_prompt = f"""
-Conversation context:
-{chat_history}
 New query: {query}
-Rephrased query:
-"""
     messages = [
         {"role": "system", "content": system_prompt},
         {"role": "user", "content": user_prompt}
     ]
     try:
         logger.info(f"Sending rephrasing request to LLM with temperature {temperature}")
         response = client.chat_completion(
@@ -316,10 +338,12 @@ Rephrased query:
         )
         logger.info("Received rephrased query from LLM")
         rephrased_question = response.choices[0].message.content.strip()
         # Remove surrounding quotes if present
         if (rephrased_question.startswith('"') and rephrased_question.endswith('"')) or \
            (rephrased_question.startswith("'") and rephrased_question.endswith("'")):
             rephrased_question = rephrased_question[1:-1].strip()
         logger.info(f"Rephrased Query (cleaned): {rephrased_question}")
         return rephrased_question
     except Exception as e:
@@ -332,52 +356,161 @@ def extract_entity_domain(query):
     matches = re.findall(domain_pattern, query)
     return matches[0] if matches else None
-def rerank_documents_with_priority(query, documents, entity_domain, similarity_threshold=0.95, max_results=5):
     try:
-        # Step 1: Encode the query and document summaries
         query_embedding = similarity_model.encode(query, convert_to_tensor=True)
         doc_summaries = [doc['summary'] for doc in documents]
-        if not doc_summaries:
-            logger.warning("No document summaries to rerank.")
-            return documents
-        doc_embeddings = similarity_model.encode(doc_summaries, convert_to_tensor=True)
-        # Step 2: Compute Cosine Similarity
-        cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
-        # Combine documents and cosine scores
-        scored_documents = list(zip(documents, cosine_scores))
-        # Step 3: Sort documents by cosine similarity score and prioritize entity domain
         scored_documents.sort(key=lambda x: (not x[0]['is_entity_domain'], -x[1]), reverse=False)
-        # Step 4: Filter out similar documents
         filtered_docs = []
         for doc, score in scored_documents:
-            if score < 0.5:  # If similarity to query is too low, skip
                 continue
             # Check similarity with already selected documents
             is_similar = False
-            for selected_doc in filtered_docs:
-                similarity = util.pytorch_cos_sim(
-                    similarity_model.encode(doc['summary'], convert_to_tensor=True),
-                    similarity_model.encode(selected_doc['summary'], convert_to_tensor=True)
-                )
                 if similarity > similarity_threshold:
                     is_similar = True
                     break
             if not is_similar:
                 filtered_docs.append(doc)
             if len(filtered_docs) >= max_results:
                 break
-        logger.info(f"Reranked and filtered to {len(filtered_docs)} unique documents.")
         return filtered_docs
     except Exception as e:
         logger.error(f"Error during reranking documents: {e}")
         return documents[:max_results]  # Fallback to first max_results documents if reranking fails

 from dotenv import load_dotenv
 import re
 from typing import List, Tuple
+from rank_bm25 import BM25Okapi
+from typing import List, Dict
+import numpy as np
+from math import log
+from collections import Counter
+import numpy as np
+from typing import List, Dict, Tuple
+import datetime
+CURRENT_YEAR = datetime.datetime.now().year
 # Automatically get the current year
 current_year = datetime.datetime.now().year
         return ""
 def rephrase_query(chat_history, query, temperature=0.2):
+    system_prompt = """You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
 1. Determine if the new query is a continuation of the previous conversation or an entirely new topic.
    - Ensure that entities from the previous context are properly quoted if they appear in the rephrased query.
 4. For both continuations and new topics:
+   - First, check if the query contains words indicating current information (e.g., "today", "now", "current", "latest"):
+     - If present, do NOT add any date operators to the query
+   - Otherwise, if the query mentions a specific time period (e.g., a quarter, year, or date range):
+     - Add appropriate "after:" and "before:" operators to the end of the rephrased query.
+     - Use the format "after:YYYY-MM-DD before:YYYY-MM-DD" for date ranges.
+     - For quarters, use the start and end dates of the following quarter (when results would typically be reported).
+   - If no specific time period is mentioned and no current-time indicators are present:
+     - Append "after: {CURRENT_YEAR}" to the end of the rephrased query.
+   - Ensure there is a space before "after:" and "before:" for proper formatting.
+   - Do not use quotes or the "+" operator when adding dates.
 5. **Output**:
    - Return ONLY the rephrased query, ensuring it is concise, clear, and contextually accurate.
    - Do not include any additional commentary or explanation.
 ### Example Scenarios
+**Scenario 1: Query About Current Information**
+- **User Query**: "What's the stock price of Apple today?"
+- **Rephrased Query**: "What's the stock price of \"Apple\" today"
+**Scenario 2: New Topic with Specific Quarter**
+- **User Query**: "How did Bank of America perform during Q2 2024?"
+- **Rephrased Query**: "How did \"Bank of America\" perform during Q2 2024 after:2024-07-01 before:2024-09-30"
+**Scenario 3: Continuation with Date Range**
+- **Previous Query**: "What were Apple's sales figures for 2023?"
+- **User Query**: "How about for the first half of 2024?"
+- **Rephrased Query**: "How about \"Apple\"'s sales figures for the first half of 2024 after:2024-01-01 before:2024-06-30"
+**Scenario 4: Current Status Query**
+- **User Query**: "What is the current market share of Toyota and Honda in the US?"
+- **Rephrased Query**: "What is the current market share of \"Toyota\" and \"Honda\" in the \"US\""
+**Scenario 5: Query Without Recognizable Entities but with Time Period**
+- **User Query**: "What were the major scientific breakthroughs in 2024?"
+- **Rephrased Query**: "What were the major scientific breakthroughs in 2024 after:2024-01-01 before:2024-12-31"
 """
+    # Create the user prompt with the chat history and current query
+    user_prompt = f"""Conversation context: {chat_history}
 New query: {query}
+Current year: {CURRENT_YEAR}
+Rephrased query:"""
     messages = [
         {"role": "system", "content": system_prompt},
         {"role": "user", "content": user_prompt}
     ]
     try:
         logger.info(f"Sending rephrasing request to LLM with temperature {temperature}")
         response = client.chat_completion(
         )
         logger.info("Received rephrased query from LLM")
         rephrased_question = response.choices[0].message.content.strip()
         # Remove surrounding quotes if present
         if (rephrased_question.startswith('"') and rephrased_question.endswith('"')) or \
            (rephrased_question.startswith("'") and rephrased_question.endswith("'")):
             rephrased_question = rephrased_question[1:-1].strip()
         logger.info(f"Rephrased Query (cleaned): {rephrased_question}")
         return rephrased_question
     except Exception as e:
     matches = re.findall(domain_pattern, query)
     return matches[0] if matches else None
+class BM25:
+    def __init__(self, k1: float = 1.5, b: float = 0.75):
+        self.k1 = k1  # term frequency saturation parameter
+        self.b = b    # length normalization parameter
+        self.corpus_size = 0
+        self.doc_lengths = []
+        self.avgdl = 0
+        self.doc_freqs = []
+        self.idf = {}
+        self.doc_vectors = []
+    def fit(self, corpus: List[str]):
+        """
+        Fit BM25 parameters to the corpus
+        Args:
+            corpus: List of document strings
+        """
+        self.corpus_size = len(corpus)
+        # Calculate document lengths and average document length
+        self.doc_lengths = []
+        for doc in corpus:
+            words = doc.lower().split()
+            self.doc_lengths.append(len(words))
+        self.avgdl = sum(self.doc_lengths) / self.corpus_size
+        # Calculate document frequencies
+        df = Counter()
+        self.doc_vectors = []
+        for doc in corpus:
+            words = doc.lower().split()
+            doc_words = set(words)
+            for word in doc_words:
+                df[word] += 1
+            self.doc_vectors.append(Counter(words))
+        # Calculate inverse document frequency
+        self.idf = {}
+        for word, freq in df.items():
+            self.idf[word] = log((self.corpus_size - freq + 0.5) / (freq + 0.5))
+    def get_scores(self, query: str) -> np.ndarray:
+        """
+        Calculate BM25 scores for the query against all documents
+        Args:
+            query: Query string
+        Returns:
+            numpy array of scores for each document
+        """
+        scores = np.zeros(self.corpus_size)
+        query_words = query.lower().split()
+        for word in query_words:
+            if word not in self.idf:
+                continue
+            qi = self.idf[word]
+            for idx, doc_vector in enumerate(self.doc_vectors):
+                if word not in doc_vector:
+                    continue
+                score = (qi * doc_vector[word] * (self.k1 + 1) /
+                        (doc_vector[word] + self.k1 * (1 - self.b + self.b *
+                         self.doc_lengths[idx] / self.avgdl)))
+                scores[idx] += score
+        return scores
+def prepare_documents_for_bm25(documents: List[Dict]) -> Tuple[List[str], List[Dict]]:
+    """
+    Prepare documents for BM25 ranking by combining title and content
+    Args:
+        documents: List of document dictionaries
+    Returns:
+        Tuple of (document texts, original documents)
+    """
+    doc_texts = []
+    for doc in documents:
+        # Combine title and content for better matching
+        doc_text = f"{doc['title']} {doc['content']}"
+        doc_texts.append(doc_text)
+    return doc_texts, documents
+# Now modify the rerank_documents_with_priority function to include BM25 ranking
+def rerank_documents_with_priority(query: str, documents: List[Dict], entity_domain: str,
+                                 similarity_threshold: float = 0.95, max_results: int = 5) -> List[Dict]:
     try:
+        if not documents:
+            logger.warning("No documents to rerank.")
+            return documents
+        # Step 1: Prepare documents for BM25
+        doc_texts, original_docs = prepare_documents_for_bm25(documents)
+        # Step 2: Initialize and fit BM25
+        bm25 = BM25()
+        bm25.fit(doc_texts)
+        # Step 3: Get BM25 scores
+        bm25_scores = bm25.get_scores(query)
+        # Step 4: Get semantic similarity scores
         query_embedding = similarity_model.encode(query, convert_to_tensor=True)
         doc_summaries = [doc['summary'] for doc in documents]
+        doc_embeddings = similarity_model.encode(doc_summaries, convert_to_tensor=True)
+        semantic_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
+        # Step 5: Combine scores (normalize first)
+        bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
+        semantic_scores_norm = (semantic_scores - torch.min(semantic_scores)) / (torch.max(semantic_scores) - torch.min(semantic_scores))
+        # Combine scores with weights (0.4 for BM25, 0.6 for semantic similarity)
+        combined_scores = 0.4 * bm25_scores_norm + 0.6 * semantic_scores_norm.numpy()
+        # Create scored documents with combined scores
+        scored_documents = list(zip(documents, combined_scores))
+        # Sort by domain priority and combined score
         scored_documents.sort(key=lambda x: (not x[0]['is_entity_domain'], -x[1]), reverse=False)
+        # Filter similar documents
         filtered_docs = []
+        added_contents = []
         for doc, score in scored_documents:
+            if score < 0.3:  # Minimum relevance threshold
                 continue
             # Check similarity with already selected documents
+            doc_embedding = similarity_model.encode(doc['summary'], convert_to_tensor=True)
             is_similar = False
+            for content in added_contents:
+                content_embedding = similarity_model.encode(content, convert_to_tensor=True)
+                similarity = util.pytorch_cos_sim(doc_embedding, content_embedding)
                 if similarity > similarity_threshold:
                     is_similar = True
                     break
             if not is_similar:
                 filtered_docs.append(doc)
+                added_contents.append(doc['summary'])
             if len(filtered_docs) >= max_results:
                 break
+        logger.info(f"Reranked and filtered to {len(filtered_docs)} unique documents using BM25 and semantic similarity.")
         return filtered_docs
     except Exception as e:
         logger.error(f"Error during reranking documents: {e}")
         return documents[:max_results]  # Fallback to first max_results documents if reranking fails