SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 7, 2024

Commit

e181e71

verified ·

1 Parent(s): c476da1

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -50

app.py CHANGED Viewed

@@ -32,6 +32,10 @@ import io
 import requests
 from duckduckgo_search import DDGS
 import random
 # Load environment variables from a .env file
 load_dotenv()
@@ -264,44 +268,43 @@ def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=Fa
         return ""
 def rephrase_query(chat_history, query, temperature=0.2):
-    system_prompt = """
 You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
-1. **Entity Identification and Domain Mapping**:
-    - Analyze the user's query to identify the main entity (e.g., organizations, brands, products).
-    - For each identified entity, determine its official domain name from your knowledgebase. For example, "Golomt Bank" corresponds to "golomtbank.com".
-    - Append the operator "+" followed by the domain to the original query to refine the search intent. For example:
-        - Original Query: "What is the latest news on Golomt Bank?"
-        - Enhanced Query: "What is the latest news on Golomt Bank + golomtbank.com"
-2. **Query Rephrasing Based on Conversation Context**:
-    - **Assess Continuation or New Topic**:
-        - Determine whether the new query is a continuation of the ongoing conversation or introduces a new, unrelated topic.
-    - **If Continuation**:
-        - Incorporate the most relevant details from the context to make the rephrased query more specific and aligned with the ongoing conversation.
-    - **If New Topic**:
-        - Rewrite the query to ensure clarity, precision, and suitability for a standalone search, avoiding any irrelevant context from the conversation.
-3. **Output**:
     - Return ONLY the rephrased query, ensuring it is concise, clear, and contextually accurate.
     - Do not include any additional commentary or explanation.
 ### Example Scenarios
-**Scenario 1: Continuation with Entity**
 - **User Query**: "What is the latest news on Golomt Bank?"
-- **Rephrased Query**: "What is the latest news on Golomt Bank + golomtbank.com"
-**Scenario 2: New Topic with Entity**
-- **User Query**: "Tell me about the new features of the iPhone."
-- **Rephrased Query**: "Tell me about the new features of the iPhone + apple.com"
-**Scenario 3: Query Without Recognizable Entity**
 - **User Query**: "How does photosynthesis work?"
-- **Rephrased Query**: "How does photosynthesis work?"
 """
@@ -340,7 +343,7 @@ Rephrased query:
         logger.error(f"Error rephrasing query with LLM: {e}")
         return query  # Fallback to original query if rephrasing fails
-def rerank_documents(query, documents):
     try:
         # Step 1: Encode the query and document summaries
         query_embedding = similarity_model.encode(query, convert_to_tensor=True)
@@ -348,33 +351,47 @@ def rerank_documents(query, documents):
         if not doc_summaries:
             logger.warning("No document summaries to rerank.")
-            return documents  # Return original documents if there's nothing to rerank
         doc_embeddings = similarity_model.encode(doc_summaries, convert_to_tensor=True)
         # Step 2: Compute Cosine Similarity
         cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
-        # Step 3: Compute Dot Product Similarity
-        dot_product_scores = torch.matmul(query_embedding, doc_embeddings.T)
-        # Ensure dot_product_scores is a 1-D tensor
-        if dot_product_scores.dim() == 0:
-            dot_product_scores = dot_product_scores.unsqueeze(0)
-        # Combine documents, cosine scores, and dot product scores
-        scored_documents = list(zip(documents, cosine_scores, dot_product_scores))
-        # Step 4: Sort documents by cosine similarity score
         scored_documents.sort(key=lambda x: x[1], reverse=True)
-        # Step 5: Return only the top 5 documents
-        reranked_docs = [doc[0] for doc in scored_documents[:5]]
-        logger.info(f"Reranked to top {len(reranked_docs)} documents.")
-        return reranked_docs
     except Exception as e:
         logger.error(f"Error during reranking documents: {e}")
-        return documents[:5]  # Fallback to first 5 documents if reranking fails
 def compute_similarity(text1, text2):
     # Encode the texts
@@ -394,24 +411,30 @@ def is_content_unique(new_content, existing_contents, similarity_threshold=0.8):
     return True
 def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
-    system_prompt = """You are a world class AI assistant. Your task is to assess whether the given text is relevant to the user's query and provide a brief summary if it is relevant."""
     user_prompt = f"""
 Query: {query}
 Document Content:
-{document['content']}
 Instructions:
-1. Assess if the document is relevant to the QUERY  made by the user.
-2. If relevant, summarize the main points in 1-2 sentences.
 3. If not relevant, simply state "Not relevant".
 Your response should be in the following format:
 Relevant: [Yes/No]
-Summary: [Your 1-2 sentence summary if relevant, or "Not relevant" if not]
-Remember to focus on financial aspects and implications in your assessment and summary.
 """
     messages = [
@@ -422,7 +445,7 @@ Remember to focus on financial aspects and implications in your assessment and s
     try:
         response = llm_client.chat_completion(
             messages=messages,
-            max_tokens=150,
             temperature=temperature,
             top_p=0.9
         )
@@ -637,9 +660,10 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
         relevant_documents = []
         unique_summaries = []
         for doc in scraped_content:
             assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
             relevance, summary = assessment.split('\n', 1)
             if relevance.strip().lower() == "relevant: yes":
                 summary_text = summary.replace("Summary: ", "").strip()
@@ -660,7 +684,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
             logger.debug(f"Assessment result: {assessment}")
         # Step 4: Rerank documents based on similarity to query
-        reranked_docs = rerank_documents(rephrased_query, relevant_documents)
         if not reranked_docs:
             logger.warning("No documents remained after reranking.")

 import requests
 from duckduckgo_search import DDGS
 import random
+import datetime
+# Automatically get the current year
+current_year = datetime.datetime.now().year
 # Load environment variables from a .env file
 load_dotenv()
         return ""
 def rephrase_query(chat_history, query, temperature=0.2):
+    system_prompt = f"""
 You are a highly intelligent and context-aware conversational assistant. Your tasks are as follows:
+1. **Entity Identification and Quotation**:
+    - Analyze the user's query to identify the main entities (e.g., organizations, brands, products, locations).
+    - For each identified entity, enclose ONLY the entity itself in double quotes within the query.
+    - If no identifiable entities are found, proceed without adding quotes.
+2. **Query Preservation**:
+    - Maintain the entire original query, including any parts after commas or other punctuation.
+    - Do not remove or truncate any part of the original query.
+3. **Appending Current Year**:
+    - Append "after: {current_year}" to the end of the rephrased query.
+    - Ensure there is a space before "after:" for proper formatting.
+    - Do not use quotes or the "+" operator when adding the year.
+4. **Output**:
     - Return ONLY the rephrased query, ensuring it is concise, clear, and contextually accurate.
     - Do not include any additional commentary or explanation.
 ### Example Scenarios
+**Scenario 1: Query with One Entity**
 - **User Query**: "What is the latest news on Golomt Bank?"
+- **Rephrased Query**: "What is the latest news on \"Golomt Bank\" after: {current_year}"
+**Scenario 2: Query with Multiple Entities and Comma**
+- **User Query**: "What is the latest news about Prospect Capital, did the rating change?"
+- **Rephrased Query**: "What is the latest news about \"Prospect Capital\", did the rating change after: {current_year}"
+**Scenario 3: Query Without Recognizable Entities**
 - **User Query**: "How does photosynthesis work?"
+- **Rephrased Query**: "How does photosynthesis work? after: {current_year}"
 """
         logger.error(f"Error rephrasing query with LLM: {e}")
         return query  # Fallback to original query if rephrasing fails
+def rerank_documents(query, documents, similarity_threshold=0.95, max_results=5):
     try:
         # Step 1: Encode the query and document summaries
         query_embedding = similarity_model.encode(query, convert_to_tensor=True)
         if not doc_summaries:
             logger.warning("No document summaries to rerank.")
+            return documents
         doc_embeddings = similarity_model.encode(doc_summaries, convert_to_tensor=True)
         # Step 2: Compute Cosine Similarity
         cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
+        # Combine documents and cosine scores
+        scored_documents = list(zip(documents, cosine_scores))
+        # Step 3: Sort documents by cosine similarity score
         scored_documents.sort(key=lambda x: x[1], reverse=True)
+        # Step 4: Filter out similar documents
+        filtered_docs = []
+        for doc, score in scored_documents:
+            if score < 0.5:  # If similarity to query is too low, skip
+                continue
+            # Check similarity with already selected documents
+            is_similar = False
+            for selected_doc in filtered_docs:
+                similarity = util.pytorch_cos_sim(
+                    similarity_model.encode(doc['summary'], convert_to_tensor=True),
+                    similarity_model.encode(selected_doc['summary'], convert_to_tensor=True)
+                )
+                if similarity > similarity_threshold:
+                    is_similar = True
+                    break
+            if not is_similar:
+                filtered_docs.append(doc)
+            if len(filtered_docs) >= max_results:
+                break
+        logger.info(f"Reranked and filtered to {len(filtered_docs)} unique documents.")
+        return filtered_docs
     except Exception as e:
         logger.error(f"Error during reranking documents: {e}")
+        return documents[:max_results]  # Fallback to first max_results documents if reranking fails
 def compute_similarity(text1, text2):
     # Encode the texts
     return True
 def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
+    system_prompt = """You are a world-class AI assistant specializing in financial news analysis. Your task is to assess the relevance of a given document to a user's query and provide a detailed summary if it's relevant."""
     user_prompt = f"""
 Query: {query}
+Document Title: {document['title']}
 Document Content:
+{document['content'][:1000]}  # Limit to first 1000 characters for efficiency
 Instructions:
+1. Assess if the document is relevant to the QUERY made by the user.
+2. If relevant, provide a detailed summary that captures the unique aspects of this particular news item. Include:
+   - Key facts and figures
+   - Dates of events or announcements
+   - Names of important entities mentioned
+   - Any financial metrics or changes reported
+   - The potential impact or significance of the news
 3. If not relevant, simply state "Not relevant".
 Your response should be in the following format:
 Relevant: [Yes/No]
+Summary: [Your detailed summary if relevant, or "Not relevant" if not]
+Remember to focus on financial aspects and implications in your assessment and summary. Aim to make the summary distinctive, highlighting what makes this particular news item unique compared to similar news.
 """
     messages = [
     try:
         response = llm_client.chat_completion(
             messages=messages,
+            max_tokens=300,  # Increased to allow for more detailed summaries
             temperature=temperature,
             top_p=0.9
         )
         relevant_documents = []
         unique_summaries = []
         for doc in scraped_content:
+            # In the search_and_scrape function
             assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
             relevance, summary = assessment.split('\n', 1)
             if relevance.strip().lower() == "relevant: yes":
                 summary_text = summary.replace("Summary: ", "").strip()
             logger.debug(f"Assessment result: {assessment}")
         # Step 4: Rerank documents based on similarity to query
+        reranked_docs = rerank_documents(rephrased_query, relevant_documents, similarity_threshold=0.95, max_results=num_results)
         if not reranked_docs:
             logger.warning("No documents remained after reranking.")