SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 24, 2024

Commit

7646c7a

verified ·

1 Parent(s): 8962e02

Update app.py

Browse files

Files changed (1) hide show

app.py +331 -151

app.py CHANGED Viewed

@@ -39,6 +39,8 @@ from typing import List, Dict, Tuple
 import datetime
 from abc import ABC, abstractmethod
 from typing import List, Dict, Any
 # Automatically get the current year
 CURRENT_YEAR = datetime.datetime.now().year
@@ -108,8 +110,6 @@ mistral_client = Mistral(api_key=MISTRAL_API_KEY)
 # Initialize the similarity model
 similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
 # Step 1: Create a base class for AI models
 class AIModel(ABC):
     @abstractmethod
@@ -447,12 +447,6 @@ Rephrased query:"""
         logger.error(f"Error rephrasing query with LLM: {e}")
         return query  # Fallback to original query if rephrasing fails
-def extract_entity_domain(query):
-    # Use a simple regex pattern to extract domain names from the query
-    domain_pattern = r'\b(?:https?://)?(?:www\.)?([a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+)\b'
-    matches = re.findall(domain_pattern, query)
-    return matches[0] if matches else None
 class BM25:
     def __init__(self, k1: float = 1.5, b: float = 0.75):
         self.k1 = k1  # term frequency saturation parameter
@@ -542,75 +536,212 @@ def prepare_documents_for_bm25(documents: List[Dict]) -> Tuple[List[str], List[D
         doc_texts.append(doc_text)
     return doc_texts, documents
 # Now modify the rerank_documents_with_priority function to include BM25 ranking
-def rerank_documents_with_priority(query: str, documents: List[Dict], entity_domain: str,
-                                 similarity_threshold: float = 0.95, max_results: int = 5) -> List[Dict]:
     try:
         if not documents:
-            logger.warning("No documents to rerank.")
             return documents
-        # Step 1: Prepare documents for BM25
         doc_texts, original_docs = prepare_documents_for_bm25(documents)
-        # Step 2: Initialize and fit BM25
         bm25 = BM25()
         bm25.fit(doc_texts)
-        # Step 3: Get BM25 scores
         bm25_scores = bm25.get_scores(query)
-        # Step 4: Get semantic similarity scores
-        query_embedding = similarity_model.encode(query, convert_to_tensor=True)
-        doc_summaries = [doc['summary'] for doc in documents]
-        doc_embeddings = similarity_model.encode(doc_summaries, convert_to_tensor=True)
-        semantic_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
-        # Step 5: Combine scores (normalize first)
         bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
-        semantic_scores_norm = (semantic_scores - torch.min(semantic_scores)) / (torch.max(semantic_scores) - torch.min(semantic_scores))
-        # Combine scores with weights (0.4 for BM25, 0.6 for semantic similarity)
-        combined_scores = 0.4 * bm25_scores_norm + 0.6 * semantic_scores_norm.numpy()
-        # Create scored documents with combined scores
-        scored_documents = list(zip(documents, combined_scores))
-        # Sort by domain priority and combined score
-        scored_documents.sort(key=lambda x: (not x[0]['is_entity_domain'], -x[1]), reverse=False)
-        # Filter similar documents
-        filtered_docs = []
-        added_contents = []
-        for doc, score in scored_documents:
-            if score < 0.3:  # Minimum relevance threshold
-                continue
-            # Check similarity with already selected documents
-            doc_embedding = similarity_model.encode(doc['summary'], convert_to_tensor=True)
-            is_similar = False
-            for content in added_contents:
-                content_embedding = similarity_model.encode(content, convert_to_tensor=True)
-                similarity = util.pytorch_cos_sim(doc_embedding, content_embedding)
-                if similarity > similarity_threshold:
-                    is_similar = True
-                    break
-            if not is_similar:
-                filtered_docs.append(doc)
-                added_contents.append(doc['summary'])
-            if len(filtered_docs) >= max_results:
-                break
-        logger.info(f"Reranked and filtered to {len(filtered_docs)} unique documents using BM25 and semantic similarity.")
-        return filtered_docs
     except Exception as e:
-        logger.error(f"Error during reranking documents: {e}")
-        return documents[:max_results]  # Fallback to first max_results documents if reranking fails
 def compute_similarity(text1, text2):
     # Encode the texts
@@ -630,7 +761,7 @@ def is_content_unique(new_content, existing_contents, similarity_threshold=0.8):
     return True
 def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
-    system_prompt = """You are a world-class AI assistant specializing in news analysis. Your task is to assess the relevance of a given document to a user's query and provide a detailed summary if it's relevant."""
     user_prompt = f"""
 Query: {query}
@@ -640,20 +771,24 @@ Document Content:
 {document['content'][:1000]}  # Limit to first 1000 characters for efficiency
 Instructions:
-1. Assess if the document is relevant to the QUERY made by the user.
-2. If relevant, provide a detailed summary that captures the unique aspects of this particular news item. Include:
    - Key facts and figures
    - Dates of events or announcements
    - Names of important entities mentioned
    - Any metrics or changes reported
-   - The potential impact or significance of the news
-3. If not relevant, simply state "Not relevant".
 Your response should be in the following format:
-Relevant: [Yes/No]
-Summary: [Your detailed summary if relevant, or "Not relevant" if not]
-Remember to focus on key aspects and implications in your assessment and summary. Aim to make the summary distinctive, highlighting what makes this particular news item unique compared to similar news.
 """
     messages = [
@@ -664,15 +799,22 @@ Remember to focus on key aspects and implications in your assessment and summary
     try:
         response = llm_client.chat_completion(
             messages=messages,
-            max_tokens=300,  # Increased to allow for more detailed summaries
             temperature=temperature,
             top_p=0.9,
             frequency_penalty=1.4
         )
-        return response.choices[0].message.content.strip()
     except Exception as e:
-        logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
-        return "Error: Unable to assess relevance and summarize"
 def scrape_full_content(url, max_chars=3000, timeout=5, use_pydf2=True):
     try:
@@ -775,6 +917,9 @@ def search_and_scrape(
     use_pydf2: bool = True
 ):
     try:
         # Step 1: Rephrase the Query
         rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
         logger.info(f"Rephrased Query: {rephrased_query}")
@@ -783,12 +928,7 @@ def search_and_scrape(
             logger.info("No need to perform search based on the rephrased query.")
             return "No search needed for the provided input."
-        # Step 2: Extract entity domain
-        entity_domain = extract_entity_domain(rephrased_query)
-        logger.info(f"Extracted entity domain: {entity_domain}")
-        # Step 3: Perform search
-        # Search query parameters
         params = {
             'q': rephrased_query,
             'format': 'json',
@@ -801,13 +941,11 @@ def search_and_scrape(
         # Remove empty parameters
         params = {k: v for k, v in params.items() if v != ""}
-        # If no engines are specified, set default engines
         if 'engines' not in params:
-            params['engines'] = 'google'  # Default to 'google' or any preferred engine
             logger.info("No engines specified. Defaulting to 'google'.")
-        # Headers for SearXNG request
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
             'Accept': 'application/json, text/javascript, */*; q=0.01',
@@ -823,18 +961,16 @@ def search_and_scrape(
         scraped_content = []
         page = 1
         while len(scraped_content) < num_results:
-            # Update params with current page
             params['pageno'] = page
-            # Send request to SearXNG
-            logger.info(f"Sending request to SearXNG for query: {rephrased_query} (Page {page})")
-            session = requests_retry_session()
             try:
                 if method.upper() == "GET":
                     response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
-                else:  # POST
                     response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
                 response.raise_for_status()
@@ -843,9 +979,8 @@ def search_and_scrape(
                 return f"An error occurred during the search request: {e}"
             search_results = response.json()
-            logger.debug(f"SearXNG Response: {search_results}")
             results = search_results.get('results', [])
             if not results:
                 logger.warning(f"No more results returned from SearXNG on page {page}.")
                 break
@@ -853,33 +988,40 @@ def search_and_scrape(
             for result in results:
                 if len(scraped_content) >= num_results:
                     break
                 url = result.get('url', '')
                 title = result.get('title', 'No title')
                 if not is_valid_url(url):
                     logger.warning(f"Invalid URL: {url}")
                     continue
                 try:
                     logger.info(f"Processing content from: {url}")
                     content = scrape_full_content(url, max_chars, timeout, use_pydf2)
-                    if content is None:  # This means it's a PDF and use_pydf2 is False
                         continue
                     if not content:
                         logger.warning(f"Failed to scrape content from {url}")
                         continue
                     scraped_content.append({
                         "title": title,
                         "url": url,
                         "content": content,
-                        "scraper": "pdf" if url.lower().endswith('.pdf') else "newspaper"
                     })
-                    logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
                 except requests.exceptions.RequestException as e:
                     logger.error(f"Error scraping {url}: {e}")
                 except Exception as e:
@@ -891,51 +1033,108 @@ def search_and_scrape(
             logger.warning("No content scraped from search results.")
             return "No content could be scraped from the search results."
-        logger.info(f"Successfully scraped {len(scraped_content)} documents.")
-         # Step 4: Assess relevance, summarize, and check for uniqueness
         relevant_documents = []
-        unique_summaries = []
         for doc in scraped_content:
             assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
             relevance, summary = assessment.split('\n', 1)
             if relevance.strip().lower() == "relevant: yes":
                 summary_text = summary.replace("Summary: ", "").strip()
-                if is_content_unique(summary_text, unique_summaries):
-                    doc_domain = urlparse(doc['url']).netloc
-                    is_entity_domain = doc_domain == entity_domain
                     relevant_documents.append({
                         "title": doc['title'],
                         "url": doc['url'],
                         "summary": summary_text,
                         "scraper": doc['scraper'],
-                        "is_entity_domain": is_entity_domain
                     })
-                    unique_summaries.append(summary_text)
-                else:
-                    logger.info(f"Skipping similar content: {doc['title']}")
         if not relevant_documents:
             logger.warning("No relevant and unique documents found.")
-            return "No relevant and unique news found for the given query."
-        # Step 5: Rerank documents based on similarity to query and prioritize entity domain
-        reranked_docs = rerank_documents_with_priority(rephrased_query, relevant_documents, entity_domain, similarity_threshold=0.95, max_results=num_results)
         if not reranked_docs:
             logger.warning("No documents remained after reranking.")
-            return "No relevant news found after filtering and ranking."
-        logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, related documents.")
-        # Step 5: Scrape full content for top documents (up to num_results)
-        for doc in reranked_docs[:num_results]:
-            full_content = scrape_full_content(doc['url'], max_chars)
-            doc['full_content'] = full_content
-        # Prepare JSON for LLM
         llm_input = {
             "query": query,
             "documents": [
@@ -943,12 +1142,13 @@ def search_and_scrape(
                     "title": doc['title'],
                     "url": doc['url'],
                     "summary": doc['summary'],
-                    "full_content": doc['full_content']
-                } for doc in reranked_docs[:num_results]
             ]
         }
-        # Step 6: LLM Summarization
         llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
         return llm_summary
@@ -957,6 +1157,12 @@ def search_and_scrape(
         logger.error(f"Unexpected error in search_and_scrape: {e}")
         return f"An unexpected error occurred during the search and scrape process: {e}"
 # Helper function to get the appropriate client for each model
 def get_client_for_model(model: str) -> Any:
     if model == "huggingface":
@@ -970,6 +1176,7 @@ def get_client_for_model(model: str) -> Any:
     else:
         raise ValueError(f"Unsupported model: {model}")
 def chat_function(message: str, history: List[Tuple[str, str]], only_web_search: bool, num_results: int, max_chars: int, time_range: str, language: str, category: str, engines: List[str], safesearch: int, method: str, llm_temperature: float, model: str, use_pydf2: bool):
     chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
@@ -1005,7 +1212,6 @@ def chat_function(message: str, history: List[Tuple[str, str]], only_web_search:
     yield response
 iface = gr.ChatInterface(
     chat_function,
     title="Web Scraper for News with Sentinel AI",
@@ -1044,30 +1250,4 @@ iface = gr.ChatInterface(
 if __name__ == "__main__":
     logger.info("Starting the SearXNG Scraper for News using ChatInterface with Advanced Parameters")
-    iface.launch(server_name="0.0.0.0", server_port=7860, share=False)

 import datetime
 from abc import ABC, abstractmethod
 from typing import List, Dict, Any
+import spacy
+from textblob import TextBlob
 # Automatically get the current year
 CURRENT_YEAR = datetime.datetime.now().year
 # Initialize the similarity model
 similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
 # Step 1: Create a base class for AI models
 class AIModel(ABC):
     @abstractmethod
         logger.error(f"Error rephrasing query with LLM: {e}")
         return query  # Fallback to original query if rephrasing fails
 class BM25:
     def __init__(self, k1: float = 1.5, b: float = 0.75):
         self.k1 = k1  # term frequency saturation parameter
         doc_texts.append(doc_text)
     return doc_texts, documents
+class ImprovedRanking:
+    def __init__(self):
+        # Load spacy for text analysis
+        self.nlp = spacy.load('en_core_web_sm')
+    def analyze_query(self, query: str) -> Dict:
+        """
+        Analyze query to determine appropriate weights
+        Args:
+            query: Search query string
+        Returns:
+            Dictionary with query analysis results
+        """
+        doc = self.nlp(query)
+        analysis = {
+            'word_count': len(query.split()),
+            'has_entities': bool(doc.ents),
+            'is_question': any(token.tag_ == 'WP' or token.tag_ == 'WRB' for token in doc),
+            'sentiment': TextBlob(query).sentiment.polarity
+        }
+        return analysis
+    def get_adaptive_weights(self, query: str) -> Tuple[float, float]:
+        """
+        Calculate adaptive weights based on query characteristics
+        Args:
+            query: Search query string
+        Returns:
+            Tuple of (bm25_weight, semantic_weight)
+        """
+        analysis = self.analyze_query(query)
+        # Base weights
+        bm25_weight = 0.4
+        semantic_weight = 0.6
+        # Adjust weights based on query characteristics
+        if analysis['word_count'] <= 2:
+            # Short queries: favor keyword matching
+            bm25_weight = 0.6
+            semantic_weight = 0.4
+        elif analysis['word_count'] >= 6:
+            # Long queries: favor semantic understanding
+            bm25_weight = 0.3
+            semantic_weight = 0.7
+        if analysis['has_entities']:
+            # Queries with named entities: increase keyword importance
+            bm25_weight += 0.1
+            semantic_weight -= 0.1
+        if analysis['is_question']:
+            # Questions: favor semantic understanding
+            bm25_weight -= 0.1
+            semantic_weight += 0.1
+        # Normalize weights to ensure they sum to 1
+        total = bm25_weight + semantic_weight
+        return bm25_weight/total, semantic_weight/total
+    def calculate_relevance_score(self, doc: Dict, query: str, similarity_model) -> float:
+        """
+        Calculate comprehensive relevance score for a document
+        Args:
+            doc: Document dictionary with title and content
+            query: Search query string
+            similarity_model: Model for computing semantic similarity
+        Returns:
+            Float representing document relevance score
+        """
+        # 1. Title relevance (30%)
+        title_embedding = similarity_model.encode(doc['title'], convert_to_tensor=True)
+        query_embedding = similarity_model.encode(query, convert_to_tensor=True)
+        title_similarity = torch.cosine_similarity(title_embedding, query_embedding, dim=0).item()
+        # 2. Content relevance (40%)
+        # Use first 512 tokens of content to avoid memory issues
+        content_preview = ' '.join(doc['content'].split()[:512])
+        content_embedding = similarity_model.encode(content_preview, convert_to_tensor=True)
+        content_similarity = torch.cosine_similarity(content_embedding, query_embedding, dim=0).item()
+        # 3. Query term presence (20%)
+        query_terms = set(query.lower().split())
+        title_terms = set(doc['title'].lower().split())
+        content_terms = set(content_preview.lower().split())
+        title_term_overlap = len(query_terms & title_terms) / len(query_terms)
+        content_term_overlap = len(query_terms & content_terms) / len(query_terms)
+        # 4. Document quality indicators (10%)
+        quality_score = self.assess_document_quality(doc)
+        # Combine scores with weights
+        final_score = (
+            title_similarity * 0.3 +
+            content_similarity * 0.4 +
+            ((title_term_overlap + content_term_overlap) / 2) * 0.2 +
+            quality_score * 0.1
+        )
+        return final_score
+    def assess_document_quality(self, doc: Dict) -> float:
+        """
+        Assess document quality based on various metrics
+        Args:
+            doc: Document dictionary
+        Returns:
+            Float representing document quality score
+        """
+        score = 0.0
+        # 1. Length score (longer documents often have more information)
+        content_length = len(doc['content'].split())
+        length_score = min(content_length / 1000, 1.0)  # Cap at 1000 words
+        # 2. Text structure score
+        has_paragraphs = doc['content'].count('\n\n') > 0
+        has_sections = bool(re.findall(r'\n[A-Z][^.!?]*[:]\n', doc['content']))
+        # 3. Writing quality score (using basic metrics)
+        blob = TextBlob(doc['content'])
+        sentences = blob.sentences
+        avg_sentence_length = sum(len(str(s).split()) for s in sentences) / len(sentences) if sentences else 0
+        sentence_score = 1.0 if 10 <= avg_sentence_length <= 25 else 0.5
+        # Combine quality metrics
+        score = (
+            length_score * 0.4 +
+            (has_paragraphs * 0.2 + has_sections * 0.2) +
+            sentence_score * 0.2
+        )
+        return score
 # Now modify the rerank_documents_with_priority function to include BM25 ranking
+def rerank_documents_improved(query: str, documents: List[Dict],
+                            similarity_model, max_results: int = 5) -> List[Dict]:
+    """
+    Rerank documents using improved scoring system
+    Args:
+        query: Search query string
+        documents: List of document dictionaries
+        similarity_model: Model for computing semantic similarity
+        max_results: Maximum number of results to return
+    Returns:
+        List of reranked documents
+    """
+    ranker = ImprovedRanking()
     try:
         if not documents:
             return documents
+        # Get adaptive weights based on query
+        bm25_weight, semantic_weight = ranker.get_adaptive_weights(query)
+        # Prepare documents for BM25
         doc_texts, original_docs = prepare_documents_for_bm25(documents)
+        # Initialize and fit BM25
         bm25 = BM25()
         bm25.fit(doc_texts)
+        # Get BM25 scores
         bm25_scores = bm25.get_scores(query)
+        # Calculate comprehensive relevance scores
+        relevance_scores = [
+            ranker.calculate_relevance_score(doc, query, similarity_model)
+            for doc in documents
+        ]
+        # Normalize scores
         bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
+        relevance_scores_norm = (np.array(relevance_scores) - np.min(relevance_scores)) / (np.max(relevance_scores) - np.min(relevance_scores))
+        # Combine scores using adaptive weights
+        final_scores = (bm25_weight * bm25_scores_norm +
+                       semantic_weight * relevance_scores_norm)
+        # Create scored documents
+        scored_documents = list(zip(documents, final_scores))
+        # Sort by final score
+        scored_documents.sort(key=lambda x: x[1], reverse=True)
+        # Return top results
+        return [doc for doc, score in scored_documents[:max_results]]
     except Exception as e:
+        logger.error(f"Error during improved reranking: {e}")
+        return documents[:max_results]
 def compute_similarity(text1, text2):
     # Encode the texts
     return True
 def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
+    system_prompt = """You are a world-class AI assistant specializing in news analysis and document summarization. Your task is to provide a comprehensive and detailed summary of the given document that captures its key points and relevance to the user's query."""
     user_prompt = f"""
 Query: {query}
 {document['content'][:1000]}  # Limit to first 1000 characters for efficiency
 Instructions:
+1. Provide a detailed summary that captures the unique aspects of this document. Include:
    - Key facts and figures
    - Dates of events or announcements
    - Names of important entities mentioned
    - Any metrics or changes reported
+   - The potential impact or significance of the content
+2. Focus on aspects that are most relevant to the user's query
+3. Ensure the summary is distinctive and highlights what makes this particular document unique
+4. Include any specific context that helps understand the document's significance
 Your response should be in the following format:
+Summary: [Your detailed summary]
+Remember to:
+- Highlight the most important information first
+- Include specific numbers, dates, and facts when available
+- Connect the information to the user's query where relevant
+- Focus on what makes this document unique or noteworthy
 """
     messages = [
     try:
         response = llm_client.chat_completion(
             messages=messages,
+            max_tokens=300,
             temperature=temperature,
             top_p=0.9,
             frequency_penalty=1.4
         )
+        summary = response.choices[0].message.content.strip()
+        # If the summary starts with "Summary: ", remove it
+        if summary.startswith("Summary: "):
+            summary = summary[9:].strip()
+        # Always return format as if document was relevant
+        return f"Relevant: Yes\nSummary: {summary}"
     except Exception as e:
+        logger.error(f"Error summarizing with LLM: {e}")
+        return f"Relevant: Yes\nSummary: Error occurred while summarizing the document: {str(e)}"
 def scrape_full_content(url, max_chars=3000, timeout=5, use_pydf2=True):
     try:
     use_pydf2: bool = True
 ):
     try:
+        # Initialize ImprovedRanking instead of DocumentRanker
+        document_ranker = ImprovedRanking()
         # Step 1: Rephrase the Query
         rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
         logger.info(f"Rephrased Query: {rephrased_query}")
             logger.info("No need to perform search based on the rephrased query.")
             return "No search needed for the provided input."
+        # [Search parameters and request handling remain the same...]
         params = {
             'q': rephrased_query,
             'format': 'json',
         # Remove empty parameters
         params = {k: v for k, v in params.items() if v != ""}
         if 'engines' not in params:
+            params['engines'] = 'google'
             logger.info("No engines specified. Defaulting to 'google'.")
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
             'Accept': 'application/json, text/javascript, */*; q=0.01',
         scraped_content = []
         page = 1
+        # Content scraping loop remains mostly the same, but add quality assessment
         while len(scraped_content) < num_results:
             params['pageno'] = page
             try:
+                session = requests_retry_session()
                 if method.upper() == "GET":
                     response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
+                else:
                     response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
                 response.raise_for_status()
                 return f"An error occurred during the search request: {e}"
             search_results = response.json()
             results = search_results.get('results', [])
             if not results:
                 logger.warning(f"No more results returned from SearXNG on page {page}.")
                 break
             for result in results:
                 if len(scraped_content) >= num_results:
                     break
                 url = result.get('url', '')
                 title = result.get('title', 'No title')
                 if not is_valid_url(url):
                     logger.warning(f"Invalid URL: {url}")
                     continue
                 try:
                     logger.info(f"Processing content from: {url}")
                     content = scrape_full_content(url, max_chars, timeout, use_pydf2)
+                    if content is None:
                         continue
                     if not content:
                         logger.warning(f"Failed to scrape content from {url}")
                         continue
+                    # Add initial quality assessment
+                    doc_quality = document_ranker.assess_document_quality({
+                        "title": title,
+                        "content": content
+                    })
                     scraped_content.append({
                         "title": title,
                         "url": url,
                         "content": content,
+                        "scraper": "pdf" if url.lower().endswith('.pdf') else "newspaper",
+                        "quality_score": doc_quality
                     })
+                    logger.info(f"Successfully scraped content from {url}. Quality score: {doc_quality}")
                 except requests.exceptions.RequestException as e:
                     logger.error(f"Error scraping {url}: {e}")
                 except Exception as e:
             logger.warning("No content scraped from search results.")
             return "No content could be scraped from the search results."
+        # Modified relevance assessment with improved analysis
         relevant_documents = []
+        unique_summaries = set()
         for doc in scraped_content:
             assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
             relevance, summary = assessment.split('\n', 1)
             if relevance.strip().lower() == "relevant: yes":
                 summary_text = summary.replace("Summary: ", "").strip()
+                if is_content_unique(summary_text, unique_summaries, similarity_threshold=0.8):
+                    # Calculate comprehensive relevance score using new method
+                    relevance_score = document_ranker.calculate_relevance_score(
+                        {
+                            "title": doc['title'],
+                            "content": doc['content'],
+                            "summary": summary_text
+                        },
+                        rephrased_query,
+                        similarity_model
+                    )
                     relevant_documents.append({
                         "title": doc['title'],
                         "url": doc['url'],
+                        "content": doc['content'],
                         "summary": summary_text,
                         "scraper": doc['scraper'],
+                        "relevance_score": relevance_score,
+                        "quality_score": doc['quality_score']
                     })
+                    unique_summaries.add(summary_text)
         if not relevant_documents:
             logger.warning("No relevant and unique documents found.")
+            return "No relevant and unique content found for the given query."
+        # Enhanced reranking using improved weights and BM25
+        try:
+            # Get query-adaptive weights
+            bm25_weight, semantic_weight = document_ranker.get_adaptive_weights(rephrased_query)
+            logger.info(f"Using adaptive weights - BM25: {bm25_weight}, Semantic: {semantic_weight}")
+            # Prepare documents for BM25
+            doc_texts = [f"{doc['title']} {doc['content']}" for doc in relevant_documents]
+            # Initialize and fit BM25
+            bm25 = BM25()
+            bm25.fit(doc_texts)
+            # Get BM25 scores
+            bm25_scores = bm25.get_scores(rephrased_query)
+            # Calculate semantic scores using title and content
+            query_embedding = similarity_model.encode(rephrased_query, convert_to_tensor=True)
+            doc_embeddings = similarity_model.encode(
+                [f"{doc['title']} {doc['summary']}" for doc in relevant_documents],
+                convert_to_tensor=True
+            )
+            semantic_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
+            # Get quality scores
+            quality_scores = np.array([doc['quality_score'] for doc in relevant_documents])
+            # Normalize all scores
+            bm25_scores_norm = normalize_scores(bm25_scores)
+            semantic_scores_norm = normalize_scores(semantic_scores.numpy())
+            quality_scores_norm = normalize_scores(quality_scores)
+            relevance_scores = normalize_scores(
+                np.array([doc['relevance_score'] for doc in relevant_documents])
+            )
+            # Combine scores with weights
+            final_scores = (
+                bm25_weight * bm25_scores_norm +
+                semantic_weight * semantic_scores_norm +
+                0.15 * quality_scores_norm +  # Add quality score weight
+                0.15 * relevance_scores      # Reduced from 0.2 to accommodate quality
+            )
+            # Create scored documents
+            scored_documents = list(zip(relevant_documents, final_scores))
+            scored_documents.sort(key=lambda x: x[1], reverse=True)
+            # Take top results
+            reranked_docs = [doc for doc, _ in scored_documents[:num_results]]
+        except Exception as e:
+            logger.error(f"Error during document reranking: {e}")
+            # Fallback to basic sorting by relevance and quality
+            reranked_docs = sorted(
+                relevant_documents,
+                key=lambda x: (x['relevance_score'] + x['quality_score']) / 2,
+                reverse=True
+            )[:num_results]
         if not reranked_docs:
             logger.warning("No documents remained after reranking.")
+            return "No relevant content found after filtering and ranking."
+        # Prepare final documents for LLM
         llm_input = {
             "query": query,
             "documents": [
                     "title": doc['title'],
                     "url": doc['url'],
                     "summary": doc['summary'],
+                    "content": doc['content'],
+                    "quality_score": doc['quality_score']  # Include quality score
+                } for doc in reranked_docs
             ]
         }
+        # LLM Summarization
         llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
         return llm_summary
         logger.error(f"Unexpected error in search_and_scrape: {e}")
         return f"An unexpected error occurred during the search and scrape process: {e}"
+def normalize_scores(scores: np.ndarray) -> np.ndarray:
+    """Normalize scores to range [0, 1]"""
+    if np.all(scores == scores[0]):
+        return np.ones_like(scores)
+    return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
 # Helper function to get the appropriate client for each model
 def get_client_for_model(model: str) -> Any:
     if model == "huggingface":
     else:
         raise ValueError(f"Unsupported model: {model}")
 def chat_function(message: str, history: List[Tuple[str, str]], only_web_search: bool, num_results: int, max_chars: int, time_range: str, language: str, category: str, engines: List[str], safesearch: int, method: str, llm_temperature: float, model: str, use_pydf2: bool):
     chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
     yield response
 iface = gr.ChatInterface(
     chat_function,
     title="Web Scraper for News with Sentinel AI",
 if __name__ == "__main__":
     logger.info("Starting the SearXNG Scraper for News using ChatInterface with Advanced Parameters")
+    iface.launch(share=False)