Shreyas094
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -39,6 +39,8 @@ from typing import List, Dict, Tuple
|
|
39 |
import datetime
|
40 |
from abc import ABC, abstractmethod
|
41 |
from typing import List, Dict, Any
|
|
|
|
|
42 |
|
43 |
# Automatically get the current year
|
44 |
CURRENT_YEAR = datetime.datetime.now().year
|
@@ -108,8 +110,6 @@ mistral_client = Mistral(api_key=MISTRAL_API_KEY)
|
|
108 |
# Initialize the similarity model
|
109 |
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
|
110 |
|
111 |
-
|
112 |
-
|
113 |
# Step 1: Create a base class for AI models
|
114 |
class AIModel(ABC):
|
115 |
@abstractmethod
|
@@ -447,12 +447,6 @@ Rephrased query:"""
|
|
447 |
logger.error(f"Error rephrasing query with LLM: {e}")
|
448 |
return query # Fallback to original query if rephrasing fails
|
449 |
|
450 |
-
def extract_entity_domain(query):
|
451 |
-
# Use a simple regex pattern to extract domain names from the query
|
452 |
-
domain_pattern = r'\b(?:https?://)?(?:www\.)?([a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+)\b'
|
453 |
-
matches = re.findall(domain_pattern, query)
|
454 |
-
return matches[0] if matches else None
|
455 |
-
|
456 |
class BM25:
|
457 |
def __init__(self, k1: float = 1.5, b: float = 0.75):
|
458 |
self.k1 = k1 # term frequency saturation parameter
|
@@ -542,75 +536,212 @@ def prepare_documents_for_bm25(documents: List[Dict]) -> Tuple[List[str], List[D
|
|
542 |
doc_texts.append(doc_text)
|
543 |
return doc_texts, documents
|
544 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
545 |
# Now modify the rerank_documents_with_priority function to include BM25 ranking
|
546 |
-
def
|
547 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
548 |
try:
|
549 |
if not documents:
|
550 |
-
logger.warning("No documents to rerank.")
|
551 |
return documents
|
552 |
|
553 |
-
#
|
|
|
|
|
|
|
554 |
doc_texts, original_docs = prepare_documents_for_bm25(documents)
|
555 |
|
556 |
-
#
|
557 |
bm25 = BM25()
|
558 |
bm25.fit(doc_texts)
|
559 |
|
560 |
-
#
|
561 |
bm25_scores = bm25.get_scores(query)
|
562 |
|
563 |
-
#
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
|
569 |
-
#
|
570 |
bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
|
571 |
-
|
572 |
|
573 |
-
# Combine scores
|
574 |
-
|
|
|
575 |
|
576 |
-
# Create scored documents
|
577 |
-
scored_documents = list(zip(documents,
|
578 |
|
579 |
-
# Sort by
|
580 |
-
scored_documents.sort(key=lambda x:
|
581 |
|
582 |
-
#
|
583 |
-
|
584 |
-
added_contents = []
|
585 |
-
|
586 |
-
for doc, score in scored_documents:
|
587 |
-
if score < 0.3: # Minimum relevance threshold
|
588 |
-
continue
|
589 |
-
|
590 |
-
# Check similarity with already selected documents
|
591 |
-
doc_embedding = similarity_model.encode(doc['summary'], convert_to_tensor=True)
|
592 |
-
is_similar = False
|
593 |
-
|
594 |
-
for content in added_contents:
|
595 |
-
content_embedding = similarity_model.encode(content, convert_to_tensor=True)
|
596 |
-
similarity = util.pytorch_cos_sim(doc_embedding, content_embedding)
|
597 |
-
if similarity > similarity_threshold:
|
598 |
-
is_similar = True
|
599 |
-
break
|
600 |
-
|
601 |
-
if not is_similar:
|
602 |
-
filtered_docs.append(doc)
|
603 |
-
added_contents.append(doc['summary'])
|
604 |
-
|
605 |
-
if len(filtered_docs) >= max_results:
|
606 |
-
break
|
607 |
-
|
608 |
-
logger.info(f"Reranked and filtered to {len(filtered_docs)} unique documents using BM25 and semantic similarity.")
|
609 |
-
return filtered_docs
|
610 |
|
611 |
except Exception as e:
|
612 |
-
logger.error(f"Error during reranking
|
613 |
-
return documents[:max_results]
|
614 |
|
615 |
def compute_similarity(text1, text2):
|
616 |
# Encode the texts
|
@@ -630,7 +761,7 @@ def is_content_unique(new_content, existing_contents, similarity_threshold=0.8):
|
|
630 |
return True
|
631 |
|
632 |
def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
|
633 |
-
system_prompt = """You are a world-class AI assistant specializing in news analysis. Your task is to
|
634 |
|
635 |
user_prompt = f"""
|
636 |
Query: {query}
|
@@ -640,20 +771,24 @@ Document Content:
|
|
640 |
{document['content'][:1000]} # Limit to first 1000 characters for efficiency
|
641 |
|
642 |
Instructions:
|
643 |
-
1.
|
644 |
-
2. If relevant, provide a detailed summary that captures the unique aspects of this particular news item. Include:
|
645 |
- Key facts and figures
|
646 |
- Dates of events or announcements
|
647 |
- Names of important entities mentioned
|
648 |
- Any metrics or changes reported
|
649 |
-
- The potential impact or significance of the
|
650 |
-
|
|
|
|
|
651 |
|
652 |
Your response should be in the following format:
|
653 |
-
|
654 |
-
Summary: [Your detailed summary if relevant, or "Not relevant" if not]
|
655 |
|
656 |
-
Remember to
|
|
|
|
|
|
|
|
|
657 |
"""
|
658 |
|
659 |
messages = [
|
@@ -664,15 +799,22 @@ Remember to focus on key aspects and implications in your assessment and summary
|
|
664 |
try:
|
665 |
response = llm_client.chat_completion(
|
666 |
messages=messages,
|
667 |
-
max_tokens=300,
|
668 |
temperature=temperature,
|
669 |
top_p=0.9,
|
670 |
frequency_penalty=1.4
|
671 |
)
|
672 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
673 |
except Exception as e:
|
674 |
-
logger.error(f"Error
|
675 |
-
return "
|
676 |
|
677 |
def scrape_full_content(url, max_chars=3000, timeout=5, use_pydf2=True):
|
678 |
try:
|
@@ -775,6 +917,9 @@ def search_and_scrape(
|
|
775 |
use_pydf2: bool = True
|
776 |
):
|
777 |
try:
|
|
|
|
|
|
|
778 |
# Step 1: Rephrase the Query
|
779 |
rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
|
780 |
logger.info(f"Rephrased Query: {rephrased_query}")
|
@@ -783,12 +928,7 @@ def search_and_scrape(
|
|
783 |
logger.info("No need to perform search based on the rephrased query.")
|
784 |
return "No search needed for the provided input."
|
785 |
|
786 |
-
#
|
787 |
-
entity_domain = extract_entity_domain(rephrased_query)
|
788 |
-
logger.info(f"Extracted entity domain: {entity_domain}")
|
789 |
-
|
790 |
-
# Step 3: Perform search
|
791 |
-
# Search query parameters
|
792 |
params = {
|
793 |
'q': rephrased_query,
|
794 |
'format': 'json',
|
@@ -801,13 +941,11 @@ def search_and_scrape(
|
|
801 |
|
802 |
# Remove empty parameters
|
803 |
params = {k: v for k, v in params.items() if v != ""}
|
804 |
-
|
805 |
-
# If no engines are specified, set default engines
|
806 |
if 'engines' not in params:
|
807 |
-
params['engines'] = 'google'
|
808 |
logger.info("No engines specified. Defaulting to 'google'.")
|
809 |
|
810 |
-
# Headers for SearXNG request
|
811 |
headers = {
|
812 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
813 |
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
@@ -823,18 +961,16 @@ def search_and_scrape(
|
|
823 |
|
824 |
scraped_content = []
|
825 |
page = 1
|
|
|
|
|
826 |
while len(scraped_content) < num_results:
|
827 |
-
# Update params with current page
|
828 |
params['pageno'] = page
|
829 |
-
|
830 |
-
# Send request to SearXNG
|
831 |
-
logger.info(f"Sending request to SearXNG for query: {rephrased_query} (Page {page})")
|
832 |
-
session = requests_retry_session()
|
833 |
-
|
834 |
try:
|
|
|
835 |
if method.upper() == "GET":
|
836 |
response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
|
837 |
-
else:
|
838 |
response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
|
839 |
|
840 |
response.raise_for_status()
|
@@ -843,9 +979,8 @@ def search_and_scrape(
|
|
843 |
return f"An error occurred during the search request: {e}"
|
844 |
|
845 |
search_results = response.json()
|
846 |
-
logger.debug(f"SearXNG Response: {search_results}")
|
847 |
-
|
848 |
results = search_results.get('results', [])
|
|
|
849 |
if not results:
|
850 |
logger.warning(f"No more results returned from SearXNG on page {page}.")
|
851 |
break
|
@@ -853,33 +988,40 @@ def search_and_scrape(
|
|
853 |
for result in results:
|
854 |
if len(scraped_content) >= num_results:
|
855 |
break
|
856 |
-
|
857 |
url = result.get('url', '')
|
858 |
title = result.get('title', 'No title')
|
859 |
-
|
860 |
if not is_valid_url(url):
|
861 |
logger.warning(f"Invalid URL: {url}")
|
862 |
continue
|
863 |
-
|
864 |
try:
|
865 |
logger.info(f"Processing content from: {url}")
|
866 |
-
|
867 |
content = scrape_full_content(url, max_chars, timeout, use_pydf2)
|
868 |
|
869 |
-
if content is None:
|
870 |
continue
|
871 |
|
872 |
if not content:
|
873 |
logger.warning(f"Failed to scrape content from {url}")
|
874 |
continue
|
875 |
|
|
|
|
|
|
|
|
|
|
|
|
|
876 |
scraped_content.append({
|
877 |
"title": title,
|
878 |
"url": url,
|
879 |
"content": content,
|
880 |
-
"scraper": "pdf" if url.lower().endswith('.pdf') else "newspaper"
|
|
|
881 |
})
|
882 |
-
logger.info(f"Successfully scraped content from {url}.
|
|
|
883 |
except requests.exceptions.RequestException as e:
|
884 |
logger.error(f"Error scraping {url}: {e}")
|
885 |
except Exception as e:
|
@@ -891,51 +1033,108 @@ def search_and_scrape(
|
|
891 |
logger.warning("No content scraped from search results.")
|
892 |
return "No content could be scraped from the search results."
|
893 |
|
894 |
-
|
895 |
-
|
896 |
-
# Step 4: Assess relevance, summarize, and check for uniqueness
|
897 |
relevant_documents = []
|
898 |
-
unique_summaries =
|
|
|
899 |
for doc in scraped_content:
|
900 |
assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
|
901 |
relevance, summary = assessment.split('\n', 1)
|
902 |
-
|
903 |
if relevance.strip().lower() == "relevant: yes":
|
904 |
summary_text = summary.replace("Summary: ", "").strip()
|
905 |
|
906 |
-
if is_content_unique(summary_text, unique_summaries):
|
907 |
-
|
908 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
909 |
relevant_documents.append({
|
910 |
"title": doc['title'],
|
911 |
"url": doc['url'],
|
|
|
912 |
"summary": summary_text,
|
913 |
"scraper": doc['scraper'],
|
914 |
-
"
|
|
|
915 |
})
|
916 |
-
unique_summaries.
|
917 |
-
else:
|
918 |
-
logger.info(f"Skipping similar content: {doc['title']}")
|
919 |
|
920 |
if not relevant_documents:
|
921 |
logger.warning("No relevant and unique documents found.")
|
922 |
-
return "No relevant and unique
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
923 |
|
924 |
-
# Step 5: Rerank documents based on similarity to query and prioritize entity domain
|
925 |
-
reranked_docs = rerank_documents_with_priority(rephrased_query, relevant_documents, entity_domain, similarity_threshold=0.95, max_results=num_results)
|
926 |
-
|
927 |
if not reranked_docs:
|
928 |
logger.warning("No documents remained after reranking.")
|
929 |
-
return "No relevant
|
930 |
-
|
931 |
-
logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, related documents.")
|
932 |
|
933 |
-
#
|
934 |
-
for doc in reranked_docs[:num_results]:
|
935 |
-
full_content = scrape_full_content(doc['url'], max_chars)
|
936 |
-
doc['full_content'] = full_content
|
937 |
-
|
938 |
-
# Prepare JSON for LLM
|
939 |
llm_input = {
|
940 |
"query": query,
|
941 |
"documents": [
|
@@ -943,12 +1142,13 @@ def search_and_scrape(
|
|
943 |
"title": doc['title'],
|
944 |
"url": doc['url'],
|
945 |
"summary": doc['summary'],
|
946 |
-
"
|
947 |
-
|
|
|
948 |
]
|
949 |
}
|
950 |
|
951 |
-
#
|
952 |
llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
|
953 |
|
954 |
return llm_summary
|
@@ -957,6 +1157,12 @@ def search_and_scrape(
|
|
957 |
logger.error(f"Unexpected error in search_and_scrape: {e}")
|
958 |
return f"An unexpected error occurred during the search and scrape process: {e}"
|
959 |
|
|
|
|
|
|
|
|
|
|
|
|
|
960 |
# Helper function to get the appropriate client for each model
|
961 |
def get_client_for_model(model: str) -> Any:
|
962 |
if model == "huggingface":
|
@@ -970,6 +1176,7 @@ def get_client_for_model(model: str) -> Any:
|
|
970 |
else:
|
971 |
raise ValueError(f"Unsupported model: {model}")
|
972 |
|
|
|
973 |
def chat_function(message: str, history: List[Tuple[str, str]], only_web_search: bool, num_results: int, max_chars: int, time_range: str, language: str, category: str, engines: List[str], safesearch: int, method: str, llm_temperature: float, model: str, use_pydf2: bool):
|
974 |
chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
|
975 |
|
@@ -1005,7 +1212,6 @@ def chat_function(message: str, history: List[Tuple[str, str]], only_web_search:
|
|
1005 |
|
1006 |
yield response
|
1007 |
|
1008 |
-
|
1009 |
iface = gr.ChatInterface(
|
1010 |
chat_function,
|
1011 |
title="Web Scraper for News with Sentinel AI",
|
@@ -1044,30 +1250,4 @@ iface = gr.ChatInterface(
|
|
1044 |
|
1045 |
if __name__ == "__main__":
|
1046 |
logger.info("Starting the SearXNG Scraper for News using ChatInterface with Advanced Parameters")
|
1047 |
-
iface.launch(
|
1048 |
-
|
1049 |
-
|
1050 |
-
|
1051 |
-
|
1052 |
-
|
1053 |
-
|
1054 |
-
|
1055 |
-
|
1056 |
-
|
1057 |
-
|
1058 |
-
|
1059 |
-
|
1060 |
-
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
-
|
1066 |
-
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
-
|
1071 |
-
|
1072 |
-
|
1073 |
-
|
|
|
39 |
import datetime
|
40 |
from abc import ABC, abstractmethod
|
41 |
from typing import List, Dict, Any
|
42 |
+
import spacy
|
43 |
+
from textblob import TextBlob
|
44 |
|
45 |
# Automatically get the current year
|
46 |
CURRENT_YEAR = datetime.datetime.now().year
|
|
|
110 |
# Initialize the similarity model
|
111 |
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
|
112 |
|
|
|
|
|
113 |
# Step 1: Create a base class for AI models
|
114 |
class AIModel(ABC):
|
115 |
@abstractmethod
|
|
|
447 |
logger.error(f"Error rephrasing query with LLM: {e}")
|
448 |
return query # Fallback to original query if rephrasing fails
|
449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
class BM25:
|
451 |
def __init__(self, k1: float = 1.5, b: float = 0.75):
|
452 |
self.k1 = k1 # term frequency saturation parameter
|
|
|
536 |
doc_texts.append(doc_text)
|
537 |
return doc_texts, documents
|
538 |
|
539 |
+
|
540 |
+
class ImprovedRanking:
|
541 |
+
def __init__(self):
|
542 |
+
# Load spacy for text analysis
|
543 |
+
self.nlp = spacy.load('en_core_web_sm')
|
544 |
+
|
545 |
+
def analyze_query(self, query: str) -> Dict:
|
546 |
+
"""
|
547 |
+
Analyze query to determine appropriate weights
|
548 |
+
|
549 |
+
Args:
|
550 |
+
query: Search query string
|
551 |
+
|
552 |
+
Returns:
|
553 |
+
Dictionary with query analysis results
|
554 |
+
"""
|
555 |
+
doc = self.nlp(query)
|
556 |
+
|
557 |
+
analysis = {
|
558 |
+
'word_count': len(query.split()),
|
559 |
+
'has_entities': bool(doc.ents),
|
560 |
+
'is_question': any(token.tag_ == 'WP' or token.tag_ == 'WRB' for token in doc),
|
561 |
+
'sentiment': TextBlob(query).sentiment.polarity
|
562 |
+
}
|
563 |
+
|
564 |
+
return analysis
|
565 |
+
|
566 |
+
def get_adaptive_weights(self, query: str) -> Tuple[float, float]:
|
567 |
+
"""
|
568 |
+
Calculate adaptive weights based on query characteristics
|
569 |
+
|
570 |
+
Args:
|
571 |
+
query: Search query string
|
572 |
+
|
573 |
+
Returns:
|
574 |
+
Tuple of (bm25_weight, semantic_weight)
|
575 |
+
"""
|
576 |
+
analysis = self.analyze_query(query)
|
577 |
+
|
578 |
+
# Base weights
|
579 |
+
bm25_weight = 0.4
|
580 |
+
semantic_weight = 0.6
|
581 |
+
|
582 |
+
# Adjust weights based on query characteristics
|
583 |
+
if analysis['word_count'] <= 2:
|
584 |
+
# Short queries: favor keyword matching
|
585 |
+
bm25_weight = 0.6
|
586 |
+
semantic_weight = 0.4
|
587 |
+
elif analysis['word_count'] >= 6:
|
588 |
+
# Long queries: favor semantic understanding
|
589 |
+
bm25_weight = 0.3
|
590 |
+
semantic_weight = 0.7
|
591 |
+
|
592 |
+
if analysis['has_entities']:
|
593 |
+
# Queries with named entities: increase keyword importance
|
594 |
+
bm25_weight += 0.1
|
595 |
+
semantic_weight -= 0.1
|
596 |
+
|
597 |
+
if analysis['is_question']:
|
598 |
+
# Questions: favor semantic understanding
|
599 |
+
bm25_weight -= 0.1
|
600 |
+
semantic_weight += 0.1
|
601 |
+
|
602 |
+
# Normalize weights to ensure they sum to 1
|
603 |
+
total = bm25_weight + semantic_weight
|
604 |
+
return bm25_weight/total, semantic_weight/total
|
605 |
+
|
606 |
+
def calculate_relevance_score(self, doc: Dict, query: str, similarity_model) -> float:
|
607 |
+
"""
|
608 |
+
Calculate comprehensive relevance score for a document
|
609 |
+
|
610 |
+
Args:
|
611 |
+
doc: Document dictionary with title and content
|
612 |
+
query: Search query string
|
613 |
+
similarity_model: Model for computing semantic similarity
|
614 |
+
|
615 |
+
Returns:
|
616 |
+
Float representing document relevance score
|
617 |
+
"""
|
618 |
+
# 1. Title relevance (30%)
|
619 |
+
title_embedding = similarity_model.encode(doc['title'], convert_to_tensor=True)
|
620 |
+
query_embedding = similarity_model.encode(query, convert_to_tensor=True)
|
621 |
+
title_similarity = torch.cosine_similarity(title_embedding, query_embedding, dim=0).item()
|
622 |
+
|
623 |
+
# 2. Content relevance (40%)
|
624 |
+
# Use first 512 tokens of content to avoid memory issues
|
625 |
+
content_preview = ' '.join(doc['content'].split()[:512])
|
626 |
+
content_embedding = similarity_model.encode(content_preview, convert_to_tensor=True)
|
627 |
+
content_similarity = torch.cosine_similarity(content_embedding, query_embedding, dim=0).item()
|
628 |
+
|
629 |
+
# 3. Query term presence (20%)
|
630 |
+
query_terms = set(query.lower().split())
|
631 |
+
title_terms = set(doc['title'].lower().split())
|
632 |
+
content_terms = set(content_preview.lower().split())
|
633 |
+
|
634 |
+
title_term_overlap = len(query_terms & title_terms) / len(query_terms)
|
635 |
+
content_term_overlap = len(query_terms & content_terms) / len(query_terms)
|
636 |
+
|
637 |
+
# 4. Document quality indicators (10%)
|
638 |
+
quality_score = self.assess_document_quality(doc)
|
639 |
+
|
640 |
+
# Combine scores with weights
|
641 |
+
final_score = (
|
642 |
+
title_similarity * 0.3 +
|
643 |
+
content_similarity * 0.4 +
|
644 |
+
((title_term_overlap + content_term_overlap) / 2) * 0.2 +
|
645 |
+
quality_score * 0.1
|
646 |
+
)
|
647 |
+
|
648 |
+
return final_score
|
649 |
+
|
650 |
+
def assess_document_quality(self, doc: Dict) -> float:
|
651 |
+
"""
|
652 |
+
Assess document quality based on various metrics
|
653 |
+
|
654 |
+
Args:
|
655 |
+
doc: Document dictionary
|
656 |
+
|
657 |
+
Returns:
|
658 |
+
Float representing document quality score
|
659 |
+
"""
|
660 |
+
score = 0.0
|
661 |
+
|
662 |
+
# 1. Length score (longer documents often have more information)
|
663 |
+
content_length = len(doc['content'].split())
|
664 |
+
length_score = min(content_length / 1000, 1.0) # Cap at 1000 words
|
665 |
+
|
666 |
+
# 2. Text structure score
|
667 |
+
has_paragraphs = doc['content'].count('\n\n') > 0
|
668 |
+
has_sections = bool(re.findall(r'\n[A-Z][^.!?]*[:]\n', doc['content']))
|
669 |
+
|
670 |
+
# 3. Writing quality score (using basic metrics)
|
671 |
+
blob = TextBlob(doc['content'])
|
672 |
+
sentences = blob.sentences
|
673 |
+
avg_sentence_length = sum(len(str(s).split()) for s in sentences) / len(sentences) if sentences else 0
|
674 |
+
sentence_score = 1.0 if 10 <= avg_sentence_length <= 25 else 0.5
|
675 |
+
|
676 |
+
# Combine quality metrics
|
677 |
+
score = (
|
678 |
+
length_score * 0.4 +
|
679 |
+
(has_paragraphs * 0.2 + has_sections * 0.2) +
|
680 |
+
sentence_score * 0.2
|
681 |
+
)
|
682 |
+
|
683 |
+
return score
|
684 |
+
|
685 |
# Now modify the rerank_documents_with_priority function to include BM25 ranking
|
686 |
+
def rerank_documents_improved(query: str, documents: List[Dict],
|
687 |
+
similarity_model, max_results: int = 5) -> List[Dict]:
|
688 |
+
"""
|
689 |
+
Rerank documents using improved scoring system
|
690 |
+
|
691 |
+
Args:
|
692 |
+
query: Search query string
|
693 |
+
documents: List of document dictionaries
|
694 |
+
similarity_model: Model for computing semantic similarity
|
695 |
+
max_results: Maximum number of results to return
|
696 |
+
|
697 |
+
Returns:
|
698 |
+
List of reranked documents
|
699 |
+
"""
|
700 |
+
ranker = ImprovedRanking()
|
701 |
+
|
702 |
try:
|
703 |
if not documents:
|
|
|
704 |
return documents
|
705 |
|
706 |
+
# Get adaptive weights based on query
|
707 |
+
bm25_weight, semantic_weight = ranker.get_adaptive_weights(query)
|
708 |
+
|
709 |
+
# Prepare documents for BM25
|
710 |
doc_texts, original_docs = prepare_documents_for_bm25(documents)
|
711 |
|
712 |
+
# Initialize and fit BM25
|
713 |
bm25 = BM25()
|
714 |
bm25.fit(doc_texts)
|
715 |
|
716 |
+
# Get BM25 scores
|
717 |
bm25_scores = bm25.get_scores(query)
|
718 |
|
719 |
+
# Calculate comprehensive relevance scores
|
720 |
+
relevance_scores = [
|
721 |
+
ranker.calculate_relevance_score(doc, query, similarity_model)
|
722 |
+
for doc in documents
|
723 |
+
]
|
724 |
|
725 |
+
# Normalize scores
|
726 |
bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
|
727 |
+
relevance_scores_norm = (np.array(relevance_scores) - np.min(relevance_scores)) / (np.max(relevance_scores) - np.min(relevance_scores))
|
728 |
|
729 |
+
# Combine scores using adaptive weights
|
730 |
+
final_scores = (bm25_weight * bm25_scores_norm +
|
731 |
+
semantic_weight * relevance_scores_norm)
|
732 |
|
733 |
+
# Create scored documents
|
734 |
+
scored_documents = list(zip(documents, final_scores))
|
735 |
|
736 |
+
# Sort by final score
|
737 |
+
scored_documents.sort(key=lambda x: x[1], reverse=True)
|
738 |
|
739 |
+
# Return top results
|
740 |
+
return [doc for doc, score in scored_documents[:max_results]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
741 |
|
742 |
except Exception as e:
|
743 |
+
logger.error(f"Error during improved reranking: {e}")
|
744 |
+
return documents[:max_results]
|
745 |
|
746 |
def compute_similarity(text1, text2):
|
747 |
# Encode the texts
|
|
|
761 |
return True
|
762 |
|
763 |
def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
|
764 |
+
system_prompt = """You are a world-class AI assistant specializing in news analysis and document summarization. Your task is to provide a comprehensive and detailed summary of the given document that captures its key points and relevance to the user's query."""
|
765 |
|
766 |
user_prompt = f"""
|
767 |
Query: {query}
|
|
|
771 |
{document['content'][:1000]} # Limit to first 1000 characters for efficiency
|
772 |
|
773 |
Instructions:
|
774 |
+
1. Provide a detailed summary that captures the unique aspects of this document. Include:
|
|
|
775 |
- Key facts and figures
|
776 |
- Dates of events or announcements
|
777 |
- Names of important entities mentioned
|
778 |
- Any metrics or changes reported
|
779 |
+
- The potential impact or significance of the content
|
780 |
+
2. Focus on aspects that are most relevant to the user's query
|
781 |
+
3. Ensure the summary is distinctive and highlights what makes this particular document unique
|
782 |
+
4. Include any specific context that helps understand the document's significance
|
783 |
|
784 |
Your response should be in the following format:
|
785 |
+
Summary: [Your detailed summary]
|
|
|
786 |
|
787 |
+
Remember to:
|
788 |
+
- Highlight the most important information first
|
789 |
+
- Include specific numbers, dates, and facts when available
|
790 |
+
- Connect the information to the user's query where relevant
|
791 |
+
- Focus on what makes this document unique or noteworthy
|
792 |
"""
|
793 |
|
794 |
messages = [
|
|
|
799 |
try:
|
800 |
response = llm_client.chat_completion(
|
801 |
messages=messages,
|
802 |
+
max_tokens=300,
|
803 |
temperature=temperature,
|
804 |
top_p=0.9,
|
805 |
frequency_penalty=1.4
|
806 |
)
|
807 |
+
summary = response.choices[0].message.content.strip()
|
808 |
+
|
809 |
+
# If the summary starts with "Summary: ", remove it
|
810 |
+
if summary.startswith("Summary: "):
|
811 |
+
summary = summary[9:].strip()
|
812 |
+
|
813 |
+
# Always return format as if document was relevant
|
814 |
+
return f"Relevant: Yes\nSummary: {summary}"
|
815 |
except Exception as e:
|
816 |
+
logger.error(f"Error summarizing with LLM: {e}")
|
817 |
+
return f"Relevant: Yes\nSummary: Error occurred while summarizing the document: {str(e)}"
|
818 |
|
819 |
def scrape_full_content(url, max_chars=3000, timeout=5, use_pydf2=True):
|
820 |
try:
|
|
|
917 |
use_pydf2: bool = True
|
918 |
):
|
919 |
try:
|
920 |
+
# Initialize ImprovedRanking instead of DocumentRanker
|
921 |
+
document_ranker = ImprovedRanking()
|
922 |
+
|
923 |
# Step 1: Rephrase the Query
|
924 |
rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
|
925 |
logger.info(f"Rephrased Query: {rephrased_query}")
|
|
|
928 |
logger.info("No need to perform search based on the rephrased query.")
|
929 |
return "No search needed for the provided input."
|
930 |
|
931 |
+
# [Search parameters and request handling remain the same...]
|
|
|
|
|
|
|
|
|
|
|
932 |
params = {
|
933 |
'q': rephrased_query,
|
934 |
'format': 'json',
|
|
|
941 |
|
942 |
# Remove empty parameters
|
943 |
params = {k: v for k, v in params.items() if v != ""}
|
944 |
+
|
|
|
945 |
if 'engines' not in params:
|
946 |
+
params['engines'] = 'google'
|
947 |
logger.info("No engines specified. Defaulting to 'google'.")
|
948 |
|
|
|
949 |
headers = {
|
950 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
951 |
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
|
|
961 |
|
962 |
scraped_content = []
|
963 |
page = 1
|
964 |
+
|
965 |
+
# Content scraping loop remains mostly the same, but add quality assessment
|
966 |
while len(scraped_content) < num_results:
|
|
|
967 |
params['pageno'] = page
|
968 |
+
|
|
|
|
|
|
|
|
|
969 |
try:
|
970 |
+
session = requests_retry_session()
|
971 |
if method.upper() == "GET":
|
972 |
response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
|
973 |
+
else:
|
974 |
response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
|
975 |
|
976 |
response.raise_for_status()
|
|
|
979 |
return f"An error occurred during the search request: {e}"
|
980 |
|
981 |
search_results = response.json()
|
|
|
|
|
982 |
results = search_results.get('results', [])
|
983 |
+
|
984 |
if not results:
|
985 |
logger.warning(f"No more results returned from SearXNG on page {page}.")
|
986 |
break
|
|
|
988 |
for result in results:
|
989 |
if len(scraped_content) >= num_results:
|
990 |
break
|
991 |
+
|
992 |
url = result.get('url', '')
|
993 |
title = result.get('title', 'No title')
|
994 |
+
|
995 |
if not is_valid_url(url):
|
996 |
logger.warning(f"Invalid URL: {url}")
|
997 |
continue
|
998 |
+
|
999 |
try:
|
1000 |
logger.info(f"Processing content from: {url}")
|
|
|
1001 |
content = scrape_full_content(url, max_chars, timeout, use_pydf2)
|
1002 |
|
1003 |
+
if content is None:
|
1004 |
continue
|
1005 |
|
1006 |
if not content:
|
1007 |
logger.warning(f"Failed to scrape content from {url}")
|
1008 |
continue
|
1009 |
|
1010 |
+
# Add initial quality assessment
|
1011 |
+
doc_quality = document_ranker.assess_document_quality({
|
1012 |
+
"title": title,
|
1013 |
+
"content": content
|
1014 |
+
})
|
1015 |
+
|
1016 |
scraped_content.append({
|
1017 |
"title": title,
|
1018 |
"url": url,
|
1019 |
"content": content,
|
1020 |
+
"scraper": "pdf" if url.lower().endswith('.pdf') else "newspaper",
|
1021 |
+
"quality_score": doc_quality
|
1022 |
})
|
1023 |
+
logger.info(f"Successfully scraped content from {url}. Quality score: {doc_quality}")
|
1024 |
+
|
1025 |
except requests.exceptions.RequestException as e:
|
1026 |
logger.error(f"Error scraping {url}: {e}")
|
1027 |
except Exception as e:
|
|
|
1033 |
logger.warning("No content scraped from search results.")
|
1034 |
return "No content could be scraped from the search results."
|
1035 |
|
1036 |
+
# Modified relevance assessment with improved analysis
|
|
|
|
|
1037 |
relevant_documents = []
|
1038 |
+
unique_summaries = set()
|
1039 |
+
|
1040 |
for doc in scraped_content:
|
1041 |
assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
|
1042 |
relevance, summary = assessment.split('\n', 1)
|
1043 |
+
|
1044 |
if relevance.strip().lower() == "relevant: yes":
|
1045 |
summary_text = summary.replace("Summary: ", "").strip()
|
1046 |
|
1047 |
+
if is_content_unique(summary_text, unique_summaries, similarity_threshold=0.8):
|
1048 |
+
# Calculate comprehensive relevance score using new method
|
1049 |
+
relevance_score = document_ranker.calculate_relevance_score(
|
1050 |
+
{
|
1051 |
+
"title": doc['title'],
|
1052 |
+
"content": doc['content'],
|
1053 |
+
"summary": summary_text
|
1054 |
+
},
|
1055 |
+
rephrased_query,
|
1056 |
+
similarity_model
|
1057 |
+
)
|
1058 |
+
|
1059 |
relevant_documents.append({
|
1060 |
"title": doc['title'],
|
1061 |
"url": doc['url'],
|
1062 |
+
"content": doc['content'],
|
1063 |
"summary": summary_text,
|
1064 |
"scraper": doc['scraper'],
|
1065 |
+
"relevance_score": relevance_score,
|
1066 |
+
"quality_score": doc['quality_score']
|
1067 |
})
|
1068 |
+
unique_summaries.add(summary_text)
|
|
|
|
|
1069 |
|
1070 |
if not relevant_documents:
|
1071 |
logger.warning("No relevant and unique documents found.")
|
1072 |
+
return "No relevant and unique content found for the given query."
|
1073 |
+
|
1074 |
+
# Enhanced reranking using improved weights and BM25
|
1075 |
+
try:
|
1076 |
+
# Get query-adaptive weights
|
1077 |
+
bm25_weight, semantic_weight = document_ranker.get_adaptive_weights(rephrased_query)
|
1078 |
+
logger.info(f"Using adaptive weights - BM25: {bm25_weight}, Semantic: {semantic_weight}")
|
1079 |
+
|
1080 |
+
# Prepare documents for BM25
|
1081 |
+
doc_texts = [f"{doc['title']} {doc['content']}" for doc in relevant_documents]
|
1082 |
+
|
1083 |
+
# Initialize and fit BM25
|
1084 |
+
bm25 = BM25()
|
1085 |
+
bm25.fit(doc_texts)
|
1086 |
+
|
1087 |
+
# Get BM25 scores
|
1088 |
+
bm25_scores = bm25.get_scores(rephrased_query)
|
1089 |
+
|
1090 |
+
# Calculate semantic scores using title and content
|
1091 |
+
query_embedding = similarity_model.encode(rephrased_query, convert_to_tensor=True)
|
1092 |
+
doc_embeddings = similarity_model.encode(
|
1093 |
+
[f"{doc['title']} {doc['summary']}" for doc in relevant_documents],
|
1094 |
+
convert_to_tensor=True
|
1095 |
+
)
|
1096 |
+
semantic_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
|
1097 |
+
|
1098 |
+
# Get quality scores
|
1099 |
+
quality_scores = np.array([doc['quality_score'] for doc in relevant_documents])
|
1100 |
+
|
1101 |
+
# Normalize all scores
|
1102 |
+
bm25_scores_norm = normalize_scores(bm25_scores)
|
1103 |
+
semantic_scores_norm = normalize_scores(semantic_scores.numpy())
|
1104 |
+
quality_scores_norm = normalize_scores(quality_scores)
|
1105 |
+
relevance_scores = normalize_scores(
|
1106 |
+
np.array([doc['relevance_score'] for doc in relevant_documents])
|
1107 |
+
)
|
1108 |
+
|
1109 |
+
# Combine scores with weights
|
1110 |
+
final_scores = (
|
1111 |
+
bm25_weight * bm25_scores_norm +
|
1112 |
+
semantic_weight * semantic_scores_norm +
|
1113 |
+
0.15 * quality_scores_norm + # Add quality score weight
|
1114 |
+
0.15 * relevance_scores # Reduced from 0.2 to accommodate quality
|
1115 |
+
)
|
1116 |
+
|
1117 |
+
# Create scored documents
|
1118 |
+
scored_documents = list(zip(relevant_documents, final_scores))
|
1119 |
+
scored_documents.sort(key=lambda x: x[1], reverse=True)
|
1120 |
+
|
1121 |
+
# Take top results
|
1122 |
+
reranked_docs = [doc for doc, _ in scored_documents[:num_results]]
|
1123 |
+
|
1124 |
+
except Exception as e:
|
1125 |
+
logger.error(f"Error during document reranking: {e}")
|
1126 |
+
# Fallback to basic sorting by relevance and quality
|
1127 |
+
reranked_docs = sorted(
|
1128 |
+
relevant_documents,
|
1129 |
+
key=lambda x: (x['relevance_score'] + x['quality_score']) / 2,
|
1130 |
+
reverse=True
|
1131 |
+
)[:num_results]
|
1132 |
|
|
|
|
|
|
|
1133 |
if not reranked_docs:
|
1134 |
logger.warning("No documents remained after reranking.")
|
1135 |
+
return "No relevant content found after filtering and ranking."
|
|
|
|
|
1136 |
|
1137 |
+
# Prepare final documents for LLM
|
|
|
|
|
|
|
|
|
|
|
1138 |
llm_input = {
|
1139 |
"query": query,
|
1140 |
"documents": [
|
|
|
1142 |
"title": doc['title'],
|
1143 |
"url": doc['url'],
|
1144 |
"summary": doc['summary'],
|
1145 |
+
"content": doc['content'],
|
1146 |
+
"quality_score": doc['quality_score'] # Include quality score
|
1147 |
+
} for doc in reranked_docs
|
1148 |
]
|
1149 |
}
|
1150 |
|
1151 |
+
# LLM Summarization
|
1152 |
llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
|
1153 |
|
1154 |
return llm_summary
|
|
|
1157 |
logger.error(f"Unexpected error in search_and_scrape: {e}")
|
1158 |
return f"An unexpected error occurred during the search and scrape process: {e}"
|
1159 |
|
1160 |
+
def normalize_scores(scores: np.ndarray) -> np.ndarray:
|
1161 |
+
"""Normalize scores to range [0, 1]"""
|
1162 |
+
if np.all(scores == scores[0]):
|
1163 |
+
return np.ones_like(scores)
|
1164 |
+
return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
|
1165 |
+
|
1166 |
# Helper function to get the appropriate client for each model
|
1167 |
def get_client_for_model(model: str) -> Any:
|
1168 |
if model == "huggingface":
|
|
|
1176 |
else:
|
1177 |
raise ValueError(f"Unsupported model: {model}")
|
1178 |
|
1179 |
+
|
1180 |
def chat_function(message: str, history: List[Tuple[str, str]], only_web_search: bool, num_results: int, max_chars: int, time_range: str, language: str, category: str, engines: List[str], safesearch: int, method: str, llm_temperature: float, model: str, use_pydf2: bool):
|
1181 |
chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
|
1182 |
|
|
|
1212 |
|
1213 |
yield response
|
1214 |
|
|
|
1215 |
iface = gr.ChatInterface(
|
1216 |
chat_function,
|
1217 |
title="Web Scraper for News with Sentinel AI",
|
|
|
1250 |
|
1251 |
if __name__ == "__main__":
|
1252 |
logger.info("Starting the SearXNG Scraper for News using ChatInterface with Advanced Parameters")
|
1253 |
+
iface.launch(share=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|