Shreyas094 commited on
Commit
7646c7a
·
verified ·
1 Parent(s): 8962e02

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +331 -151
app.py CHANGED
@@ -39,6 +39,8 @@ from typing import List, Dict, Tuple
39
  import datetime
40
  from abc import ABC, abstractmethod
41
  from typing import List, Dict, Any
 
 
42
 
43
  # Automatically get the current year
44
  CURRENT_YEAR = datetime.datetime.now().year
@@ -108,8 +110,6 @@ mistral_client = Mistral(api_key=MISTRAL_API_KEY)
108
  # Initialize the similarity model
109
  similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
110
 
111
-
112
-
113
  # Step 1: Create a base class for AI models
114
  class AIModel(ABC):
115
  @abstractmethod
@@ -447,12 +447,6 @@ Rephrased query:"""
447
  logger.error(f"Error rephrasing query with LLM: {e}")
448
  return query # Fallback to original query if rephrasing fails
449
 
450
- def extract_entity_domain(query):
451
- # Use a simple regex pattern to extract domain names from the query
452
- domain_pattern = r'\b(?:https?://)?(?:www\.)?([a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+)\b'
453
- matches = re.findall(domain_pattern, query)
454
- return matches[0] if matches else None
455
-
456
  class BM25:
457
  def __init__(self, k1: float = 1.5, b: float = 0.75):
458
  self.k1 = k1 # term frequency saturation parameter
@@ -542,75 +536,212 @@ def prepare_documents_for_bm25(documents: List[Dict]) -> Tuple[List[str], List[D
542
  doc_texts.append(doc_text)
543
  return doc_texts, documents
544
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
  # Now modify the rerank_documents_with_priority function to include BM25 ranking
546
- def rerank_documents_with_priority(query: str, documents: List[Dict], entity_domain: str,
547
- similarity_threshold: float = 0.95, max_results: int = 5) -> List[Dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
  try:
549
  if not documents:
550
- logger.warning("No documents to rerank.")
551
  return documents
552
 
553
- # Step 1: Prepare documents for BM25
 
 
 
554
  doc_texts, original_docs = prepare_documents_for_bm25(documents)
555
 
556
- # Step 2: Initialize and fit BM25
557
  bm25 = BM25()
558
  bm25.fit(doc_texts)
559
 
560
- # Step 3: Get BM25 scores
561
  bm25_scores = bm25.get_scores(query)
562
 
563
- # Step 4: Get semantic similarity scores
564
- query_embedding = similarity_model.encode(query, convert_to_tensor=True)
565
- doc_summaries = [doc['summary'] for doc in documents]
566
- doc_embeddings = similarity_model.encode(doc_summaries, convert_to_tensor=True)
567
- semantic_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
568
 
569
- # Step 5: Combine scores (normalize first)
570
  bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
571
- semantic_scores_norm = (semantic_scores - torch.min(semantic_scores)) / (torch.max(semantic_scores) - torch.min(semantic_scores))
572
 
573
- # Combine scores with weights (0.4 for BM25, 0.6 for semantic similarity)
574
- combined_scores = 0.4 * bm25_scores_norm + 0.6 * semantic_scores_norm.numpy()
 
575
 
576
- # Create scored documents with combined scores
577
- scored_documents = list(zip(documents, combined_scores))
578
 
579
- # Sort by domain priority and combined score
580
- scored_documents.sort(key=lambda x: (not x[0]['is_entity_domain'], -x[1]), reverse=False)
581
 
582
- # Filter similar documents
583
- filtered_docs = []
584
- added_contents = []
585
-
586
- for doc, score in scored_documents:
587
- if score < 0.3: # Minimum relevance threshold
588
- continue
589
-
590
- # Check similarity with already selected documents
591
- doc_embedding = similarity_model.encode(doc['summary'], convert_to_tensor=True)
592
- is_similar = False
593
-
594
- for content in added_contents:
595
- content_embedding = similarity_model.encode(content, convert_to_tensor=True)
596
- similarity = util.pytorch_cos_sim(doc_embedding, content_embedding)
597
- if similarity > similarity_threshold:
598
- is_similar = True
599
- break
600
-
601
- if not is_similar:
602
- filtered_docs.append(doc)
603
- added_contents.append(doc['summary'])
604
-
605
- if len(filtered_docs) >= max_results:
606
- break
607
-
608
- logger.info(f"Reranked and filtered to {len(filtered_docs)} unique documents using BM25 and semantic similarity.")
609
- return filtered_docs
610
 
611
  except Exception as e:
612
- logger.error(f"Error during reranking documents: {e}")
613
- return documents[:max_results] # Fallback to first max_results documents if reranking fails
614
 
615
  def compute_similarity(text1, text2):
616
  # Encode the texts
@@ -630,7 +761,7 @@ def is_content_unique(new_content, existing_contents, similarity_threshold=0.8):
630
  return True
631
 
632
  def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
633
- system_prompt = """You are a world-class AI assistant specializing in news analysis. Your task is to assess the relevance of a given document to a user's query and provide a detailed summary if it's relevant."""
634
 
635
  user_prompt = f"""
636
  Query: {query}
@@ -640,20 +771,24 @@ Document Content:
640
  {document['content'][:1000]} # Limit to first 1000 characters for efficiency
641
 
642
  Instructions:
643
- 1. Assess if the document is relevant to the QUERY made by the user.
644
- 2. If relevant, provide a detailed summary that captures the unique aspects of this particular news item. Include:
645
  - Key facts and figures
646
  - Dates of events or announcements
647
  - Names of important entities mentioned
648
  - Any metrics or changes reported
649
- - The potential impact or significance of the news
650
- 3. If not relevant, simply state "Not relevant".
 
 
651
 
652
  Your response should be in the following format:
653
- Relevant: [Yes/No]
654
- Summary: [Your detailed summary if relevant, or "Not relevant" if not]
655
 
656
- Remember to focus on key aspects and implications in your assessment and summary. Aim to make the summary distinctive, highlighting what makes this particular news item unique compared to similar news.
 
 
 
 
657
  """
658
 
659
  messages = [
@@ -664,15 +799,22 @@ Remember to focus on key aspects and implications in your assessment and summary
664
  try:
665
  response = llm_client.chat_completion(
666
  messages=messages,
667
- max_tokens=300, # Increased to allow for more detailed summaries
668
  temperature=temperature,
669
  top_p=0.9,
670
  frequency_penalty=1.4
671
  )
672
- return response.choices[0].message.content.strip()
 
 
 
 
 
 
 
673
  except Exception as e:
674
- logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
675
- return "Error: Unable to assess relevance and summarize"
676
 
677
  def scrape_full_content(url, max_chars=3000, timeout=5, use_pydf2=True):
678
  try:
@@ -775,6 +917,9 @@ def search_and_scrape(
775
  use_pydf2: bool = True
776
  ):
777
  try:
 
 
 
778
  # Step 1: Rephrase the Query
779
  rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
780
  logger.info(f"Rephrased Query: {rephrased_query}")
@@ -783,12 +928,7 @@ def search_and_scrape(
783
  logger.info("No need to perform search based on the rephrased query.")
784
  return "No search needed for the provided input."
785
 
786
- # Step 2: Extract entity domain
787
- entity_domain = extract_entity_domain(rephrased_query)
788
- logger.info(f"Extracted entity domain: {entity_domain}")
789
-
790
- # Step 3: Perform search
791
- # Search query parameters
792
  params = {
793
  'q': rephrased_query,
794
  'format': 'json',
@@ -801,13 +941,11 @@ def search_and_scrape(
801
 
802
  # Remove empty parameters
803
  params = {k: v for k, v in params.items() if v != ""}
804
-
805
- # If no engines are specified, set default engines
806
  if 'engines' not in params:
807
- params['engines'] = 'google' # Default to 'google' or any preferred engine
808
  logger.info("No engines specified. Defaulting to 'google'.")
809
 
810
- # Headers for SearXNG request
811
  headers = {
812
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
813
  'Accept': 'application/json, text/javascript, */*; q=0.01',
@@ -823,18 +961,16 @@ def search_and_scrape(
823
 
824
  scraped_content = []
825
  page = 1
 
 
826
  while len(scraped_content) < num_results:
827
- # Update params with current page
828
  params['pageno'] = page
829
-
830
- # Send request to SearXNG
831
- logger.info(f"Sending request to SearXNG for query: {rephrased_query} (Page {page})")
832
- session = requests_retry_session()
833
-
834
  try:
 
835
  if method.upper() == "GET":
836
  response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
837
- else: # POST
838
  response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
839
 
840
  response.raise_for_status()
@@ -843,9 +979,8 @@ def search_and_scrape(
843
  return f"An error occurred during the search request: {e}"
844
 
845
  search_results = response.json()
846
- logger.debug(f"SearXNG Response: {search_results}")
847
-
848
  results = search_results.get('results', [])
 
849
  if not results:
850
  logger.warning(f"No more results returned from SearXNG on page {page}.")
851
  break
@@ -853,33 +988,40 @@ def search_and_scrape(
853
  for result in results:
854
  if len(scraped_content) >= num_results:
855
  break
856
-
857
  url = result.get('url', '')
858
  title = result.get('title', 'No title')
859
-
860
  if not is_valid_url(url):
861
  logger.warning(f"Invalid URL: {url}")
862
  continue
863
-
864
  try:
865
  logger.info(f"Processing content from: {url}")
866
-
867
  content = scrape_full_content(url, max_chars, timeout, use_pydf2)
868
 
869
- if content is None: # This means it's a PDF and use_pydf2 is False
870
  continue
871
 
872
  if not content:
873
  logger.warning(f"Failed to scrape content from {url}")
874
  continue
875
 
 
 
 
 
 
 
876
  scraped_content.append({
877
  "title": title,
878
  "url": url,
879
  "content": content,
880
- "scraper": "pdf" if url.lower().endswith('.pdf') else "newspaper"
 
881
  })
882
- logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
 
883
  except requests.exceptions.RequestException as e:
884
  logger.error(f"Error scraping {url}: {e}")
885
  except Exception as e:
@@ -891,51 +1033,108 @@ def search_and_scrape(
891
  logger.warning("No content scraped from search results.")
892
  return "No content could be scraped from the search results."
893
 
894
- logger.info(f"Successfully scraped {len(scraped_content)} documents.")
895
-
896
- # Step 4: Assess relevance, summarize, and check for uniqueness
897
  relevant_documents = []
898
- unique_summaries = []
 
899
  for doc in scraped_content:
900
  assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
901
  relevance, summary = assessment.split('\n', 1)
902
-
903
  if relevance.strip().lower() == "relevant: yes":
904
  summary_text = summary.replace("Summary: ", "").strip()
905
 
906
- if is_content_unique(summary_text, unique_summaries):
907
- doc_domain = urlparse(doc['url']).netloc
908
- is_entity_domain = doc_domain == entity_domain
 
 
 
 
 
 
 
 
 
909
  relevant_documents.append({
910
  "title": doc['title'],
911
  "url": doc['url'],
 
912
  "summary": summary_text,
913
  "scraper": doc['scraper'],
914
- "is_entity_domain": is_entity_domain
 
915
  })
916
- unique_summaries.append(summary_text)
917
- else:
918
- logger.info(f"Skipping similar content: {doc['title']}")
919
 
920
  if not relevant_documents:
921
  logger.warning("No relevant and unique documents found.")
922
- return "No relevant and unique news found for the given query."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
923
 
924
- # Step 5: Rerank documents based on similarity to query and prioritize entity domain
925
- reranked_docs = rerank_documents_with_priority(rephrased_query, relevant_documents, entity_domain, similarity_threshold=0.95, max_results=num_results)
926
-
927
  if not reranked_docs:
928
  logger.warning("No documents remained after reranking.")
929
- return "No relevant news found after filtering and ranking."
930
-
931
- logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, related documents.")
932
 
933
- # Step 5: Scrape full content for top documents (up to num_results)
934
- for doc in reranked_docs[:num_results]:
935
- full_content = scrape_full_content(doc['url'], max_chars)
936
- doc['full_content'] = full_content
937
-
938
- # Prepare JSON for LLM
939
  llm_input = {
940
  "query": query,
941
  "documents": [
@@ -943,12 +1142,13 @@ def search_and_scrape(
943
  "title": doc['title'],
944
  "url": doc['url'],
945
  "summary": doc['summary'],
946
- "full_content": doc['full_content']
947
- } for doc in reranked_docs[:num_results]
 
948
  ]
949
  }
950
 
951
- # Step 6: LLM Summarization
952
  llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
953
 
954
  return llm_summary
@@ -957,6 +1157,12 @@ def search_and_scrape(
957
  logger.error(f"Unexpected error in search_and_scrape: {e}")
958
  return f"An unexpected error occurred during the search and scrape process: {e}"
959
 
 
 
 
 
 
 
960
  # Helper function to get the appropriate client for each model
961
  def get_client_for_model(model: str) -> Any:
962
  if model == "huggingface":
@@ -970,6 +1176,7 @@ def get_client_for_model(model: str) -> Any:
970
  else:
971
  raise ValueError(f"Unsupported model: {model}")
972
 
 
973
  def chat_function(message: str, history: List[Tuple[str, str]], only_web_search: bool, num_results: int, max_chars: int, time_range: str, language: str, category: str, engines: List[str], safesearch: int, method: str, llm_temperature: float, model: str, use_pydf2: bool):
974
  chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
975
 
@@ -1005,7 +1212,6 @@ def chat_function(message: str, history: List[Tuple[str, str]], only_web_search:
1005
 
1006
  yield response
1007
 
1008
-
1009
  iface = gr.ChatInterface(
1010
  chat_function,
1011
  title="Web Scraper for News with Sentinel AI",
@@ -1044,30 +1250,4 @@ iface = gr.ChatInterface(
1044
 
1045
  if __name__ == "__main__":
1046
  logger.info("Starting the SearXNG Scraper for News using ChatInterface with Advanced Parameters")
1047
- iface.launch(server_name="0.0.0.0", server_port=7860, share=False)
1048
-
1049
-
1050
-
1051
-
1052
-
1053
-
1054
-
1055
-
1056
-
1057
-
1058
-
1059
-
1060
-
1061
-
1062
-
1063
-
1064
-
1065
-
1066
-
1067
-
1068
-
1069
-
1070
-
1071
-
1072
-
1073
-
 
39
  import datetime
40
  from abc import ABC, abstractmethod
41
  from typing import List, Dict, Any
42
+ import spacy
43
+ from textblob import TextBlob
44
 
45
  # Automatically get the current year
46
  CURRENT_YEAR = datetime.datetime.now().year
 
110
  # Initialize the similarity model
111
  similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
112
 
 
 
113
  # Step 1: Create a base class for AI models
114
  class AIModel(ABC):
115
  @abstractmethod
 
447
  logger.error(f"Error rephrasing query with LLM: {e}")
448
  return query # Fallback to original query if rephrasing fails
449
 
 
 
 
 
 
 
450
  class BM25:
451
  def __init__(self, k1: float = 1.5, b: float = 0.75):
452
  self.k1 = k1 # term frequency saturation parameter
 
536
  doc_texts.append(doc_text)
537
  return doc_texts, documents
538
 
539
+
540
+ class ImprovedRanking:
541
+ def __init__(self):
542
+ # Load spacy for text analysis
543
+ self.nlp = spacy.load('en_core_web_sm')
544
+
545
+ def analyze_query(self, query: str) -> Dict:
546
+ """
547
+ Analyze query to determine appropriate weights
548
+
549
+ Args:
550
+ query: Search query string
551
+
552
+ Returns:
553
+ Dictionary with query analysis results
554
+ """
555
+ doc = self.nlp(query)
556
+
557
+ analysis = {
558
+ 'word_count': len(query.split()),
559
+ 'has_entities': bool(doc.ents),
560
+ 'is_question': any(token.tag_ == 'WP' or token.tag_ == 'WRB' for token in doc),
561
+ 'sentiment': TextBlob(query).sentiment.polarity
562
+ }
563
+
564
+ return analysis
565
+
566
+ def get_adaptive_weights(self, query: str) -> Tuple[float, float]:
567
+ """
568
+ Calculate adaptive weights based on query characteristics
569
+
570
+ Args:
571
+ query: Search query string
572
+
573
+ Returns:
574
+ Tuple of (bm25_weight, semantic_weight)
575
+ """
576
+ analysis = self.analyze_query(query)
577
+
578
+ # Base weights
579
+ bm25_weight = 0.4
580
+ semantic_weight = 0.6
581
+
582
+ # Adjust weights based on query characteristics
583
+ if analysis['word_count'] <= 2:
584
+ # Short queries: favor keyword matching
585
+ bm25_weight = 0.6
586
+ semantic_weight = 0.4
587
+ elif analysis['word_count'] >= 6:
588
+ # Long queries: favor semantic understanding
589
+ bm25_weight = 0.3
590
+ semantic_weight = 0.7
591
+
592
+ if analysis['has_entities']:
593
+ # Queries with named entities: increase keyword importance
594
+ bm25_weight += 0.1
595
+ semantic_weight -= 0.1
596
+
597
+ if analysis['is_question']:
598
+ # Questions: favor semantic understanding
599
+ bm25_weight -= 0.1
600
+ semantic_weight += 0.1
601
+
602
+ # Normalize weights to ensure they sum to 1
603
+ total = bm25_weight + semantic_weight
604
+ return bm25_weight/total, semantic_weight/total
605
+
606
+ def calculate_relevance_score(self, doc: Dict, query: str, similarity_model) -> float:
607
+ """
608
+ Calculate comprehensive relevance score for a document
609
+
610
+ Args:
611
+ doc: Document dictionary with title and content
612
+ query: Search query string
613
+ similarity_model: Model for computing semantic similarity
614
+
615
+ Returns:
616
+ Float representing document relevance score
617
+ """
618
+ # 1. Title relevance (30%)
619
+ title_embedding = similarity_model.encode(doc['title'], convert_to_tensor=True)
620
+ query_embedding = similarity_model.encode(query, convert_to_tensor=True)
621
+ title_similarity = torch.cosine_similarity(title_embedding, query_embedding, dim=0).item()
622
+
623
+ # 2. Content relevance (40%)
624
+ # Use first 512 tokens of content to avoid memory issues
625
+ content_preview = ' '.join(doc['content'].split()[:512])
626
+ content_embedding = similarity_model.encode(content_preview, convert_to_tensor=True)
627
+ content_similarity = torch.cosine_similarity(content_embedding, query_embedding, dim=0).item()
628
+
629
+ # 3. Query term presence (20%)
630
+ query_terms = set(query.lower().split())
631
+ title_terms = set(doc['title'].lower().split())
632
+ content_terms = set(content_preview.lower().split())
633
+
634
+ title_term_overlap = len(query_terms & title_terms) / len(query_terms)
635
+ content_term_overlap = len(query_terms & content_terms) / len(query_terms)
636
+
637
+ # 4. Document quality indicators (10%)
638
+ quality_score = self.assess_document_quality(doc)
639
+
640
+ # Combine scores with weights
641
+ final_score = (
642
+ title_similarity * 0.3 +
643
+ content_similarity * 0.4 +
644
+ ((title_term_overlap + content_term_overlap) / 2) * 0.2 +
645
+ quality_score * 0.1
646
+ )
647
+
648
+ return final_score
649
+
650
+ def assess_document_quality(self, doc: Dict) -> float:
651
+ """
652
+ Assess document quality based on various metrics
653
+
654
+ Args:
655
+ doc: Document dictionary
656
+
657
+ Returns:
658
+ Float representing document quality score
659
+ """
660
+ score = 0.0
661
+
662
+ # 1. Length score (longer documents often have more information)
663
+ content_length = len(doc['content'].split())
664
+ length_score = min(content_length / 1000, 1.0) # Cap at 1000 words
665
+
666
+ # 2. Text structure score
667
+ has_paragraphs = doc['content'].count('\n\n') > 0
668
+ has_sections = bool(re.findall(r'\n[A-Z][^.!?]*[:]\n', doc['content']))
669
+
670
+ # 3. Writing quality score (using basic metrics)
671
+ blob = TextBlob(doc['content'])
672
+ sentences = blob.sentences
673
+ avg_sentence_length = sum(len(str(s).split()) for s in sentences) / len(sentences) if sentences else 0
674
+ sentence_score = 1.0 if 10 <= avg_sentence_length <= 25 else 0.5
675
+
676
+ # Combine quality metrics
677
+ score = (
678
+ length_score * 0.4 +
679
+ (has_paragraphs * 0.2 + has_sections * 0.2) +
680
+ sentence_score * 0.2
681
+ )
682
+
683
+ return score
684
+
685
  # Now modify the rerank_documents_with_priority function to include BM25 ranking
686
+ def rerank_documents_improved(query: str, documents: List[Dict],
687
+ similarity_model, max_results: int = 5) -> List[Dict]:
688
+ """
689
+ Rerank documents using improved scoring system
690
+
691
+ Args:
692
+ query: Search query string
693
+ documents: List of document dictionaries
694
+ similarity_model: Model for computing semantic similarity
695
+ max_results: Maximum number of results to return
696
+
697
+ Returns:
698
+ List of reranked documents
699
+ """
700
+ ranker = ImprovedRanking()
701
+
702
  try:
703
  if not documents:
 
704
  return documents
705
 
706
+ # Get adaptive weights based on query
707
+ bm25_weight, semantic_weight = ranker.get_adaptive_weights(query)
708
+
709
+ # Prepare documents for BM25
710
  doc_texts, original_docs = prepare_documents_for_bm25(documents)
711
 
712
+ # Initialize and fit BM25
713
  bm25 = BM25()
714
  bm25.fit(doc_texts)
715
 
716
+ # Get BM25 scores
717
  bm25_scores = bm25.get_scores(query)
718
 
719
+ # Calculate comprehensive relevance scores
720
+ relevance_scores = [
721
+ ranker.calculate_relevance_score(doc, query, similarity_model)
722
+ for doc in documents
723
+ ]
724
 
725
+ # Normalize scores
726
  bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
727
+ relevance_scores_norm = (np.array(relevance_scores) - np.min(relevance_scores)) / (np.max(relevance_scores) - np.min(relevance_scores))
728
 
729
+ # Combine scores using adaptive weights
730
+ final_scores = (bm25_weight * bm25_scores_norm +
731
+ semantic_weight * relevance_scores_norm)
732
 
733
+ # Create scored documents
734
+ scored_documents = list(zip(documents, final_scores))
735
 
736
+ # Sort by final score
737
+ scored_documents.sort(key=lambda x: x[1], reverse=True)
738
 
739
+ # Return top results
740
+ return [doc for doc, score in scored_documents[:max_results]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
741
 
742
  except Exception as e:
743
+ logger.error(f"Error during improved reranking: {e}")
744
+ return documents[:max_results]
745
 
746
  def compute_similarity(text1, text2):
747
  # Encode the texts
 
761
  return True
762
 
763
  def assess_relevance_and_summarize(llm_client, query, document, temperature=0.2):
764
+ system_prompt = """You are a world-class AI assistant specializing in news analysis and document summarization. Your task is to provide a comprehensive and detailed summary of the given document that captures its key points and relevance to the user's query."""
765
 
766
  user_prompt = f"""
767
  Query: {query}
 
771
  {document['content'][:1000]} # Limit to first 1000 characters for efficiency
772
 
773
  Instructions:
774
+ 1. Provide a detailed summary that captures the unique aspects of this document. Include:
 
775
  - Key facts and figures
776
  - Dates of events or announcements
777
  - Names of important entities mentioned
778
  - Any metrics or changes reported
779
+ - The potential impact or significance of the content
780
+ 2. Focus on aspects that are most relevant to the user's query
781
+ 3. Ensure the summary is distinctive and highlights what makes this particular document unique
782
+ 4. Include any specific context that helps understand the document's significance
783
 
784
  Your response should be in the following format:
785
+ Summary: [Your detailed summary]
 
786
 
787
+ Remember to:
788
+ - Highlight the most important information first
789
+ - Include specific numbers, dates, and facts when available
790
+ - Connect the information to the user's query where relevant
791
+ - Focus on what makes this document unique or noteworthy
792
  """
793
 
794
  messages = [
 
799
  try:
800
  response = llm_client.chat_completion(
801
  messages=messages,
802
+ max_tokens=300,
803
  temperature=temperature,
804
  top_p=0.9,
805
  frequency_penalty=1.4
806
  )
807
+ summary = response.choices[0].message.content.strip()
808
+
809
+ # If the summary starts with "Summary: ", remove it
810
+ if summary.startswith("Summary: "):
811
+ summary = summary[9:].strip()
812
+
813
+ # Always return format as if document was relevant
814
+ return f"Relevant: Yes\nSummary: {summary}"
815
  except Exception as e:
816
+ logger.error(f"Error summarizing with LLM: {e}")
817
+ return f"Relevant: Yes\nSummary: Error occurred while summarizing the document: {str(e)}"
818
 
819
  def scrape_full_content(url, max_chars=3000, timeout=5, use_pydf2=True):
820
  try:
 
917
  use_pydf2: bool = True
918
  ):
919
  try:
920
+ # Initialize ImprovedRanking instead of DocumentRanker
921
+ document_ranker = ImprovedRanking()
922
+
923
  # Step 1: Rephrase the Query
924
  rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
925
  logger.info(f"Rephrased Query: {rephrased_query}")
 
928
  logger.info("No need to perform search based on the rephrased query.")
929
  return "No search needed for the provided input."
930
 
931
+ # [Search parameters and request handling remain the same...]
 
 
 
 
 
932
  params = {
933
  'q': rephrased_query,
934
  'format': 'json',
 
941
 
942
  # Remove empty parameters
943
  params = {k: v for k, v in params.items() if v != ""}
944
+
 
945
  if 'engines' not in params:
946
+ params['engines'] = 'google'
947
  logger.info("No engines specified. Defaulting to 'google'.")
948
 
 
949
  headers = {
950
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
951
  'Accept': 'application/json, text/javascript, */*; q=0.01',
 
961
 
962
  scraped_content = []
963
  page = 1
964
+
965
+ # Content scraping loop remains mostly the same, but add quality assessment
966
  while len(scraped_content) < num_results:
 
967
  params['pageno'] = page
968
+
 
 
 
 
969
  try:
970
+ session = requests_retry_session()
971
  if method.upper() == "GET":
972
  response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
973
+ else:
974
  response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
975
 
976
  response.raise_for_status()
 
979
  return f"An error occurred during the search request: {e}"
980
 
981
  search_results = response.json()
 
 
982
  results = search_results.get('results', [])
983
+
984
  if not results:
985
  logger.warning(f"No more results returned from SearXNG on page {page}.")
986
  break
 
988
  for result in results:
989
  if len(scraped_content) >= num_results:
990
  break
991
+
992
  url = result.get('url', '')
993
  title = result.get('title', 'No title')
994
+
995
  if not is_valid_url(url):
996
  logger.warning(f"Invalid URL: {url}")
997
  continue
998
+
999
  try:
1000
  logger.info(f"Processing content from: {url}")
 
1001
  content = scrape_full_content(url, max_chars, timeout, use_pydf2)
1002
 
1003
+ if content is None:
1004
  continue
1005
 
1006
  if not content:
1007
  logger.warning(f"Failed to scrape content from {url}")
1008
  continue
1009
 
1010
+ # Add initial quality assessment
1011
+ doc_quality = document_ranker.assess_document_quality({
1012
+ "title": title,
1013
+ "content": content
1014
+ })
1015
+
1016
  scraped_content.append({
1017
  "title": title,
1018
  "url": url,
1019
  "content": content,
1020
+ "scraper": "pdf" if url.lower().endswith('.pdf') else "newspaper",
1021
+ "quality_score": doc_quality
1022
  })
1023
+ logger.info(f"Successfully scraped content from {url}. Quality score: {doc_quality}")
1024
+
1025
  except requests.exceptions.RequestException as e:
1026
  logger.error(f"Error scraping {url}: {e}")
1027
  except Exception as e:
 
1033
  logger.warning("No content scraped from search results.")
1034
  return "No content could be scraped from the search results."
1035
 
1036
+ # Modified relevance assessment with improved analysis
 
 
1037
  relevant_documents = []
1038
+ unique_summaries = set()
1039
+
1040
  for doc in scraped_content:
1041
  assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
1042
  relevance, summary = assessment.split('\n', 1)
1043
+
1044
  if relevance.strip().lower() == "relevant: yes":
1045
  summary_text = summary.replace("Summary: ", "").strip()
1046
 
1047
+ if is_content_unique(summary_text, unique_summaries, similarity_threshold=0.8):
1048
+ # Calculate comprehensive relevance score using new method
1049
+ relevance_score = document_ranker.calculate_relevance_score(
1050
+ {
1051
+ "title": doc['title'],
1052
+ "content": doc['content'],
1053
+ "summary": summary_text
1054
+ },
1055
+ rephrased_query,
1056
+ similarity_model
1057
+ )
1058
+
1059
  relevant_documents.append({
1060
  "title": doc['title'],
1061
  "url": doc['url'],
1062
+ "content": doc['content'],
1063
  "summary": summary_text,
1064
  "scraper": doc['scraper'],
1065
+ "relevance_score": relevance_score,
1066
+ "quality_score": doc['quality_score']
1067
  })
1068
+ unique_summaries.add(summary_text)
 
 
1069
 
1070
  if not relevant_documents:
1071
  logger.warning("No relevant and unique documents found.")
1072
+ return "No relevant and unique content found for the given query."
1073
+
1074
+ # Enhanced reranking using improved weights and BM25
1075
+ try:
1076
+ # Get query-adaptive weights
1077
+ bm25_weight, semantic_weight = document_ranker.get_adaptive_weights(rephrased_query)
1078
+ logger.info(f"Using adaptive weights - BM25: {bm25_weight}, Semantic: {semantic_weight}")
1079
+
1080
+ # Prepare documents for BM25
1081
+ doc_texts = [f"{doc['title']} {doc['content']}" for doc in relevant_documents]
1082
+
1083
+ # Initialize and fit BM25
1084
+ bm25 = BM25()
1085
+ bm25.fit(doc_texts)
1086
+
1087
+ # Get BM25 scores
1088
+ bm25_scores = bm25.get_scores(rephrased_query)
1089
+
1090
+ # Calculate semantic scores using title and content
1091
+ query_embedding = similarity_model.encode(rephrased_query, convert_to_tensor=True)
1092
+ doc_embeddings = similarity_model.encode(
1093
+ [f"{doc['title']} {doc['summary']}" for doc in relevant_documents],
1094
+ convert_to_tensor=True
1095
+ )
1096
+ semantic_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
1097
+
1098
+ # Get quality scores
1099
+ quality_scores = np.array([doc['quality_score'] for doc in relevant_documents])
1100
+
1101
+ # Normalize all scores
1102
+ bm25_scores_norm = normalize_scores(bm25_scores)
1103
+ semantic_scores_norm = normalize_scores(semantic_scores.numpy())
1104
+ quality_scores_norm = normalize_scores(quality_scores)
1105
+ relevance_scores = normalize_scores(
1106
+ np.array([doc['relevance_score'] for doc in relevant_documents])
1107
+ )
1108
+
1109
+ # Combine scores with weights
1110
+ final_scores = (
1111
+ bm25_weight * bm25_scores_norm +
1112
+ semantic_weight * semantic_scores_norm +
1113
+ 0.15 * quality_scores_norm + # Add quality score weight
1114
+ 0.15 * relevance_scores # Reduced from 0.2 to accommodate quality
1115
+ )
1116
+
1117
+ # Create scored documents
1118
+ scored_documents = list(zip(relevant_documents, final_scores))
1119
+ scored_documents.sort(key=lambda x: x[1], reverse=True)
1120
+
1121
+ # Take top results
1122
+ reranked_docs = [doc for doc, _ in scored_documents[:num_results]]
1123
+
1124
+ except Exception as e:
1125
+ logger.error(f"Error during document reranking: {e}")
1126
+ # Fallback to basic sorting by relevance and quality
1127
+ reranked_docs = sorted(
1128
+ relevant_documents,
1129
+ key=lambda x: (x['relevance_score'] + x['quality_score']) / 2,
1130
+ reverse=True
1131
+ )[:num_results]
1132
 
 
 
 
1133
  if not reranked_docs:
1134
  logger.warning("No documents remained after reranking.")
1135
+ return "No relevant content found after filtering and ranking."
 
 
1136
 
1137
+ # Prepare final documents for LLM
 
 
 
 
 
1138
  llm_input = {
1139
  "query": query,
1140
  "documents": [
 
1142
  "title": doc['title'],
1143
  "url": doc['url'],
1144
  "summary": doc['summary'],
1145
+ "content": doc['content'],
1146
+ "quality_score": doc['quality_score'] # Include quality score
1147
+ } for doc in reranked_docs
1148
  ]
1149
  }
1150
 
1151
+ # LLM Summarization
1152
  llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
1153
 
1154
  return llm_summary
 
1157
  logger.error(f"Unexpected error in search_and_scrape: {e}")
1158
  return f"An unexpected error occurred during the search and scrape process: {e}"
1159
 
1160
+ def normalize_scores(scores: np.ndarray) -> np.ndarray:
1161
+ """Normalize scores to range [0, 1]"""
1162
+ if np.all(scores == scores[0]):
1163
+ return np.ones_like(scores)
1164
+ return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
1165
+
1166
  # Helper function to get the appropriate client for each model
1167
  def get_client_for_model(model: str) -> Any:
1168
  if model == "huggingface":
 
1176
  else:
1177
  raise ValueError(f"Unsupported model: {model}")
1178
 
1179
+
1180
  def chat_function(message: str, history: List[Tuple[str, str]], only_web_search: bool, num_results: int, max_chars: int, time_range: str, language: str, category: str, engines: List[str], safesearch: int, method: str, llm_temperature: float, model: str, use_pydf2: bool):
1181
  chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
1182
 
 
1212
 
1213
  yield response
1214
 
 
1215
  iface = gr.ChatInterface(
1216
  chat_function,
1217
  title="Web Scraper for News with Sentinel AI",
 
1250
 
1251
  if __name__ == "__main__":
1252
  logger.info("Starting the SearXNG Scraper for News using ChatInterface with Advanced Parameters")
1253
+ iface.launch(share=False)